All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 00/29] SGI enhancedNFS patches
@ 2009-03-31 20:28 Greg Banks
  2009-03-31 20:28 ` [patch 01/29] knfsd: Add infrastructure for measuring RPC service times Greg Banks
                   ` (29 more replies)
  0 siblings, 30 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

This patchset is a selection of the useful parts of the NFS server
patches which comprise the SGI enhancedNFS product, forward ported,
merged and reorganised.

Bruce: all of these are potentially candidates for 2.6.30.

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 01/29] knfsd: Add infrastructure for measuring RPC service times.
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-04-25  2:13   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 02/29] knfsd: Add stats table infrastructure Greg Banks
                   ` (28 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Two new functions; svc_time_mark() remembers the current time
in a struct svc_time; svc_time_elapsed() calculates and returns
the time since a svc_time was marked.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 include/linux/sunrpc/svc.h |   12 ++++++++++++
 net/sunrpc/svc.c           |   25 +++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -18,6 +18,16 @@
 #include <linux/sunrpc/svcauth.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
+#include <linux/time.h>
+
+/*
+ * Structure used to implement a fast lockless elapsed time measure.
+ */
+struct svc_time
+{
+	struct timespec	st_spec;
+};
+
 
 /*
  * This is the RPC server thread function prototype
@@ -419,6 +429,8 @@ int		   svc_register(const struct svc_se
 void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
 struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
+void		   svc_time_mark(struct svc_time *);
+int		   svc_time_elapsed(const struct svc_time *, struct timespec *);
 char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
 
 #define	RPC_MAX_ADDRBUFLEN	(63U)
Index: bfields/net/sunrpc/svc.c
===================================================================
--- bfields.orig/net/sunrpc/svc.c
+++ bfields/net/sunrpc/svc.c
@@ -1232,3 +1232,28 @@ u32 svc_max_payload(const struct svc_rqs
 	return max;
 }
 EXPORT_SYMBOL_GPL(svc_max_payload);
+
+
+void
+svc_time_mark(struct svc_time *st)
+{
+	getnstimeofday(&st->st_spec);
+}
+EXPORT_SYMBOL(svc_time_mark);
+
+int
+svc_time_elapsed(const struct svc_time *mark, struct timespec *ts)
+{
+	struct svc_time now;
+
+	svc_time_mark(&now);
+
+	if (now.st_spec.tv_sec < mark->st_spec.tv_sec)
+		return -EINVAL;	/* time going backwards */
+
+	*ts = timespec_sub(now.st_spec, mark->st_spec);
+
+	return 0;
+}
+EXPORT_SYMBOL(svc_time_elapsed);
+

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 02/29] knfsd: Add stats table infrastructure.
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
  2009-03-31 20:28 ` [patch 01/29] knfsd: Add infrastructure for measuring RPC service times Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-04-25  3:56   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 03/29] knfsd: add userspace controls for stats tables Greg Banks
                   ` (27 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

This infrastructure will be used to implement per-client and per-export
serverside stats.  Multiple stats objects are kept in a hashtable,
keyed by a string name (e.g. client IP address or export path).
Old entries are pruned from the table using a timer.  The function
nfsd_stats_find() can be used to find an entry and create it if
necessary.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/stats.c            |  231 ++++++++++++++++++++++++++++++++++
 include/linux/nfsd/debug.h |    1 
 include/linux/nfsd/stats.h |   43 ++++++
 3 files changed, 275 insertions(+)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -29,17 +29,29 @@
 #include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/swap.h>
+#include <linux/log2.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/stats.h>
 
+#define NFSDDBG_FACILITY		NFSDDBG_STATS
+
+#define hentry_from_hnode(hn) \
+	hlist_entry((hn), nfsd_stats_hentry_t, se_node)
+
 struct nfsd_stats	nfsdstats;
 struct svc_stat		nfsd_svcstats = {
 	.program	= &nfsd_program,
 };
 
+int nfsd_stats_enabled = 1;
+int nfsd_stats_prune_period = 2*86400;
+
 static int nfsd_proc_show(struct seq_file *seq, void *v)
 {
 	int i;
@@ -98,6 +110,225 @@ static const struct file_operations nfsd
 	.release = single_release,
 };
 
+
+/*
+ * Stats hash pruning works thus.  A scan is run every prune period.
+ * On every scan, hentries with the OLD flag are detached and
+ * a reference dropped (usually that will be the last reference
+ * and the hentry will be deleted).  Hentries without the OLD flag
+ * have the OLD flag set; the flag is reset in nfsd_stats_get().
+ * So hentries with active traffic in the last 2 prune periods
+ * are not candidates for pruning.
+ */
+static void nfsd_stats_prune(unsigned long closure)
+{
+	nfsd_stats_hash_t *sh = (nfsd_stats_hash_t *)closure;
+	unsigned int i;
+	nfsd_stats_hentry_t *se;
+	struct hlist_node *hn, *next;
+	struct hlist_head to_be_dropped = HLIST_HEAD_INIT;
+
+	dprintk("nfsd_stats_prune\n");
+
+	if (!down_write_trylock(&sh->sh_sem)) {
+		/* hash is busy...try again in a second */
+		dprintk("nfsd_stats_prune: busy\n");
+		mod_timer(&sh->sh_prune_timer, jiffies + HZ);
+		return;
+	}
+
+	for (i = 0 ; i < sh->sh_size ; i++) {
+		hlist_for_each_entry_safe(se, hn, next, &sh->sh_hash[i], se_node) {
+			if (!test_and_set_bit(NFSD_STATS_HENTRY_OLD, &se->se_flags))
+				continue;
+			hlist_del_init(&se->se_node);
+			hlist_add_head(&se->se_node, &to_be_dropped);
+		}
+	}
+
+	up_write(&sh->sh_sem);
+
+	dprintk("nfsd_stats_prune: deleting\n");
+	hlist_for_each_entry_safe(se, hn, next, &to_be_dropped, se_node)
+		nfsd_stats_put(se);
+
+	mod_timer(&sh->sh_prune_timer, jiffies + nfsd_stats_prune_period * HZ);
+}
+
+/*
+ * Initialise a stats hash.  Array size scales with
+ * server memory, as a loose heuristic for how many
+ * clients or exports a server is likely to have.
+ */
+static void nfsd_stats_hash_init(nfsd_stats_hash_t *sh, const char *which)
+{
+	unsigned int nbits;
+	unsigned int i;
+
+	init_rwsem(&sh->sh_sem);
+
+	nbits = 5 + ilog2(totalram_pages >> (30-PAGE_SHIFT));
+	sh->sh_size = (1<<nbits);
+	sh->sh_mask = (sh->sh_size-1);
+
+	sh->sh_hash = kmalloc(sizeof(struct hlist_head) * sh->sh_size, GFP_KERNEL);
+	if (sh->sh_hash == NULL) {
+		printk(KERN_ERR "failed to allocate knfsd %s stats hashtable\n", which);
+		/* struggle on... */
+		return;
+	}
+	printk(KERN_INFO "knfsd %s stats hashtable, %u entries\n", which, sh->sh_size);
+
+	for (i = 0 ; i < sh->sh_size ; i++)
+		INIT_HLIST_HEAD(&sh->sh_hash[i]);
+
+	/* start the prune timer */
+	init_timer(&sh->sh_prune_timer);
+	sh->sh_prune_timer.function = nfsd_stats_prune;
+	sh->sh_prune_timer.expires = jiffies + nfsd_stats_prune_period * HZ;
+	sh->sh_prune_timer.data = (unsigned long)sh;
+}
+
+/*
+ * Destroy a stats hash.  Drop what should be the last
+ * reference on all hentries, clean up the timer, and
+ * free the hash array.
+ */
+static void nfsd_stats_hash_destroy(nfsd_stats_hash_t *sh)
+{
+	unsigned int i;
+	nfsd_stats_hentry_t *se;
+
+	del_timer_sync(&sh->sh_prune_timer);
+
+	/* drop the last reference for all remaining hentries */
+	for (i = 0 ; i < sh->sh_size ; i++) {
+		struct hlist_head *hh = &sh->sh_hash[i];
+
+		while (hh->first != NULL) {
+			se = hentry_from_hnode(hh->first);
+			BUG_ON(atomic_read(&se->se_refcount) != 1);
+			nfsd_stats_put(se);
+		}
+	}
+
+	if (sh->sh_hash != NULL) {
+		kfree(sh->sh_hash);
+	}
+}
+
+/*
+ * Find and return a hentry for the given name, with a new refcount,
+ * creating it if necessary.  Will only return NULL on OOM or if
+ * stats are disabled.  Does it's own locking using the hash rwsem;
+ * may sleep.
+ */
+nfsd_stats_hentry_t *nfsd_stats_find(nfsd_stats_hash_t *sh,
+				     const char *name, int len)
+{
+	u32 hash;
+	nfsd_stats_hentry_t *se, *new = NULL;
+	struct hlist_node *hn;
+
+	dprintk("nfsd_stats_find: name %s len %d\n", name, len);
+
+	if (!nfsd_stats_enabled || sh->sh_hash == NULL)
+		return NULL;
+
+
+	/* search the hash table */
+	hash = jhash(name, len, 0xfeedbeef) & sh->sh_mask;
+	down_read(&sh->sh_sem);
+	hlist_for_each_entry(se, hn, &sh->sh_hash[hash], se_node) {
+		if (!strcmp(se->se_name, name)) {
+			/* found matching */
+			dprintk("nfsd_stats_find: found %s\n", se->se_name);
+			nfsd_stats_get(se);
+			up_read(&sh->sh_sem);
+			return se;
+		}
+	}
+	up_read(&sh->sh_sem);
+
+	/* not found, create a new one */
+	dprintk("nfsd_stats_find: allocating new for %s\n", name);
+	new = (nfsd_stats_hentry_t *)kmalloc(sizeof(*new), GFP_KERNEL);
+	if (new == NULL)
+		return NULL;
+	/* initialise */
+
+	new->se_name = kmalloc(len+1, GFP_KERNEL);
+	if (new->se_name == NULL) {
+		kfree(new);
+		return NULL;
+	}
+
+	memcpy(new->se_name, name, len+1);
+	atomic_set(&new->se_refcount, 2);/* 1 for the caller, 1 for the hash */
+	new->se_hash = sh;
+	new->se_flags = 0;
+	INIT_HLIST_NODE(&new->se_node);
+	memset(&new->se_data, 0, sizeof(new->se_data));
+
+	/* attach to the hash datastructure */
+
+	/*
+	 * first check to see if we lost a race and some
+	 * other thread already added a matching hentry.
+	 */
+	down_write(&sh->sh_sem);
+	hlist_for_each_entry(se, hn, &sh->sh_hash[hash], se_node) {
+		if (!strcmp(se->se_name, name)) {
+			/* found matching, use that instead */
+			dprintk("nfsd_stats_find: found(2) %s\n", name);
+			kfree(new->se_name);
+			kfree(new);
+			nfsd_stats_get(se);
+			up_write(&sh->sh_sem);
+			return se;
+		}
+	}
+	/* still not there, insert new one into the hash */
+	hlist_add_head(&new->se_node, &sh->sh_hash[hash]);
+
+	up_write(&sh->sh_sem);
+	return new;
+}
+
+/*
+ * Drop a reference to a hentry, deleting the hentry if this
+ * was the last reference.  Does it's own locking using the
+ * hash rwsem; may sleep.
+ */
+void
+nfsd_stats_put(nfsd_stats_hentry_t *se)
+{
+	nfsd_stats_hash_t *sh = se->se_hash;
+
+	if (!atomic_dec_and_test(&se->se_refcount))
+		return;
+
+	/* just dropped the last reference */
+	down_write(&sh->sh_sem);
+
+	if (atomic_read(&se->se_refcount)) {
+		/*
+		 * We lost a race getting the write lock, and
+		 * now there's a reference again.  Whatever.
+		 */
+		goto out_unlock;
+	}
+
+	dprintk("nfsd_stats_put: freeing %s\n", se->se_name);
+	hlist_del(&se->se_node);
+	kfree(se->se_name);
+	kfree(se);
+
+out_unlock:
+	up_write(&sh->sh_sem);
+}
+
+
 void
 nfsd_stat_init(void)
 {
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -40,6 +40,37 @@ struct nfsd_stats {
 
 };
 
+struct nfsd_op_stats {
+	/* nothing to see here, yet */
+};
+
+
+typedef struct nfsd_stats_hash		nfsd_stats_hash_t;
+typedef struct nfsd_stats_hentry	nfsd_stats_hentry_t;
+
+/* Entry in the export and client stats hashtables */
+struct nfsd_stats_hentry {
+	struct hlist_node	se_node;	/* links hash chains */
+	char			*se_name;
+	atomic_t		se_refcount;	/* 1 for each user + 1 for hash */
+#define NFSD_STATS_HENTRY_OLD	0
+	unsigned long		se_flags;
+	nfsd_stats_hash_t	*se_hash;
+	struct nfsd_op_stats	se_data;
+};
+
+/*
+ * Hashtable structure for export and client stats.
+ * Table width is chosen at boot time to scale with
+ * the size of the server.
+ */
+struct nfsd_stats_hash {
+	struct rw_semaphore	sh_sem;
+	unsigned int		sh_size;
+	unsigned int		sh_mask;
+	struct hlist_head	*sh_hash;
+	struct timer_list	sh_prune_timer;
+};
 
 extern struct nfsd_stats	nfsdstats;
 extern struct svc_stat		nfsd_svcstats;
@@ -47,5 +78,17 @@ extern struct svc_stat		nfsd_svcstats;
 void	nfsd_stat_init(void);
 void	nfsd_stat_shutdown(void);
 
+extern nfsd_stats_hentry_t *nfsd_stats_find(nfsd_stats_hash_t *,
+					    const char *name, int len);
+static inline void
+nfsd_stats_get(nfsd_stats_hentry_t *se)
+{
+	atomic_inc(&se->se_refcount);
+	clear_bit(NFSD_STATS_HENTRY_OLD, &se->se_flags);
+}
+extern void nfsd_stats_put(nfsd_stats_hentry_t *se);
+
+
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_NFSD_STATS_H */
Index: bfields/include/linux/nfsd/debug.h
===================================================================
--- bfields.orig/include/linux/nfsd/debug.h
+++ bfields/include/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
 #define NFSDDBG_REPCACHE	0x0080
 #define NFSDDBG_XDR		0x0100
 #define NFSDDBG_LOCKD		0x0200
+#define NFSDDBG_STATS		0x0400
 #define NFSDDBG_ALL		0x7FFF
 #define NFSDDBG_NOCHANGE	0xFFFF
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 03/29] knfsd: add userspace controls for stats tables
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
  2009-03-31 20:28 ` [patch 01/29] knfsd: Add infrastructure for measuring RPC service times Greg Banks
  2009-03-31 20:28 ` [patch 02/29] knfsd: Add stats table infrastructure Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-04-25 21:57   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 04/29] knfsd: Add stats updating API Greg Banks
                   ` (26 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Add two control files to /proc/fs/nfsd:

* "stats_enabled" can be used to disable or enable the gathering
   of per-client and per-export statistics in the server.

* "stats_prune_period" can be used to set the period at
   which the pruning timer runs, in seconds.  Unused stats
   entries will survive at most twice that time.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfsctl.c |   99 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

Index: bfields/fs/nfsd/nfsctl.c
===================================================================
--- bfields.orig/fs/nfsd/nfsctl.c
+++ bfields/fs/nfsd/nfsctl.c
@@ -64,6 +64,8 @@ enum {
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
+	NFSD_Stats_Enabled,
+	NFSD_Stats_Prune_Period,
 	/*
 	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
 static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
+static ssize_t write_stats_enabled(struct file *file, char *buf, size_t size);
+static ssize_t write_stats_prune_period(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
@@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
 	[NFSD_Versions] = write_versions,
 	[NFSD_Ports] = write_ports,
 	[NFSD_MaxBlkSize] = write_maxblksize,
+	[NFSD_Stats_Enabled] = write_stats_enabled,
+	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
 #ifdef CONFIG_NFSD_V4
 	[NFSD_Leasetime] = write_leasetime,
 	[NFSD_RecoveryDir] = write_recoverydir,
@@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
 	return sprintf(buf, "%d\n", nfsd_max_blksize);
 }
 
+extern int nfsd_stats_enabled;
+
+/**
+ * write_stats_enabled - Set or report whether per-client/
+ *			 per-export stats are enabled.
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new value
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of the current setting
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_stats_enabled(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	if (size > 0) {
+		int enabled;
+		int rv = get_int(&mesg, &enabled);
+		if (rv)
+			return rv;
+		/* check `enabled' against allowed range */
+		if (enabled < 0 || enabled > 1)
+			return -EINVAL;
+		/*
+		 * We can change the enabled flag at any time without
+		 * locking.  All it controls is whether stats are
+		 * gathered for new incoming NFS calls.  Old gathered
+		 * stats still sit around in the hash tables until
+		 * naturally pruned.
+		 */
+		nfsd_stats_enabled = enabled;
+	}
+	return sprintf(buf, "%d\n", nfsd_stats_enabled);
+}
+
+extern int nfsd_stats_prune_period;
+
+/**
+ * write_stats_prune_period - Set or report the period for pruning
+ *			      old per-client/per-export stats entries,
+ *			      in seconds.
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new value
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of the current setting
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
+static ssize_t write_stats_prune_period(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	if (size > 0) {
+		int period;
+		int rv = get_int(&mesg, &period);
+		if (rv)
+			return rv;
+		/* check `period' against allowed range */
+		if (period < 10 || period > 14*86400)
+			return -EINVAL;
+		/*
+		 * We can change the period at any time without
+		 * locking.  All it controls is the timeout on the
+		 * next run of the prune timer.  This might cause
+		 * some unexpected behaviour if the period is
+		 * changed from really high to really low.
+		 */
+		nfsd_stats_prune_period = period;
+	}
+	return sprintf(buf, "%d\n", nfsd_stats_prune_period);
+}
+
 #ifdef CONFIG_NFSD_V4
 extern time_t nfs4_leasetime(void);
 
@@ -1263,6 +1360,8 @@ static int nfsd_fill_super(struct super_
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_Stats_Enabled] = {"stats_enabled", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_Stats_Prune_Period] = {"stats_prune_period", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 04/29] knfsd: Add stats updating API
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (2 preceding siblings ...)
  2009-03-31 20:28 ` [patch 03/29] knfsd: add userspace controls for stats tables Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 05/29] knfsd: Infrastructure for providing stats to userspace Greg Banks
                   ` (25 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Add APIs for the NFS server code to use to update per-client
and per-export statistics.  Functions nfsd_stats_pre() and
nfsd_stats_post() are called before and after each NFS call.  Function
nfsd_stats_update_op() is provided to update the stats for a particular
NFS operation, and several convenience wrappers for it are provided.

Note that the NFS server code needs to be instrumented to call the
nfsd_stats_update_op() function at appropriate times, rather than
have the generic RPC code do those updates.  This is necessary if we
are to support per-export stats, because the operation's file handle
(which tells us the export in use) is not available until after
NFS-specific XDR decoding.

Note also that the API transparently handles the complex way
that NFSv4 maps operations to calls, compared to the old simple
NFSv2/v3 arrangement.  The NFS code just calls nfsd_stats_update_op()
once for each operation.  The first time this happens in each NFS
call, some per-call accounting is done as well as per-op accouting.
The second and subsequent calls to nfsd_stats_update_op() in an NFS
call will only do per-op accounting.  This is designed to simplify
the instrumentation needed in each NFS call code.

Also, fill out the struct nfsd_op_stats with the actual counters.
Note that the "ops" used to store the stats are neither precisely the
NFSv2/3 calls nor the NFSv4 operations.  Instead they're an abstract
set of operations, with some NFS operations merged together where
we're not really interested in the differences (e.g. NFSv3 CREATE,
MKDIR, SYMLINK, and several other calls are lumped together into a
"MKINODE" op).  Conversely, the WRITE call is split into two "ops"
depending on whether it's marked STABLE, because this has a significant
performance effect.

Contains code based on patches from Harshula Jayasuriya <harshula@sgi.com>
and Peter Leckie <pleckie@sgi.com>.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfssvc.c           |    4 +
 fs/nfsd/stats.c            |   97 ++++++++++++++++++++++++++++++
 include/linux/nfsd/stats.h |  107 +++++++++++++++++++++++++++++++++-
 include/linux/sunrpc/svc.h |    1 
 4 files changed, 208 insertions(+), 1 deletion(-)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -328,6 +328,103 @@ out_unlock:
 	up_write(&sh->sh_sem);
 }
 
+static void __nfsd_stats_begin_call(struct svc_rqst *rqstp,
+				    nfsd_stats_hentry_t *se)
+{
+	struct nfsd_op_stats *os = &se->se_data;
+
+	os->os_bytes_in += rqstp->rq_arg.len;
+}
+
+static void __nfsd_stats_end_call(struct svc_rqst *rqstp,
+				  nfsd_stats_hentry_t *se,
+				  int tb)
+{
+	struct nfsd_op_stats *os = &se->se_data;
+
+	if (tb >= 0)
+		os->os_service_times[tb]++;
+	os->os_bytes_out += rqstp->rq_res.len;
+
+	if (rqstp->rq_prog == NFS_PROGRAM) {
+		if (rqstp->rq_vers >= 2 && rqstp->rq_vers <= 4)
+			os->os_versions[rqstp->rq_vers-2]++;
+	}
+
+	switch (rqstp->rq_prot)
+	{
+	case IPPROTO_TCP:
+		os->os_transports[NFSD_STATS_TRANSPORT_TCP]++;
+		break;
+	case IPPROTO_UDP:
+		os->os_transports[NFSD_STATS_TRANSPORT_UDP]++;
+		break;
+#if defined(CONFIG_SUNRPC_XPRT_RDMA) || defined(CONFIG_SUNRPC_XPRT_RDMA_MODULE)
+	case IPPROTO_MAX:
+		os->os_transports[NFSD_STATS_TRANSPORT_RDMA]++;
+		break;
+#endif
+	}
+}
+
+static void __nfsd_stats_op(struct svc_rqst *rqstp,
+			    nfsd_stats_hentry_t *se,
+			    int rbucket, int wbucket, int op)
+{
+	struct nfsd_op_stats *os = &se->se_data;
+
+	if (rbucket >= 0)
+		os->os_read_sizes[rbucket]++;
+	if (wbucket >= 0)
+		os->os_write_sizes[wbucket]++;
+
+	os->os_ops[op]++;
+}
+
+void nfsd_stats_update_op(struct svc_rqst *rqstp, struct svc_fh *fh,
+			  int rbucket, int wbucket, int op)
+{
+	if (!nfsd_stats_enabled)
+		return;
+
+	/* interesting things happen here 2 patches hence */
+}
+
+void nfsd_stats_pre(struct svc_rqst *rqstp)
+{
+	svc_time_mark(&rqstp->rq_start_time);
+}
+
+static inline int time_bucket(const struct timespec *ts)
+{
+	int i = 0;
+	unsigned long ns = ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec;
+
+	if (ns) {
+		ns = (ns-1) >> 11;	/* smallest bucket is 256 usec */
+		while (i < NFSD_STATS_SVCTIME_NUM)
+		{
+			if (!ns)
+				break;
+			i++;
+			ns >>= 1;
+		}
+	}
+	return i;
+}
+
+void nfsd_stats_post(struct svc_rqst *rqstp)
+{
+	int tb = -1;
+	struct timespec svctime;
+
+	/* calculate service time and update the stats */
+	if (svc_time_elapsed(&rqstp->rq_start_time, &svctime) == 0)
+		tb = time_bucket(&svctime);
+
+	/* interesting things happen here 2 patches hence */
+}
+
 
 void
 nfsd_stat_init(void)
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -41,7 +41,60 @@ struct nfsd_stats {
 };
 
 struct nfsd_op_stats {
-	/* nothing to see here, yet */
+#define NFSD_STATS_OP_FSINFO	0	/* includes NULLPROC,FSSTAT,FSINFO,
+					 * PATHCONF,ROOT(v2),WRITECACHE(v2) */
+#define NFSD_STATS_OP_MKINODE	1	/* includes CREATE,MKDIR,SYMLINK,
+					 * MKNOD,RENAME,LINK */
+#define NFSD_STATS_OP_GETATTR	2
+#define NFSD_STATS_OP_SETATTR	3
+#define NFSD_STATS_OP_LOOKUP	4
+#define NFSD_STATS_OP_ACCESS	5
+#define NFSD_STATS_OP_READ	6	/* includes READ,READLINK */
+#define NFSD_STATS_OP_SWRITE	7	/* sync WRITEs */
+#define NFSD_STATS_OP_AWRITE	8	/* async WRITEs */
+#define NFSD_STATS_OP_COMMIT	9
+#define NFSD_STATS_OP_REMOVE	10
+#define NFSD_STATS_OP_RMDIR	11
+#define NFSD_STATS_OP_READDIR	12
+#define NFSD_STATS_OP_READDIRP	13
+#define NFSD_STATS_OP_XATTR	14	/* includes all NFSACL ops too */
+#define NFSD_STATS_OP_LOCKD	15	/* all LOCKD ops except: */
+#define NFSD_STATS_OP_SHARE	16	/* includes LOCKD's SHARE,UNSHARE */
+#define NFSD_STATS_OP_GRANTED	17	/* includes LOCKD' GRANTED */
+#define NFSD_STATS_OP_NUM	18
+	unsigned long	os_ops[NFSD_STATS_OP_NUM];
+	unsigned long	os_bytes_in;
+	unsigned long	os_bytes_out;
+#define NFSD_STATS_SIZE_LE4K		0
+#define NFSD_STATS_SIZE_LE8K		1
+#define NFSD_STATS_SIZE_LE16K		2
+#define NFSD_STATS_SIZE_LE32K		3
+#define NFSD_STATS_SIZE_LE64K		4
+#define NFSD_STATS_SIZE_LE128K		5
+#define NFSD_STATS_SIZE_LE256K		6
+#define NFSD_STATS_SIZE_LE512K		7
+#define NFSD_STATS_SIZE_GT512K		8
+#define NFSD_STATS_SIZE_NUM		9
+	unsigned long	os_read_sizes[NFSD_STATS_SIZE_NUM];
+	unsigned long	os_write_sizes[NFSD_STATS_SIZE_NUM];
+#define NFSD_STATS_TRANSPORT_UDP	0
+#define NFSD_STATS_TRANSPORT_TCP	1
+#define NFSD_STATS_TRANSPORT_RDMA	2
+#define NFSD_STATS_TRANSPORT_NUM	3
+	unsigned long	os_transports[NFSD_STATS_TRANSPORT_NUM];
+#define NFSD_STATS_VERSION_V2		0
+#define NFSD_STATS_VERSION_V3		1
+#define NFSD_STATS_VERSION_V4		2
+#define NFSD_STATS_VERSION_NUM		3
+	unsigned long	os_versions[NFSD_STATS_VERSION_NUM];
+#define NFSD_STATS_SVCTIME_LE256US	0
+#define NFSD_STATS_SVCTIME_LE1MS	1
+#define NFSD_STATS_SVCTIME_LE4MS	2
+#define NFSD_STATS_SVCTIME_LE16MS	3
+#define NFSD_STATS_SVCTIME_LE64MS	4
+#define NFSD_STATS_SVCTIME_GT64MS	5
+#define NFSD_STATS_SVCTIME_NUM		6
+	unsigned long	os_service_times[NFSD_STATS_SVCTIME_NUM];
 };
 
 
@@ -88,6 +141,58 @@ nfsd_stats_get(nfsd_stats_hentry_t *se)
 }
 extern void nfsd_stats_put(nfsd_stats_hentry_t *se);
 
+extern void nfsd_stats_update_op(struct svc_rqst *rqstp, struct svc_fh *fh,
+				 int rbucket, int wbucket, int op);
+
+static inline void nfsd_stats_update(struct svc_rqst *rqstp, struct svc_fh *fh,
+				     int op)
+{
+	nfsd_stats_update_op(rqstp, fh, -1, -1, op);
+}
+
+static inline int nfsd_stats_size_bucket(unsigned int size)
+{
+	int i = 0;
+
+	if (size) {
+		size = (size-1) >> 12;	/* smallest bucket is 4K */
+		while (i < NFSD_STATS_SIZE_NUM-1)
+		{
+			if (!size)
+				break;
+			i++;
+			size >>= 1;
+		}
+	}
+	return i;
+}
+
+static inline void nfsd_stats_update_read(struct svc_rqst *rqstp,
+					  struct svc_fh *fh,
+					  unsigned int size)
+{
+	nfsd_stats_update_op(rqstp,fh,
+			     nfsd_stats_size_bucket(size),
+			     /*wbucket*/-1,
+			     NFSD_STATS_OP_READ);
+}
+
+static inline void nfsd_stats_update_write(struct svc_rqst *rqstp,
+					   struct svc_fh *fh,
+					   unsigned int size, int stable)
+{
+	nfsd_stats_update_op(rqstp,fh,
+			     /*rbucket*/-1,
+			     nfsd_stats_size_bucket(size),
+			     (stable ? NFSD_STATS_OP_SWRITE :
+				       NFSD_STATS_OP_AWRITE));
+}
+/* nfsd calls this before servicing a request */
+void nfsd_stats_pre(struct svc_rqst *rqstp);
+/* nfsd calls this after servicing a request */
+void nfsd_stats_post(struct svc_rqst *rqstp);
+
+
 
 
 #endif /* __KERNEL__ */
Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -290,6 +290,7 @@ struct svc_rqst {
 	wait_queue_head_t	rq_wait;	/* synchronization */
 	struct task_struct	*rq_task;	/* service thread */
 	int			rq_waking;	/* 1 if thread is being woken */
+	struct svc_time		rq_start_time;
 };
 
 /*
Index: bfields/fs/nfsd/nfssvc.c
===================================================================
--- bfields.orig/fs/nfsd/nfssvc.c
+++ bfields/fs/nfsd/nfssvc.c
@@ -468,8 +468,12 @@ nfsd(void *vrqstp)
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
+		nfsd_stats_pre(rqstp);
+
 		svc_process(rqstp);
 
+		nfsd_stats_post(rqstp);
+
 		/* Unlock export hash tables */
 		exp_readunlock();
 	}

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 05/29] knfsd: Infrastructure for providing stats to userspace
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (3 preceding siblings ...)
  2009-03-31 20:28 ` [patch 04/29] knfsd: Add stats updating API Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-04-01  0:28   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 06/29] knfsd: Gather per-export stats Greg Banks
                   ` (24 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Added iteration and seq_file infrastructure to allow implementing
a /proc file which exports all the entries in a stats hashtable as
text to userspace.  Function nfsd_stats_open() is called in the /proc
file's open method and handles all the subsequent details.

Like all RPC statistics, the format is designed to be easy to parse
in shell scripts and C code.  Counter values are presented in text
form, grouped into lines which start with a two-letter keyword.
For example, the line "by 2680 487656" shows that 2680 bytes of NFS
calls have been received and 487656 bytes of replies have been sent.
The special "nm" keyword starts a new entry and shows it's internal
name, e.g. "nm 192.168.67.45" in the per-client statistics file will
begin the entry for the client whose IP address is 192.168.67.45.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/stats.c            |  173 ++++++++++++++++++++++++++++++++++
 include/linux/nfsd/stats.h |   11 ++
 2 files changed, 184 insertions(+)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -426,6 +426,179 @@ void nfsd_stats_post(struct svc_rqst *rq
 }
 
 
+static nfsd_stats_hentry_t *nfsd_stats_hiter_first(nfsd_stats_hiter_t *itr)
+{
+	for (itr->bucket = 0 ;
+	     itr->bucket < itr->sh->sh_size ;
+	     itr->bucket++) {
+		struct hlist_head *hh = &itr->sh->sh_hash[itr->bucket];
+		if (hh->first != NULL)
+			return hentry_from_hnode(hh->first);
+	}
+	return NULL;
+}
+
+static nfsd_stats_hentry_t *nfsd_stats_hiter_next(nfsd_stats_hiter_t *itr,
+						  nfsd_stats_hentry_t *se)
+{
+	struct hlist_head *hh;
+
+	for (;;) {
+		if (se->se_node.next != NULL)
+			return hentry_from_hnode(se->se_node.next);
+		if (++itr->bucket >= itr->sh->sh_size)
+			return NULL;	/* finished iterating */
+		hh = &itr->sh->sh_hash[itr->bucket];
+		if (hh->first != NULL)
+			return hentry_from_hnode(hh->first);
+	}
+}
+
+static nfsd_stats_hentry_t *nfsd_stats_hiter_seek(nfsd_stats_hiter_t *itr,
+						  loff_t pos)
+{
+	nfsd_stats_hentry_t *se;
+
+	for (se = nfsd_stats_hiter_first(itr) ;
+	     se != NULL ;
+	     se = nfsd_stats_hiter_next(itr, se)) {
+		if (!--pos)
+			return se;
+	}
+	return NULL;
+}
+
+static void *nfsd_stats_start(struct seq_file *m, loff_t *pos)
+{
+	nfsd_stats_hiter_t *itr = m->private;
+
+	dprintk("nfsd_stats_start, *pos=%d\n", (int)*pos);
+	down_read(&itr->sh->sh_sem);
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+
+	return nfsd_stats_hiter_seek(itr, *pos);
+}
+
+static void *nfsd_stats_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	nfsd_stats_hiter_t *itr = m->private;
+	nfsd_stats_hentry_t *se = p;
+
+	dprintk("nfsd_stats_next, *pos=%llu bucket=%d\n", *pos, itr->bucket);
+
+	if (p == SEQ_START_TOKEN)
+		se = nfsd_stats_hiter_first(itr);
+	else
+		se = nfsd_stats_hiter_next(itr, se);
+	++*pos;
+	return se;
+}
+
+static void nfsd_stats_stop(struct seq_file *m, void *p)
+{
+	nfsd_stats_hiter_t *itr = m->private;
+
+	up_read(&itr->sh->sh_sem);
+}
+
+static int nfsd_stats_show(struct seq_file *m, void *p)
+{
+	nfsd_stats_hentry_t *se = p;
+	struct nfsd_op_stats *os = &se->se_data;
+	int i;
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m, "# Version 1.0\n");
+		return 0;
+	}
+
+	dprintk("nfsd_stats_show %s\n",  se->se_name);
+
+	seq_puts(m, "nm ");
+	seq_escape(m, se->se_name, " \t\n\\");
+	seq_printf(m, "\n");
+
+	/* histogram of operations */
+	seq_puts(m, "op");
+	for (i = 0 ; i < NFSD_STATS_OP_NUM ; i++)
+		seq_printf(m, " %lu", os->os_ops[i]);
+	seq_putc(m, '\n');
+
+	/* bytes in and out */
+	seq_printf(m, "by %lu %lu\n", os->os_bytes_in, os->os_bytes_out);
+
+	/* histogram of read sizes */
+	seq_puts(m, "rs");
+	for (i = 0 ; i < NFSD_STATS_SIZE_NUM ; i++)
+		seq_printf(m, " %lu", os->os_read_sizes[i]);
+	seq_putc(m, '\n');
+
+	/* histogram of write sizes */
+	seq_puts(m, "ws");
+	for (i = 0 ; i < NFSD_STATS_SIZE_NUM ; i++)
+		seq_printf(m, " %lu", os->os_write_sizes[i]);
+	seq_putc(m, '\n');
+
+	/* counts of operations by transport */
+	seq_printf(m, "tr udp %lu\n",
+		   os->os_transports[NFSD_STATS_TRANSPORT_UDP]);
+	seq_printf(m, "tr tcp %lu\n",
+		   os->os_transports[NFSD_STATS_TRANSPORT_TCP]);
+#if defined(CONFIG_NFSD_RDMA) || defined(CONFIG_NFSD_RDMA_MODULE)
+	seq_printf(m, "tr rdma %lu\n",
+		   os->os_transports[NFSD_STATS_TRANSPORT_RDMA]);
+#endif
+
+	/* counts of operations by version */
+	seq_printf(m, "ve 2 %lu\n",
+		   os->os_versions[NFSD_STATS_VERSION_V2]);
+	seq_printf(m, "ve 3 %lu\n",
+		   os->os_versions[NFSD_STATS_VERSION_V3]);
+	seq_printf(m, "ve 4 %lu\n",
+		   os->os_versions[NFSD_STATS_VERSION_V4]);
+
+	/* histogram of service times */
+	seq_puts(m, "st");
+	for (i = 0 ; i < NFSD_STATS_SVCTIME_NUM ; i++)
+		seq_printf(m, " %lu", os->os_service_times[i]);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static struct seq_operations nfsd_stats_seq_ops = {
+	.start	= nfsd_stats_start,
+	.next	= nfsd_stats_next,
+	.stop	= nfsd_stats_stop,
+	.show	= nfsd_stats_show,
+};
+
+int nfsd_stats_open(struct file *file, nfsd_stats_hash_t *sh)
+{
+	int err;
+	nfsd_stats_hiter_t *itr;
+
+	if (sh->sh_hash == NULL)
+		return -ENOENT;
+
+	if ((itr = kmalloc(sizeof(*itr), GFP_KERNEL)) == NULL)
+		return -ENOMEM;
+
+	if ((err = seq_open(file, &nfsd_stats_seq_ops))) {
+		kfree(itr);
+		return err;
+	}
+
+	itr->sh = sh;
+	itr->bucket = 0;
+	((struct seq_file *) file->private_data)->private = itr;
+
+	return 0;
+}
+
+
 void
 nfsd_stat_init(void)
 {
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -100,6 +100,7 @@ struct nfsd_op_stats {
 
 typedef struct nfsd_stats_hash		nfsd_stats_hash_t;
 typedef struct nfsd_stats_hentry	nfsd_stats_hentry_t;
+typedef struct nfsd_stats_hiter		nfsd_stats_hiter_t;
 
 /* Entry in the export and client stats hashtables */
 struct nfsd_stats_hentry {
@@ -125,6 +126,13 @@ struct nfsd_stats_hash {
 	struct timer_list	sh_prune_timer;
 };
 
+/* Hashtable iteration state used during seq_file traversal */
+struct nfsd_stats_hiter {
+	nfsd_stats_hash_t *sh;
+	int bucket;
+};
+
+
 extern struct nfsd_stats	nfsdstats;
 extern struct svc_stat		nfsd_svcstats;
 
@@ -192,6 +200,9 @@ void nfsd_stats_pre(struct svc_rqst *rqs
 /* nfsd calls this after servicing a request */
 void nfsd_stats_post(struct svc_rqst *rqstp);
 
+/* open the hash for a seq_file pass to userspace */
+int nfsd_stats_open(struct file *file, nfsd_stats_hash_t *sh);
+
 
 
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 06/29] knfsd: Gather per-export stats
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (4 preceding siblings ...)
  2009-03-31 20:28 ` [patch 05/29] knfsd: Infrastructure for providing stats to userspace Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 07/29] knfsd: Prefetch the per-export stats entry Greg Banks
                   ` (23 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Uses the new stats infrastructure to record and export NFS statistics
on a per-export basis.  The export is chosen according to the first
filehandle presented in the incoming call.  If an NFSv4 call references
filehandles on multiple exports, all statistics will be recorded
against the first one.  If the call does not reference any filehandles
(e.g. NFSv3 NULL call), it will not be counted in the per-export stats
(although later it will be counted in the per-client stats).

A file /proc/fs/nfsd/export_stats is provided to allow userspace
programs to read the statistics.

To avoid a hash lookup in a locked global table on every operation,
the stats entry is cached on the struct svc_export.

Contains code based on a patch from Harshula Jayasuriya <harshula@sgi.com>.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/export.c            |   33 +++++++++++++++++++++++++++++++
 fs/nfsd/nfsctl.c            |   15 ++++++++++++++
 fs/nfsd/stats.c             |   35 +++++++++++++++++++++++++++++++--
 include/linux/nfsd/export.h |    1 
 include/linux/nfsd/stats.h  |    1 
 include/linux/sunrpc/svc.h  |    1 
 6 files changed, 84 insertions(+), 2 deletions(-)

Index: bfields/fs/nfsd/export.c
===================================================================
--- bfields.orig/fs/nfsd/export.c
+++ bfields/fs/nfsd/export.c
@@ -27,6 +27,7 @@
 #include <linux/hash.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
+#include <linux/list.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -44,6 +45,7 @@ typedef struct svc_export	svc_export;
 
 static void		exp_do_unexport(svc_export *unexp);
 static int		exp_verify_string(char *cp, int max);
+static nfsd_stats_hentry_t *exp_stats_find(struct path *);
 
 /*
  * We have two caches.
@@ -333,6 +335,8 @@ static void svc_export_put(struct kref *
 	auth_domain_put(exp->ex_client);
 	kfree(exp->ex_pathname);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
+	if (exp->ex_stats)
+		nfsd_stats_put(exp->ex_stats);
 	kfree(exp);
 }
 
@@ -673,6 +677,34 @@ static int svc_export_match(struct cache
 		orig->ex_path.mnt == new->ex_path.mnt;
 }
 
+/*
+ * Find and return a stats hentry in the export stats hash,
+ * given the mount+dentry for the export, creating it if
+ * necessary.  Will return NULL on OOM or if stats disabled.
+ */
+static nfsd_stats_hentry_t *exp_stats_find(struct path *pp)
+{
+	char *buf, *pathname;
+	int len;
+	nfsd_stats_hentry_t *se = NULL;
+
+	dprintk("exp_stats_find: mnt %p dentry %p\n", pp->mnt, pp->dentry);
+
+	/* construct the export's path in a temporary page */
+	buf = (char *)__get_free_page(GFP_KERNEL);
+	if (buf == NULL)
+		return NULL;
+
+	pathname = d_path(pp, buf, PAGE_SIZE);
+	if (!IS_ERR(pathname)) {
+		len = buf + PAGE_SIZE - 1 - pathname;
+		se = nfsd_stats_find(&nfsd_export_stats_hash, pathname, len);
+	}
+
+	free_page((unsigned long)buf);
+	return se;
+}
+
 static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 {
 	struct svc_export *new = container_of(cnew, struct svc_export, h);
@@ -686,6 +718,7 @@ static void svc_export_init(struct cache
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
+	new->ex_stats = exp_stats_find(&new->ex_path);
 }
 
 static void export_update(struct cache_head *cnew, struct cache_head *citem)
Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -49,6 +49,7 @@ struct svc_stat		nfsd_svcstats = {
 	.program	= &nfsd_program,
 };
 
+nfsd_stats_hash_t nfsd_export_stats_hash;
 int nfsd_stats_enabled = 1;
 int nfsd_stats_prune_period = 2*86400;
 
@@ -384,15 +385,34 @@ static void __nfsd_stats_op(struct svc_r
 void nfsd_stats_update_op(struct svc_rqst *rqstp, struct svc_fh *fh,
 			  int rbucket, int wbucket, int op)
 {
+    	nfsd_stats_hentry_t *se;
+
 	if (!nfsd_stats_enabled)
 		return;
 
-	/* interesting things happen here 2 patches hence */
+	/* first op in the call: find and cache per-export stats */
+	if (fh != NULL &&
+	    fh->fh_export != NULL &&
+	    (se = fh->fh_export->ex_stats) != NULL &&
+	    rqstp->rq_export_stats == NULL) {
+		/*
+		 * We want the stats to survive fh_put() of the filehandle
+		 * so we can update os_bytes_out and service time in
+		 * nfsd_stats_post().  So grab a reference here.
+		 */
+		nfsd_stats_get(se);
+		rqstp->rq_export_stats = se;
+		__nfsd_stats_begin_call(rqstp, se);
+	}
+	/* all ops in the call: update per-export stats */
+	if (rqstp->rq_export_stats)
+		__nfsd_stats_op(rqstp, rqstp->rq_export_stats, rbucket, wbucket, op);
 }
 
 void nfsd_stats_pre(struct svc_rqst *rqstp)
 {
 	svc_time_mark(&rqstp->rq_start_time);
+	rqstp->rq_export_stats = NULL;
 }
 
 static inline int time_bucket(const struct timespec *ts)
@@ -418,11 +438,18 @@ void nfsd_stats_post(struct svc_rqst *rq
 	int tb = -1;
 	struct timespec svctime;
 
+	if (rqstp->rq_export_stats == NULL)
+		return;
+
 	/* calculate service time and update the stats */
 	if (svc_time_elapsed(&rqstp->rq_start_time, &svctime) == 0)
 		tb = time_bucket(&svctime);
 
-	/* interesting things happen here 2 patches hence */
+	if (rqstp->rq_export_stats != NULL) {
+		__nfsd_stats_end_call(rqstp, rqstp->rq_export_stats, tb);
+		nfsd_stats_put(rqstp->rq_export_stats);
+		rqstp->rq_export_stats = NULL;
+	}
 }
 
 
@@ -603,10 +630,14 @@ void
 nfsd_stat_init(void)
 {
 	svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
+
+	nfsd_stats_hash_init(&nfsd_export_stats_hash, "export");
 }
 
 void
 nfsd_stat_shutdown(void)
 {
 	svc_proc_unregister("nfsd");
+
+	nfsd_stats_hash_destroy(&nfsd_export_stats_hash);
 }
Index: bfields/include/linux/nfsd/export.h
===================================================================
--- bfields.orig/include/linux/nfsd/export.h
+++ bfields/include/linux/nfsd/export.h
@@ -92,6 +92,7 @@ struct svc_export {
 	struct nfsd4_fs_locations ex_fslocs;
 	int			ex_nflavors;
 	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	struct nfsd_stats_hentry *ex_stats;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -135,6 +135,7 @@ struct nfsd_stats_hiter {
 
 extern struct nfsd_stats	nfsdstats;
 extern struct svc_stat		nfsd_svcstats;
+extern nfsd_stats_hash_t	nfsd_export_stats_hash;
 
 void	nfsd_stat_init(void);
 void	nfsd_stat_shutdown(void);
Index: bfields/fs/nfsd/nfsctl.c
===================================================================
--- bfields.orig/fs/nfsd/nfsctl.c
+++ bfields/fs/nfsd/nfsctl.c
@@ -66,6 +66,7 @@ enum {
 	NFSD_MaxBlkSize,
 	NFSD_Stats_Enabled,
 	NFSD_Stats_Prune_Period,
+	NFSD_Export_Stats,
 	/*
 	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -179,6 +180,19 @@ static const struct file_operations expo
 	.owner		= THIS_MODULE,
 };
 
+static int export_stats_open(struct inode *inode, struct file *file)
+{
+	return nfsd_stats_open(file, &nfsd_export_stats_hash);
+}
+
+static struct file_operations export_stats_operations = {
+	.open		= export_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+	.owner		= THIS_MODULE,
+};
+
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 
 static struct file_operations pool_stats_operations = {
@@ -1362,6 +1376,7 @@ static int nfsd_fill_super(struct super_
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Stats_Enabled] = {"stats_enabled", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Stats_Prune_Period] = {"stats_prune_period", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_Export_Stats] = {"export_stats", &export_stats_operations, S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -291,6 +291,7 @@ struct svc_rqst {
 	struct task_struct	*rq_task;	/* service thread */
 	int			rq_waking;	/* 1 if thread is being woken */
 	struct svc_time		rq_start_time;
+	struct nfsd_stats_hentry *rq_export_stats;
 };
 
 /*

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 07/29] knfsd: Prefetch the per-export stats entry
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (5 preceding siblings ...)
  2009-03-31 20:28 ` [patch 06/29] knfsd: Gather per-export stats Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 08/29] knfsd: Gather per-client stats Greg Banks
                   ` (22 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

On most large NFS servers, there are very few per-export stats
entries and each one will be used on multiple CPUs.  This tends to
make them a contention point on high call rate workloads (which is
why the per-export and per-client stats are designed to be disabled
at runtime).  Experiment showed that prefetching the stats entry had
a small positive benefit on this contention.

If I was to do this properly, the per-export stats objects would
be stored per-cpu and stitched together on demand for export to
userspace.  However, that would make the reference counting and
pruning semantics....interesting.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/stats.c |   11 +++++++++++
 1 file changed, 11 insertions(+)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -33,6 +33,7 @@
 #include <linux/list.h>
 #include <linux/swap.h>
 #include <linux/log2.h>
+#include <linux/prefetch.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
@@ -44,6 +45,15 @@
 #define hentry_from_hnode(hn) \
 	hlist_entry((hn), nfsd_stats_hentry_t, se_node)
 
+static inline void nfsd_stats_prefetch(nfsd_stats_hentry_t *se)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < sizeof(nfsd_stats_hentry_t) ; i += PREFETCH_STRIDE)
+		prefetch((char *)se+i);
+}
+
+
 struct nfsd_stats	nfsdstats;
 struct svc_stat		nfsd_svcstats = {
 	.program	= &nfsd_program,
@@ -395,6 +405,7 @@ void nfsd_stats_update_op(struct svc_rqs
 	    fh->fh_export != NULL &&
 	    (se = fh->fh_export->ex_stats) != NULL &&
 	    rqstp->rq_export_stats == NULL) {
+	    	nfsd_stats_prefetch(se);
 		/*
 		 * We want the stats to survive fh_put() of the filehandle
 		 * so we can update os_bytes_out and service time in

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 08/29] knfsd: Gather per-client stats
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (6 preceding siblings ...)
  2009-03-31 20:28 ` [patch 07/29] knfsd: Prefetch the per-export stats entry Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 09/29] knfsd: Cache per-client stats entry on TCP transports Greg Banks
                   ` (21 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Uses the new stats infrastructure to record and export NFS statistics
on a per-client basis.  

A file /proc/fs/nfsd/client_stats is provided to allow userspace
programs to read the statistics.

Unfortunately, we use a hash lookup in a locked global table on
every operation.  The next patch avoids that for the common TCP case.

Contains code based on a patch from Harshula Jayasuriya <harshula@sgi.com>.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfsctl.c                |   15 ++++++++++++++
 fs/nfsd/stats.c                 |   30 ++++++++++++++++++++++++++++-
 include/linux/nfsd/stats.h      |    1 
 include/linux/sunrpc/svc.h      |    1 
 include/linux/sunrpc/svc_xprt.h |   28 +++++++++++++++++++++++++++
 5 files changed, 74 insertions(+), 1 deletion(-)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -37,6 +37,7 @@
 
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/stats.h>
 
@@ -60,6 +61,7 @@ struct svc_stat		nfsd_svcstats = {
 };
 
 nfsd_stats_hash_t nfsd_export_stats_hash;
+nfsd_stats_hash_t nfsd_client_stats_hash;
 int nfsd_stats_enabled = 1;
 int nfsd_stats_prune_period = 2*86400;
 
@@ -418,12 +420,30 @@ void nfsd_stats_update_op(struct svc_rqs
 	/* all ops in the call: update per-export stats */
 	if (rqstp->rq_export_stats)
 		__nfsd_stats_op(rqstp, rqstp->rq_export_stats, rbucket, wbucket, op);
+
+	/* first op in the call: find and cache per-client stats */
+	if (rqstp->rq_client_stats == NULL) {
+		char *client, buf[SVC_FORMAT_ADDR_MAX];
+		se = NULL;
+		client = __svc_format_addr(svc_addr(rqstp), buf, sizeof(buf));
+		if (client != NULL)
+			se = nfsd_stats_find(&nfsd_client_stats_hash, client, strlen(client));
+		if (se != NULL) {
+			/* take over the new reference from nfsd_stats_find() */
+			rqstp->rq_client_stats = se;
+			__nfsd_stats_begin_call(rqstp, se);
+		}
+	}
+	/* all ops in the call: update per-client stats */
+	if (rqstp->rq_client_stats)
+		__nfsd_stats_op(rqstp, rqstp->rq_client_stats, rbucket, wbucket, op);
 }
 
 void nfsd_stats_pre(struct svc_rqst *rqstp)
 {
 	svc_time_mark(&rqstp->rq_start_time);
 	rqstp->rq_export_stats = NULL;
+	rqstp->rq_client_stats = NULL;
 }
 
 static inline int time_bucket(const struct timespec *ts)
@@ -449,7 +469,7 @@ void nfsd_stats_post(struct svc_rqst *rq
 	int tb = -1;
 	struct timespec svctime;
 
-	if (rqstp->rq_export_stats == NULL)
+	if (rqstp->rq_export_stats == NULL && rqstp->rq_client_stats == NULL)
 		return;
 
 	/* calculate service time and update the stats */
@@ -461,6 +481,12 @@ void nfsd_stats_post(struct svc_rqst *rq
 		nfsd_stats_put(rqstp->rq_export_stats);
 		rqstp->rq_export_stats = NULL;
 	}
+
+	if (rqstp->rq_client_stats != NULL) {
+		__nfsd_stats_end_call(rqstp, rqstp->rq_client_stats, tb);
+		nfsd_stats_put(rqstp->rq_client_stats);
+		rqstp->rq_client_stats = NULL;
+	}
 }
 
 
@@ -643,6 +669,7 @@ nfsd_stat_init(void)
 	svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
 
 	nfsd_stats_hash_init(&nfsd_export_stats_hash, "export");
+	nfsd_stats_hash_init(&nfsd_client_stats_hash, "client");
 }
 
 void
@@ -651,4 +678,5 @@ nfsd_stat_shutdown(void)
 	svc_proc_unregister("nfsd");
 
 	nfsd_stats_hash_destroy(&nfsd_export_stats_hash);
+	nfsd_stats_hash_destroy(&nfsd_client_stats_hash);
 }
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -136,6 +136,7 @@ struct nfsd_stats_hiter {
 extern struct nfsd_stats	nfsdstats;
 extern struct svc_stat		nfsd_svcstats;
 extern nfsd_stats_hash_t	nfsd_export_stats_hash;
+extern nfsd_stats_hash_t	nfsd_client_stats_hash;
 
 void	nfsd_stat_init(void);
 void	nfsd_stat_shutdown(void);
Index: bfields/fs/nfsd/nfsctl.c
===================================================================
--- bfields.orig/fs/nfsd/nfsctl.c
+++ bfields/fs/nfsd/nfsctl.c
@@ -67,6 +67,7 @@ enum {
 	NFSD_Stats_Enabled,
 	NFSD_Stats_Prune_Period,
 	NFSD_Export_Stats,
+	NFSD_Client_Stats,
 	/*
 	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -1090,6 +1091,19 @@ static ssize_t write_ports(struct file *
 	return rv;
 }
 
+static int client_stats_open(struct inode *inode, struct file *file)
+{
+	return nfsd_stats_open(file, &nfsd_client_stats_hash);
+}
+
+static struct file_operations client_stats_operations = {
+	.open		= client_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+	.owner		= THIS_MODULE,
+};
+
 
 int nfsd_max_blksize;
 
@@ -1377,6 +1391,7 @@ static int nfsd_fill_super(struct super_
 		[NFSD_Stats_Enabled] = {"stats_enabled", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Stats_Prune_Period] = {"stats_prune_period", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Export_Stats] = {"export_stats", &export_stats_operations, S_IRUGO},
+		[NFSD_Client_Stats] = {"client_stats", &client_stats_operations, S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -292,6 +292,7 @@ struct svc_rqst {
 	int			rq_waking;	/* 1 if thread is being woken */
 	struct svc_time		rq_start_time;
 	struct nfsd_stats_hentry *rq_export_stats;
+	struct nfsd_stats_hentry *rq_client_stats;
 };
 
 /*
Index: bfields/include/linux/sunrpc/svc_xprt.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc_xprt.h
+++ bfields/include/linux/sunrpc/svc_xprt.h
@@ -164,4 +164,32 @@ static inline char *__svc_print_addr(con
 
 	return buf;
 }
+
+/*
+ * Build a string describing the sockaddr, which is
+ * suitable for use as a unique client key.  Ignores
+ * the port.  Unlike __svc_print_addr(), may return
+ * NULL if the address cannot be decoded.
+ */
+#define SVC_FORMAT_ADDR_MAX	    42
+static inline char *__svc_format_addr(struct sockaddr *sa,
+				      char *buf, size_t len)
+{
+	switch (sa->sa_family) {
+	case AF_INET:
+		snprintf(buf, len, "%pI4",
+			&((struct sockaddr_in *)sa)->sin_addr);
+		break;
+
+	case AF_INET6:
+		snprintf(buf, len, "%pI6",
+			 &((struct sockaddr_in6 *)sa)->sin6_addr);
+		break;
+
+	default:
+		return NULL;
+	}
+	return buf;
+}
+
 #endif /* SUNRPC_SVC_XPRT_H */

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 09/29] knfsd: Cache per-client stats entry on TCP transports.
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (7 preceding siblings ...)
  2009-03-31 20:28 ` [patch 08/29] knfsd: Gather per-client stats Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 10/29] knfsd: Update per-client & per-export stats from NFSv3 Greg Banks
                   ` (20 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

TCP transports are connection-oriented and every call arriving on
such a transport will use the same per-client stats entry.  So we
can avoid doing a hash lookup in a locked global hashtable on every
NFS call by caching the result of nfsd_stats_find() on the transport.

The same is true of RDMA transports, but this patch doesn't address
that.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/stats.c                 |   45 ++++++++++++++++++++++++++---
 include/linux/sunrpc/svc_xprt.h |    3 +
 net/sunrpc/svc_xprt.c           |    4 ++
 net/sunrpc/svcsock.c            |    1 
 4 files changed, 49 insertions(+), 4 deletions(-)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -394,6 +394,39 @@ static void __nfsd_stats_op(struct svc_r
 	os->os_ops[op]++;
 }
 
+static inline nfsd_stats_hentry_t *
+nfsd_stats_xprt_cached_get(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	nfsd_stats_hentry_t *se = NULL;
+
+	if (test_bit(XPT_CACHE_STATS, &xprt->xpt_flags)) {
+		spin_lock(&xprt->xpt_lock);
+		se = xprt->xpt_stats_cache;
+		if (se != NULL)
+			nfsd_stats_get(se);
+		spin_unlock(&xprt->xpt_lock);
+	}
+	return se;
+}
+
+static inline void
+nfsd_stats_xprt_cached_set(struct svc_rqst *rqstp, nfsd_stats_hentry_t *se)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	if (test_bit(XPT_CACHE_STATS, &xprt->xpt_flags)) {
+		spin_lock(&xprt->xpt_lock);
+		if (xprt->xpt_stats_cache == NULL) {
+			xprt->xpt_stats_cache = se;
+			xprt->xpt_stats_cache_release = (void (*)(void*))nfsd_stats_put;
+			/* take a reference for the cached pointer */
+			nfsd_stats_get(se);
+		}
+		spin_unlock(&xprt->xpt_lock);
+	}
+}
+
 void nfsd_stats_update_op(struct svc_rqst *rqstp, struct svc_fh *fh,
 			  int rbucket, int wbucket, int op)
 {
@@ -424,10 +457,14 @@ void nfsd_stats_update_op(struct svc_rqs
 	/* first op in the call: find and cache per-client stats */
 	if (rqstp->rq_client_stats == NULL) {
 		char *client, buf[SVC_FORMAT_ADDR_MAX];
-		se = NULL;
-		client = __svc_format_addr(svc_addr(rqstp), buf, sizeof(buf));
-		if (client != NULL)
-			se = nfsd_stats_find(&nfsd_client_stats_hash, client, strlen(client));
+		se = nfsd_stats_xprt_cached_get(rqstp);
+		if (se == NULL) {
+			client = __svc_format_addr(svc_addr(rqstp), buf, sizeof(buf));
+			if (client != NULL)
+				se = nfsd_stats_find(&nfsd_client_stats_hash, client, strlen(client));
+			if (se != NULL)
+				nfsd_stats_xprt_cached_set(rqstp, se);
+		}
 		if (se != NULL) {
 			/* take over the new reference from nfsd_stats_find() */
 			rqstp->rq_client_stats = se;
Index: bfields/include/linux/sunrpc/svc_xprt.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc_xprt.h
+++ bfields/include/linux/sunrpc/svc_xprt.h
@@ -51,6 +51,7 @@ struct svc_xprt {
 #define	XPT_DETACHED	10		/* detached from tempsocks list */
 #define XPT_LISTENER	11		/* listening endpoint */
 #define XPT_CACHE_AUTH	12		/* cache auth info */
+#define XPT_CACHE_STATS	13		/* cache stats info */
 
 	struct svc_pool		*xpt_pool;	/* current pool iff queued */
 	struct svc_serv		*xpt_server;	/* service for transport */
@@ -59,6 +60,8 @@ struct svc_xprt {
 	spinlock_t		xpt_lock;	/* protects sk_deferred
 						 * and xpt_auth_cache */
 	void			*xpt_auth_cache;/* auth cache */
+	void			*xpt_stats_cache;
+	void			(*xpt_stats_cache_release)(void *);
 	struct list_head	xpt_deferred;	/* deferred requests that need
 						 * to be revisted */
 	struct sockaddr_storage	xpt_local;	/* local address */
Index: bfields/net/sunrpc/svc_xprt.c
===================================================================
--- bfields.orig/net/sunrpc/svc_xprt.c
+++ bfields/net/sunrpc/svc_xprt.c
@@ -130,6 +130,10 @@ static void svc_xprt_free(struct kref *k
 	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
 	    && xprt->xpt_auth_cache != NULL)
 		svcauth_unix_info_release(xprt->xpt_auth_cache);
+	if (test_bit(XPT_CACHE_STATS, &xprt->xpt_flags) &&
+	    xprt->xpt_stats_cache != NULL &&
+	    xprt->xpt_stats_cache_release != NULL)
+		xprt->xpt_stats_cache_release(xprt->xpt_stats_cache);
 	xprt->xpt_ops->xpo_free(xprt);
 	module_put(owner);
 }
Index: bfields/net/sunrpc/svcsock.c
===================================================================
--- bfields.orig/net/sunrpc/svcsock.c
+++ bfields/net/sunrpc/svcsock.c
@@ -1108,6 +1108,7 @@ static void svc_tcp_init(struct svc_sock
 
 	svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
 	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+	set_bit(XPT_CACHE_STATS, &svsk->sk_xprt.xpt_flags);
 	if (sk->sk_state == TCP_LISTEN) {
 		dprintk("setting up TCP socket for listening\n");
 		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 10/29] knfsd: Update per-client & per-export stats from NFSv3
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (8 preceding siblings ...)
  2009-03-31 20:28 ` [patch 09/29] knfsd: Cache per-client stats entry on TCP transports Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 11/29] knfsd: Update per-client & per-export stats from NFSv2 Greg Banks
                   ` (19 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Add instrumentation to the NFSv3 server procedures.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfs3acl.c  |    2 ++
 fs/nfsd/nfs3proc.c |   26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

Index: bfields/fs/nfsd/nfs3proc.c
===================================================================
--- bfields.orig/fs/nfsd/nfs3proc.c
+++ bfields/fs/nfsd/nfs3proc.c
@@ -47,6 +47,7 @@ static int	nfs3_ftypes[] = {
 static __be32
 nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
 	return nfs_ok;
 }
 
@@ -73,6 +74,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqst
 			  resp->fh.fh_dentry, &resp->stat);
 	nfserr = nfserrno(err);
 
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_GETATTR);
 	RETURN_STATUS(nfserr);
 }
 
@@ -91,6 +93,7 @@ nfsd3_proc_setattr(struct svc_rqst *rqst
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
 			      argp->check_guard, argp->guardtime);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_SETATTR);
 	RETURN_STATUS(nfserr);
 }
 
@@ -115,6 +118,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp
 				    argp->name,
 				    argp->len,
 				    &resp->fh);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_LOOKUP);
 	RETURN_STATUS(nfserr);
 }
 
@@ -134,6 +138,7 @@ nfsd3_proc_access(struct svc_rqst *rqstp
 	fh_copy(&resp->fh, &argp->fh);
 	resp->access = argp->access;
 	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_ACCESS);
 	RETURN_STATUS(nfserr);
 }
 
@@ -152,6 +157,7 @@ nfsd3_proc_readlink(struct svc_rqst *rqs
 	fh_copy(&resp->fh, &argp->fh);
 	resp->len = NFS3_MAXPATHLEN;
 	nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
+	nfsd_stats_update_read(rqstp, &resp->fh, resp->len);
 	RETURN_STATUS(nfserr);
 }
 
@@ -192,6 +198,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, 
 		resp->eof = (argp->offset + resp->count) >= inode->i_size;
 	}
 
+	nfsd_stats_update_read(rqstp, &resp->fh, resp->count);
+
 	RETURN_STATUS(nfserr);
 }
 
@@ -219,6 +227,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp,
 				   &cnt,
 				   &resp->committed);
 	resp->count = cnt;
+	nfsd_stats_update_write(rqstp, &resp->fh, resp->count, resp->committed);
 	RETURN_STATUS(nfserr);
 }
 
@@ -263,6 +272,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp
 				attr, newfhp,
 				argp->createmode, argp->verf, NULL, NULL);
 
+	nfsd_stats_update(rqstp, dirfhp, NFSD_STATS_OP_MKINODE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -286,6 +296,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp,
 	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
 				    &argp->attrs, S_IFDIR, 0, &resp->fh);
 
+	nfsd_stats_update(rqstp, &resp->dirfh, NFSD_STATS_OP_MKINODE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -305,6 +316,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqst
 	nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
 						   argp->tname, argp->tlen,
 						   &resp->fh, &argp->attrs);
+	nfsd_stats_update(rqstp, &resp->dirfh, NFSD_STATS_OP_MKINODE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -342,6 +354,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp,
 	nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
 				    &argp->attrs, type, rdev, &resp->fh);
 
+	nfsd_stats_update(rqstp, &resp->dirfh, NFSD_STATS_OP_MKINODE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -362,6 +375,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp
 	/* Unlink. -S_IFDIR means file must not be a directory */
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_REMOVE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -381,6 +395,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp,
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_RMDIR);
 	RETURN_STATUS(nfserr);
 }
 
@@ -403,6 +418,7 @@ nfsd3_proc_rename(struct svc_rqst *rqstp
 	fh_copy(&resp->tfh, &argp->tfh);
 	nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
 				    &resp->tfh, argp->tname, argp->tlen);
+	nfsd_stats_update(rqstp, &resp->ffh, NFSD_STATS_OP_MKINODE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -423,6 +439,7 @@ nfsd3_proc_link(struct svc_rqst *rqstp, 
 	fh_copy(&resp->tfh, &argp->tfh);
 	nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
 				  &resp->fh);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_MKINODE);
 	RETURN_STATUS(nfserr);
 }
 
@@ -458,6 +475,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqst
 	if (resp->offset)
 		xdr_encode_hyper(resp->offset, argp->cookie);
 
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_READDIR);
 	RETURN_STATUS(nfserr);
 }
 
@@ -518,6 +536,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *
 		}
 	}
 
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_READDIRP);
 	RETURN_STATUS(nfserr);
 }
 
@@ -534,6 +553,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqst
 				SVCFH_fmt(&argp->fh));
 
 	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
+	nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_FSINFO);
 	fh_put(&argp->fh);
 	RETURN_STATUS(nfserr);
 }
@@ -575,6 +595,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqst
 			resp->f_properties = NFS3_FSF_BILLYBOY;
 		}
 		resp->f_maxfilesize = sb->s_maxbytes;
+
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_FSINFO);
 	}
 
 	fh_put(&argp->fh);
@@ -617,6 +639,8 @@ nfsd3_proc_pathconf(struct svc_rqst * rq
 			resp->p_case_preserving  = 0;
 			break;
 		}
+
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_FSINFO);
 	}
 
 	fh_put(&argp->fh);
@@ -644,6 +668,8 @@ nfsd3_proc_commit(struct svc_rqst * rqst
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count);
 
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_COMMIT);
+
 	RETURN_STATUS(nfserr);
 }
 
Index: bfields/fs/nfsd/nfs3acl.c
===================================================================
--- bfields.orig/fs/nfsd/nfs3acl.c
+++ bfields/fs/nfsd/nfs3acl.c
@@ -82,6 +82,7 @@ static __be32 nfsd3_proc_getacl(struct s
 		resp->acl_default = acl;
 	}
 
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_XATTR);
 	/* resp->acl_{access,default} are released in nfs3svc_release_getacl. */
 	RETURN_STATUS(0);
 
@@ -117,6 +118,7 @@ static __be32 nfsd3_proc_setacl(struct s
 	   nfs3svc_decode_setaclargs. */
 	posix_acl_release(argp->acl_access);
 	posix_acl_release(argp->acl_default);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_XATTR);
 	RETURN_STATUS(nfserr);
 }
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 11/29] knfsd: Update per-client & per-export stats from NFSv2
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (9 preceding siblings ...)
  2009-03-31 20:28 ` [patch 10/29] knfsd: Update per-client & per-export stats from NFSv3 Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 12/29] knfsd: Update per-client & per-export stats from NFSv4 Greg Banks
                   ` (18 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Add instrumentation to the NFSv2 server procedures.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfs2acl.c |    2 ++
 fs/nfsd/nfsproc.c |   28 ++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

Index: bfields/fs/nfsd/nfsproc.c
===================================================================
--- bfields.orig/fs/nfsd/nfsproc.c
+++ bfields/fs/nfsd/nfsproc.c
@@ -34,6 +34,7 @@ typedef struct svc_buf	svc_buf;
 static __be32
 nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_GETATTR);
 	return nfs_ok;
 }
 
@@ -67,6 +68,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0,
 			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_GETATTR);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -85,6 +88,8 @@ nfsd_proc_setattr(struct svc_rqst *rqstp
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_GETATTR);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -107,6 +112,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp,
 	nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
 				 &resp->fh);
 
+	nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_LOOKUP);
 	fh_put(&argp->fh);
 	return nfsd_return_dirop(nfserr, resp);
 }
@@ -126,6 +132,7 @@ nfsd_proc_readlink(struct svc_rqst *rqst
 	resp->len = NFS_MAXPATHLEN;
 	nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
 
+	nfsd_stats_update_read(rqstp, &argp->fh, resp->len);
 	fh_put(&argp->fh);
 	return nfserr;
 }
@@ -163,6 +170,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, s
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
 				  &resp->count);
+	nfsd_stats_update_read(rqstp, &resp->fh, resp->count);
 
 	if (nfserr) return nfserr;
 	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
@@ -191,6 +199,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, 
 				   rqstp->rq_vec, argp->vlen,
 			           &cnt,
 				   &stable);
+	nfsd_stats_update_write(rqstp, &resp->fh, cnt, stable);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -349,6 +358,8 @@ out_unlock:
 	fh_unlock(dirfhp);
 
 done:
+	if (!nfserr)
+		nfsd_stats_update(rqstp, dirfhp, NFSD_STATS_OP_MKINODE);
 	fh_put(dirfhp);
 	return nfsd_return_dirop(nfserr, resp);
 }
@@ -364,6 +375,8 @@ nfsd_proc_remove(struct svc_rqst *rqstp,
 
 	/* Unlink. -SIFDIR means file must not be a directory */
 	nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_REMOVE);
 	fh_put(&argp->fh);
 	return nfserr;
 }
@@ -381,6 +394,8 @@ nfsd_proc_rename(struct svc_rqst *rqstp,
 
 	nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
 				    &argp->tfh, argp->tname, argp->tlen);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->ffh, NFSD_STATS_OP_MKINODE);
 	fh_put(&argp->ffh);
 	fh_put(&argp->tfh);
 	return nfserr;
@@ -401,6 +416,8 @@ nfsd_proc_link(struct svc_rqst *rqstp, s
 
 	nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
 				  &argp->ffh);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->ffh, NFSD_STATS_OP_MKINODE);
 	fh_put(&argp->ffh);
 	fh_put(&argp->tfh);
 	return nfserr;
@@ -425,6 +442,8 @@ nfsd_proc_symlink(struct svc_rqst *rqstp
 						 argp->tname, argp->tlen,
 				 		 &newfh, &argp->attrs);
 
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->ffh, NFSD_STATS_OP_MKINODE);
 
 	fh_put(&argp->ffh);
 	fh_put(&newfh);
@@ -452,6 +471,8 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp, 
 	fh_init(&resp->fh, NFS_FHSIZE);
 	nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
 				    &argp->attrs, S_IFDIR, 0, &resp->fh);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_MKINODE);
 	fh_put(&argp->fh);
 	return nfsd_return_dirop(nfserr, resp);
 }
@@ -468,6 +489,8 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp, 
 	dprintk("nfsd: RMDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
 
 	nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_RMDIR);
 	fh_put(&argp->fh);
 	return nfserr;
 }
@@ -508,6 +531,9 @@ nfsd_proc_readdir(struct svc_rqst *rqstp
 	if (resp->offset)
 		*resp->offset = htonl(offset);
 
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_READDIR);
+
 	fh_put(&argp->fh);
 	return nfserr;
 }
@@ -525,6 +551,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp
 
 	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
 			NFSD_MAY_BYPASS_GSS_ON_ROOT);
+	if (!nfserr)
+		nfsd_stats_update(rqstp, &argp->fh, NFSD_STATS_OP_FSINFO);
 	fh_put(&argp->fh);
 	return nfserr;
 }
Index: bfields/fs/nfsd/nfs2acl.c
===================================================================
--- bfields.orig/fs/nfsd/nfs2acl.c
+++ bfields/fs/nfsd/nfs2acl.c
@@ -86,6 +86,7 @@ static __be32 nfsacld_proc_getacl(struct
 		resp->acl_default = acl;
 	}
 
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_XATTR);
 	/* resp->acl_{access,default} are released in nfssvc_release_getacl. */
 	RETURN_STATUS(0);
 
@@ -123,6 +124,7 @@ static __be32 nfsacld_proc_setacl(struct
 	   nfssvc_decode_setaclargs. */
 	posix_acl_release(argp->acl_access);
 	posix_acl_release(argp->acl_default);
+	nfsd_stats_update(rqstp, &resp->fh, NFSD_STATS_OP_XATTR);
 	return nfserr;
 }
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 12/29] knfsd: Update per-client & per-export stats from NFSv4
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (10 preceding siblings ...)
  2009-03-31 20:28 ` [patch 11/29] knfsd: Update per-client & per-export stats from NFSv2 Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 13/29] knfsd: reply cache cleanups Greg Banks
                   ` (17 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Add instrumentation to the NFSv4 server procedures.

Contains code based on a patch from Harshula Jayasuriya <harshula@sgi.com>.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfs4proc.c  |   40 +++++++++++++++++++++++++++++++++-------
 fs/nfsd/nfs4state.c |   34 ++++++++++++++++++++++++++++++++--
 fs/nfsd/nfs4xdr.c   |    7 +++++++
 3 files changed, 72 insertions(+), 9 deletions(-)

Index: bfields/fs/nfsd/nfs4proc.c
===================================================================
--- bfields.orig/fs/nfsd/nfs4proc.c
+++ bfields/fs/nfsd/nfs4proc.c
@@ -192,6 +192,9 @@ nfsd4_open(struct svc_rqst *rqstp, struc
 	   struct nfsd4_open *open)
 {
 	__be32 status;
+
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_ACCESS);
+
 	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
 		open->op_stateowner);
@@ -350,12 +353,16 @@ static __be32
 nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     struct nfsd4_access *access)
 {
+	__be32 status;
+
 	if (access->ac_req_access & ~NFS3_ACCESS_FULL)
 		return nfserr_inval;
 
 	access->ac_resp_access = access->ac_req_access;
-	return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access,
-			   &access->ac_supported);
+	status = nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access,
+			     &access->ac_supported);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_ACCESS);
+	return status;
 }
 
 static __be32
@@ -372,6 +379,7 @@ nfsd4_commit(struct svc_rqst *rqstp, str
 			     commit->co_count);
 	if (status == nfserr_symlink)
 		status = nfserr_inval;
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_COMMIT);
 	return status;
 }
 
@@ -452,6 +460,8 @@ nfsd4_create(struct svc_rqst *rqstp, str
 		status = nfserr_badtype;
 	}
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_MKINODE);
+
 	if (!status) {
 		fh_unlock(&cstate->current_fh);
 		set_change_info(&create->cr_cinfo, &cstate->current_fh);
@@ -480,6 +490,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, st
 	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	getattr->ga_fhp = &cstate->current_fh;
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_GETATTR);
 	return nfs_ok;
 }
 
@@ -495,6 +506,7 @@ nfsd4_link(struct svc_rqst *rqstp, struc
 			   link->li_name, link->li_namelen, &cstate->save_fh);
 	if (!status)
 		set_change_info(&link->li_cinfo, &cstate->current_fh);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_MKINODE);
 	return status;
 }
 
@@ -514,17 +526,22 @@ nfsd4_lookupp(struct svc_rqst *rqstp, st
 		return nfserr_noent;
 	}
 	fh_put(&tmp_fh);
-	return nfsd_lookup(rqstp, &cstate->current_fh,
-			   "..", 2, &cstate->current_fh);
+	ret = nfsd_lookup(rqstp, &cstate->current_fh,
+			  "..", 2, &cstate->current_fh);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_LOOKUP);
+	return ret;
 }
 
 static __be32
 nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	     struct nfsd4_lookup *lookup)
 {
-	return nfsd_lookup(rqstp, &cstate->current_fh,
-			   lookup->lo_name, lookup->lo_len,
-			   &cstate->current_fh);
+	__be32 status;
+	status = nfsd_lookup(rqstp, &cstate->current_fh,
+			     lookup->lo_name, lookup->lo_len,
+			     &cstate->current_fh);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_LOOKUP);
+	return status;
 }
 
 static __be32
@@ -604,6 +621,7 @@ nfsd4_remove(struct svc_rqst *rqstp, str
 		return nfserr_grace;
 	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
 			     remove->rm_name, remove->rm_namelen);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_REMOVE);
 	if (status == nfserr_symlink)
 		return nfserr_notdir;
 	if (!status) {
@@ -628,6 +646,8 @@ nfsd4_rename(struct svc_rqst *rqstp, str
 			     rename->rn_snamelen, &cstate->current_fh,
 			     rename->rn_tname, rename->rn_tnamelen);
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_MKINODE);
+
 	/* the underlying filesystem returns different error's than required
 	 * by NFSv4. both save_fh and current_fh have been verified.. */
 	if (status == nfserr_isdir)
@@ -667,6 +687,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, st
 	} else
 		secinfo->si_exp = exp;
 	dput(dentry);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_FSINFO);
 	return err;
 }
 
@@ -700,6 +721,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
 		goto out;
 	status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
 				0, (time_t)0);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_SETATTR);
 out:
 	mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
 	return status;
@@ -751,6 +773,8 @@ nfsd4_write(struct svc_rqst *rqstp, stru
 
 	if (status == nfserr_symlink)
 		status = nfserr_inval;
+	nfsd_stats_update_write(rqstp, &cstate->current_fh, write->wr_bytes_written,
+				write->wr_how_written);
 	return status;
 }
 
@@ -809,6 +833,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, st
 	if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen))
 		status = nfserr_same;
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_ACCESS);
+
 out_kfree:
 	kfree(buf);
 	return status;
Index: bfields/fs/nfsd/nfs4state.c
===================================================================
--- bfields.orig/fs/nfsd/nfs4state.c
+++ bfields/fs/nfsd/nfs4state.c
@@ -1217,6 +1217,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp
 		.data = exid->id,
 	};
 
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
+
 	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
 		" ip_addr=%u flags %x, spa_how %d\n",
 		__func__, rqstp, exid, clname.len, clname.data,
@@ -1368,6 +1370,8 @@ nfsd4_create_session(struct svc_rqst *rq
 	struct nfsd4_slot *slot = NULL;
 	int status = 0;
 
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
+
 	nfs4_lock_state();
 	unconf = find_unconfirmed_client(&cr_ses->clientid);
 	conf = find_confirmed_client(&cr_ses->clientid);
@@ -1456,6 +1460,8 @@ nfsd4_destroy_session(struct svc_rqst *r
 	struct nfsd4_session *ses;
 	u32 status = nfserr_badsession;
 
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
+
 	/* Notes:
 	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
 	 * - Should we return nfserr_back_chan_busy if waiting for
@@ -1493,6 +1499,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	struct nfsd4_slot *slot;
 	int status;
 
+	/* No nfsd_stats_update() call here, for the same reasons
+	 * as for PUTFH and GETFH */
+
 	if (resp->opcnt != 1)
 		return nfserr_sequence_pos;
 
@@ -1560,7 +1569,9 @@ nfsd4_setclientid(struct svc_rqst *rqstp
 	__be32 			status;
 	char			*princ;
 	char                    dname[HEXDIR_LEN];
-	
+
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
+
 	if (!check_name(clname))
 		return nfserr_inval;
 
@@ -1684,6 +1695,8 @@ nfsd4_setclientid_confirm(struct svc_rqs
 	clientid_t * clid = &setclientid_confirm->sc_clientid;
 	__be32 status;
 
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
+
 	if (STALE_CLIENTID(clid))
 		return nfserr_stale_clientid;
 	/* 
@@ -2629,6 +2642,8 @@ nfsd4_renew(struct svc_rqst *rqstp, stru
 	struct nfs4_client *clp;
 	__be32 status;
 
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_FSINFO);
+
 	nfs4_lock_state();
 	dprintk("process_renew(%08x/%08x): starting\n", 
 			clid->cl_boot, clid->cl_id);
@@ -3057,6 +3072,8 @@ nfsd4_open_confirm(struct svc_rqst *rqst
 	struct nfs4_stateowner *sop;
 	struct nfs4_stateid *stp;
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_ACCESS);
+
 	dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
@@ -3129,6 +3146,8 @@ nfsd4_open_downgrade(struct svc_rqst *rq
 	unsigned int share_access;
 	int flags = OPEN_STATE;
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_ACCESS);
+
 	dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
@@ -3188,6 +3207,8 @@ nfsd4_close(struct svc_rqst *rqstp, stru
 	struct nfs4_stateid *stp;
 	int flags = OPEN_STATE | CLOSE_STATE;
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_ACCESS);
+
 	dprintk("NFSD: nfsd4_close on file %.*s\n", 
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
@@ -3259,6 +3280,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp
 	renew_client(dp->dl_client);
 
 	unhash_delegation(dp);
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_FSINFO);
 out:
 	nfs4_unlock_state();
 
@@ -3517,6 +3539,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
 	unsigned int cmd;
 	int err, flags = 0;
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_LOCKD);
+
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
 		(long long) lock->lk_offset,
 		(long long) lock->lk_length);
@@ -3695,6 +3719,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, stru
 	int error;
 	__be32 status;
 
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_LOCKD);
+
 	if (locks_in_grace())
 		return nfserr_grace;
 
@@ -3768,7 +3794,9 @@ nfsd4_locku(struct svc_rqst *rqstp, stru
 	struct file_lock file_lock;
 	__be32 status;
 	int err, flags = LOCK_STATE;
-						        
+
+	nfsd_stats_update(rqstp, &cstate->current_fh, NFSD_STATS_OP_LOCKD);
+
 	dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
 		(long long) locku->lu_offset,
 		(long long) locku->lu_length);
@@ -3865,6 +3893,8 @@ nfsd4_release_lockowner(struct svc_rqst 
 	int i;
 	__be32 status;
 
+	nfsd_stats_update(rqstp, NULL, NFSD_STATS_OP_LOCKD);
+
 	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
 		clid->cl_boot, clid->cl_id);
 
Index: bfields/fs/nfsd/nfs4xdr.c
===================================================================
--- bfields.orig/fs/nfsd/nfs4xdr.c
+++ bfields/fs/nfsd/nfs4xdr.c
@@ -2641,6 +2641,8 @@ nfsd4_encode_read(struct nfsd4_compoundr
 			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
 			&maxcount);
 
+	nfsd_stats_update_read(read->rd_rqstp, read->rd_fhp, maxcount);
+
 	if (nfserr == nfserr_symlink)
 		nfserr = nfserr_inval;
 	if (nfserr)
@@ -2692,6 +2694,7 @@ nfsd4_encode_readlink(struct nfsd4_compo
 	 * assume that truncation occurred, and return NFS4ERR_RESOURCE.
 	 */
 	nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount);
+	nfsd_stats_update_read(readlink->rl_rqstp, readlink->rl_fhp, maxcount);
 	if (nfserr == nfserr_isdir)
 		return nfserr_inval;
 	if (nfserr)
@@ -2764,6 +2767,10 @@ nfsd4_encode_readdir(struct nfsd4_compou
 	nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
 			      &offset,
 			      &readdir->common, nfsd4_encode_dirent);
+
+	nfsd_stats_update(readdir->rd_rqstp, readdir->rd_fhp,
+			  NFSD_STATS_OP_READDIRP);
+
 	if (nfserr == nfs_ok &&
 	    readdir->common.err == nfserr_toosmall &&
 	    readdir->buffer == page) 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 13/29] knfsd: reply cache cleanups
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (11 preceding siblings ...)
  2009-03-31 20:28 ` [patch 12/29] knfsd: Update per-client & per-export stats from NFSv4 Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-12 19:54   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 14/29] knfsd: better hashing in the reply cache Greg Banks
                   ` (16 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Make REQHASH() an inline function.  Rename hash_list to cache_hash.
Fix an obsolete comment.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c         |   29 +++++++++++++++++++----------
 include/linux/nfsd/cache.h |    3 +--
 2 files changed, 20 insertions(+), 12 deletions(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -29,15 +29,24 @@
  */
 #define CACHESIZE		1024
 #define HASHSIZE		64
-#define REQHASH(xid)		(((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
 
-static struct hlist_head *	hash_list;
+static struct hlist_head *	cache_hash;
 static struct list_head 	lru_head;
 static int			cache_disabled = 1;
 
+/*
+ * Calculate the hash index from an XID.
+ */
+static inline u32 request_hash(u32 xid)
+{
+	u32 h = xid;
+	h ^= (xid >> 24);
+	return h & (HASHSIZE-1);
+}
+
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 
-/* 
+/*
  * locking for the reply cache:
  * A cache entry is "single use" if c_state == RC_INPROG
  * Otherwise, it when accessing _prev or _next, the lock must be held.
@@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
 		i--;
 	}
 
-	hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-	if (!hash_list)
+	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+	if (!cache_hash)
 		goto out_nomem;
 
 	cache_disabled = 0;
@@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
 
 	cache_disabled = 1;
 
-	kfree (hash_list);
-	hash_list = NULL;
+	kfree (cache_hash);
+	cache_hash = NULL;
 }
 
 /*
@@ -108,7 +117,7 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
 	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid));
+	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
 }
 
 /*
@@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	spin_lock(&cache_lock);
 	rtn = RC_DOIT;
 
-	rh = &hash_list[REQHASH(xid)];
+	rh = &cache_hash[request_hash(xid)];
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
 		if (rp->c_state != RC_UNUSED &&
 		    xid == rp->c_xid && proc == rp->c_proc &&
@@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp
 
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
 	len >>= 2;
-	
+
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
 		rp->c_state = RC_UNUSED;
Index: bfields/include/linux/nfsd/cache.h
===================================================================
--- bfields.orig/include/linux/nfsd/cache.h
+++ bfields/include/linux/nfsd/cache.h
@@ -14,8 +14,7 @@
 #include <linux/uio.h>
 
 /*
- * Representation of a reply cache entry. The first two members *must*
- * be hash_next and hash_prev.
+ * Representation of a reply cache entry.
  */
 struct svc_cacherep {
 	struct hlist_node	c_hash;

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 14/29] knfsd: better hashing in the reply cache
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (12 preceding siblings ...)
  2009-03-31 20:28 ` [patch 13/29] knfsd: reply cache cleanups Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-08 22:01   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 15/29] knfsd: fix reply cache memory corruption Greg Banks
                   ` (15 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Improve the hash function to handle clients which increment the XID
in the unexpected byte order, by folding down the top bits of the XID.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -35,12 +35,15 @@ static struct list_head 	lru_head;
 static int			cache_disabled = 1;
 
 /*
- * Calculate the hash index from an XID.
+ * Calculate the hash index from an XID.  Note, some clients increment
+ * their XIDs in host order, which can result in all the variation being
+ * in the top bits we see here.  So we fold those bits down.
  */
 static inline u32 request_hash(u32 xid)
 {
 	u32 h = xid;
 	h ^= (xid >> 24);
+	h ^= ((xid & 0xff0000) >> 8);
 	return h & (HASHSIZE-1);
 }
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 15/29] knfsd: fix reply cache memory corruption
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (13 preceding siblings ...)
  2009-03-31 20:28 ` [patch 14/29] knfsd: better hashing in the reply cache Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-12 19:55   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 16/29] knfsd: use client IPv4 address in reply cache hash Greg Banks
                   ` (14 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Fix a regression in the reply cache introduced when the code was
converted to use proper Linux lists.  When a new entry needs to be
inserted, the case where all the entries are currently being used
by threads is not correctly detected.  This can result in memory
corruption and a crash.  In the current code this is an extremely
unlikely corner case; it would require the machine to have 1024
nfsd threads and all of them to be busy at the same time.  However,
upcoming reply cache changes make this more likely; a crash due to
this problem was actually observed in field.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -177,8 +177,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	}
 	}
 
-	/* This should not happen */
-	if (rp == NULL) {
+	/* All entries on the LRU are in-progress. This should not happen */
+	if (&rp->c_lru == &lru_head) {
 		static int	complaints;
 
 		printk(KERN_WARNING "nfsd: all repcache entries locked!\n");

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 16/29] knfsd: use client IPv4 address in reply cache hash
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (14 preceding siblings ...)
  2009-03-31 20:28 ` [patch 15/29] knfsd: fix reply cache memory corruption Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-11 21:48   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 17/29] knfsd: make the reply cache SMP-friendly Greg Banks
                   ` (13 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Use the IPv4 address of the client in the reply cache hash function.
This can help improve the distribution of the hash function when the
workload includes a large number of clients which mounted their NFS
filesystems at nearly the same time and are doing similar sequences
of NFS calls, a pattern seen with large compute clusters.

This code predates the IPv6 support in the current NFS server but
should be harmless with IPv6 clients.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c |   27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -38,12 +38,17 @@ static int			cache_disabled = 1;
  * Calculate the hash index from an XID.  Note, some clients increment
  * their XIDs in host order, which can result in all the variation being
  * in the top bits we see here.  So we fold those bits down.
+ *
+ * Experiment shows that using the Jenkins hash improves the spectral
+ * properties of this hash, but the CPU cost of calculating it outweighs
+ * the advantages.
  */
-static inline u32 request_hash(u32 xid)
+static inline u32 request_hash(u32 xid, const struct sockaddr_in *sin)
 {
 	u32 h = xid;
 	h ^= (xid >> 24);
 	h ^= ((xid & 0xff0000) >> 8);
+	h ^= sin->sin_addr.s_addr;
 	return h & (HASHSIZE-1);
 }
 
@@ -114,16 +119,6 @@ lru_put_end(struct svc_cacherep *rp)
 }
 
 /*
- * Move a cache entry from one hash list to another
- */
-static void
-hash_refile(struct svc_cacherep *rp)
-{
-	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
-}
-
-/*
  * Try to find an entry matching the current call in the cache. When none
  * is found, we grab the oldest unlocked entry off the LRU list.
  * Note that no operation within the loop may sleep.
@@ -137,7 +132,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	__be32			xid = rqstp->rq_xid;
 	u32			proto =  rqstp->rq_prot,
 				vers = rqstp->rq_vers,
-				proc = rqstp->rq_proc;
+				proc = rqstp->rq_proc,
+				h;
 	unsigned long		age;
 	int rtn;
 
@@ -146,11 +142,12 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 		nfsdstats.rcnocache++;
 		return RC_DOIT;
 	}
+	h = request_hash(xid, svc_addr_in(rqstp));
 
 	spin_lock(&cache_lock);
 	rtn = RC_DOIT;
 
-	rh = &cache_hash[request_hash(xid)];
+	rh = &cache_hash[h];
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
 		if (rp->c_state != RC_UNUSED &&
 		    xid == rp->c_xid && proc == rp->c_proc &&
@@ -198,7 +195,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	rp->c_vers = vers;
 	rp->c_timestamp = jiffies;
 
-	hash_refile(rp);
+	/* Move the cache entry from one hash list to another */
+	hlist_del_init(&rp->c_hash);
+	hlist_add_head(&rp->c_hash, cache_hash + h);
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 17/29] knfsd: make the reply cache SMP-friendly
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (15 preceding siblings ...)
  2009-03-31 20:28 ` [patch 16/29] knfsd: use client IPv4 address in reply cache hash Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 18/29] knfsd: dynamically expand the reply cache Greg Banks
                   ` (12 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Make the reply cache scale better on large multiprocessor NFS servers,
by splitting the global lock into multiple locks one in each hash
bucket.  To avoid introducing a new global contention point, the
global LRU list of reply cache entries is split into multiple LRU
lists, one per hash bucket.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c |  166 ++++++++++++++++++++++++++++++------------
 1 file changed, 121 insertions(+), 45 deletions(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -8,6 +8,9 @@
  * it does things a bit differently.
  *
  * Copyright (C) 1995, 1996 Olaf Kirch <okir-pn4DOG8n3UYbFoVRYvo4fw@public.gmane.org>
+ *
+ * SMP lock splitting by Greg Banks <gnb@sgi.com>
+ *     Copyright (c) 2005-2009 Silicon Graphics, Inc.
  */
 
 #include <linux/kernel.h>
@@ -27,11 +30,43 @@
  * Solaris2:	1024
  * DEC Unix:	512-4096
  */
-#define CACHESIZE		1024
-#define HASHSIZE		64
+/* number of buckets used to manage LRU lists and cache locks (power of 2) */
+#ifdef CONFIG_SMP
+#define CACHE_NUM_BUCKETS	64
+#else
+#define CACHE_NUM_BUCKETS	1
+#endif
+/* number of entries in all LRU lists (power of 2) */
+#define CACHE_SIZE		(1024)
+/* largest possible number of entries in LRU per bucket */
+#define CACHE_BUCKET_MAX_SIZE	(CACHE_SIZE/CACHE_NUM_BUCKETS)
+/* log2 of largest desired hash chain length */
+#define MAX_CHAIN_ORDER		2
+/* initial and maximum size of the per-bucket hash table */
+#define HASHSIZE		((CACHE_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
+
+/*
+ * locking for the reply cache:
+ * A cache entry is "single use" if c_state == RC_INPROG
+ * Otherwise, it when accessing _prev or _next, the lock for the
+ * appropriate bucket must be held.
+ *
+ * On uniprocessor systems, we have a single global lock and LRU
+ * list.  However on multiprocessor systems, to reduce contention
+ * on that spinlock under heavy nonidempotent load (think chmod -R)
+ * we use multiple buckets.  The lower few bits of the hash index
+ * are used to index the buckets.
+ */
+struct svc_cache_bucket
+{
+	spinlock_t lock;
+	struct list_head lru;
+	unsigned int size;
+	struct hlist_head *hash;
+} ____cacheline_aligned_in_smp;
 
-static struct hlist_head *	cache_hash;
-static struct list_head 	lru_head;
+static struct svc_cache_bucket	cache_buckets[CACHE_NUM_BUCKETS];
+#define bucket_for_hash(hash)	(&cache_buckets[(hash) & (CACHE_NUM_BUCKETS-1)])
 static int			cache_disabled = 1;
 
 /*
@@ -49,39 +84,70 @@ static inline u32 request_hash(u32 xid, 
 	h ^= (xid >> 24);
 	h ^= ((xid & 0xff0000) >> 8);
 	h ^= sin->sin_addr.s_addr;
-	return h & (HASHSIZE-1);
+	return h;
 }
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 
 /*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
+ * Initialise the reply cache data structures.
+ * Called without cache_lock, uses it internally.  Returns
+ * 0 on success, an error otherwise.
  */
-static DEFINE_SPINLOCK(cache_lock);
-
-int nfsd_reply_cache_init(void)
+static int nfsd_cache_bucket_init(struct svc_cache_bucket *b, unsigned int num)
 {
-	struct svc_cacherep	*rp;
-	int			i;
+	struct svc_cacherep *rp;
+	unsigned int i;
+	LIST_HEAD(lru);
 
-	INIT_LIST_HEAD(&lru_head);
-	i = CACHESIZE;
+	/* allocate new entries without the lock, keep them on their own list */
+	i = num;
 	while (i) {
 		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
 		if (!rp)
 			goto out_nomem;
-		list_add(&rp->c_lru, &lru_head);
+		list_add(&rp->c_lru, &lru);
 		rp->c_state = RC_UNUSED;
 		rp->c_type = RC_NOCACHE;
 		INIT_HLIST_NODE(&rp->c_hash);
 		i--;
 	}
 
-	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-	if (!cache_hash)
-		goto out_nomem;
+	/* add the new entries */
+	spin_lock(&b->lock);
+
+	b->size = num;
+	list_splice(&lru, &b->lru);
+
+	spin_unlock(&b->lock);
+	return 0;
+
+out_nomem:
+	/* free any entries we've allocated but not spliced in */
+	while (!list_empty(&lru)) {
+		rp = list_entry(lru.next, struct svc_cacherep, c_lru);
+		list_del(&rp->c_lru);
+		kfree (rp);
+	}
+	return -ENOMEM;
+}
+
+int nfsd_reply_cache_init(void)
+{
+	unsigned int bucket;
+
+	for (bucket = 0 ; bucket < CACHE_NUM_BUCKETS ; bucket++)
+	{
+		struct svc_cache_bucket *b = &cache_buckets[bucket];
+
+		INIT_LIST_HEAD(&b->lru);
+		spin_lock_init(&b->lock);
+		if (nfsd_cache_bucket_init(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
+			goto out_nomem;
+		b->hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+		if (!b->hash)
+			goto out_nomem;
+	}
 
 	cache_disabled = 0;
 	return 0;
@@ -94,28 +160,32 @@ out_nomem:
 void nfsd_reply_cache_shutdown(void)
 {
 	struct svc_cacherep	*rp;
+	unsigned int bucket;
 
-	while (!list_empty(&lru_head)) {
-		rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
-		if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
-			kfree(rp->c_replvec.iov_base);
-		list_del(&rp->c_lru);
-		kfree(rp);
+	for (bucket = 0 ; bucket < CACHE_NUM_BUCKETS ; bucket++)
+	{
+		struct svc_cache_bucket *b = &cache_buckets[bucket];
+
+		while (!list_empty(&b->lru)) {
+			rp = list_entry(b->lru.next, struct svc_cacherep, c_lru);
+			if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
+				kfree(rp->c_replvec.iov_base);
+			list_del(&rp->c_lru);
+			kfree(rp);
+		}
+		kfree (b->hash);
+		b->hash = NULL;
 	}
 
 	cache_disabled = 1;
-
-	kfree (cache_hash);
-	cache_hash = NULL;
 }
 
 /*
  * Move cache entry to end of LRU list
  */
-static void
-lru_put_end(struct svc_cacherep *rp)
+static inline void lru_put_end(struct svc_cache_bucket *b, struct svc_cacherep *rp)
 {
-	list_move_tail(&rp->c_lru, &lru_head);
+	list_move_tail(&rp->c_lru, &b->lru);
 }
 
 /*
@@ -134,8 +204,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 				vers = rqstp->rq_vers,
 				proc = rqstp->rq_proc,
 				h;
+	struct svc_cache_bucket *b;
 	unsigned long		age;
-	int rtn;
+	int			rtn;
 
 	rqstp->rq_cacherep = NULL;
 	if (cache_disabled || type == RC_NOCACHE) {
@@ -143,11 +214,13 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 		return RC_DOIT;
 	}
 	h = request_hash(xid, svc_addr_in(rqstp));
+	b = bucket_for_hash(h);
+	h = (h / CACHE_NUM_BUCKETS) & (HASHSIZE-1);
 
-	spin_lock(&cache_lock);
+	spin_lock(&b->lock);
 	rtn = RC_DOIT;
 
-	rh = &cache_hash[h];
+	rh = &b->hash[h];
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
 		if (rp->c_state != RC_UNUSED &&
 		    xid == rp->c_xid && proc == rp->c_proc &&
@@ -163,10 +236,10 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	/* This loop shouldn't take more than a few iterations normally */
 	{
 	int	safe = 0;
-	list_for_each_entry(rp, &lru_head, c_lru) {
+	list_for_each_entry(rp, &b->lru, c_lru) {
 		if (rp->c_state != RC_INPROG)
 			break;
-		if (safe++ > CACHESIZE) {
+		if (safe++ > b->size) {
 			printk("nfsd: loop in repcache LRU list\n");
 			cache_disabled = 1;
 			goto out;
@@ -175,7 +248,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	}
 
 	/* All entries on the LRU are in-progress. This should not happen */
-	if (&rp->c_lru == &lru_head) {
+	if (&rp->c_lru == &b->lru) {
 		static int	complaints;
 
 		printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
@@ -197,7 +270,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 
 	/* Move the cache entry from one hash list to another */
 	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, cache_hash + h);
+	hlist_add_head(&rp->c_hash, b->hash + h);
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {
@@ -206,14 +279,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	}
 	rp->c_type = RC_NOCACHE;
  out:
-	spin_unlock(&cache_lock);
+	spin_unlock(&b->lock);
 	return rtn;
 
 found_entry:
 	/* We found a matching entry which is either in progress or done. */
 	age = jiffies - rp->c_timestamp;
 	rp->c_timestamp = jiffies;
-	lru_put_end(rp);
+	lru_put_end(b, rp);
 
 	rtn = RC_DROPIT;
 	/* Request being processed or excessive rexmits */
@@ -269,10 +342,13 @@ nfsd_cache_update(struct svc_rqst *rqstp
 	struct svc_cacherep *rp;
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
 	int		len;
+	struct svc_cache_bucket *b;
 
 	if (!(rp = rqstp->rq_cacherep) || cache_disabled)
 		return;
 
+	b = bucket_for_hash(request_hash(rp->c_xid, &rp->c_addr));
+
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
 	len >>= 2;
 
@@ -292,22 +368,22 @@ nfsd_cache_update(struct svc_rqst *rqstp
 		cachv = &rp->c_replvec;
 		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
 		if (!cachv->iov_base) {
-			spin_lock(&cache_lock);
+			spin_lock(&b->lock);
 			rp->c_state = RC_UNUSED;
-			spin_unlock(&cache_lock);
+			spin_unlock(&b->lock);
 			return;
 		}
 		cachv->iov_len = len << 2;
 		memcpy(cachv->iov_base, statp, len << 2);
 		break;
 	}
-	spin_lock(&cache_lock);
-	lru_put_end(rp);
+	spin_lock(&b->lock);
+	lru_put_end(b, rp);
 	rp->c_secure = rqstp->rq_secure;
 	rp->c_type = cachetype;
 	rp->c_state = RC_DONE;
 	rp->c_timestamp = jiffies;
-	spin_unlock(&cache_lock);
+	spin_unlock(&b->lock);
 	return;
 }
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 18/29] knfsd: dynamically expand the reply cache
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (16 preceding siblings ...)
  2009-03-31 20:28 ` [patch 17/29] knfsd: make the reply cache SMP-friendly Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-26 18:57   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 19/29] knfsd: faster probing in " Greg Banks
                   ` (11 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Allow the reply cache to expand under nonidempotent NFS call load.
The current fixed limit on reply cache entries is actually so small
as to make the reply cache utterly ineffectual (see the comment in
nfscache.c for details).

This is a simpler version of an older more complicated patch which
dynamically expanded the hash index using lazy rehashing.  Here we
allocate a hash index which is too large for the initial size of the
reply cache, and don't ever resize it.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c |   76 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 10 deletions(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -9,7 +9,7 @@
  *
  * Copyright (C) 1995, 1996 Olaf Kirch <okir-pn4DOG8n3UYbFoVRYvo4fw@public.gmane.org>
  *
- * SMP lock splitting by Greg Banks <gnb@sgi.com>
+ * Dynamic expansion and SMP lock splitting by Greg Banks <gnb@sgi.com>
  *     Copyright (c) 2005-2009 Silicon Graphics, Inc.
  */
 
@@ -24,11 +24,21 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/cache.h>
 
-/* Size of reply cache. Common values are:
+/* Initial size of reply cache. Common values are:
  * 4.3BSD:	128
  * 4.4BSD:	256
  * Solaris2:	1024
  * DEC Unix:	512-4096
+ *
+ * All these values reflect network packet rates and NFS loads common
+ * somewhen around 1990, and are utterly inadequate for modern NFS
+ * servers.  To be at all effective the reply cache needs to hold all
+ * NFS calls seen by the server for at least a client RPC timeout period
+ * (typically 1.1 seconds), and to handle weird IP routing issues should
+ * really hold 120 seconds of traffic.  A modern NFS server can be
+ * fielding upwards of 10,000 calls per second, which means the default
+ * cache size of 1024 holds about 102 milliseconds'  traffic, i.e. the
+ * default size is three orders of magnitude too small.
  */
 /* number of buckets used to manage LRU lists and cache locks (power of 2) */
 #ifdef CONFIG_SMP
@@ -36,14 +46,22 @@
 #else
 #define CACHE_NUM_BUCKETS	1
 #endif
-/* number of entries in all LRU lists (power of 2) */
+/* initial number of entries in all LRU lists (power of 2) */
 #define CACHE_SIZE		(1024)
+/* largest possible number of entries in all LRU lists (power of 2) */
+#define CACHE_MAX_SIZE		(16*1024*CACHE_NUM_BUCKETS)
 /* largest possible number of entries in LRU per bucket */
-#define CACHE_BUCKET_MAX_SIZE	(CACHE_SIZE/CACHE_NUM_BUCKETS)
+#define CACHE_BUCKET_MAX_SIZE	(CACHE_MAX_SIZE/CACHE_NUM_BUCKETS)
+/* number of entries each bucket will expand by */
+#define CACHE_BUCKET_INCREMENT	(1024/CACHE_NUM_BUCKETS)
 /* log2 of largest desired hash chain length */
 #define MAX_CHAIN_ORDER		2
 /* initial and maximum size of the per-bucket hash table */
-#define HASHSIZE		((CACHE_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
+#define HASHSIZE		((CACHE_MAX_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
+/* the cache attempts to expand if an entry younger than this is evicted */
+#define CACHE_THRESH_AGE	(11 * HZ / 10)  /* in jiffies */
+/* parameters for rate limiting cache expansion */
+#define CACHE_RATE_JIFFIES	(HZ/2)
 
 /*
  * locking for the reply cache:
@@ -63,6 +81,9 @@ struct svc_cache_bucket
 	struct list_head lru;
 	unsigned int size;
 	struct hlist_head *hash;
+	/* parameters for expand rate limiting */
+	unsigned long last;
+	unsigned long nhits;
 } ____cacheline_aligned_in_smp;
 
 static struct svc_cache_bucket	cache_buckets[CACHE_NUM_BUCKETS];
@@ -90,18 +111,18 @@ static inline u32 request_hash(u32 xid, 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 
 /*
- * Initialise the reply cache data structures.
+ * Expand (or initialise) the reply cache data structures.
  * Called without cache_lock, uses it internally.  Returns
  * 0 on success, an error otherwise.
  */
-static int nfsd_cache_bucket_init(struct svc_cache_bucket *b, unsigned int num)
+static int nfsd_cache_bucket_expand(struct svc_cache_bucket *b, unsigned int increment)
 {
 	struct svc_cacherep *rp;
 	unsigned int i;
 	LIST_HEAD(lru);
 
 	/* allocate new entries without the lock, keep them on their own list */
-	i = num;
+	i = increment;
 	while (i) {
 		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
 		if (!rp)
@@ -116,7 +137,7 @@ static int nfsd_cache_bucket_init(struct
 	/* add the new entries */
 	spin_lock(&b->lock);
 
-	b->size = num;
+	b->size += increment;
 	list_splice(&lru, &b->lru);
 
 	spin_unlock(&b->lock);
@@ -142,7 +163,7 @@ int nfsd_reply_cache_init(void)
 
 		INIT_LIST_HEAD(&b->lru);
 		spin_lock_init(&b->lock);
-		if (nfsd_cache_bucket_init(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
+		if (nfsd_cache_bucket_expand(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
 			goto out_nomem;
 		b->hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
 		if (!b->hash)
@@ -189,6 +210,26 @@ static inline void lru_put_end(struct sv
 }
 
 /*
+ * Decide whether it is time to expand the cache.  Returns 1 iff
+ * the cache is to be expanded.  Called with bucket lock held.
+ */
+static int nfsd_cache_expand_ratelimit(struct svc_cache_bucket *b)
+{
+	unsigned long now = jiffies;
+
+	b->nhits++;
+	if (b->last == 0) {
+		b->last = now;
+	} else if ((now - b->last) > CACHE_RATE_JIFFIES &&
+		   b->nhits > (b->size >> 4)) {
+		b->nhits = 0;
+		b->last = now;
+		return 1;
+	}
+	return 0;
+}
+
+/*
  * Try to find an entry matching the current call in the cache. When none
  * is found, we grab the oldest unlocked entry off the LRU list.
  * Note that no operation within the loop may sleep.
@@ -207,6 +248,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	struct svc_cache_bucket *b;
 	unsigned long		age;
 	int			rtn;
+	int			expand = 0;
 
 	rqstp->rq_cacherep = NULL;
 	if (cache_disabled || type == RC_NOCACHE) {
@@ -259,6 +301,18 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 		goto out;
 	}
 
+	if (rp->c_state != RC_UNUSED) {
+		/* reusing an existing cache entry */
+		age = jiffies - rp->c_timestamp;
+		if (age < CACHE_THRESH_AGE &&
+		    b->size < CACHE_BUCKET_MAX_SIZE &&
+		    nfsd_cache_expand_ratelimit(b)) {
+			expand = CACHE_BUCKET_INCREMENT;
+			if (b->size + expand > CACHE_BUCKET_MAX_SIZE)
+				expand = CACHE_BUCKET_MAX_SIZE - b->size;
+		}
+	}
+
 	rqstp->rq_cacherep = rp;
 	rp->c_state = RC_INPROG;
 	rp->c_xid = xid;
@@ -280,6 +334,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	rp->c_type = RC_NOCACHE;
  out:
 	spin_unlock(&b->lock);
+	if (expand)
+		nfsd_cache_bucket_expand(b, expand);
 	return rtn;
 
 found_entry:

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 19/29] knfsd: faster probing in the reply cache
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (17 preceding siblings ...)
  2009-03-31 20:28 ` [patch 18/29] knfsd: dynamically expand the reply cache Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 20/29] knfsd: add extended reply cache stats Greg Banks
                   ` (10 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Slightly reorganise the CPU-intensive hash chain probing to check
for more commonly varying entry keys first, and to reduce arithmetic
operations by hoisting out an invariant.  This slightly reduces the
CPU time spent in nfsd_cache_lookup() under heavy nonidempotent loads.

This code predates the IPv6 support in the NFS server but should be
harmless with IPv6 clients.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -108,6 +108,23 @@ static inline u32 request_hash(u32 xid, 
 	return h;
 }
 
+
+/*
+ * Fast compare of two sockaddr_in's.  Ignore the zero padding at
+ * the end of a sockaddr_in; compares sin_family and sin_port in
+ * one comparison.  Returns 1 if identical, 0 otherwise.
+ *
+ * Alignment issues prevent us from comparing all the relevant
+ * fields in a single 64b comparison on 64b platforms: the most
+ * we can assume is that sin_addr is 32b-aligned.
+ */
+static inline int
+compare_sockaddr_in(const struct sockaddr_in *sina, const struct sockaddr_in *sinb)
+{
+	return ((*(u32 *)&sina->sin_family == *(u32 *)&sinb->sin_family) &&
+		(sina->sin_addr.s_addr == sinb->sin_addr.s_addr));
+}
+
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 
 /*
@@ -263,12 +280,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	rtn = RC_DOIT;
 
 	rh = &b->hash[h];
+	age = jiffies - 120*HZ;
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
 		if (rp->c_state != RC_UNUSED &&
-		    xid == rp->c_xid && proc == rp->c_proc &&
+		    xid == rp->c_xid &&
+		    compare_sockaddr_in(svc_addr_in(rqstp), &rp->c_addr) &&
+		    proc == rp->c_proc &&
 		    proto == rp->c_prot && vers == rp->c_vers &&
-		    time_before(jiffies, rp->c_timestamp + 120*HZ) &&
-		    memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) {
+		    time_after(rp->c_timestamp, age)) {
 			nfsdstats.rchits++;
 			goto found_entry;
 		}

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 20/29] knfsd: add extended reply cache stats
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (18 preceding siblings ...)
  2009-03-31 20:28 ` [patch 19/29] knfsd: faster probing in " Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 21/29] knfsd: remove unreported filehandle stats counters Greg Banks
                   ` (9 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Add more statistics to /proc/net/rpc/nfsd which track the behaviour
of the reply cache, in particular hashing efficiency and memory usage.
A new line starting with the keyword "rc2" is added.

Note: the nfsdstats structure is currently a global contention point
in heavy multiprocessor NFS workloads, so this patch will actually
slow down the reply cache slightly.  That problem is addressed in a
later patch.

Signed-off-by: Greg Banks <gnb@sgi.com>
---

 fs/nfsd/nfscache.c         |   17 +++++++++++++++++
 fs/nfsd/stats.c            |   10 ++++++++++
 include/linux/nfsd/stats.h |   10 +++++++++-
 3 files changed, 36 insertions(+), 1 deletion(-)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -107,6 +107,16 @@ static int nfsd_proc_show(struct seq_fil
 	seq_putc(seq, '\n');
 #endif
 
+	/* extended repcache stats */
+	seq_printf(seq, "rc2 %u %u %u %u %u %u %u\n",
+			nfsdstats.rcprobes,
+			nfsdstats.rcexpands,
+			nfsdstats.rcrehash,
+			nfsdstats.rcentries,
+			nfsdstats.rcmem,
+			nfsdstats.rchashsize,
+			nfsdstats.rcage);
+
 	return 0;
 }
 
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -37,7 +37,15 @@ struct nfsd_stats {
 #ifdef CONFIG_NFSD_V4
 	unsigned int	nfs4_opcount[LAST_NFS4_OP + 1];	/* count of individual nfsv4 operations */
 #endif
-
+	/* extended repcache stats */
+	unsigned int    rcprobes;       /* counter: walks down hash chains */
+	unsigned int    rcexpands;      /* counter: when the cache is expanded */
+	unsigned int    rcrehash;       /* counter: when the cache index is expanded */
+	unsigned int    rcentries;      /* instant: # entries */
+	unsigned int    rcmem;          /* instant: bytes of memory used */
+	unsigned int    rchashsize;     /* instant: # chains in index */
+	unsigned int    rcage;          /* instant: age in milliseconds of last
+					 * entry reused from the LRU list */
 };
 
 struct nfsd_op_stats {
Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -155,6 +155,8 @@ static int nfsd_cache_bucket_expand(stru
 	spin_lock(&b->lock);
 
 	b->size += increment;
+	nfsdstats.rcentries += increment;
+	nfsdstats.rcmem += increment * sizeof(struct svc_cacherep);
 	list_splice(&lru, &b->lru);
 
 	spin_unlock(&b->lock);
@@ -185,8 +187,11 @@ int nfsd_reply_cache_init(void)
 		b->hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
 		if (!b->hash)
 			goto out_nomem;
+
+		nfsdstats.rcmem += HASHSIZE * sizeof(struct hlist_head);
 	}
 
+	nfsdstats.rchashsize = HASHSIZE;
 	cache_disabled = 0;
 	return 0;
 out_nomem:
@@ -266,6 +271,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	unsigned long		age;
 	int			rtn;
 	int			expand = 0;
+	unsigned int		nprobes = 0;
 
 	rqstp->rq_cacherep = NULL;
 	if (cache_disabled || type == RC_NOCACHE) {
@@ -282,6 +288,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	rh = &b->hash[h];
 	age = jiffies - 120*HZ;
 	hlist_for_each_entry(rp, hn, rh, c_hash) {
+		nprobes++;
 		if (rp->c_state != RC_UNUSED &&
 		    xid == rp->c_xid &&
 		    compare_sockaddr_in(svc_addr_in(rqstp), &rp->c_addr) &&
@@ -289,10 +296,12 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 		    proto == rp->c_prot && vers == rp->c_vers &&
 		    time_after(rp->c_timestamp, age)) {
 			nfsdstats.rchits++;
+			nfsdstats.rcprobes += nprobes;
 			goto found_entry;
 		}
 	}
 	nfsdstats.rcmisses++;
+	nfsdstats.rcprobes += nprobes;
 
 	/* This loop shouldn't take more than a few iterations normally */
 	{
@@ -323,12 +332,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	if (rp->c_state != RC_UNUSED) {
 		/* reusing an existing cache entry */
 		age = jiffies - rp->c_timestamp;
+		nfsdstats.rcage = age;
 		if (age < CACHE_THRESH_AGE &&
 		    b->size < CACHE_BUCKET_MAX_SIZE &&
 		    nfsd_cache_expand_ratelimit(b)) {
 			expand = CACHE_BUCKET_INCREMENT;
 			if (b->size + expand > CACHE_BUCKET_MAX_SIZE)
 				expand = CACHE_BUCKET_MAX_SIZE - b->size;
+			nfsdstats.rcexpands++;
 		}
 	}
 
@@ -349,6 +360,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	if (rp->c_type == RC_REPLBUFF) {
 		kfree(rp->c_replvec.iov_base);
 		rp->c_replvec.iov_base = NULL;
+		nfsdstats.rcmem -= rp->c_replvec.iov_len;
 	}
 	rp->c_type = RC_NOCACHE;
  out:
@@ -418,6 +430,7 @@ nfsd_cache_update(struct svc_rqst *rqstp
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
 	int		len;
 	struct svc_cache_bucket *b;
+	unsigned int	moremem = 0;
 
 	if (!(rp = rqstp->rq_cacherep) || cache_disabled)
 		return;
@@ -450,6 +463,7 @@ nfsd_cache_update(struct svc_rqst *rqstp
 		}
 		cachv->iov_len = len << 2;
 		memcpy(cachv->iov_base, statp, len << 2);
+		moremem = len << 2;
 		break;
 	}
 	spin_lock(&b->lock);
@@ -458,6 +472,8 @@ nfsd_cache_update(struct svc_rqst *rqstp
 	rp->c_type = cachetype;
 	rp->c_state = RC_DONE;
 	rp->c_timestamp = jiffies;
+	if (moremem)
+		nfsdstats.rcmem += moremem;
 	spin_unlock(&b->lock);
 	return;
 }
@@ -481,3 +497,4 @@ nfsd_cache_append(struct svc_rqst *rqstp
 	vec->iov_len += data->iov_len;
 	return 1;
 }
+

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 21/29] knfsd: remove unreported filehandle stats counters
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (19 preceding siblings ...)
  2009-03-31 20:28 ` [patch 20/29] knfsd: add extended reply cache stats Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-12 20:00   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 22/29] knfsd: make svc_authenticate() scale Greg Banks
                   ` (8 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

The file nfsfh.c contains two static variables nfsd_nr_verified and
nfsd_nr_put.  These are counters which are incremented as a side
effect of the fh_verify() fh_compose() and fh_put() operations,
i.e. at least twice per NFS call for any non-trivial workload.
Needless to say this makes the cacheline that contains them (and any
other innocent victims) a very hot contention point indeed under high
call-rate workloads on multiprocessor NFS server.  It also turns out
that these counters are not used anywhere.  They're not reported to
userspace, they're not used in logic, they're not even exported from
the object file (let alone the module).  All they do is waste CPU time.

So this patch removes them.

Tests on a 16 CPU Altix A4700 with 2 10gige Myricom cards, configured
separately (no bonding).  Workload is 640 client threads doing directory
traverals with random small reads, from server RAM.

Before
======

Kernel profile:

  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  6.05   2716.00  2716.00    30406     0.09     1.02  svc_process
  4.44   4706.00  1990.00     1975     1.01     1.01  spin_unlock_irqrestore
  3.72   6376.00  1670.00     1666     1.00     1.00  svc_export_put
  3.41   7907.00  1531.00     1786     0.86     1.02  nfsd_ofcache_lookup
  3.25   9363.00  1456.00    10965     0.13     1.01  nfsd_dispatch
  3.10  10752.00  1389.00     1376     1.01     1.01  nfsd_cache_lookup
  2.57  11907.00  1155.00     4517     0.26     1.03  svc_tcp_recvfrom
  ...
  2.21  15352.00  1003.00     1081     0.93     1.00  nfsd_choose_ofc  <----
  ^^^^ 

Here the function nfsd_choose_ofc() reads a global variable
which by accident happened to be located in the same cacheline as
nfsd_nr_verified.

Call rate:

nullarbor:~ # pmdumptext nfs3.server.calls
...
Thu Dec 13 00:15:27     184780.663
Thu Dec 13 00:15:28     184885.881
Thu Dec 13 00:15:29     184449.215
Thu Dec 13 00:15:30     184971.058
Thu Dec 13 00:15:31     185036.052
Thu Dec 13 00:15:32     185250.475
Thu Dec 13 00:15:33     184481.319
Thu Dec 13 00:15:34     185225.737
Thu Dec 13 00:15:35     185408.018
Thu Dec 13 00:15:36     185335.764

 
After
=====

kernel profile:

  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  6.33   2813.00  2813.00    29979     0.09     1.01  svc_process
  4.66   4883.00  2070.00     2065     1.00     1.00  spin_unlock_irqrestore
  4.06   6687.00  1804.00     2182     0.83     1.00  nfsd_ofcache_lookup
  3.20   8110.00  1423.00    10932     0.13     1.00  nfsd_dispatch
  3.03   9456.00  1346.00     1343     1.00     1.00  nfsd_cache_lookup
  2.62  10622.00  1166.00     4645     0.25     1.01  svc_tcp_recvfrom
[...]
  0.10  42586.00    44.00       74     0.59     1.00  nfsd_choose_ofc  <--- HA!!
  ^^^^

Call rate:

nullarbor:~ # pmdumptext nfs3.server.calls
...
Thu Dec 13 01:45:28     194677.118
Thu Dec 13 01:45:29     193932.692
Thu Dec 13 01:45:30     194294.364
Thu Dec 13 01:45:31     194971.276
Thu Dec 13 01:45:32     194111.207
Thu Dec 13 01:45:33     194999.635
Thu Dec 13 01:45:34     195312.594
Thu Dec 13 01:45:35     195707.293
Thu Dec 13 01:45:36     194610.353
Thu Dec 13 01:45:37     195913.662
Thu Dec 13 01:45:38     194808.675

i.e. about a 5.3% improvement in call rate.

Signed-off-by: Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
Reviewed-by: David Chinner <dgc@sgi.com>
---

 fs/nfsd/nfsfh.c |    6 ------
 1 file changed, 6 deletions(-)

Index: bfields/fs/nfsd/nfsfh.c
===================================================================
--- bfields.orig/fs/nfsd/nfsfh.c
+++ bfields/fs/nfsd/nfsfh.c
@@ -27,9 +27,6 @@
 #define NFSDDBG_FACILITY		NFSDDBG_FH
 
 
-static int nfsd_nr_verified;
-static int nfsd_nr_put;
-
 /*
  * our acceptability function.
  * if NOSUBTREECHECK, accept anything
@@ -251,7 +248,6 @@ static __be32 nfsd_set_fh_dentry(struct 
 
 	fhp->fh_dentry = dentry;
 	fhp->fh_export = exp;
-	nfsd_nr_verified++;
 	return 0;
 out:
 	exp_put(exp);
@@ -552,7 +548,6 @@ fh_compose(struct svc_fh *fhp, struct sv
 			return nfserr_opnotsupp;
 	}
 
-	nfsd_nr_verified++;
 	return 0;
 }
 
@@ -609,7 +604,6 @@ fh_put(struct svc_fh *fhp)
 		fhp->fh_pre_saved = 0;
 		fhp->fh_post_saved = 0;
 #endif
-		nfsd_nr_put++;
 	}
 	if (exp) {
 		cache_put(&exp->h, &svc_export_cache);

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 22/29] knfsd: make svc_authenticate() scale
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (20 preceding siblings ...)
  2009-03-31 20:28 ` [patch 21/29] knfsd: remove unreported filehandle stats counters Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-05-12 21:24   ` J. Bruce Fields
  2009-03-31 20:28 ` [patch 23/29] knfsd: introduce SVC_INC_STAT Greg Banks
                   ` (7 subsequent siblings)
  29 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Replace the global spinlock which protects the table of registered
RPC authentication flavours, with an RCU scheme.  The spinlock was
taken by nfsd on every CPU for every NFS call, resulting in lots
of spinlock contention and one very hot and bouncy cacheline.

Tests on a 16 CPU Altix A4700 with 2 10gige Myricom cards, configured
separately (no bonding).  Workload is 640 client threads doing directory
traverals with random small reads, from server RAM.

Before: 242 KIOPS, with an oprofile like:
  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  5.01   2276.00  2276.00     2666     0.85     1.00  nfsd_ofcache_lookup
  4.61   4370.00  2094.00     2092     1.00     1.00  ia64_spinlock_contention	<----
  4.20   6279.00  1909.00     3141     0.61     0.78  svc_sock_enqueue
  4.03   8108.00  1829.00     1824     1.00     1.00  spin_unlock_irqrestore
  3.32   9618.00  1510.00     3588     0.42     1.00  spin_lock

             2090.00    0.00    2088/2092        spin_lock [22]
[40]     4.6 2094.00    0.00    2092         ia64_spinlock_contention [40]

             1473.39 2039.32    3501/3588        svc_authenticate [21]
[22]     7.9 1510.00 2090.00    3588         spin_lock [22]

After: 253 KIOPS, with a oprofile like:
  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  5.20   2250.00  2250.00     2638     0.85     1.00  nfsd_ofcache_lookup
  4.31   4117.00  1867.00     1863     1.00     1.00  spin_unlock_irqrestore
  3.13   5470.00  1353.00     1447     0.94     1.01  svcauth_unix_set_client
  2.79   6677.00  1207.00     1203     1.00     1.00  exp_readunlock
  2.77   7875.00  1198.00     1186     1.01     1.01  svc_export_put
  ...
  0.03  43095.00    13.00       13     1.00     1.00  ia64_spinlock_contention	<----

Before anyone asks, going to a rwlock_t kept similar performance and
just turned the time spent spinning on the lock to time spent waiting
for the cacheline to bounce.

Signed-off-by: Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
Reviewed-by: David Chinner <dgc@sgi.com>
---

 net/sunrpc/svcauth.c |   26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

Index: bfields/net/sunrpc/svcauth.c
===================================================================
--- bfields.orig/net/sunrpc/svcauth.c
+++ bfields/net/sunrpc/svcauth.c
@@ -42,17 +42,19 @@ svc_authenticate(struct svc_rqst *rqstp,
 	*authp = rpc_auth_ok;
 
 	flavor = svc_getnl(&rqstp->rq_arg.head[0]);
+	if (flavor >= RPC_AUTH_MAXFLAVOR)
+		return SVC_DENIED;
 
 	dprintk("svc: svc_authenticate (%d)\n", flavor);
 
-	spin_lock(&authtab_lock);
-	if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor])
-			|| !try_module_get(aops->owner)) {
-		spin_unlock(&authtab_lock);
+	rcu_read_lock();
+	aops = rcu_dereference(authtab[flavor]);
+	if (!aops || !try_module_get(aops->owner)) {
+		rcu_read_unlock();
 		*authp = rpc_autherr_badcred;
 		return SVC_DENIED;
 	}
-	spin_unlock(&authtab_lock);
+	rcu_read_unlock();
 
 	rqstp->rq_authop = aops;
 	return aops->accept(rqstp, authp);
@@ -87,9 +89,13 @@ int
 svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
 {
 	int rv = -EINVAL;
+
+	if (flavor >= RPC_AUTH_MAXFLAVOR)
+		return -EINVAL;
+
 	spin_lock(&authtab_lock);
-	if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
-		authtab[flavor] = aops;
+	if (authtab[flavor] == NULL) {
+		rcu_assign_pointer(authtab[flavor], aops);
 		rv = 0;
 	}
 	spin_unlock(&authtab_lock);
@@ -100,9 +106,11 @@ EXPORT_SYMBOL_GPL(svc_auth_register);
 void
 svc_auth_unregister(rpc_authflavor_t flavor)
 {
+	if (flavor >= RPC_AUTH_MAXFLAVOR)
+		return;
+
 	spin_lock(&authtab_lock);
-	if (flavor < RPC_AUTH_MAXFLAVOR)
-		authtab[flavor] = NULL;
+	rcu_assign_pointer(authtab[flavor], NULL);
 	spin_unlock(&authtab_lock);
 }
 EXPORT_SYMBOL_GPL(svc_auth_unregister);

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 23/29] knfsd: introduce SVC_INC_STAT
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (21 preceding siblings ...)
  2009-03-31 20:28 ` [patch 22/29] knfsd: make svc_authenticate() scale Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 24/29] knfsd: remove the program field from struct svc_stat Greg Banks
                   ` (6 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Replace direct increments of counters in the svc_stat structure
attached to each svc_serv, with the macro SVC_INC_STAT().  This doesn't
change any functionality but prepares for a subsequent patch which
changes the definitions of those macros to make the sv_stat counters
per-cpu for performance.

Signed-off-by: Greg Banks <gnb@sgi.com>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 include/linux/sunrpc/svc.h |    3 +++
 net/sunrpc/svc.c           |   18 +++++++++---------
 net/sunrpc/svcsock.c       |   12 ++++--------
 3 files changed, 16 insertions(+), 17 deletions(-)

Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -109,6 +109,9 @@ struct svc_serv {
 	unsigned int		sv_drc_pages_used;/* DRC pages used */
 #endif /* CONFIG_NFSD_V4_1 */
 };
+#define SVC_INC_STAT(serv, field) \
+	((serv)->sv_stats ? \
+		++((serv)->sv_stats->field) : 0)
 
 /*
  * We use sv_nrthreads as a reference count.  svc_destroy() drops
Index: bfields/net/sunrpc/svc.c
===================================================================
--- bfields.orig/net/sunrpc/svc.c
+++ bfields/net/sunrpc/svc.c
@@ -1084,7 +1084,7 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_procinfo = procp;
 
 	/* Syntactic check complete */
-	serv->sv_stats->rpccnt++;
+	SVC_INC_STAT(serv, rpccnt);
 
 	/* Build the reply header. */
 	statp = resv->iov_base +resv->iov_len;
@@ -1121,7 +1121,7 @@ svc_process(struct svc_rqst *rqstp)
 		if (*statp == rpc_success && (xdr = procp->pc_encode)
 		 && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
 			dprintk("svc: failed to encode reply\n");
-			/* serv->sv_stats->rpcsystemerr++; */
+			/* SVC_INC_STAT(serv, rpcsystemerr); */
 			*statp = rpc_system_err;
 		}
 	} else {
@@ -1165,11 +1165,11 @@ err_short_len:
 err_bad_dir:
 	svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
 
-	serv->sv_stats->rpcbadfmt++;
+	SVC_INC_STAT(serv, rpcbadfmt);
 	goto dropit;			/* drop request */
 
 err_bad_rpc:
-	serv->sv_stats->rpcbadfmt++;
+	SVC_INC_STAT(serv, rpcbadfmt);
 	svc_putnl(resv, 1);	/* REJECT */
 	svc_putnl(resv, 0);	/* RPC_MISMATCH */
 	svc_putnl(resv, 2);	/* Only RPCv2 supported */
@@ -1178,7 +1178,7 @@ err_bad_rpc:
 
 err_bad_auth:
 	dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
-	serv->sv_stats->rpcbadauth++;
+	SVC_INC_STAT(serv, rpcbadauth);
 	/* Restore write pointer to location of accept status: */
 	xdr_ressize_check(rqstp, reply_statp);
 	svc_putnl(resv, 1);	/* REJECT */
@@ -1188,7 +1188,7 @@ err_bad_auth:
 
 err_bad_prog:
 	dprintk("svc: unknown program %d\n", prog);
-	serv->sv_stats->rpcbadfmt++;
+	SVC_INC_STAT(serv, rpcbadfmt);
 	svc_putnl(resv, RPC_PROG_UNAVAIL);
 	goto sendit;
 
@@ -1196,7 +1196,7 @@ err_bad_vers:
 	svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
 		       vers, prog, progp->pg_name);
 
-	serv->sv_stats->rpcbadfmt++;
+	SVC_INC_STAT(serv, rpcbadfmt);
 	svc_putnl(resv, RPC_PROG_MISMATCH);
 	svc_putnl(resv, progp->pg_lovers);
 	svc_putnl(resv, progp->pg_hivers);
@@ -1205,7 +1205,7 @@ err_bad_vers:
 err_bad_proc:
 	svc_printk(rqstp, "unknown procedure (%d)\n", proc);
 
-	serv->sv_stats->rpcbadfmt++;
+	SVC_INC_STAT(serv, rpcbadfmt);
 	svc_putnl(resv, RPC_PROC_UNAVAIL);
 	goto sendit;
 
@@ -1214,7 +1214,7 @@ err_garbage:
 
 	rpc_stat = rpc_garbage_args;
 err_bad:
-	serv->sv_stats->rpcbadfmt++;
+	SVC_INC_STAT(serv, rpcbadfmt);
 	svc_putnl(resv, ntohl(rpc_stat));
 	goto sendit;
 }
Index: bfields/net/sunrpc/svcsock.c
===================================================================
--- bfields.orig/net/sunrpc/svcsock.c
+++ bfields/net/sunrpc/svcsock.c
@@ -531,8 +531,7 @@ static int svc_udp_recvfrom(struct svc_r
 			DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
 	}
 
-	if (serv->sv_stats)
-		serv->sv_stats->netudpcnt++;
+	SVC_INC_STAT(serv, netudpcnt);
 
 	return len;
 }
@@ -770,8 +769,7 @@ static struct svc_xprt *svc_tcp_accept(s
 	}
 	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
 
-	if (serv->sv_stats)
-		serv->sv_stats->nettcpconn++;
+	SVC_INC_STAT(serv, nettcpconn);
 
 	return &newsvsk->sk_xprt;
 
@@ -973,8 +971,7 @@ out:
 
 	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
 	svc_xprt_received(&svsk->sk_xprt);
-	if (serv->sv_stats)
-		serv->sv_stats->nettcpcnt++;
+	SVC_INC_STAT(serv, nettcpcnt);
 
 	return len;
 
Index: bfields/net/sunrpc/svc_xprt.c
===================================================================
--- bfields.orig/net/sunrpc/svc_xprt.c
+++ bfields/net/sunrpc/svc_xprt.c
@@ -765,8 +765,7 @@ int svc_recv(struct svc_rqst *rqstp, lon
 	rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
 	rqstp->rq_chandle.defer = svc_defer;
 
-	if (serv->sv_stats)
-		serv->sv_stats->netcnt++;
+	SVC_INC_STAT(serv, netcnt);
 	return len;
 }
 EXPORT_SYMBOL_GPL(svc_recv);

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 24/29] knfsd: remove the program field from struct svc_stat
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (22 preceding siblings ...)
  2009-03-31 20:28 ` [patch 23/29] knfsd: introduce SVC_INC_STAT Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 25/29] knfsd: allocate svc_serv.sv_stats dynamically Greg Banks
                   ` (5 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Remove the `program' field from the svc_stat structure.  This field
is an anachronism dating from the days when each svc_serv could
only support a single svc_program, and doesn't make sense anymore.
Change the arguments of svc_proc_register() and svc_seq_show() to
remove the uses of svc_stat.program.

This is a preliminary step towards making the NFS server statistics
be gathered per-CPU.

Signed-off-by: Greg Banks <gnb@sgi.com>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 fs/nfsd/nfssvc.c             |    4 +---
 fs/nfsd/stats.c              |    9 ++++-----
 include/linux/sunrpc/stats.h |   12 ++++++------
 net/sunrpc/stats.c           |   11 +++++++----
 4 files changed, 18 insertions(+), 18 deletions(-)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -56,9 +56,7 @@ static inline void nfsd_stats_prefetch(n
 
 
 struct nfsd_stats	nfsdstats;
-struct svc_stat		nfsd_svcstats = {
-	.program	= &nfsd_program,
-};
+struct svc_stat		nfsd_svcstats;
 
 nfsd_stats_hash_t nfsd_export_stats_hash;
 nfsd_stats_hash_t nfsd_client_stats_hash;
@@ -95,7 +93,8 @@ static int nfsd_proc_show(struct seq_fil
 	seq_putc(seq, '\n');
 	
 	/* show my rpc info */
-	svc_seq_show(seq, &nfsd_svcstats);
+	if (nfsd_serv)
+		svc_seq_show(seq, nfsd_serv);
 
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
@@ -713,7 +712,7 @@ int nfsd_stats_open(struct file *file, n
 void
 nfsd_stat_init(void)
 {
-	svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops);
+	svc_proc_register("nfsd", &nfsd_proc_fops);
 
 	nfsd_stats_hash_init(&nfsd_export_stats_hash, "export");
 	nfsd_stats_hash_init(&nfsd_client_stats_hash, "client");
Index: bfields/include/linux/sunrpc/stats.h
===================================================================
--- bfields.orig/include/linux/sunrpc/stats.h
+++ bfields/include/linux/sunrpc/stats.h
@@ -26,8 +26,6 @@ struct rpc_stat {
 };
 
 struct svc_stat {
-	struct svc_program *	program;
-
 	unsigned int		netcnt,
 				netudpcnt,
 				nettcpcnt,
@@ -44,16 +42,18 @@ void			rpc_proc_exit(void);
 void			rpc_modcount(struct inode *, int);
 #endif
 
+struct svc_serv;    /* forward declare to shut up compiler */
+
 #ifdef CONFIG_PROC_FS
 struct proc_dir_entry *	rpc_proc_register(struct rpc_stat *);
 void			rpc_proc_unregister(const char *);
 void			rpc_proc_zero(struct rpc_program *);
-struct proc_dir_entry *	svc_proc_register(struct svc_stat *,
+struct proc_dir_entry *	svc_proc_register(const char *,
 					  const struct file_operations *);
 void			svc_proc_unregister(const char *);
 
 void			svc_seq_show(struct seq_file *,
-				     const struct svc_stat *);
+				     const struct svc_serv *);
 
 extern struct proc_dir_entry	*proc_net_rpc;
 
@@ -63,12 +63,12 @@ static inline struct proc_dir_entry *rpc
 static inline void rpc_proc_unregister(const char *p) {}
 static inline void rpc_proc_zero(struct rpc_program *p) {}
 
-static inline struct proc_dir_entry *svc_proc_register(struct svc_stat *s,
+static inline struct proc_dir_entry *svc_proc_register(const char *s,
 						       const struct file_operations *f) { return NULL; }
 static inline void svc_proc_unregister(const char *p) {}
 
 static inline void svc_seq_show(struct seq_file *seq,
-				const struct svc_stat *st) {}
+				const struct svc_serv *) {}
 
 #define proc_net_rpc NULL
 
Index: bfields/net/sunrpc/stats.c
===================================================================
--- bfields.orig/net/sunrpc/stats.c
+++ bfields/net/sunrpc/stats.c
@@ -77,8 +77,11 @@ static const struct file_operations rpc_
 /*
  * Get RPC server stats
  */
-void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
-	const struct svc_program *prog = statp->program;
+void svc_seq_show(struct seq_file *seq, const struct svc_serv *serv)
+{
+	/* TODO: report call counts from the non-primary programs */
+	const struct svc_program *prog = serv->sv_program;
+	struct svc_stat *statp = serv->sv_stats;
 	const struct svc_procedure *proc;
 	const struct svc_version *vers;
 	unsigned int i, j;
@@ -245,9 +248,9 @@ rpc_proc_unregister(const char *name)
 EXPORT_SYMBOL_GPL(rpc_proc_unregister);
 
 struct proc_dir_entry *
-svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
+svc_proc_register(const char *name, const struct file_operations *fops)
 {
-	return do_register(statp->program->pg_name, statp, fops);
+	return do_register(name, NULL, fops);
 }
 EXPORT_SYMBOL_GPL(svc_proc_register);
 
Index: bfields/fs/nfsd/nfssvc.c
===================================================================
--- bfields.orig/fs/nfsd/nfssvc.c
+++ bfields/fs/nfsd/nfssvc.c
@@ -88,9 +88,7 @@ static struct svc_program	nfsd_acl_progr
 	.pg_authenticate	= &svc_set_client,
 };
 
-static struct svc_stat	nfsd_acl_svcstats = {
-	.program	= &nfsd_acl_program,
-};
+static struct svc_stat	nfsd_acl_svcstats;
 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
 
 static struct svc_version *	nfsd_version[] = {

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 25/29] knfsd: allocate svc_serv.sv_stats dynamically
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (23 preceding siblings ...)
  2009-03-31 20:28 ` [patch 24/29] knfsd: remove the program field from struct svc_stat Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 26/29] knfsd: make svc_serv.sv_stats per-CPU Greg Banks
                   ` (4 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Remove some more anachronisms from the days when each svc_serv could
only handle a single svc_program.

One sv_serv statistics structure was pointed to by svc_serv.sv_stats
and updated as calls flowed.  However, a statically allocated svc_stat
was also pointed to by each svc_program.pg_stats, and the svc_serv
would take over the svc_stat of the first svc_program added to it,
ignoring the others.

This patch removes the svc_program.pg_stats field and the statically
allocated svc_stat structures.  A single svc_stat structure is
allocated dynamically when the svc_serv starts.

This is a preliminary step towards making the NFS server statistics
be gathered per-CPU.

Signed-off-by: Greg Banks <gnb@sgi.com>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 fs/lockd/svc.c             |    6 ------
 fs/nfs/callback.c          |    3 ---
 fs/nfsd/nfssvc.c           |    4 ----
 fs/nfsd/stats.c            |    1 -
 include/linux/nfsd/stats.h |    1 -
 include/linux/sunrpc/svc.h |    1 -
 net/sunrpc/svc.c           |    9 ++++++++-
 7 files changed, 8 insertions(+), 17 deletions(-)

Index: bfields/fs/lockd/svc.c
===================================================================
--- bfields.orig/fs/lockd/svc.c
+++ bfields/fs/lockd/svc.c
@@ -569,8 +569,6 @@ static struct svc_version *	nlmsvc_versi
 #endif
 };
 
-static struct svc_stat		nlmsvc_stats;
-
 #define NLM_NRVERS	ARRAY_SIZE(nlmsvc_version)
 static struct svc_program	nlmsvc_program = {
 	.pg_prog		= NLM_PROGRAM,		/* program number */
@@ -578,6 +576,5 @@ static struct svc_program	nlmsvc_program
 	.pg_vers		= nlmsvc_version,	/* version table */
 	.pg_name		= "lockd",		/* service name */
 	.pg_class		= "nfsd",		/* share authentication with nfsd */
-	.pg_stats		= &nlmsvc_stats,	/* stats table */
 	.pg_authenticate = &lockd_authenticate	/* export authentication */
 };
Index: bfields/fs/nfs/callback.c
===================================================================
--- bfields.orig/fs/nfs/callback.c
+++ bfields/fs/nfs/callback.c
@@ -245,14 +245,11 @@ static struct svc_version *nfs4_callback
 	[1] = &nfs4_callback_version1,
 };
 
-static struct svc_stat nfs4_callback_stats;
-
 static struct svc_program nfs4_callback_program = {
 	.pg_prog = NFS4_CALLBACK,			/* RPC service number */
 	.pg_nvers = ARRAY_SIZE(nfs4_callback_version),	/* Number of entries */
 	.pg_vers = nfs4_callback_version,		/* version table */
 	.pg_name = "NFSv4 callback",			/* service name */
 	.pg_class = "nfs",				/* authentication class */
-	.pg_stats = &nfs4_callback_stats,
 	.pg_authenticate = nfs_callback_authenticate,
 };
Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -56,7 +56,6 @@ static inline void nfsd_stats_prefetch(n
 
 
 struct nfsd_stats	nfsdstats;
-struct svc_stat		nfsd_svcstats;
 
 nfsd_stats_hash_t nfsd_export_stats_hash;
 nfsd_stats_hash_t nfsd_client_stats_hash;
Index: bfields/fs/nfsd/nfssvc.c
===================================================================
--- bfields.orig/fs/nfsd/nfssvc.c
+++ bfields/fs/nfsd/nfssvc.c
@@ -68,7 +68,6 @@ DEFINE_MUTEX(nfsd_mutex);
 struct svc_serv 		*nfsd_serv;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-static struct svc_stat	nfsd_acl_svcstats;
 static struct svc_version *	nfsd_acl_version[] = {
 	[2] = &nfsd_acl_version2,
 	[3] = &nfsd_acl_version3,
@@ -84,11 +83,9 @@ static struct svc_program	nfsd_acl_progr
 	.pg_vers		= nfsd_acl_versions,
 	.pg_name		= "nfsacl",
 	.pg_class		= "nfsd",
-	.pg_stats		= &nfsd_acl_svcstats,
 	.pg_authenticate	= &svc_set_client,
 };
 
-static struct svc_stat	nfsd_acl_svcstats;
 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
 
 static struct svc_version *	nfsd_version[] = {
@@ -114,7 +111,6 @@ struct svc_program		nfsd_program = {
 	.pg_vers		= nfsd_versions,	/* version table */
 	.pg_name		= "nfsd",		/* program name */
 	.pg_class		= "nfsd",		/* authentication class */
-	.pg_stats		= &nfsd_svcstats,	/* version table */
 	.pg_authenticate	= &svc_set_client,	/* export authentication */
 
 };
Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -375,7 +375,6 @@ struct svc_program {
 	struct svc_version **	pg_vers;	/* version array */
 	char *			pg_name;	/* service name */
 	char *			pg_class;	/* class name: services sharing authentication */
-	struct svc_stat *	pg_stats;	/* rpc statistics */
 	int			(*pg_authenticate)(struct svc_rqst *);
 };
 
Index: bfields/net/sunrpc/svc.c
===================================================================
--- bfields.orig/net/sunrpc/svc.c
+++ bfields/net/sunrpc/svc.c
@@ -371,7 +371,6 @@ __svc_create(struct svc_program *prog, u
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
-	serv->sv_stats     = prog->pg_stats;
 	if (bufsize > RPCSVC_MAXPAYLOAD)
 		bufsize = RPCSVC_MAXPAYLOAD;
 	serv->sv_max_payload = bufsize? bufsize : 4096;
@@ -396,11 +395,18 @@ __svc_create(struct svc_program *prog, u
 	init_timer(&serv->sv_temptimer);
 	spin_lock_init(&serv->sv_lock);
 
+	serv->sv_stats = kzalloc(sizeof(struct svc_stat), GFP_KERNEL);
+	if (!serv->sv_stats) {
+		kfree(serv);
+		return NULL;
+	}
+
 	serv->sv_nrpools = npools;
 	serv->sv_pools =
 		kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
 			GFP_KERNEL);
 	if (!serv->sv_pools) {
+		kfree(serv->sv_stats);
 		kfree(serv);
 		return NULL;
 	}
@@ -489,6 +495,7 @@ svc_destroy(struct svc_serv *serv)
 
 	svc_unregister(serv);
 	kfree(serv->sv_pools);
+	kfree(serv->sv_stats);
 	kfree(serv);
 }
 EXPORT_SYMBOL_GPL(svc_destroy);
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -142,7 +142,6 @@ struct nfsd_stats_hiter {
 
 
 extern struct nfsd_stats	nfsdstats;
-extern struct svc_stat		nfsd_svcstats;
 extern nfsd_stats_hash_t	nfsd_export_stats_hash;
 extern nfsd_stats_hash_t	nfsd_client_stats_hash;
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 26/29] knfsd: make svc_serv.sv_stats per-CPU
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (24 preceding siblings ...)
  2009-03-31 20:28 ` [patch 25/29] knfsd: allocate svc_serv.sv_stats dynamically Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 27/29] knfsd: move hot procedure count field out of svc_procedure Greg Banks
                   ` (3 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Split the svc_stat structure pointed to by svc_serv.sv_stats into
a structure per-cpu, and aggregate all the per-cpu structures just
before emitting them to userspace via /proc.  This avoids bouncing
cachelines when updating stats.  See next patch for performance
numbers.

Signed-off-by: Greg Banks <gnb@sgi.com>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 include/linux/sunrpc/svc.h |    6 +++---
 net/sunrpc/stats.c         |   34 +++++++++++++++++++++++-----------
 net/sunrpc/svc.c           |    8 ++++----
 3 files changed, 30 insertions(+), 18 deletions(-)

Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -76,7 +76,7 @@ struct svc_pool {
  */
 struct svc_serv {
 	struct svc_program *	sv_program;	/* RPC program */
-	struct svc_stat *	sv_stats;	/* RPC statistics */
+	struct svc_stat *	sv_stats_percpu;/* RPC statistics */
 	spinlock_t		sv_lock;
 	unsigned int		sv_nrthreads;	/* # of server threads */
 	unsigned int		sv_maxconn;	/* max connections allowed or
@@ -110,8 +110,8 @@ struct svc_serv {
 #endif /* CONFIG_NFSD_V4_1 */
 };
 #define SVC_INC_STAT(serv, field) \
-	((serv)->sv_stats ? \
-		++((serv)->sv_stats->field) : 0)
+	((serv)->sv_stats_percpu ? \
+		++(per_cpu_ptr((serv)->sv_stats_percpu, smp_processor_id())->field) : 0)
 
 /*
  * We use sv_nrthreads as a reference count.  svc_destroy() drops
Index: bfields/net/sunrpc/stats.c
===================================================================
--- bfields.orig/net/sunrpc/stats.c
+++ bfields/net/sunrpc/stats.c
@@ -75,17 +75,48 @@ static const struct file_operations rpc_
 };
 
 /*
+ * Accumulate all the per-cpu struct svc_stat
+ * into one global total for emission to userspace.
+ * Relies on struct svc_stat being composed of
+ * unsigned ints without gaps, so it can be treated
+ * as an array of unsigned ints.
+ *
+ * Note: we iterate over all possible CPUs instead
+ * of just the online ones to avoid counters going
+ * backwards when CPUs go offline.
+ */
+static void svc_stat_accum(const struct svc_serv *serv,
+			   struct svc_stat *sp)
+{
+	unsigned int *usp = (unsigned int *)sp;
+	int cpu;
+	int i;
+
+	memset(sp, 0, sizeof(*sp));
+	for_each_possible_cpu(cpu) {
+		unsigned int *ucsp = (unsigned int *)
+					per_cpu_ptr(serv->sv_stats_percpu, cpu);
+		for (i = 0 ; i < sizeof(*sp)/sizeof(unsigned int) ; i++)
+			usp[i] += ucsp[i];
+	}
+}
+
+
+/*
  * Get RPC server stats
  */
 void svc_seq_show(struct seq_file *seq, const struct svc_serv *serv)
 {
 	/* TODO: report call counts from the non-primary programs */
 	const struct svc_program *prog = serv->sv_program;
-	struct svc_stat *statp = serv->sv_stats;
+	struct svc_stat accum;
+	struct svc_stat *statp = &accum;
 	const struct svc_procedure *proc;
 	const struct svc_version *vers;
 	unsigned int i, j;
 
+	svc_stat_accum(serv, &accum);
+
 	seq_printf(seq,
 		"net %u %u %u %u\n",
 			statp->netcnt,
Index: bfields/net/sunrpc/svc.c
===================================================================
--- bfields.orig/net/sunrpc/svc.c
+++ bfields/net/sunrpc/svc.c
@@ -395,8 +395,9 @@ __svc_create(struct svc_program *prog, u
 	init_timer(&serv->sv_temptimer);
 	spin_lock_init(&serv->sv_lock);
 
-	serv->sv_stats = kzalloc(sizeof(struct svc_stat), GFP_KERNEL);
-	if (!serv->sv_stats) {
+	serv->sv_stats_percpu = __alloc_percpu(sizeof(struct svc_stat),
+					       __alignof__(struct svc_stat));
+	if (!serv->sv_stats_percpu) {
 		kfree(serv);
 		return NULL;
 	}
@@ -406,7 +407,7 @@ __svc_create(struct svc_program *prog, u
 		kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
 			GFP_KERNEL);
 	if (!serv->sv_pools) {
-		kfree(serv->sv_stats);
+		free_percpu(serv->sv_stats_percpu);
 		kfree(serv);
 		return NULL;
 	}
@@ -495,7 +496,7 @@ svc_destroy(struct svc_serv *serv)
 
 	svc_unregister(serv);
 	kfree(serv->sv_pools);
-	kfree(serv->sv_stats);
+	free_percpu(serv->sv_stats_percpu);
 	kfree(serv);
 }
 EXPORT_SYMBOL_GPL(svc_destroy);

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 27/29] knfsd: move hot procedure count field out of svc_procedure
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (25 preceding siblings ...)
  2009-03-31 20:28 ` [patch 26/29] knfsd: make svc_serv.sv_stats per-CPU Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 28/29] knfsd: introduce NFSD_INC_STAT() Greg Banks
                   ` (2 subsequent siblings)
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

The svc_procedure structure contains a number of very static fields
which describe each RPC call that can be made.  However the pc_count
field is a counter of calls received.  For any given workload there
will be some of these that are very hot cachelines indeed.

This patch moves the counters to a dynamically allocated per-cpu
area at the end of the svc_stat structure attached to the svc_serv.
The pc_count field is used only as an index into that area, so that the
svc_procedure cachelines remain constant after service initialisation.

The result is dramatically less time spent in svc_process() and
nfsd_dispatch() waiting for svc_procedure cachelines to bounce.

Tests on a 16 CPU Altix A4700 with 2 10gige Myricom cards, configured
separately (no bonding).  Workload is 640 client threads doing directory
traverals with random small reads, from server RAM.

Before
======

Kernel profile:

  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  6.33   2813.00  2813.00    29979     0.09     1.01  svc_process	<----
  ^^^^
  4.66   4883.00  2070.00     2065     1.00     1.00  spin_unlock_irqrestore
  4.06   6687.00  1804.00     2182     0.83     1.00  nfsd_ofcache_lookup
  3.20   8110.00  1423.00    10932     0.13     1.00  nfsd_dispatch	<----
  ^^^^
  3.03   9456.00  1346.00     1343     1.00     1.00  nfsd_cache_lookup
  2.62  10622.00  1166.00     4645     0.25     1.01  svc_tcp_recvfrom
  2.47  11720.00  1098.00     1096     1.00     1.00  ia64_spinlock_contention

Call rate:

nullarbor:~ # pmdumptext nfs3.server.calls
...
Thu Dec 13 01:45:27     194796.183
Thu Dec 13 01:45:28     194677.118
Thu Dec 13 01:45:29     193932.692
Thu Dec 13 01:45:30     194294.364
Thu Dec 13 01:45:31     194971.276
Thu Dec 13 01:45:32     194111.207
Thu Dec 13 01:45:33     194999.635
Thu Dec 13 01:45:34     195312.594
Thu Dec 13 01:45:35     195707.293
Thu Dec 13 01:45:36     194610.353
Thu Dec 13 01:45:37     195913.662


After
=====

Kernel profile:

  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  5.32   2420.00  2420.00     2793     0.87     1.00  nfsd_ofcache_lookup
  4.21   4337.00  1917.00     1894     1.01     1.01  spin_unlock_irqrestore
  3.05   5723.00  1386.00     1375     1.01     1.01  ia64_spinlock_contention
  2.76   6977.00  1254.00     1250     1.00     1.00  svc_export_put
  2.67   8193.00  1216.00     1210     1.00     1.01  find_get_page
  2.57   9362.00  1169.00     1247     0.94     1.01  svcauth_unix_set_client
  ...
  0.93  29904.00   425.00    29154     0.01     1.02  svc_process	<----
  ^^^^
  ...
  0.35  38663.00   159.00    11859     0.01     1.01  nfsd_dispatch	<----
  ^^^^

call rate:

nullarbor:~ # pmdumptext nfs3.server.calls
...
Thu Dec 13 06:35:43     242547.513
Thu Dec 13 06:35:44     242257.033
Thu Dec 13 06:35:45     242144.719
Thu Dec 13 06:35:46     242857.100
Thu Dec 13 06:35:47     241464.156
Thu Dec 13 06:35:48     241182.933
Thu Dec 13 06:35:49     241294.968
Thu Dec 13 06:35:50     241606.887

i.e. about a 24.2% improvement in call rate.  Note, this includes the
performance gain from the previous patch which made svc_stat per-cpu.

Signed-off-by: Greg Banks <gnb@sgi.com>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 include/linux/sunrpc/stats.h |    1 +
 include/linux/sunrpc/svc.h   |    2 +-
 net/sunrpc/stats.c           |    2 +-
 net/sunrpc/svc.c             |   10 +++++++---
 4 files changed, 10 insertions(+), 5 deletions(-)

Index: bfields/include/linux/sunrpc/stats.h
===================================================================
--- bfields.orig/include/linux/sunrpc/stats.h
+++ bfields/include/linux/sunrpc/stats.h
@@ -34,6 +34,7 @@ struct svc_stat {
 				rpcbadfmt,
 				rpcbadauth,
 				rpcbadclnt;
+	unsigned int		callcnt[];
 };
 
 void			rpc_proc_init(void);
Index: bfields/include/linux/sunrpc/svc.h
===================================================================
--- bfields.orig/include/linux/sunrpc/svc.h
+++ bfields/include/linux/sunrpc/svc.h
@@ -77,6 +77,7 @@ struct svc_pool {
 struct svc_serv {
 	struct svc_program *	sv_program;	/* RPC program */
 	struct svc_stat *	sv_stats_percpu;/* RPC statistics */
+	unsigned int		sv_stats_ncalls;/* how many slots in svc_stat.callcnt[] */
 	spinlock_t		sv_lock;
 	unsigned int		sv_nrthreads;	/* # of server threads */
 	unsigned int		sv_maxconn;	/* max connections allowed or
@@ -408,7 +409,7 @@ struct svc_procedure {
 	kxdrproc_t		pc_release;	/* XDR free result */
 	unsigned int		pc_argsize;	/* argument struct size */
 	unsigned int		pc_ressize;	/* result struct size */
-	unsigned int		pc_count;	/* call count */
+	unsigned int		pc_countidx;	/* index into svc_stat.callcnt[] */
 	unsigned int		pc_cachetype;	/* cache info (NFS) */
 	unsigned int		pc_xdrressize;	/* maximum size of XDR reply */
 };
Index: bfields/net/sunrpc/stats.c
===================================================================
--- bfields.orig/net/sunrpc/stats.c
+++ bfields/net/sunrpc/stats.c
@@ -91,12 +91,13 @@ static void svc_stat_accum(const struct 
 	unsigned int *usp = (unsigned int *)sp;
 	int cpu;
 	int i;
+	int n = sizeof(*sp)/sizeof(unsigned int) + serv->sv_stats_ncalls;
 
 	memset(sp, 0, sizeof(*sp));
 	for_each_possible_cpu(cpu) {
 		unsigned int *ucsp = (unsigned int *)
 					per_cpu_ptr(serv->sv_stats_percpu, cpu);
-		for (i = 0 ; i < sizeof(*sp)/sizeof(unsigned int) ; i++)
+		for (i = 0 ; i < n ; i++)
 			usp[i] += ucsp[i];
 	}
 }
@@ -109,13 +110,17 @@ void svc_seq_show(struct seq_file *seq, 
 {
 	/* TODO: report call counts from the non-primary programs */
 	const struct svc_program *prog = serv->sv_program;
-	struct svc_stat accum;
-	struct svc_stat *statp = &accum;
+	struct svc_stat *statp;
 	const struct svc_procedure *proc;
 	const struct svc_version *vers;
 	unsigned int i, j;
 
-	svc_stat_accum(serv, &accum);
+	statp = kzalloc(sizeof(struct svc_stat) +
+			sizeof(unsigned int) * serv->sv_stats_ncalls,
+			GFP_KERNEL);
+	if (!statp)
+		return;
+	svc_stat_accum(serv, statp);
 
 	seq_printf(seq,
 		"net %u %u %u %u\n",
@@ -136,9 +141,11 @@ void svc_seq_show(struct seq_file *seq, 
 			continue;
 		seq_printf(seq, "proc%d %u", i, vers->vs_nproc);
 		for (j = 0; j < vers->vs_nproc; j++, proc++)
-			seq_printf(seq, " %u", proc->pc_count);
+			seq_printf(seq, " %u", statp->callcnt[proc->pc_countidx]);
 		seq_putc(seq, '\n');
 	}
+
+	kfree(statp);
 }
 EXPORT_SYMBOL_GPL(svc_seq_show);
 
Index: bfields/net/sunrpc/svc.c
===================================================================
--- bfields.orig/net/sunrpc/svc.c
+++ bfields/net/sunrpc/svc.c
@@ -365,6 +365,7 @@ __svc_create(struct svc_program *prog, u
 	unsigned int vers;
 	unsigned int xdrsize;
 	unsigned int i;
+	unsigned int countidx = 0;
 
 	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
@@ -386,6 +387,8 @@ __svc_create(struct svc_program *prog, u
 					prog->pg_lovers = vers;
 				if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
 					xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+				for (i = 0 ; i < prog->pg_vers[vers]->vs_nproc ; i++)
+					prog->pg_vers[vers]->vs_proc[i].pc_countidx = countidx++;
 			}
 		prog = prog->pg_next;
 	}
@@ -395,7 +398,9 @@ __svc_create(struct svc_program *prog, u
 	init_timer(&serv->sv_temptimer);
 	spin_lock_init(&serv->sv_lock);
 
-	serv->sv_stats_percpu = __alloc_percpu(sizeof(struct svc_stat),
+	serv->sv_stats_ncalls = countidx;
+	serv->sv_stats_percpu = __alloc_percpu(sizeof(struct svc_stat) +
+					       sizeof(unsigned int) * countidx,
 					       __alignof__(struct svc_stat));
 	if (!serv->sv_stats_percpu) {
 		kfree(serv);
@@ -1098,8 +1103,8 @@ svc_process(struct svc_rqst *rqstp)
 	statp = resv->iov_base +resv->iov_len;
 	svc_putnl(resv, RPC_SUCCESS);
 
-	/* Bump per-procedure stats counter */
-	procp->pc_count++;
+	/* Bump per-procedure per-cpu stats counter */
+	SVC_INC_STAT(serv, callcnt[procp->pc_countidx]);
 
 	/* Initialize storage for argp and resp */
 	memset(rqstp->rq_argp, 0, procp->pc_argsize);

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 28/29] knfsd: introduce NFSD_INC_STAT()
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (26 preceding siblings ...)
  2009-03-31 20:28 ` [patch 27/29] knfsd: move hot procedure count field out of svc_procedure Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-03-31 20:28 ` [patch 29/29] knfsd: make nfsdstats per-CPU Greg Banks
  2009-04-01  0:23 ` [patch 00/29] SGI enhancedNFS patches J. Bruce Fields
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Replace direct adds and increments of counters in the global nfsdstats
structure with the two macros NFSD_INC_STAT() and NFSD_ADD_STAT().
This doesn't change any functionality but prepares for a subsequent
patch which changes the definitions of those macros to make the
nfsdstats counters per-cpu for performance.

This is a preliminary step towards making the NFS server statistics
be gathered per-CPU.

Signed-off-by: Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 fs/nfsd/nfs4proc.c         |    2 +-
 fs/nfsd/nfsfh.c            |    2 +-
 fs/nfsd/vfs.c              |    4 ++--
 include/linux/nfsd/stats.h |    5 +++++
 4 files changed, 9 insertions(+), 4 deletions(-)

Index: bfields/fs/nfsd/nfs4proc.c
===================================================================
--- bfields.orig/fs/nfsd/nfs4proc.c
+++ bfields/fs/nfsd/nfs4proc.c
@@ -872,7 +872,7 @@ nfsd4_proc_null(struct svc_rqst *rqstp, 
 static inline void nfsd4_increment_op_stats(u32 opnum)
 {
 	if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
-		nfsdstats.nfs4_opcount[opnum]++;
+		NFSD_INC_STAT(nfs4_opcount[opnum]);
 }
 
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
Index: bfields/fs/nfsd/nfsfh.c
===================================================================
--- bfields.orig/fs/nfsd/nfsfh.c
+++ bfields/fs/nfsd/nfsfh.c
@@ -354,7 +354,7 @@ skip_pseudoflavor_check:
 	}
 out:
 	if (error == nfserr_stale)
-		nfsdstats.fh_stale++;
+		NFSD_INC_STAT(fh_stale);
 	return error;
 }
 
Index: bfields/fs/nfsd/vfs.c
===================================================================
--- bfields.orig/fs/nfsd/vfs.c
+++ bfields/fs/nfsd/vfs.c
@@ -87,6 +87,7 @@ struct raparm_hbucket {
 #define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
 #define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
 static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
+static int raparm_cache_size;
 
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
@@ -809,7 +810,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
 		if (ra->p_count == 0)
 			frap = rap;
 	}
-	depth = nfsdstats.ra_size*11/10;
+	depth = raparm_cache_size*11/10;
 	if (!frap) {	
 		spin_unlock(&rab->pb_lock);
 		return NULL;
@@ -827,7 +828,7 @@ found:
 		rab->pb_head = ra;
 	}
 	ra->p_count++;
-	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
+	NFSD_INC_STAT(ra_depth[depth*10/raparm_cache_size]);
 	spin_unlock(&rab->pb_lock);
 	return ra;
 }
@@ -938,7 +939,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
 	}
 
 	if (host_err >= 0) {
-		nfsdstats.io_read += host_err;
+		NFSD_ADD_STAT(io_read, host_err);
 		*count = host_err;
 		err = 0;
 		fsnotify_access(file->f_path.dentry);
@@ -1010,7 +1011,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
 	if (host_err >= 0) {
-		nfsdstats.io_write += host_err;
+		NFSD_ADD_STAT(io_write, host_err);
 		fsnotify_modify(file->f_path.dentry);
 	}
 
@@ -2111,6 +2112,7 @@ nfsd_racache_init(int cache_size)
 	int	nperbucket;
 	struct raparms **raparm = NULL;
 
+	raparm_cache_size = cache_size;
 
 	if (raparm_hash[0].pb_head)
 		return 0;
@@ -2134,7 +2136,7 @@ nfsd_racache_init(int cache_size)
 		*raparm = NULL;
 	}
 
-	nfsdstats.ra_size = cache_size;
+	NFSD_SET_STAT(ra_size, cache_size);
 	return 0;
 
 out_nomem:
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -47,6 +47,13 @@ struct nfsd_stats {
 	unsigned int    rcage;          /* instant: age in milliseconds of last
 					 * entry reused from the LRU list */
 };
+#define NFSD_INC_STAT(field) \
+	(nfsdstats.field++)
+#define NFSD_ADD_STAT(field, v) \
+	(nfsdstats.field += (v))
+#define NFSD_SET_STAT(field, v) \
+	(nfsdstats.field = (v))
+
 
 struct nfsd_op_stats {
 #define NFSD_STATS_OP_FSINFO	0	/* includes NULLPROC,FSSTAT,FSINFO,
Index: bfields/fs/nfsd/nfscache.c
===================================================================
--- bfields.orig/fs/nfsd/nfscache.c
+++ bfields/fs/nfsd/nfscache.c
@@ -155,8 +155,8 @@ static int nfsd_cache_bucket_expand(stru
 	spin_lock(&b->lock);
 
 	b->size += increment;
-	nfsdstats.rcentries += increment;
-	nfsdstats.rcmem += increment * sizeof(struct svc_cacherep);
+	NFSD_ADD_STAT(rcentries, increment);
+	NFSD_ADD_STAT(rcmem, increment * sizeof(struct svc_cacherep));
 	list_splice(&lru, &b->lru);
 
 	spin_unlock(&b->lock);
@@ -188,10 +188,10 @@ int nfsd_reply_cache_init(void)
 		if (!b->hash)
 			goto out_nomem;
 
-		nfsdstats.rcmem += HASHSIZE * sizeof(struct hlist_head);
+		NFSD_ADD_STAT(rcmem, HASHSIZE * sizeof(struct hlist_head));
 	}
 
-	nfsdstats.rchashsize = HASHSIZE;
+	NFSD_SET_STAT(rchashsize, HASHSIZE);
 	cache_disabled = 0;
 	return 0;
 out_nomem:
@@ -275,7 +275,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 
 	rqstp->rq_cacherep = NULL;
 	if (cache_disabled || type == RC_NOCACHE) {
-		nfsdstats.rcnocache++;
+		NFSD_INC_STAT(rcnocache);
 		return RC_DOIT;
 	}
 	h = request_hash(xid, svc_addr_in(rqstp));
@@ -295,13 +295,13 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 		    proc == rp->c_proc &&
 		    proto == rp->c_prot && vers == rp->c_vers &&
 		    time_after(rp->c_timestamp, age)) {
-			nfsdstats.rchits++;
-			nfsdstats.rcprobes += nprobes;
+			NFSD_INC_STAT(rchits);
+			NFSD_ADD_STAT(rcprobes, nprobes);
 			goto found_entry;
 		}
 	}
-	nfsdstats.rcmisses++;
-	nfsdstats.rcprobes += nprobes;
+	NFSD_INC_STAT(rcmisses);
+	NFSD_ADD_STAT(rcprobes, nprobes);
 
 	/* This loop shouldn't take more than a few iterations normally */
 	{
@@ -332,14 +332,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	if (rp->c_state != RC_UNUSED) {
 		/* reusing an existing cache entry */
 		age = jiffies - rp->c_timestamp;
-		nfsdstats.rcage = age;
+		NFSD_SET_STAT(rcage, age);
 		if (age < CACHE_THRESH_AGE &&
 		    b->size < CACHE_BUCKET_MAX_SIZE &&
 		    nfsd_cache_expand_ratelimit(b)) {
 			expand = CACHE_BUCKET_INCREMENT;
 			if (b->size + expand > CACHE_BUCKET_MAX_SIZE)
 				expand = CACHE_BUCKET_MAX_SIZE - b->size;
-			nfsdstats.rcexpands++;
+			NFSD_INC_STAT(rcexpands);
 		}
 	}
 
@@ -360,7 +360,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
 	if (rp->c_type == RC_REPLBUFF) {
 		kfree(rp->c_replvec.iov_base);
 		rp->c_replvec.iov_base = NULL;
-		nfsdstats.rcmem -= rp->c_replvec.iov_len;
+		NFSD_ADD_STAT(rcmem, -rp->c_replvec.iov_len);
 	}
 	rp->c_type = RC_NOCACHE;
  out:
@@ -473,7 +473,7 @@ nfsd_cache_update(struct svc_rqst *rqstp
 	rp->c_state = RC_DONE;
 	rp->c_timestamp = jiffies;
 	if (moremem)
-		nfsdstats.rcmem += moremem;
+		NFSD_ADD_STAT(rcmem, moremem);
 	spin_unlock(&b->lock);
 	return;
 }
Index: bfields/fs/nfsd/nfssvc.c
===================================================================
--- bfields.orig/fs/nfsd/nfssvc.c
+++ bfields/fs/nfsd/nfssvc.c
@@ -425,7 +425,7 @@ nfsd(void *vrqstp)
 	allow_signal(SIGINT);
 	allow_signal(SIGQUIT);
 
-	nfsdstats.th_cnt++;
+	NFSD_INC_STAT(th_cnt);
 	mutex_unlock(&nfsd_mutex);
 
 	/*
@@ -476,7 +476,7 @@ nfsd(void *vrqstp)
 	flush_signals(current);
 
 	mutex_lock(&nfsd_mutex);
-	nfsdstats.th_cnt --;
+	NFSD_ADD_STAT(th_cnt, -1);
 
 out:
 	/* Release the thread */

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [patch 29/29] knfsd: make nfsdstats per-CPU
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (27 preceding siblings ...)
  2009-03-31 20:28 ` [patch 28/29] knfsd: introduce NFSD_INC_STAT() Greg Banks
@ 2009-03-31 20:28 ` Greg Banks
  2009-04-01  0:23 ` [patch 00/29] SGI enhancedNFS patches J. Bruce Fields
  29 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-03-31 20:28 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

Make the global nfsdstats structure per-cpu.  Fields in this struct are
incremented one times per READ and WRITE NFS call, and also in various
other more obscure situations.  When the workload is doing READs to
every nfsd on every CPU, it's a very hot and bouncy cacheline indeed.

Tests on a 16 CPU Altix A4700 with 2 10gige Myricom cards, configured
separately (no bonding).  Workload is 640 client threads doing directory
traverals with random small reads, from server RAM.

Before
======

Kernel profile:

  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  5.45   2484.00  2484.00     2883     0.86     1.00  nfsd_ofcache_lookup
  4.91   4720.00  2236.00     2231     1.00     1.00  spin_unlock_irqrestore
  2.80   5994.00  1274.00     1262     1.01     1.01  svc_export_put
  2.74   7242.00  1248.00     3650     0.34     1.00  nfsd_vfs_read	<----
  2.58   8417.00  1175.00     1281     0.92     1.01  svcauth_unix_set_client

After
=====

Kernel profile:

  %   cumulative   self              self     total
 time   samples   samples    calls   1/call   1/call  name
  5.01   2276.00  2276.00     2666     0.85     1.00  nfsd_ofcache_lookup
  4.61   4370.00  2094.00     2092     1.00     1.00  ia64_spinlock_contention
  4.20   6279.00  1909.00     3141     0.61     0.78  svc_sock_enqueue
  4.03   8108.00  1829.00     1824     1.00     1.00  spin_unlock_irqrestore
  3.32   9618.00  1510.00     3588     0.42     1.00  spin_lock
  ...
  0.54  36665.00   246.00     2211     0.11     1.00  nfsd_vfs_read	<----

In this case, the throughput did not actually improve until the next
problem was solved (patch knfsd-make-svc-authenticate-scale-2).

Signed-off-by: Greg Banks <gnb@sgi.com>
Reviewed-by: David Chinner <dgc@sgi.com>
Reviewed-by: Peter Leckie <pleckie-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
---

 fs/nfsd/nfscache.c         |   19 ++++++++++---------
 fs/nfsd/stats.c            |   29 ++++++++++++++++++++++++++++-
 include/linux/nfsd/stats.h |    8 +++++---
 3 files changed, 43 insertions(+), 13 deletions(-)

Index: bfields/fs/nfsd/stats.c
===================================================================
--- bfields.orig/fs/nfsd/stats.c
+++ bfields/fs/nfsd/stats.c
@@ -55,16 +55,51 @@ static inline void nfsd_stats_prefetch(n
 }
 
 
-struct nfsd_stats	nfsdstats;
+struct nfsd_stats	*nfsdstats_percpu;
 
 nfsd_stats_hash_t nfsd_export_stats_hash;
 nfsd_stats_hash_t nfsd_client_stats_hash;
 int nfsd_stats_enabled = 1;
 int nfsd_stats_prune_period = 2*86400;
 
+/*
+ * Accumulate all the per-cpu struct nfsd_stats
+ * into one global total for emission to userspace.
+ * Relies on struct nfsd_stats being composed of
+ * unsigned ints without gaps, so it can be treated
+ * as an array of unsigned ints.
+ *
+ * Note: we iterate over all possible CPUs instead
+ * of just the online ones to avoid counters going
+ * backwards when CPUs go offline.
+ *
+ * Note: the rcage field needs to be accumulated as
+ * a minimum across all the CPUs, not a sum.
+ */
+static void nfsd_stat_accum(struct nfsd_stats *sp)
+{
+	unsigned int *usp = (unsigned int *)sp;
+	int cpu;
+	int i;
+	unsigned int rcage = ~0;
+
+	memset(sp, 0, sizeof(*sp));
+	for_each_possible_cpu(cpu) {
+		struct nfsd_stats *csp = per_cpu_ptr(nfsdstats_percpu, cpu);
+		unsigned int *ucsp = (unsigned int *)csp;
+		for (i = 0 ; i < sizeof(*sp)/sizeof(unsigned int) ; i++)
+			usp[i] += ucsp[i];
+		rcage = min_t(unsigned int, rcage, csp->rcage);
+	}
+	sp->rcage = rcage;
+}
+
 static int nfsd_proc_show(struct seq_file *seq, void *v)
 {
 	int i;
+	struct nfsd_stats nfsdstats;
+
+	nfsd_stat_accum(&nfsdstats);
 
 	seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n",
 		      nfsdstats.rchits,
@@ -715,6 +750,7 @@ nfsd_stat_init(void)
 
 	nfsd_stats_hash_init(&nfsd_export_stats_hash, "export");
 	nfsd_stats_hash_init(&nfsd_client_stats_hash, "client");
+	nfsdstats_percpu = alloc_percpu(struct nfsd_stats);
 }
 
 void
@@ -724,4 +760,5 @@ nfsd_stat_shutdown(void)
 
 	nfsd_stats_hash_destroy(&nfsd_export_stats_hash);
 	nfsd_stats_hash_destroy(&nfsd_client_stats_hash);
+	free_percpu(nfsdstats_percpu);
 }
Index: bfields/include/linux/nfsd/stats.h
===================================================================
--- bfields.orig/include/linux/nfsd/stats.h
+++ bfields/include/linux/nfsd/stats.h
@@ -48,11 +48,17 @@ struct nfsd_stats {
 					 * entry reused from the LRU list */
 };
 #define NFSD_INC_STAT(field) \
-	(nfsdstats.field++)
+	(nfsdstats_percpu ? \
+		++(per_cpu_ptr(nfsdstats_percpu, smp_processor_id())->field) : 0)
 #define NFSD_ADD_STAT(field, v) \
-	(nfsdstats.field += (v))
+	(nfsdstats_percpu ? \
+		(per_cpu_ptr(nfsdstats_percpu, smp_processor_id())->field) += (v) : 0)
+/* Note that SET_STAT() always proceeds on per-cpu slot 0 to
+ * preserve the value semantics; this means you CANNOT mix
+ * SET_STAT() with INC_STAT() etc */
 #define NFSD_SET_STAT(field, v) \
-	(nfsdstats.field = (v))
+	(nfsdstats_percpu ? \
+		(per_cpu_ptr(nfsdstats_percpu, 0)->field) = (v) : 0)
 
 
 struct nfsd_op_stats {
@@ -148,7 +154,7 @@ struct nfsd_stats_hiter {
 };
 
 
-extern struct nfsd_stats	nfsdstats;
+extern struct nfsd_stats	*nfsdstats_percpu;
 extern nfsd_stats_hash_t	nfsd_export_stats_hash;
 extern nfsd_stats_hash_t	nfsd_client_stats_hash;
 

--
Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 00/29] SGI enhancedNFS patches
  2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
                   ` (28 preceding siblings ...)
  2009-03-31 20:28 ` [patch 29/29] knfsd: make nfsdstats per-CPU Greg Banks
@ 2009-04-01  0:23 ` J. Bruce Fields
  2009-04-01  3:32   ` Greg Banks
  29 siblings, 1 reply; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-01  0:23 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:00AM +1100, Greg Banks wrote:
> This patchset is a selection of the useful parts of the NFS server
> patches which comprise the SGI enhancedNFS product, forward ported,
> merged and reorganised.

OK, thanks.

> Bruce: all of these are potentially candidates for 2.6.30.

It's probably too late for 2.6.30 (the 4.1 stuff I've promised to try to
make a serious attempt at, but that's it).  I'll publish a for-2.6.31
branch as soon as I can....  (But of course anything that looks like a
bugfix I'll keep considering for 2.6.30.)

--b.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 05/29] knfsd: Infrastructure for providing stats to userspace
  2009-03-31 20:28 ` [patch 05/29] knfsd: Infrastructure for providing stats to userspace Greg Banks
@ 2009-04-01  0:28   ` J. Bruce Fields
  2009-04-01  3:43     ` Greg Banks
  0 siblings, 1 reply; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-01  0:28 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:05AM +1100, Greg Banks wrote:
> Added iteration and seq_file infrastructure to allow implementing
> a /proc file which exports all the entries in a stats hashtable as
> text to userspace.  Function nfsd_stats_open() is called in the /proc
> file's open method and handles all the subsequent details.
> 
> Like all RPC statistics, the format is designed to be easy to parse
> in shell scripts and C code.  Counter values are presented in text
> form, grouped into lines which start with a two-letter keyword.
> For example, the line "by 2680 487656" shows that 2680 bytes of NFS
> calls have been received and 487656 bytes of replies have been sent.
> The special "nm" keyword starts a new entry and shows it's internal
> name, e.g. "nm 192.168.67.45" in the per-client statistics file will
> begin the entry for the client whose IP address is 192.168.67.45.

OK, so the rules for a userland script are that they should ignore any
line which starts with a two-character code that they don't recognize,
allowing us to add more of those later if necessary?

I've also occasionally wanted something to expose per-client
troubleshooting information of various sorts.  (For example: are
callbacks to a given v4.0 client currently working, and if not, why
not?)  So it'd be interesting if it could also be extended to do that
sort of thing.

--b.

> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/stats.c            |  173 ++++++++++++++++++++++++++++++++++
>  include/linux/nfsd/stats.h |   11 ++
>  2 files changed, 184 insertions(+)
> 
> Index: bfields/fs/nfsd/stats.c
> ===================================================================
> --- bfields.orig/fs/nfsd/stats.c
> +++ bfields/fs/nfsd/stats.c
> @@ -426,6 +426,179 @@ void nfsd_stats_post(struct svc_rqst *rq
>  }
>  
>  
> +static nfsd_stats_hentry_t *nfsd_stats_hiter_first(nfsd_stats_hiter_t *itr)
> +{
> +	for (itr->bucket = 0 ;
> +	     itr->bucket < itr->sh->sh_size ;
> +	     itr->bucket++) {
> +		struct hlist_head *hh = &itr->sh->sh_hash[itr->bucket];
> +		if (hh->first != NULL)
> +			return hentry_from_hnode(hh->first);
> +	}
> +	return NULL;
> +}
> +
> +static nfsd_stats_hentry_t *nfsd_stats_hiter_next(nfsd_stats_hiter_t *itr,
> +						  nfsd_stats_hentry_t *se)
> +{
> +	struct hlist_head *hh;
> +
> +	for (;;) {
> +		if (se->se_node.next != NULL)
> +			return hentry_from_hnode(se->se_node.next);
> +		if (++itr->bucket >= itr->sh->sh_size)
> +			return NULL;	/* finished iterating */
> +		hh = &itr->sh->sh_hash[itr->bucket];
> +		if (hh->first != NULL)
> +			return hentry_from_hnode(hh->first);
> +	}
> +}
> +
> +static nfsd_stats_hentry_t *nfsd_stats_hiter_seek(nfsd_stats_hiter_t *itr,
> +						  loff_t pos)
> +{
> +	nfsd_stats_hentry_t *se;
> +
> +	for (se = nfsd_stats_hiter_first(itr) ;
> +	     se != NULL ;
> +	     se = nfsd_stats_hiter_next(itr, se)) {
> +		if (!--pos)
> +			return se;
> +	}
> +	return NULL;
> +}
> +
> +static void *nfsd_stats_start(struct seq_file *m, loff_t *pos)
> +{
> +	nfsd_stats_hiter_t *itr = m->private;
> +
> +	dprintk("nfsd_stats_start, *pos=%d\n", (int)*pos);
> +	down_read(&itr->sh->sh_sem);
> +
> +	if (!*pos)
> +		return SEQ_START_TOKEN;
> +
> +	return nfsd_stats_hiter_seek(itr, *pos);
> +}
> +
> +static void *nfsd_stats_next(struct seq_file *m, void *p, loff_t *pos)
> +{
> +	nfsd_stats_hiter_t *itr = m->private;
> +	nfsd_stats_hentry_t *se = p;
> +
> +	dprintk("nfsd_stats_next, *pos=%llu bucket=%d\n", *pos, itr->bucket);
> +
> +	if (p == SEQ_START_TOKEN)
> +		se = nfsd_stats_hiter_first(itr);
> +	else
> +		se = nfsd_stats_hiter_next(itr, se);
> +	++*pos;
> +	return se;
> +}
> +
> +static void nfsd_stats_stop(struct seq_file *m, void *p)
> +{
> +	nfsd_stats_hiter_t *itr = m->private;
> +
> +	up_read(&itr->sh->sh_sem);
> +}
> +
> +static int nfsd_stats_show(struct seq_file *m, void *p)
> +{
> +	nfsd_stats_hentry_t *se = p;
> +	struct nfsd_op_stats *os = &se->se_data;
> +	int i;
> +
> +	if (p == SEQ_START_TOKEN) {
> +		seq_puts(m, "# Version 1.0\n");
> +		return 0;
> +	}
> +
> +	dprintk("nfsd_stats_show %s\n",  se->se_name);
> +
> +	seq_puts(m, "nm ");
> +	seq_escape(m, se->se_name, " \t\n\\");
> +	seq_printf(m, "\n");
> +
> +	/* histogram of operations */
> +	seq_puts(m, "op");
> +	for (i = 0 ; i < NFSD_STATS_OP_NUM ; i++)
> +		seq_printf(m, " %lu", os->os_ops[i]);
> +	seq_putc(m, '\n');
> +
> +	/* bytes in and out */
> +	seq_printf(m, "by %lu %lu\n", os->os_bytes_in, os->os_bytes_out);
> +
> +	/* histogram of read sizes */
> +	seq_puts(m, "rs");
> +	for (i = 0 ; i < NFSD_STATS_SIZE_NUM ; i++)
> +		seq_printf(m, " %lu", os->os_read_sizes[i]);
> +	seq_putc(m, '\n');
> +
> +	/* histogram of write sizes */
> +	seq_puts(m, "ws");
> +	for (i = 0 ; i < NFSD_STATS_SIZE_NUM ; i++)
> +		seq_printf(m, " %lu", os->os_write_sizes[i]);
> +	seq_putc(m, '\n');
> +
> +	/* counts of operations by transport */
> +	seq_printf(m, "tr udp %lu\n",
> +		   os->os_transports[NFSD_STATS_TRANSPORT_UDP]);
> +	seq_printf(m, "tr tcp %lu\n",
> +		   os->os_transports[NFSD_STATS_TRANSPORT_TCP]);
> +#if defined(CONFIG_NFSD_RDMA) || defined(CONFIG_NFSD_RDMA_MODULE)
> +	seq_printf(m, "tr rdma %lu\n",
> +		   os->os_transports[NFSD_STATS_TRANSPORT_RDMA]);
> +#endif
> +
> +	/* counts of operations by version */
> +	seq_printf(m, "ve 2 %lu\n",
> +		   os->os_versions[NFSD_STATS_VERSION_V2]);
> +	seq_printf(m, "ve 3 %lu\n",
> +		   os->os_versions[NFSD_STATS_VERSION_V3]);
> +	seq_printf(m, "ve 4 %lu\n",
> +		   os->os_versions[NFSD_STATS_VERSION_V4]);
> +
> +	/* histogram of service times */
> +	seq_puts(m, "st");
> +	for (i = 0 ; i < NFSD_STATS_SVCTIME_NUM ; i++)
> +		seq_printf(m, " %lu", os->os_service_times[i]);
> +	seq_putc(m, '\n');
> +
> +	return 0;
> +}
> +
> +static struct seq_operations nfsd_stats_seq_ops = {
> +	.start	= nfsd_stats_start,
> +	.next	= nfsd_stats_next,
> +	.stop	= nfsd_stats_stop,
> +	.show	= nfsd_stats_show,
> +};
> +
> +int nfsd_stats_open(struct file *file, nfsd_stats_hash_t *sh)
> +{
> +	int err;
> +	nfsd_stats_hiter_t *itr;
> +
> +	if (sh->sh_hash == NULL)
> +		return -ENOENT;
> +
> +	if ((itr = kmalloc(sizeof(*itr), GFP_KERNEL)) == NULL)
> +		return -ENOMEM;
> +
> +	if ((err = seq_open(file, &nfsd_stats_seq_ops))) {
> +		kfree(itr);
> +		return err;
> +	}
> +
> +	itr->sh = sh;
> +	itr->bucket = 0;
> +	((struct seq_file *) file->private_data)->private = itr;
> +
> +	return 0;
> +}
> +
> +
>  void
>  nfsd_stat_init(void)
>  {
> Index: bfields/include/linux/nfsd/stats.h
> ===================================================================
> --- bfields.orig/include/linux/nfsd/stats.h
> +++ bfields/include/linux/nfsd/stats.h
> @@ -100,6 +100,7 @@ struct nfsd_op_stats {
>  
>  typedef struct nfsd_stats_hash		nfsd_stats_hash_t;
>  typedef struct nfsd_stats_hentry	nfsd_stats_hentry_t;
> +typedef struct nfsd_stats_hiter		nfsd_stats_hiter_t;
>  
>  /* Entry in the export and client stats hashtables */
>  struct nfsd_stats_hentry {
> @@ -125,6 +126,13 @@ struct nfsd_stats_hash {
>  	struct timer_list	sh_prune_timer;
>  };
>  
> +/* Hashtable iteration state used during seq_file traversal */
> +struct nfsd_stats_hiter {
> +	nfsd_stats_hash_t *sh;
> +	int bucket;
> +};
> +
> +
>  extern struct nfsd_stats	nfsdstats;
>  extern struct svc_stat		nfsd_svcstats;
>  
> @@ -192,6 +200,9 @@ void nfsd_stats_pre(struct svc_rqst *rqs
>  /* nfsd calls this after servicing a request */
>  void nfsd_stats_post(struct svc_rqst *rqstp);
>  
> +/* open the hash for a seq_file pass to userspace */
> +int nfsd_stats_open(struct file *file, nfsd_stats_hash_t *sh);
> +
>  
>  
>  
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 00/29] SGI enhancedNFS patches
  2009-04-01  0:23 ` [patch 00/29] SGI enhancedNFS patches J. Bruce Fields
@ 2009-04-01  3:32   ` Greg Banks
       [not found]     ` <ac442c870903312032t34630c6dvdbb644cb510f8079-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 63+ messages in thread
From: Greg Banks @ 2009-04-01  3:32 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Greg Banks, Linux NFS ML

On Wed, Apr 1, 2009 at 11:23 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Wed, Apr 01, 2009 at 07:28:00AM +1100, Greg Banks wrote:

>> Bruce: all of these are potentially candidates for 2.6.30.
>
> It's probably too late for 2.6.30 (the 4.1 stuff I've promised to try to
> make a serious attempt at, but that's it).  I'll publish a for-2.6.31
> branch as soon as I can....  (But of course anything that looks like a
> bugfix I'll keep considering for 2.6.30.)

No worries.  I figured as much, but the patches really did need to be
posted this week.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 05/29] knfsd: Infrastructure for providing stats to userspace
  2009-04-01  0:28   ` J. Bruce Fields
@ 2009-04-01  3:43     ` Greg Banks
  0 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-01  3:43 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Greg Banks, Linux NFS ML

On Wed, Apr 1, 2009 at 11:28 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Wed, Apr 01, 2009 at 07:28:05AM +1100, Greg Banks wrote:

>> The special "nm" keyword starts a new entry and shows it's internal
>> name, e.g. "nm 192.168.67.45" in the per-client statistics file will
>> begin the entry for the client whose IP address is 192.168.67.45.
>
> OK, so the rules for a userland script are that they should ignore any
> line which starts with a two-character code that they don't recognize,
> allowing us to add more of those later if necessary?

Yes, just like the existing /proc/net/rpc/nfsd (except that in both
cases it's really the first whitespace-separated word and that only
happens to be 2 chars long in most cases).

I expect to be documenting the format properly sometime later.  Also,
SGI has some software that reads the two new files and provides
metrics for the PCP (Performance Co-Pilot) monitoring package; I have
permission to release that as open source and expect to do so in the
next few days..

> I've also occasionally wanted something to expose per-client
> troubleshooting information of various sorts.  (For example: are
> callbacks to a given v4.0 client currently working, and if not, why
> not?)  So it'd be interesting if it could also be extended to do that
> sort of thing.

Sure.  This would be the ideal infrastructure for adding such counters.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 00/29] SGI enhancedNFS patches
       [not found]     ` <ac442c870903312032t34630c6dvdbb644cb510f8079-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2009-04-01  6:34       ` Jeff Garzik
  2009-04-01  6:41         ` Greg Banks
  0 siblings, 1 reply; 63+ messages in thread
From: Jeff Garzik @ 2009-04-01  6:34 UTC (permalink / raw)
  To: Greg Banks; +Cc: J. Bruce Fields, Greg Banks, Linux NFS ML

Greg Banks wrote:
> On Wed, Apr 1, 2009 at 11:23 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
>> On Wed, Apr 01, 2009 at 07:28:00AM +1100, Greg Banks wrote:
> 
>>> Bruce: all of these are potentially candidates for 2.6.30.
>> It's probably too late for 2.6.30 (the 4.1 stuff I've promised to try to
>> make a serious attempt at, but that's it).  I'll publish a for-2.6.31
>> branch as soon as I can....  (But of course anything that looks like a
>> bugfix I'll keep considering for 2.6.30.)
> 
> No worries.  I figured as much, but the patches really did need to be
> posted this week.

So, what is enhancedNFS?  Does enhancedNFS comply with current RFCs, or 
deviate?

What features does it add?

	Jeff




^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 00/29] SGI enhancedNFS patches
  2009-04-01  6:34       ` Jeff Garzik
@ 2009-04-01  6:41         ` Greg Banks
  0 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-01  6:41 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: J. Bruce Fields, Greg Banks, Linux NFS ML

On Wed, Apr 1, 2009 at 5:34 PM, Jeff Garzik <jeff@garzik.org> wrote:
> Greg Banks wrote:
>>
>> On Wed, Apr 1, 2009 at 11:23 AM, J. Bruce Fields <bfields@fieldses.org>
>> wrote:
>>>
>>> On Wed, Apr 01, 2009 at 07:28:00AM +1100, Greg Banks wrote:
>>
>>>> Bruce: all of these are potentially candidates for 2.6.30.
>>>
>>> It's probably too late for 2.6.30 (the 4.1 stuff I've promised to try to
>>> make a serious attempt at, but that's it).  I'll publish a for-2.6.31
>>> branch as soon as I can....  (But of course anything that looks like a
>>> bugfix I'll keep considering for 2.6.30.)
>>
>> No worries.  I figured as much, but the patches really did need to be
>> posted this week.
>
> So, what is enhancedNFS?  Does enhancedNFS comply with current RFCs, or
> deviate?
>
> What features does it add?

It uses the same protocols.  The server is faster and more scalable
and has more statistics to drive the web-based management UI.  The
name is perhaps poorly chosen.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 01/29] knfsd: Add infrastructure for measuring RPC service times.
  2009-03-31 20:28 ` [patch 01/29] knfsd: Add infrastructure for measuring RPC service times Greg Banks
@ 2009-04-25  2:13   ` J. Bruce Fields
  2009-04-25  2:14     ` J. Bruce Fields
  2009-04-25  2:52     ` Greg Banks
  0 siblings, 2 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-25  2:13 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:01AM +1100, Greg Banks wrote:
> Two new functions; svc_time_mark() remembers the current time
> in a struct svc_time; svc_time_elapsed() calculates and returns
> the time since a svc_time was marked.
> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  include/linux/sunrpc/svc.h |   12 ++++++++++++
>  net/sunrpc/svc.c           |   25 +++++++++++++++++++++++++
>  2 files changed, 37 insertions(+)
> 
> Index: bfields/include/linux/sunrpc/svc.h
> ===================================================================
> --- bfields.orig/include/linux/sunrpc/svc.h
> +++ bfields/include/linux/sunrpc/svc.h
> @@ -18,6 +18,16 @@
>  #include <linux/sunrpc/svcauth.h>
>  #include <linux/wait.h>
>  #include <linux/mm.h>
> +#include <linux/time.h>
> +
> +/*
> + * Structure used to implement a fast lockless elapsed time measure.
> + */
> +struct svc_time
> +{
> +	struct timespec	st_spec;
> +};

Are struct svc_time, ...

> +void
> +svc_time_mark(struct svc_time *st)
> +{
> +	getnstimeofday(&st->st_spec);
> +}
> +EXPORT_SYMBOL(svc_time_mark);

... and this function really necessary?  If you're not too attached to
them: it would seem simpler just to use struct timespec and
getnstimeofday directly.  (Well, at least simpler to read for someone
familiar with the kernel but not with th nfs code.)

--b.

> +
> +int
> +svc_time_elapsed(const struct svc_time *mark, struct timespec *ts)
> +{
> +	struct svc_time now;
> +
> +	svc_time_mark(&now);
> +
> +	if (now.st_spec.tv_sec < mark->st_spec.tv_sec)
> +		return -EINVAL;	/* time going backwards */
> +
> +	*ts = timespec_sub(now.st_spec, mark->st_spec);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(svc_time_elapsed);
> +
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 01/29] knfsd: Add infrastructure for measuring RPC service times.
  2009-04-25  2:13   ` J. Bruce Fields
@ 2009-04-25  2:14     ` J. Bruce Fields
  2009-04-25  2:52     ` Greg Banks
  1 sibling, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-25  2:14 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

Whoops--correcting Greg's email address.

On Fri, Apr 24, 2009 at 10:13:15PM -0400, bfields wrote:
> On Wed, Apr 01, 2009 at 07:28:01AM +1100, Greg Banks wrote:
> > Two new functions; svc_time_mark() remembers the current time
> > in a struct svc_time; svc_time_elapsed() calculates and returns
> > the time since a svc_time was marked.
> > 
> > Signed-off-by: Greg Banks <gnb@sgi.com>
> > ---
> > 
> >  include/linux/sunrpc/svc.h |   12 ++++++++++++
> >  net/sunrpc/svc.c           |   25 +++++++++++++++++++++++++
> >  2 files changed, 37 insertions(+)
> > 
> > Index: bfields/include/linux/sunrpc/svc.h
> > ===================================================================
> > --- bfields.orig/include/linux/sunrpc/svc.h
> > +++ bfields/include/linux/sunrpc/svc.h
> > @@ -18,6 +18,16 @@
> >  #include <linux/sunrpc/svcauth.h>
> >  #include <linux/wait.h>
> >  #include <linux/mm.h>
> > +#include <linux/time.h>
> > +
> > +/*
> > + * Structure used to implement a fast lockless elapsed time measure.
> > + */
> > +struct svc_time
> > +{
> > +	struct timespec	st_spec;
> > +};
> 
> Are struct svc_time, ...
> 
> > +void
> > +svc_time_mark(struct svc_time *st)
> > +{
> > +	getnstimeofday(&st->st_spec);
> > +}
> > +EXPORT_SYMBOL(svc_time_mark);
> 
> ... and this function really necessary?  If you're not too attached to
> them: it would seem simpler just to use struct timespec and
> getnstimeofday directly.  (Well, at least simpler to read for someone
> familiar with the kernel but not with th nfs code.)
> 
> --b.
> 
> > +
> > +int
> > +svc_time_elapsed(const struct svc_time *mark, struct timespec *ts)
> > +{
> > +	struct svc_time now;
> > +
> > +	svc_time_mark(&now);
> > +
> > +	if (now.st_spec.tv_sec < mark->st_spec.tv_sec)
> > +		return -EINVAL;	/* time going backwards */
> > +
> > +	*ts = timespec_sub(now.st_spec, mark->st_spec);
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(svc_time_elapsed);
> > +
> > 
> > --
> > Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 01/29] knfsd: Add infrastructure for measuring RPC service times.
  2009-04-25  2:13   ` J. Bruce Fields
  2009-04-25  2:14     ` J. Bruce Fields
@ 2009-04-25  2:52     ` Greg Banks
  1 sibling, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-25  2:52 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

On Sat, Apr 25, 2009 at 12:13 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Wed, Apr 01, 2009 at 07:28:01AM +1100, Greg Banks wrote:
>> Two new functions; svc_time_mark() remembers the current time
>> in a struct svc_time; svc_time_elapsed() calculates and returns
>> the time since a svc_time was marked.
>>
>> Signed-off-by: Greg Banks <gnb@sgi.com>
>> ---
>>
>>  include/linux/sunrpc/svc.h |   12 ++++++++++++
>>  net/sunrpc/svc.c           |   25 +++++++++++++++++++++++++
>>  2 files changed, 37 insertions(+)
>>
>> Index: bfields/include/linux/sunrpc/svc.h
>> ===================================================================
>> --- bfields.orig/include/linux/sunrpc/svc.h
>> +++ bfields/include/linux/sunrpc/svc.h
>> @@ -18,6 +18,16 @@
>>  #include <linux/sunrpc/svcauth.h>
>>  #include <linux/wait.h>
>>  #include <linux/mm.h>
>> +#include <linux/time.h>
>> +
>> +/*
>> + * Structure used to implement a fast lockless elapsed time measure.
>> + */
>> +struct svc_time
>> +{
>> +     struct timespec st_spec;
>> +};
>
> Are struct svc_time, ...
>
>> +void
>> +svc_time_mark(struct svc_time *st)
>> +{
>> +     getnstimeofday(&st->st_spec);
>> +}
>> +EXPORT_SYMBOL(svc_time_mark);
>
> ... and this function really necessary?  If you're not too attached to
> them: it would seem simpler just to use struct timespec and
> getnstimeofday directly.  (Well, at least simpler to read for someone
> familiar with the kernel but not with th nfs code.)

To be quite frank, no.  They're historical; they were very necessary
in the original SLES10-based version of these patches because that
ancient kernel was missing some important time infrastructure bits.
In that code, the structure and functions did some very hairy and
arch-specific things involving jiffies and ITC/TSC registers.  I was
being lazy by not eliding them during forward porting, and I'll do so
in the next version.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 02/29] knfsd: Add stats table infrastructure.
  2009-03-31 20:28 ` [patch 02/29] knfsd: Add stats table infrastructure Greg Banks
@ 2009-04-25  3:56   ` J. Bruce Fields
  2009-04-26  4:12     ` Greg Banks
  0 siblings, 1 reply; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-25  3:56 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:02AM +1100, Greg Banks wrote:
> This infrastructure will be used to implement per-client and per-export
> serverside stats.  Multiple stats objects are kept in a hashtable,
> keyed by a string name (e.g. client IP address or export path).
> Old entries are pruned from the table using a timer.  The function
> nfsd_stats_find() can be used to find an entry and create it if
> necessary.
> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/stats.c            |  231 ++++++++++++++++++++++++++++++++++
>  include/linux/nfsd/debug.h |    1 
>  include/linux/nfsd/stats.h |   43 ++++++
>  3 files changed, 275 insertions(+)
> 
> Index: bfields/fs/nfsd/stats.c
> ===================================================================
> --- bfields.orig/fs/nfsd/stats.c
> +++ bfields/fs/nfsd/stats.c
> @@ -29,17 +29,29 @@
>  #include <linux/seq_file.h>
>  #include <linux/stat.h>
>  #include <linux/module.h>
> +#include <linux/jhash.h>
> +#include <linux/list.h>
> +#include <linux/swap.h>
> +#include <linux/log2.h>
>  
>  #include <linux/sunrpc/svc.h>
>  #include <linux/sunrpc/stats.h>
>  #include <linux/nfsd/nfsd.h>
>  #include <linux/nfsd/stats.h>
>  
> +#define NFSDDBG_FACILITY		NFSDDBG_STATS
> +
> +#define hentry_from_hnode(hn) \
> +	hlist_entry((hn), nfsd_stats_hentry_t, se_node)
> +
>  struct nfsd_stats	nfsdstats;
>  struct svc_stat		nfsd_svcstats = {
>  	.program	= &nfsd_program,
>  };
>  
> +int nfsd_stats_enabled = 1;
> +int nfsd_stats_prune_period = 2*86400;

For those of us that don't immediately recognize 86400 as the number of
seconds in a day, writing that out as " = 2*24*60*60;" could be a useful
hint.

Also nice: a comment with any rationale (however minor) for the choice
of period.

> +
>  static int nfsd_proc_show(struct seq_file *seq, void *v)
>  {
>  	int i;
> @@ -98,6 +110,225 @@ static const struct file_operations nfsd
>  	.release = single_release,
>  };
>  
> +
> +/*
> + * Stats hash pruning works thus.  A scan is run every prune period.
> + * On every scan, hentries with the OLD flag are detached and
> + * a reference dropped (usually that will be the last reference
> + * and the hentry will be deleted).  Hentries without the OLD flag
> + * have the OLD flag set; the flag is reset in nfsd_stats_get().
> + * So hentries with active traffic in the last 2 prune periods
> + * are not candidates for pruning.

s/2 prune periods/prune period/ ?

(From the description above: on exit from nfsd_stats_prune() all
remaining entries have OLD set.  Therefore if an entry is not touched in
the single period between two nfsd_stats_prune()'s, the second
nfsd_stats_prune() run will drop it.)

> + */
> +static void nfsd_stats_prune(unsigned long closure)
> +{
> +	nfsd_stats_hash_t *sh = (nfsd_stats_hash_t *)closure;
> +	unsigned int i;
> +	nfsd_stats_hentry_t *se;
> +	struct hlist_node *hn, *next;
> +	struct hlist_head to_be_dropped = HLIST_HEAD_INIT;
> +
> +	dprintk("nfsd_stats_prune\n");
> +
> +	if (!down_write_trylock(&sh->sh_sem)) {
> +		/* hash is busy...try again in a second */
> +		dprintk("nfsd_stats_prune: busy\n");
> +		mod_timer(&sh->sh_prune_timer, jiffies + HZ);

Could we make sh_sem a spinlock?  It doesn't look the the critical
sections ever need to sleep.

(Or even consider rcu, if we need the read lock on every rpc?  OK, I'm
mostly ignorant of rcu.)

> +		return;
> +	}
> +
> +	for (i = 0 ; i < sh->sh_size ; i++) {
> +		hlist_for_each_entry_safe(se, hn, next, &sh->sh_hash[i], se_node) {
> +			if (!test_and_set_bit(NFSD_STATS_HENTRY_OLD, &se->se_flags))

It looks like this is only ever used under the lock, so the
test_and_set_bit() is overkill.

> +				continue;
> +			hlist_del_init(&se->se_node);
> +			hlist_add_head(&se->se_node, &to_be_dropped);

Replace those two by hlist_move_list?

> +		}
> +	}
> +
> +	up_write(&sh->sh_sem);
> +
> +	dprintk("nfsd_stats_prune: deleting\n");
> +	hlist_for_each_entry_safe(se, hn, next, &to_be_dropped, se_node)
> +		nfsd_stats_put(se);

nfsd_stats_put() can down a semaphore, which we probably don't want in a
timer.  (So: make sh_sem a spinlock?)

> +
> +	mod_timer(&sh->sh_prune_timer, jiffies + nfsd_stats_prune_period * HZ);
> +}
> +
> +/*
> + * Initialise a stats hash.  Array size scales with
> + * server memory, as a loose heuristic for how many
> + * clients or exports a server is likely to have.
> + */
> +static void nfsd_stats_hash_init(nfsd_stats_hash_t *sh, const char *which)
> +{
> +	unsigned int nbits;
> +	unsigned int i;
> +
> +	init_rwsem(&sh->sh_sem);
> +
> +	nbits = 5 + ilog2(totalram_pages >> (30-PAGE_SHIFT));
> +	sh->sh_size = (1<<nbits);
> +	sh->sh_mask = (sh->sh_size-1);

Some comment on the choice of scale factor?  Also, see:

	http://marc.info/?l=linux-kernel&m=118299825922287&w=2

and followups.

Might consider a little helper function to do this kind of
fraction-of-total-memory calculation since I think the server does it in
3 or 4 places.

> +
> +	sh->sh_hash = kmalloc(sizeof(struct hlist_head) * sh->sh_size, GFP_KERNEL);

Can this be a more than a page?  (If so, could we just cap it at that
size to avoid >order-0 allocations and keep the kmalloc failure
unlikely?)

> +	if (sh->sh_hash == NULL) {
> +		printk(KERN_ERR "failed to allocate knfsd %s stats hashtable\n", which);
> +		/* struggle on... */
> +		return;
> +	}
> +	printk(KERN_INFO "knfsd %s stats hashtable, %u entries\n", which, sh->sh_size);

Eh.  Make it a dprintk?  Or maybe expose this in the nfsd filesystem if
it's not already?

> +
> +	for (i = 0 ; i < sh->sh_size ; i++)
> +		INIT_HLIST_HEAD(&sh->sh_hash[i]);
> +
> +	/* start the prune timer */
> +	init_timer(&sh->sh_prune_timer);
> +	sh->sh_prune_timer.function = nfsd_stats_prune;
> +	sh->sh_prune_timer.expires = jiffies + nfsd_stats_prune_period * HZ;
> +	sh->sh_prune_timer.data = (unsigned long)sh;
> +}
> +
> +/*
> + * Destroy a stats hash.  Drop what should be the last
> + * reference on all hentries, clean up the timer, and
> + * free the hash array.
> + */
> +static void nfsd_stats_hash_destroy(nfsd_stats_hash_t *sh)
> +{
> +	unsigned int i;
> +	nfsd_stats_hentry_t *se;
> +
> +	del_timer_sync(&sh->sh_prune_timer);
> +
> +	/* drop the last reference for all remaining hentries */
> +	for (i = 0 ; i < sh->sh_size ; i++) {
> +		struct hlist_head *hh = &sh->sh_hash[i];
> +
> +		while (hh->first != NULL) {
> +			se = hentry_from_hnode(hh->first);
> +			BUG_ON(atomic_read(&se->se_refcount) != 1);
> +			nfsd_stats_put(se);
> +		}
> +	}
> +
> +	if (sh->sh_hash != NULL) {

Drop the NULL check.

> +		kfree(sh->sh_hash);
> +	}
> +}
> +
> +/*
> + * Find and return a hentry for the given name, with a new refcount,
> + * creating it if necessary.  Will only return NULL on OOM or if
> + * stats are disabled.  Does it's own locking using the hash rwsem;
> + * may sleep.
> + */
> +nfsd_stats_hentry_t *nfsd_stats_find(nfsd_stats_hash_t *sh,
> +				     const char *name, int len)
> +{
> +	u32 hash;
> +	nfsd_stats_hentry_t *se, *new = NULL;
> +	struct hlist_node *hn;
> +
> +	dprintk("nfsd_stats_find: name %s len %d\n", name, len);
> +
> +	if (!nfsd_stats_enabled || sh->sh_hash == NULL)
> +		return NULL;
> +
> +
> +	/* search the hash table */
> +	hash = jhash(name, len, 0xfeedbeef) & sh->sh_mask;
> +	down_read(&sh->sh_sem);
> +	hlist_for_each_entry(se, hn, &sh->sh_hash[hash], se_node) {
> +		if (!strcmp(se->se_name, name)) {
> +			/* found matching */
> +			dprintk("nfsd_stats_find: found %s\n", se->se_name);
> +			nfsd_stats_get(se);
> +			up_read(&sh->sh_sem);
> +			return se;
> +		}
> +	}
> +	up_read(&sh->sh_sem);
> +
> +	/* not found, create a new one */
> +	dprintk("nfsd_stats_find: allocating new for %s\n", name);
> +	new = (nfsd_stats_hentry_t *)kmalloc(sizeof(*new), GFP_KERNEL);
> +	if (new == NULL)
> +		return NULL;
> +	/* initialise */
> +
> +	new->se_name = kmalloc(len+1, GFP_KERNEL);
> +	if (new->se_name == NULL) {
> +		kfree(new);
> +		return NULL;
> +	}
> +
> +	memcpy(new->se_name, name, len+1);
> +	atomic_set(&new->se_refcount, 2);/* 1 for the caller, 1 for the hash */
> +	new->se_hash = sh;
> +	new->se_flags = 0;
> +	INIT_HLIST_NODE(&new->se_node);
> +	memset(&new->se_data, 0, sizeof(new->se_data));
> +
> +	/* attach to the hash datastructure */
> +
> +	/*
> +	 * first check to see if we lost a race and some
> +	 * other thread already added a matching hentry.
> +	 */
> +	down_write(&sh->sh_sem);
> +	hlist_for_each_entry(se, hn, &sh->sh_hash[hash], se_node) {
> +		if (!strcmp(se->se_name, name)) {
> +			/* found matching, use that instead */
> +			dprintk("nfsd_stats_find: found(2) %s\n", name);
> +			kfree(new->se_name);
> +			kfree(new);
> +			nfsd_stats_get(se);
> +			up_write(&sh->sh_sem);
> +			return se;
> +		}
> +	}
> +	/* still not there, insert new one into the hash */
> +	hlist_add_head(&new->se_node, &sh->sh_hash[hash]);
> +
> +	up_write(&sh->sh_sem);
> +	return new;
> +}
> +
> +/*
> + * Drop a reference to a hentry, deleting the hentry if this
> + * was the last reference.  Does it's own locking using the

s/it's/its/

(Contending for the nitpick-of-the-day award.)

> + * hash rwsem; may sleep.
> + */
> +void
> +nfsd_stats_put(nfsd_stats_hentry_t *se)
> +{
> +	nfsd_stats_hash_t *sh = se->se_hash;
> +
> +	if (!atomic_dec_and_test(&se->se_refcount))
> +		return;
> +
> +	/* just dropped the last reference */
> +	down_write(&sh->sh_sem);
> +
> +	if (atomic_read(&se->se_refcount)) {
> +		/*
> +		 * We lost a race getting the write lock, and
> +		 * now there's a reference again.  Whatever.
> +		 */

Some kind of atomic_dec_and_lock() might close the race.

> +		goto out_unlock;
> +	}
> +
> +	dprintk("nfsd_stats_put: freeing %s\n", se->se_name);
> +	hlist_del(&se->se_node);
> +	kfree(se->se_name);
> +	kfree(se);
> +
> +out_unlock:
> +	up_write(&sh->sh_sem);
> +}
> +
> +
>  void
>  nfsd_stat_init(void)
>  {
> Index: bfields/include/linux/nfsd/stats.h
> ===================================================================
> --- bfields.orig/include/linux/nfsd/stats.h
> +++ bfields/include/linux/nfsd/stats.h
> @@ -40,6 +40,37 @@ struct nfsd_stats {
>  
>  };
>  
> +struct nfsd_op_stats {
> +	/* nothing to see here, yet */
> +};
> +
> +
> +typedef struct nfsd_stats_hash		nfsd_stats_hash_t;
> +typedef struct nfsd_stats_hentry	nfsd_stats_hentry_t;

Absent unusual circumstances, standard kernel style is to drop the
typedefs and use "struct nfsd_stats_{hash,hentry}" everywhere.

--b.

> +
> +/* Entry in the export and client stats hashtables */
> +struct nfsd_stats_hentry {
> +	struct hlist_node	se_node;	/* links hash chains */
> +	char			*se_name;
> +	atomic_t		se_refcount;	/* 1 for each user + 1 for hash */
> +#define NFSD_STATS_HENTRY_OLD	0
> +	unsigned long		se_flags;
> +	nfsd_stats_hash_t	*se_hash;
> +	struct nfsd_op_stats	se_data;
> +};
> +
> +/*
> + * Hashtable structure for export and client stats.
> + * Table width is chosen at boot time to scale with
> + * the size of the server.
> + */
> +struct nfsd_stats_hash {
> +	struct rw_semaphore	sh_sem;
> +	unsigned int		sh_size;
> +	unsigned int		sh_mask;
> +	struct hlist_head	*sh_hash;
> +	struct timer_list	sh_prune_timer;
> +};
>  
>  extern struct nfsd_stats	nfsdstats;
>  extern struct svc_stat		nfsd_svcstats;
> @@ -47,5 +78,17 @@ extern struct svc_stat		nfsd_svcstats;
>  void	nfsd_stat_init(void);
>  void	nfsd_stat_shutdown(void);
>  
> +extern nfsd_stats_hentry_t *nfsd_stats_find(nfsd_stats_hash_t *,
> +					    const char *name, int len);
> +static inline void
> +nfsd_stats_get(nfsd_stats_hentry_t *se)
> +{
> +	atomic_inc(&se->se_refcount);
> +	clear_bit(NFSD_STATS_HENTRY_OLD, &se->se_flags);
> +}
> +extern void nfsd_stats_put(nfsd_stats_hentry_t *se);
> +
> +
> +
>  #endif /* __KERNEL__ */
>  #endif /* LINUX_NFSD_STATS_H */
> Index: bfields/include/linux/nfsd/debug.h
> ===================================================================
> --- bfields.orig/include/linux/nfsd/debug.h
> +++ bfields/include/linux/nfsd/debug.h
> @@ -32,6 +32,7 @@
>  #define NFSDDBG_REPCACHE	0x0080
>  #define NFSDDBG_XDR		0x0100
>  #define NFSDDBG_LOCKD		0x0200
> +#define NFSDDBG_STATS		0x0400
>  #define NFSDDBG_ALL		0x7FFF
>  #define NFSDDBG_NOCHANGE	0xFFFF
>  
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-03-31 20:28 ` [patch 03/29] knfsd: add userspace controls for stats tables Greg Banks
@ 2009-04-25 21:57   ` J. Bruce Fields
  2009-04-25 22:03     ` J. Bruce Fields
  2009-04-26  4:14     ` Greg Banks
  0 siblings, 2 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-25 21:57 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
> Add two control files to /proc/fs/nfsd:
> 
> * "stats_enabled" can be used to disable or enable the gathering
>    of per-client and per-export statistics in the server.
> 
> * "stats_prune_period" can be used to set the period at
>    which the pruning timer runs, in seconds.  Unused stats
>    entries will survive at most twice that time.
> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/nfsctl.c |   99 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 99 insertions(+)
> 
> Index: bfields/fs/nfsd/nfsctl.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfsctl.c
> +++ bfields/fs/nfsd/nfsctl.c
> @@ -64,6 +64,8 @@ enum {
>  	NFSD_Versions,
>  	NFSD_Ports,
>  	NFSD_MaxBlkSize,
> +	NFSD_Stats_Enabled,
> +	NFSD_Stats_Prune_Period,
>  	/*
>  	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
>  	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
> @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
>  static ssize_t write_versions(struct file *file, char *buf, size_t size);
>  static ssize_t write_ports(struct file *file, char *buf, size_t size);
>  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
> +static ssize_t write_stats_enabled(struct file *file, char *buf, size_t size);
> +static ssize_t write_stats_prune_period(struct file *file, char *buf, size_t size);
>  #ifdef CONFIG_NFSD_V4
>  static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
>  static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
> @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
>  	[NFSD_Versions] = write_versions,
>  	[NFSD_Ports] = write_ports,
>  	[NFSD_MaxBlkSize] = write_maxblksize,
> +	[NFSD_Stats_Enabled] = write_stats_enabled,
> +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
>  #ifdef CONFIG_NFSD_V4
>  	[NFSD_Leasetime] = write_leasetime,
>  	[NFSD_RecoveryDir] = write_recoverydir,
> @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
>  	return sprintf(buf, "%d\n", nfsd_max_blksize);
>  }
>  
> +extern int nfsd_stats_enabled;
> +
> +/**
> + * write_stats_enabled - Set or report whether per-client/
> + *			 per-export stats are enabled.
> + *
> + * Input:
> + *			buf:		ignored
> + *			size:		zero
> + *
> + * OR
> + *
> + * Input:
> + * 			buf:		C string containing an unsigned
> + * 					integer value representing the new value
> + *			size:		non-zero length of C string in @buf
> + * Output:
> + *	On success:	passed-in buffer filled with '\n'-terminated C string
> + *			containing numeric value of the current setting
> + *			return code is the size in bytes of the string
> + *	On error:	return code is zero or a negative errno value
> + */
> +static ssize_t write_stats_enabled(struct file *file, char *buf, size_t size)
> +{
> +	char *mesg = buf;
> +	if (size > 0) {
> +		int enabled;
> +		int rv = get_int(&mesg, &enabled);
> +		if (rv)
> +			return rv;
> +		/* check `enabled' against allowed range */
> +		if (enabled < 0 || enabled > 1)
> +			return -EINVAL;
> +		/*
> +		 * We can change the enabled flag at any time without
> +		 * locking.  All it controls is whether stats are
> +		 * gathered for new incoming NFS calls.  Old gathered
> +		 * stats still sit around in the hash tables until
> +		 * naturally pruned.
> +		 */
> +		nfsd_stats_enabled = enabled;
> +	}
> +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
> +}
> +
> +extern int nfsd_stats_prune_period;
> +
> +/**
> + * write_stats_prune_period - Set or report the period for pruning
> + *			      old per-client/per-export stats entries,
> + *			      in seconds.
> + *
> + * Input:
> + *			buf:		ignored
> + *			size:		zero
> + *
> + * OR
> + *
> + * Input:
> + * 			buf:		C string containing an unsigned
> + * 					integer value representing the new value
> + *			size:		non-zero length of C string in @buf
> + * Output:
> + *	On success:	passed-in buffer filled with '\n'-terminated C string
> + *			containing numeric value of the current setting
> + *			return code is the size in bytes of the string
> + *	On error:	return code is zero or a negative errno value
> + */

Just an idle remark, don't worry about this for now, but: we might want
to rein in this write_*() comment format a little some day.  A lot of
the content seems duplicated.

--b.

> +static ssize_t write_stats_prune_period(struct file *file, char *buf, size_t size)
> +{
> +	char *mesg = buf;
> +	if (size > 0) {
> +		int period;
> +		int rv = get_int(&mesg, &period);
> +		if (rv)
> +			return rv;
> +		/* check `period' against allowed range */
> +		if (period < 10 || period > 14*86400)
> +			return -EINVAL;
> +		/*
> +		 * We can change the period at any time without
> +		 * locking.  All it controls is the timeout on the
> +		 * next run of the prune timer.  This might cause
> +		 * some unexpected behaviour if the period is
> +		 * changed from really high to really low.
> +		 */
> +		nfsd_stats_prune_period = period;
> +	}
> +	return sprintf(buf, "%d\n", nfsd_stats_prune_period);
> +}
> +
>  #ifdef CONFIG_NFSD_V4
>  extern time_t nfs4_leasetime(void);
>  
> @@ -1263,6 +1360,8 @@ static int nfsd_fill_super(struct super_
>  		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
>  		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
>  		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
> +		[NFSD_Stats_Enabled] = {"stats_enabled", &transaction_ops, S_IWUSR|S_IRUGO},
> +		[NFSD_Stats_Prune_Period] = {"stats_prune_period", &transaction_ops, S_IWUSR|S_IRUGO},
>  #ifdef CONFIG_NFSD_V4
>  		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
>  		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-25 21:57   ` J. Bruce Fields
@ 2009-04-25 22:03     ` J. Bruce Fields
  2009-04-27 16:06       ` Chuck Lever
  2009-04-26  4:14     ` Greg Banks
  1 sibling, 1 reply; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-25 22:03 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

Pfft, did it again.

--b.

On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
> > Add two control files to /proc/fs/nfsd:
> > 
> > * "stats_enabled" can be used to disable or enable the gathering
> >    of per-client and per-export statistics in the server.
> > 
> > * "stats_prune_period" can be used to set the period at
> >    which the pruning timer runs, in seconds.  Unused stats
> >    entries will survive at most twice that time.
> > 
> > Signed-off-by: Greg Banks <gnb@sgi.com>
> > ---
> > 
> >  fs/nfsd/nfsctl.c |   99 ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 99 insertions(+)
> > 
> > Index: bfields/fs/nfsd/nfsctl.c
> > ===================================================================
> > --- bfields.orig/fs/nfsd/nfsctl.c
> > +++ bfields/fs/nfsd/nfsctl.c
> > @@ -64,6 +64,8 @@ enum {
> >  	NFSD_Versions,
> >  	NFSD_Ports,
> >  	NFSD_MaxBlkSize,
> > +	NFSD_Stats_Enabled,
> > +	NFSD_Stats_Prune_Period,
> >  	/*
> >  	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
> >  	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
> > @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
> >  static ssize_t write_versions(struct file *file, char *buf, size_t size);
> >  static ssize_t write_ports(struct file *file, char *buf, size_t size);
> >  static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
> > +static ssize_t write_stats_enabled(struct file *file, char *buf, size_t size);
> > +static ssize_t write_stats_prune_period(struct file *file, char *buf, size_t size);
> >  #ifdef CONFIG_NFSD_V4
> >  static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
> >  static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
> > @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
> >  	[NFSD_Versions] = write_versions,
> >  	[NFSD_Ports] = write_ports,
> >  	[NFSD_MaxBlkSize] = write_maxblksize,
> > +	[NFSD_Stats_Enabled] = write_stats_enabled,
> > +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
> >  #ifdef CONFIG_NFSD_V4
> >  	[NFSD_Leasetime] = write_leasetime,
> >  	[NFSD_RecoveryDir] = write_recoverydir,
> > @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
> >  	return sprintf(buf, "%d\n", nfsd_max_blksize);
> >  }
> >  
> > +extern int nfsd_stats_enabled;
> > +
> > +/**
> > + * write_stats_enabled - Set or report whether per-client/
> > + *			 per-export stats are enabled.
> > + *
> > + * Input:
> > + *			buf:		ignored
> > + *			size:		zero
> > + *
> > + * OR
> > + *
> > + * Input:
> > + * 			buf:		C string containing an unsigned
> > + * 					integer value representing the new value
> > + *			size:		non-zero length of C string in @buf
> > + * Output:
> > + *	On success:	passed-in buffer filled with '\n'-terminated C string
> > + *			containing numeric value of the current setting
> > + *			return code is the size in bytes of the string
> > + *	On error:	return code is zero or a negative errno value
> > + */
> > +static ssize_t write_stats_enabled(struct file *file, char *buf, size_t size)
> > +{
> > +	char *mesg = buf;
> > +	if (size > 0) {
> > +		int enabled;
> > +		int rv = get_int(&mesg, &enabled);
> > +		if (rv)
> > +			return rv;
> > +		/* check `enabled' against allowed range */
> > +		if (enabled < 0 || enabled > 1)
> > +			return -EINVAL;
> > +		/*
> > +		 * We can change the enabled flag at any time without
> > +		 * locking.  All it controls is whether stats are
> > +		 * gathered for new incoming NFS calls.  Old gathered
> > +		 * stats still sit around in the hash tables until
> > +		 * naturally pruned.
> > +		 */
> > +		nfsd_stats_enabled = enabled;
> > +	}
> > +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
> > +}
> > +
> > +extern int nfsd_stats_prune_period;
> > +
> > +/**
> > + * write_stats_prune_period - Set or report the period for pruning
> > + *			      old per-client/per-export stats entries,
> > + *			      in seconds.
> > + *
> > + * Input:
> > + *			buf:		ignored
> > + *			size:		zero
> > + *
> > + * OR
> > + *
> > + * Input:
> > + * 			buf:		C string containing an unsigned
> > + * 					integer value representing the new value
> > + *			size:		non-zero length of C string in @buf
> > + * Output:
> > + *	On success:	passed-in buffer filled with '\n'-terminated C string
> > + *			containing numeric value of the current setting
> > + *			return code is the size in bytes of the string
> > + *	On error:	return code is zero or a negative errno value
> > + */
> 
> Just an idle remark, don't worry about this for now, but: we might want
> to rein in this write_*() comment format a little some day.  A lot of
> the content seems duplicated.
> 
> --b.
> 
> > +static ssize_t write_stats_prune_period(struct file *file, char *buf, size_t size)
> > +{
> > +	char *mesg = buf;
> > +	if (size > 0) {
> > +		int period;
> > +		int rv = get_int(&mesg, &period);
> > +		if (rv)
> > +			return rv;
> > +		/* check `period' against allowed range */
> > +		if (period < 10 || period > 14*86400)
> > +			return -EINVAL;
> > +		/*
> > +		 * We can change the period at any time without
> > +		 * locking.  All it controls is the timeout on the
> > +		 * next run of the prune timer.  This might cause
> > +		 * some unexpected behaviour if the period is
> > +		 * changed from really high to really low.
> > +		 */
> > +		nfsd_stats_prune_period = period;
> > +	}
> > +	return sprintf(buf, "%d\n", nfsd_stats_prune_period);
> > +}
> > +
> >  #ifdef CONFIG_NFSD_V4
> >  extern time_t nfs4_leasetime(void);
> >  
> > @@ -1263,6 +1360,8 @@ static int nfsd_fill_super(struct super_
> >  		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
> >  		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
> >  		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
> > +		[NFSD_Stats_Enabled] = {"stats_enabled", &transaction_ops, S_IWUSR|S_IRUGO},
> > +		[NFSD_Stats_Prune_Period] = {"stats_prune_period", &transaction_ops, S_IWUSR|S_IRUGO},
> >  #ifdef CONFIG_NFSD_V4
> >  		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
> >  		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
> > 
> > --
> > Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 02/29] knfsd: Add stats table infrastructure.
  2009-04-25  3:56   ` J. Bruce Fields
@ 2009-04-26  4:12     ` Greg Banks
  0 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-26  4:12 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

On Sat, Apr 25, 2009 at 1:56 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Wed, Apr 01, 2009 at 07:28:02AM +1100, Greg Banks wrote:

>> +int nfsd_stats_enabled = 1;
>> +int nfsd_stats_prune_period = 2*86400;
>
> For those of us that don't immediately recognize 86400 as the number of
> seconds in a day, writing that out as " = 2*24*60*60;" could be a useful
> hint.

Done.

>
> Also nice: a comment with any rationale (however minor) for the choice
> of period.

I've added this comment

/*
 * This number provides a bound on how long a record for a particular
 * stats entry survives after it's last use (an entry will die between
 * 1x and 2x the prune period after it's last use).  This is really only
 * particularly useful if a system admin is going to be trawling through
 * the /proc files manually and wants to see entries for (e.g.) clients
 * which have since unmounted.  If instead he uses some userspace
 * stats infrastructure which can handle rate conversion and instance
 * management, the prune period doesn't really matter.  The choice of
 * 2 days is really quite arbitrary.
 */



>> + * Stats hash pruning works thus.  A scan is run every prune period.
>> + * On every scan, hentries with the OLD flag are detached and
>> + * a reference dropped (usually that will be the last reference
>> + * and the hentry will be deleted).  Hentries without the OLD flag
>> + * have the OLD flag set; the flag is reset in nfsd_stats_get().
>> + * So hentries with active traffic in the last 2 prune periods
>> + * are not candidates for pruning.
>
> s/2 prune periods/prune period/ ?
>
> (From the description above: on exit from nfsd_stats_prune() all
> remaining entries have OLD set.  Therefore if an entry is not touched in
> the single period between two nfsd_stats_prune()'s, the second
> nfsd_stats_prune() run will drop it.)

Yeah, that was poorly phrased.  Fixed.

>
>> + */
>> +static void nfsd_stats_prune(unsigned long closure)
>> +{
>> +     nfsd_stats_hash_t *sh = (nfsd_stats_hash_t *)closure;
>> +     unsigned int i;
>> +     nfsd_stats_hentry_t *se;
>> +     struct hlist_node *hn, *next;
>> +     struct hlist_head to_be_dropped = HLIST_HEAD_INIT;
>> +
>> +     dprintk("nfsd_stats_prune\n");
>> +
>> +     if (!down_write_trylock(&sh->sh_sem)) {
>> +             /* hash is busy...try again in a second */
>> +             dprintk("nfsd_stats_prune: busy\n");
>> +             mod_timer(&sh->sh_prune_timer, jiffies + HZ);
>
> Could we make sh_sem a spinlock?  It doesn't look the the critical
> sections ever need to sleep.
>
> (Or even consider rcu, if we need the read lock on every rpc?  OK, I'm
> mostly ignorant of rcu.)

So was I way back when I wrote this patch, and it was written for an
antique kernel which was missing some useful locking bits.  So I'm not
too surprised that the locking scheme could do with a rethink.  I'll
take another look and get back to you.

>
>> +             return;
>> +     }
>> +
>> +     for (i = 0 ; i < sh->sh_size ; i++) {
>> +             hlist_for_each_entry_safe(se, hn, next, &sh->sh_hash[i], se_node) {
>> +                     if (!test_and_set_bit(NFSD_STATS_HENTRY_OLD, &se->se_flags))
>
> It looks like this is only ever used under the lock, so the
> test_and_set_bit() is overkill.

It's cleared in nfsd_stats_get() without the sh_sem lock.

>
>> +                             continue;
>> +                     hlist_del_init(&se->se_node);
>> +                     hlist_add_head(&se->se_node, &to_be_dropped);
>
> Replace those two by hlist_move_list?

If I read hlist_move_list() correctly, it moves an entire chain from
one hlist_head to another.  Here we want instead to move a single
hlist_node from one chain to another.  So, no.

>
>> +             }
>> +     }
>> +
>> +     up_write(&sh->sh_sem);
>> +
>> +     dprintk("nfsd_stats_prune: deleting\n");
>> +     hlist_for_each_entry_safe(se, hn, next, &to_be_dropped, se_node)
>> +             nfsd_stats_put(se);
>
> nfsd_stats_put() can down a semaphore, which we probably don't want in a
> timer.

Ouch.  What the hell was I thinking <kicks self>

>
>> +
>> +     mod_timer(&sh->sh_prune_timer, jiffies + nfsd_stats_prune_period * HZ);
>> +}
>> +
>> +/*
>> + * Initialise a stats hash.  Array size scales with
>> + * server memory, as a loose heuristic for how many
>> + * clients or exports a server is likely to have.
>> + */
>> +static void nfsd_stats_hash_init(nfsd_stats_hash_t *sh, const char *which)
>> +{
>> +     unsigned int nbits;
>> +     unsigned int i;
>> +
>> +     init_rwsem(&sh->sh_sem);
>> +
>> +     nbits = 5 + ilog2(totalram_pages >> (30-PAGE_SHIFT));
>> +     sh->sh_size = (1<<nbits);
>> +     sh->sh_mask = (sh->sh_size-1);
>
> Some comment on the choice of scale factor?  Also, see:
>
>        http://marc.info/?l=linux-kernel&m=118299825922287&w=2
>
> and followups.

Ok, I'll look into those.

>
> Might consider a little helper function to do this kind of
> fraction-of-total-memory calculation since I think the server does it in
> 3 or 4 places.
>
>> +
>> +     sh->sh_hash = kmalloc(sizeof(struct hlist_head) * sh->sh_size, GFP_KERNEL);
>
> Can this be a more than a page?

Yes, but it would need to be a fairly large-memory machine.  With 4K
pages and 8B pointers, totalram_pages would need to be 16G.  With 4B
pointers, we'd need 32G.

> (If so, could we just cap it at that
> size to avoid >order-0 allocations and keep the kmalloc failure
> unlikely?)

Well...I have no problem with capping it, but I don't think it's a
likely failure mode.  Firstly, there *two* allocations, which are
probably only order 1, and they happen at nfsd module load time.
Secondly, the allocation order scales, really quite slowly, with
available RAM.  Thirdly, machines which have a lowmem split will hit
the >0 order later than more modern machines with flat address spaces.

>
>> +     if (sh->sh_hash == NULL) {
>> +             printk(KERN_ERR "failed to allocate knfsd %s stats hashtable\n", which);
>> +             /* struggle on... */
>> +             return;
>> +     }
>> +     printk(KERN_INFO "knfsd %s stats hashtable, %u entries\n", which, sh->sh_size);
>
> Eh.  Make it a dprintk?

I don't think a dprintk() is useful.  This happens once during nfsd
module load, so there's no chance for an admin to enable dprintks
before it happens.

>  Or maybe expose this in the nfsd filesystem if
> it's not already?

There will be two files in the nfsd filesystem.  I'll remove the printk()


>> +     if (sh->sh_hash != NULL) {
>
> Drop the NULL check.

Done.


>> + * Drop a reference to a hentry, deleting the hentry if this
>> + * was the last reference.  Does it's own locking using the
>
> s/it's/its/

Done.

>
> (Contending for the nitpick-of-the-day award.)

:-)


>> +
>> +     if (atomic_read(&se->se_refcount)) {
>> +             /*
>> +              * We lost a race getting the write lock, and
>> +              * now there's a reference again.  Whatever.
>> +              */
>
> Some kind of atomic_dec_and_lock() might close the race.

Yep.  I'll address this when I rethink locking.

>> +
>> +typedef struct nfsd_stats_hash               nfsd_stats_hash_t;
>> +typedef struct nfsd_stats_hentry     nfsd_stats_hentry_t;
>
> Absent unusual circumstances, standard kernel style is to drop the
> typedefs and use "struct nfsd_stats_{hash,hentry}" everywhere.

Sorry, it's a disgusting habit and I'll stop it right now.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-25 21:57   ` J. Bruce Fields
  2009-04-25 22:03     ` J. Bruce Fields
@ 2009-04-26  4:14     ` Greg Banks
  1 sibling, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-26  4:14 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

On Sun, Apr 26, 2009 at 7:57 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:

>> +
>> +/**
>> + * write_stats_prune_period - Set or report the period for pruning
>> + *                         old per-client/per-export stats entries,
>> + *                         in seconds.
>> + *
>> + * Input:
>> + *                   buf:            ignored
>> + *                   size:           zero
>> + *
>> + * OR
>> + *
>> + * Input:
>> + *                   buf:            C string containing an unsigned
>> + *                                   integer value representing the new value
>> + *                   size:           non-zero length of C string in @buf
>> + * Output:
>> + *   On success:     passed-in buffer filled with '\n'-terminated C string
>> + *                   containing numeric value of the current setting
>> + *                   return code is the size in bytes of the string
>> + *   On error:       return code is zero or a negative errno value
>> + */
>
> Just an idle remark, don't worry about this for now, but: we might want
> to rein in this write_*() comment format a little some day.  A lot of
> the content seems duplicated.
>

Fair comment.  Noted for later.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-25 22:03     ` J. Bruce Fields
@ 2009-04-27 16:06       ` Chuck Lever
  2009-04-27 23:22         ` J. Bruce Fields
       [not found]         ` <ac442c870904271827w6041a67ew82fe36a843beeac3@mail.gmail.com>
  0 siblings, 2 replies; 63+ messages in thread
From: Chuck Lever @ 2009-04-27 16:06 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Greg Banks, Linux NFS ML

On Apr 25, 2009, at 6:03 PM, J. Bruce Fields wrote:
> Pfft, did it again.
>
> --b.
>
> On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
>> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
>>> Add two control files to /proc/fs/nfsd:
>>>
>>> * "stats_enabled" can be used to disable or enable the gathering
>>>   of per-client and per-export statistics in the server.
>>>
>>> * "stats_prune_period" can be used to set the period at
>>>   which the pruning timer runs, in seconds.  Unused stats
>>>   entries will survive at most twice that time.
>>>
>>> Signed-off-by: Greg Banks <gnb@sgi.com>
>>> ---
>>>
>>> fs/nfsd/nfsctl.c |   99 ++++++++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 99 insertions(+)
>>>
>>> Index: bfields/fs/nfsd/nfsctl.c
>>> ===================================================================
>>> --- bfields.orig/fs/nfsd/nfsctl.c
>>> +++ bfields/fs/nfsd/nfsctl.c
>>> @@ -64,6 +64,8 @@ enum {
>>> 	NFSD_Versions,
>>> 	NFSD_Ports,
>>> 	NFSD_MaxBlkSize,
>>> +	NFSD_Stats_Enabled,
>>> +	NFSD_Stats_Prune_Period,
>>> 	/*
>>> 	 * The below MUST come last.  Otherwise we leave a hole in  
>>> nfsd_files[]
>>> 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
>>> @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
>>> static ssize_t write_versions(struct file *file, char *buf, size_t  
>>> size);
>>> static ssize_t write_ports(struct file *file, char *buf, size_t  
>>> size);
>>> static ssize_t write_maxblksize(struct file *file, char *buf,  
>>> size_t size);
>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,  
>>> size_t size);
>>> +static ssize_t write_stats_prune_period(struct file *file, char  
>>> *buf, size_t size);
>>> #ifdef CONFIG_NFSD_V4
>>> static ssize_t write_leasetime(struct file *file, char *buf,  
>>> size_t size);
>>> static ssize_t write_recoverydir(struct file *file, char *buf,  
>>> size_t size);
>>> @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
>>> 	[NFSD_Versions] = write_versions,
>>> 	[NFSD_Ports] = write_ports,
>>> 	[NFSD_MaxBlkSize] = write_maxblksize,
>>> +	[NFSD_Stats_Enabled] = write_stats_enabled,
>>> +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
>>> #ifdef CONFIG_NFSD_V4
>>> 	[NFSD_Leasetime] = write_leasetime,
>>> 	[NFSD_RecoveryDir] = write_recoverydir,
>>> @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
>>> 	return sprintf(buf, "%d\n", nfsd_max_blksize);
>>> }
>>>
>>> +extern int nfsd_stats_enabled;
>>> +
>>> +/**
>>> + * write_stats_enabled - Set or report whether per-client/
>>> + *			 per-export stats are enabled.
>>> + *
>>> + * Input:
>>> + *			buf:		ignored
>>> + *			size:		zero
>>> + *
>>> + * OR
>>> + *
>>> + * Input:
>>> + * 			buf:		C string containing an unsigned
>>> + * 					integer value representing the new value
>>> + *			size:		non-zero length of C string in @buf
>>> + * Output:
>>> + *	On success:	passed-in buffer filled with '\n'-terminated C  
>>> string
>>> + *			containing numeric value of the current setting
>>> + *			return code is the size in bytes of the string
>>> + *	On error:	return code is zero or a negative errno value
>>> + */
>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,  
>>> size_t size)
>>> +{
>>> +	char *mesg = buf;
>>> +	if (size > 0) {
>>> +		int enabled;
>>> +		int rv = get_int(&mesg, &enabled);
>>> +		if (rv)
>>> +			return rv;
>>> +		/* check `enabled' against allowed range */
>>> +		if (enabled < 0 || enabled > 1)
>>> +			return -EINVAL;
>>> +		/*
>>> +		 * We can change the enabled flag at any time without
>>> +		 * locking.  All it controls is whether stats are
>>> +		 * gathered for new incoming NFS calls.  Old gathered
>>> +		 * stats still sit around in the hash tables until
>>> +		 * naturally pruned.
>>> +		 */
>>> +		nfsd_stats_enabled = enabled;
>>> +	}
>>> +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
>>> +}
>>> +
>>> +extern int nfsd_stats_prune_period;
>>> +
>>> +/**
>>> + * write_stats_prune_period - Set or report the period for pruning
>>> + *			      old per-client/per-export stats entries,
>>> + *			      in seconds.
>>> + *
>>> + * Input:
>>> + *			buf:		ignored
>>> + *			size:		zero
>>> + *
>>> + * OR
>>> + *
>>> + * Input:
>>> + * 			buf:		C string containing an unsigned
>>> + * 					integer value representing the new value
>>> + *			size:		non-zero length of C string in @buf
>>> + * Output:
>>> + *	On success:	passed-in buffer filled with '\n'-terminated C  
>>> string
>>> + *			containing numeric value of the current setting
>>> + *			return code is the size in bytes of the string
>>> + *	On error:	return code is zero or a negative errno value
>>> + */
>>
>> Just an idle remark, don't worry about this for now, but: we might  
>> want
>> to rein in this write_*() comment format a little some day.  A lot of
>> the content seems duplicated.

I disagree.

The present nfsctl user space API is entirely ad hoc.

Although sometimes the behavior is the same, each function/file can  
behave slightly differently than the others.  We have to be very  
specific about this here because the comments serve as both "user"  
documentation and as code/API specification.

Because this code wasn't adequately documented, features have been  
added over time without a close examination of the operation of other  
parts of this code.

If you really want to simplify the comments, we should consider  
simplifying the API first, imo.

>> --b.
>>
>>> +static ssize_t write_stats_prune_period(struct file *file, char  
>>> *buf, size_t size)
>>> +{
>>> +	char *mesg = buf;
>>> +	if (size > 0) {
>>> +		int period;
>>> +		int rv = get_int(&mesg, &period);
>>> +		if (rv)
>>> +			return rv;
>>> +		/* check `period' against allowed range */
>>> +		if (period < 10 || period > 14*86400)
>>> +			return -EINVAL;
>>> +		/*
>>> +		 * We can change the period at any time without
>>> +		 * locking.  All it controls is the timeout on the
>>> +		 * next run of the prune timer.  This might cause
>>> +		 * some unexpected behaviour if the period is
>>> +		 * changed from really high to really low.
>>> +		 */
>>> +		nfsd_stats_prune_period = period;
>>> +	}
>>> +	return sprintf(buf, "%d\n", nfsd_stats_prune_period);
>>> +}
>>> +
>>> #ifdef CONFIG_NFSD_V4
>>> extern time_t nfs4_leasetime(void);
>>>
>>> @@ -1263,6 +1360,8 @@ static int nfsd_fill_super(struct super_
>>> 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
>>> 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
>>> 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR| 
>>> S_IRUGO},
>>> +		[NFSD_Stats_Enabled] = {"stats_enabled", &transaction_ops,  
>>> S_IWUSR|S_IRUGO},
>>> +		[NFSD_Stats_Prune_Period] = {"stats_prune_period",  
>>> &transaction_ops, S_IWUSR|S_IRUGO},
>>> #ifdef CONFIG_NFSD_V4
>>> 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR| 
>>> S_IRUSR},
>>> 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops,  
>>> S_IWUSR|S_IRUSR},
>>>
>>> --
>>> Greg
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs"  
> in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-27 16:06       ` Chuck Lever
@ 2009-04-27 23:22         ` J. Bruce Fields
  2009-04-28 15:37           ` Chuck Lever
       [not found]         ` <ac442c870904271827w6041a67ew82fe36a843beeac3@mail.gmail.com>
  1 sibling, 1 reply; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-27 23:22 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Greg Banks, Linux NFS ML

On Mon, Apr 27, 2009 at 12:06:18PM -0400, Chuck Lever wrote:
> On Apr 25, 2009, at 6:03 PM, J. Bruce Fields wrote:
>> Pfft, did it again.
>>
>> --b.
>>
>> On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
>>> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
>>>> Add two control files to /proc/fs/nfsd:
>>>>
>>>> * "stats_enabled" can be used to disable or enable the gathering
>>>>   of per-client and per-export statistics in the server.
>>>>
>>>> * "stats_prune_period" can be used to set the period at
>>>>   which the pruning timer runs, in seconds.  Unused stats
>>>>   entries will survive at most twice that time.
>>>>
>>>> Signed-off-by: Greg Banks <gnb@sgi.com>
>>>> ---
>>>>
>>>> fs/nfsd/nfsctl.c |   99 ++++++++++++++++++++++++++++++++++++++++++++
>>>> 1 file changed, 99 insertions(+)
>>>>
>>>> Index: bfields/fs/nfsd/nfsctl.c
>>>> ===================================================================
>>>> --- bfields.orig/fs/nfsd/nfsctl.c
>>>> +++ bfields/fs/nfsd/nfsctl.c
>>>> @@ -64,6 +64,8 @@ enum {
>>>> 	NFSD_Versions,
>>>> 	NFSD_Ports,
>>>> 	NFSD_MaxBlkSize,
>>>> +	NFSD_Stats_Enabled,
>>>> +	NFSD_Stats_Prune_Period,
>>>> 	/*
>>>> 	 * The below MUST come last.  Otherwise we leave a hole in  
>>>> nfsd_files[]
>>>> 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
>>>> @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
>>>> static ssize_t write_versions(struct file *file, char *buf, size_t  
>>>> size);
>>>> static ssize_t write_ports(struct file *file, char *buf, size_t  
>>>> size);
>>>> static ssize_t write_maxblksize(struct file *file, char *buf,  
>>>> size_t size);
>>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,  
>>>> size_t size);
>>>> +static ssize_t write_stats_prune_period(struct file *file, char  
>>>> *buf, size_t size);
>>>> #ifdef CONFIG_NFSD_V4
>>>> static ssize_t write_leasetime(struct file *file, char *buf,  
>>>> size_t size);
>>>> static ssize_t write_recoverydir(struct file *file, char *buf,  
>>>> size_t size);
>>>> @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
>>>> 	[NFSD_Versions] = write_versions,
>>>> 	[NFSD_Ports] = write_ports,
>>>> 	[NFSD_MaxBlkSize] = write_maxblksize,
>>>> +	[NFSD_Stats_Enabled] = write_stats_enabled,
>>>> +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
>>>> #ifdef CONFIG_NFSD_V4
>>>> 	[NFSD_Leasetime] = write_leasetime,
>>>> 	[NFSD_RecoveryDir] = write_recoverydir,
>>>> @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
>>>> 	return sprintf(buf, "%d\n", nfsd_max_blksize);
>>>> }
>>>>
>>>> +extern int nfsd_stats_enabled;
>>>> +
>>>> +/**
>>>> + * write_stats_enabled - Set or report whether per-client/
>>>> + *			 per-export stats are enabled.
>>>> + *
>>>> + * Input:
>>>> + *			buf:		ignored
>>>> + *			size:		zero
>>>> + *
>>>> + * OR
>>>> + *
>>>> + * Input:
>>>> + * 			buf:		C string containing an unsigned
>>>> + * 					integer value representing the new value
>>>> + *			size:		non-zero length of C string in @buf
>>>> + * Output:
>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C  
>>>> string
>>>> + *			containing numeric value of the current setting
>>>> + *			return code is the size in bytes of the string
>>>> + *	On error:	return code is zero or a negative errno value
>>>> + */
>>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,  
>>>> size_t size)
>>>> +{
>>>> +	char *mesg = buf;
>>>> +	if (size > 0) {
>>>> +		int enabled;
>>>> +		int rv = get_int(&mesg, &enabled);
>>>> +		if (rv)
>>>> +			return rv;
>>>> +		/* check `enabled' against allowed range */
>>>> +		if (enabled < 0 || enabled > 1)
>>>> +			return -EINVAL;
>>>> +		/*
>>>> +		 * We can change the enabled flag at any time without
>>>> +		 * locking.  All it controls is whether stats are
>>>> +		 * gathered for new incoming NFS calls.  Old gathered
>>>> +		 * stats still sit around in the hash tables until
>>>> +		 * naturally pruned.
>>>> +		 */
>>>> +		nfsd_stats_enabled = enabled;
>>>> +	}
>>>> +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
>>>> +}
>>>> +
>>>> +extern int nfsd_stats_prune_period;
>>>> +
>>>> +/**
>>>> + * write_stats_prune_period - Set or report the period for pruning
>>>> + *			      old per-client/per-export stats entries,
>>>> + *			      in seconds.
>>>> + *
>>>> + * Input:
>>>> + *			buf:		ignored
>>>> + *			size:		zero
>>>> + *
>>>> + * OR
>>>> + *
>>>> + * Input:
>>>> + * 			buf:		C string containing an unsigned
>>>> + * 					integer value representing the new value
>>>> + *			size:		non-zero length of C string in @buf
>>>> + * Output:
>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C  
>>>> string
>>>> + *			containing numeric value of the current setting
>>>> + *			return code is the size in bytes of the string
>>>> + *	On error:	return code is zero or a negative errno value
>>>> + */
>>>
>>> Just an idle remark, don't worry about this for now, but: we might  
>>> want
>>> to rein in this write_*() comment format a little some day.  A lot of
>>> the content seems duplicated.
>
> I disagree.

How?  The below seems to be an arguing against *removing* the comments,
or removing information from them, neither of which I'd be in favor of.

--b.

> The present nfsctl user space API is entirely ad hoc.
>
> Although sometimes the behavior is the same, each function/file can  
> behave slightly differently than the others.  We have to be very  
> specific about this here because the comments serve as both "user"  
> documentation and as code/API specification.
>
> Because this code wasn't adequately documented, features have been added 
> over time without a close examination of the operation of other parts of 
> this code.
>
> If you really want to simplify the comments, we should consider  
> simplifying the API first, imo.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
       [not found]           ` <ac442c870904271827w6041a67ew82fe36a843beeac3-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2009-04-28  1:31             ` Greg Banks
  0 siblings, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-28  1:31 UTC (permalink / raw)
  To: Linux NFS ML

G'day,

Whoops, this bounced from the list because I accidentally had HTML
formatting on.  Resending.

On Tue, Apr 28, 2009 at 11:27 AM, Greg Banks <gnb-xTcybq6BZ68@public.gmane.org> wrote:
>
>
> On Tue, Apr 28, 2009 at 2:06 AM, Chuck Lever <chuck.lever@oracle.com> wrote:
>>
>> On Apr 25, 2009, at 6:03 PM, J. Bruce Fields wrote:
>>>
>>> Pfft, did it again.
>>>
>>> --b.
>>>
>>> On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
>>>>
>>>> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
>>>>>
>>>>> +
>>>>> +/**
>>>>> + * write_stats_prune_period - Set or report the period for pruning
>>>>> + *                           old per-client/per-export stats entries,
>>>>> + *                           in seconds.
>>>>> + *
>>>>> + * Input:
>>>>> + *                     buf:            ignored
>>>>> + *                     size:           zero
>>>>> + *
>>>>> + * OR
>>>>> + *
>>>>> + * Input:
>>>>> + *                     buf:            C string containing an unsigned
>>>>> + *                                     integer value representing the
>>>>> new value
>>>>> + *                     size:           non-zero length of C string in
>>>>> @buf
>>>>> + * Output:
>>>>> + *     On success:     passed-in buffer filled with '\n'-terminated C
>>>>> string
>>>>> + *                     containing numeric value of the current setting
>>>>> + *                     return code is the size in bytes of the string
>>>>> + *     On error:       return code is zero or a negative errno value
>>>>> + */
>>>>
>>>> Just an idle remark, don't worry about this for now, but: we might want
>>>> to rein in this write_*() comment format a little some day.  A lot of
>>>> the content seems duplicated.
>>
>> I disagree.
>>
>> The present nfsctl user space API is entirely ad hoc.
>
> Agreed.
>
>>
>>
>> Although sometimes the behavior is the same, each function/file can behave
>> slightly differently than the others.  We have to be very specific about
>> this here because the comments serve as both "user" documentation and as
>> code/API specification.
>
> Sure, but the comments aren't organised in such a way that the subtle
> differences between each pseudofile are obvious.  Instead there are large
> swathes of text above each write_*() function which describe behaviours
> common to all the write_*() functions, and you have to hunt carefully for
> the differences.
>
> For example, we don't need 10 copies of this text:
>
>  *     On success:     passed-in buffer filled with '\n'-terminated C string
>  *                     containing numeric value of the current setting
>  *                     return code is the size in bytes of the string
>
> Bruce rightly points out that my patch was continuing this poor trend.
>
>>
>> Because this code wasn't adequately documented, features have been added
>> over time without a close examination of the operation of other parts of
>> this code.
>>
>> If you really want to simplify the comments, we should consider
>> simplifying the API first, imo.
>
> Meh.  Most of the stuff in the nfsd filesystem could be *simplified* by
> using the actual /proc filesystem mechanisms instead.
>
>
>
> --
> Greg.
>



-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-27 23:22         ` J. Bruce Fields
@ 2009-04-28 15:37           ` Chuck Lever
  2009-04-28 15:57             ` J. Bruce Fields
  0 siblings, 1 reply; 63+ messages in thread
From: Chuck Lever @ 2009-04-28 15:37 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Greg Banks, Linux NFS ML


On Apr 27, 2009, at 7:22 PM, J. Bruce Fields wrote:

> On Mon, Apr 27, 2009 at 12:06:18PM -0400, Chuck Lever wrote:
>> On Apr 25, 2009, at 6:03 PM, J. Bruce Fields wrote:
>>> Pfft, did it again.
>>>
>>> --b.
>>>
>>> On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
>>>> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
>>>>> Add two control files to /proc/fs/nfsd:
>>>>>
>>>>> * "stats_enabled" can be used to disable or enable the gathering
>>>>>  of per-client and per-export statistics in the server.
>>>>>
>>>>> * "stats_prune_period" can be used to set the period at
>>>>>  which the pruning timer runs, in seconds.  Unused stats
>>>>>  entries will survive at most twice that time.
>>>>>
>>>>> Signed-off-by: Greg Banks <gnb@sgi.com>
>>>>> ---
>>>>>
>>>>> fs/nfsd/nfsctl.c |   99 +++++++++++++++++++++++++++++++++++++++++ 
>>>>> +++
>>>>> 1 file changed, 99 insertions(+)
>>>>>
>>>>> Index: bfields/fs/nfsd/nfsctl.c
>>>>> = 
>>>>> ==================================================================
>>>>> --- bfields.orig/fs/nfsd/nfsctl.c
>>>>> +++ bfields/fs/nfsd/nfsctl.c
>>>>> @@ -64,6 +64,8 @@ enum {
>>>>> 	NFSD_Versions,
>>>>> 	NFSD_Ports,
>>>>> 	NFSD_MaxBlkSize,
>>>>> +	NFSD_Stats_Enabled,
>>>>> +	NFSD_Stats_Prune_Period,
>>>>> 	/*
>>>>> 	 * The below MUST come last.  Otherwise we leave a hole in
>>>>> nfsd_files[]
>>>>> 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
>>>>> @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
>>>>> static ssize_t write_versions(struct file *file, char *buf, size_t
>>>>> size);
>>>>> static ssize_t write_ports(struct file *file, char *buf, size_t
>>>>> size);
>>>>> static ssize_t write_maxblksize(struct file *file, char *buf,
>>>>> size_t size);
>>>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,
>>>>> size_t size);
>>>>> +static ssize_t write_stats_prune_period(struct file *file, char
>>>>> *buf, size_t size);
>>>>> #ifdef CONFIG_NFSD_V4
>>>>> static ssize_t write_leasetime(struct file *file, char *buf,
>>>>> size_t size);
>>>>> static ssize_t write_recoverydir(struct file *file, char *buf,
>>>>> size_t size);
>>>>> @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
>>>>> 	[NFSD_Versions] = write_versions,
>>>>> 	[NFSD_Ports] = write_ports,
>>>>> 	[NFSD_MaxBlkSize] = write_maxblksize,
>>>>> +	[NFSD_Stats_Enabled] = write_stats_enabled,
>>>>> +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
>>>>> #ifdef CONFIG_NFSD_V4
>>>>> 	[NFSD_Leasetime] = write_leasetime,
>>>>> 	[NFSD_RecoveryDir] = write_recoverydir,
>>>>> @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
>>>>> 	return sprintf(buf, "%d\n", nfsd_max_blksize);
>>>>> }
>>>>>
>>>>> +extern int nfsd_stats_enabled;
>>>>> +
>>>>> +/**
>>>>> + * write_stats_enabled - Set or report whether per-client/
>>>>> + *			 per-export stats are enabled.
>>>>> + *
>>>>> + * Input:
>>>>> + *			buf:		ignored
>>>>> + *			size:		zero
>>>>> + *
>>>>> + * OR
>>>>> + *
>>>>> + * Input:
>>>>> + * 			buf:		C string containing an unsigned
>>>>> + * 					integer value representing the new value
>>>>> + *			size:		non-zero length of C string in @buf
>>>>> + * Output:
>>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C
>>>>> string
>>>>> + *			containing numeric value of the current setting
>>>>> + *			return code is the size in bytes of the string
>>>>> + *	On error:	return code is zero or a negative errno value
>>>>> + */
>>>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,
>>>>> size_t size)
>>>>> +{
>>>>> +	char *mesg = buf;
>>>>> +	if (size > 0) {
>>>>> +		int enabled;
>>>>> +		int rv = get_int(&mesg, &enabled);
>>>>> +		if (rv)
>>>>> +			return rv;
>>>>> +		/* check `enabled' against allowed range */
>>>>> +		if (enabled < 0 || enabled > 1)
>>>>> +			return -EINVAL;
>>>>> +		/*
>>>>> +		 * We can change the enabled flag at any time without
>>>>> +		 * locking.  All it controls is whether stats are
>>>>> +		 * gathered for new incoming NFS calls.  Old gathered
>>>>> +		 * stats still sit around in the hash tables until
>>>>> +		 * naturally pruned.
>>>>> +		 */
>>>>> +		nfsd_stats_enabled = enabled;
>>>>> +	}
>>>>> +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
>>>>> +}
>>>>> +
>>>>> +extern int nfsd_stats_prune_period;
>>>>> +
>>>>> +/**
>>>>> + * write_stats_prune_period - Set or report the period for  
>>>>> pruning
>>>>> + *			      old per-client/per-export stats entries,
>>>>> + *			      in seconds.
>>>>> + *
>>>>> + * Input:
>>>>> + *			buf:		ignored
>>>>> + *			size:		zero
>>>>> + *
>>>>> + * OR
>>>>> + *
>>>>> + * Input:
>>>>> + * 			buf:		C string containing an unsigned
>>>>> + * 					integer value representing the new value
>>>>> + *			size:		non-zero length of C string in @buf
>>>>> + * Output:
>>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C
>>>>> string
>>>>> + *			containing numeric value of the current setting
>>>>> + *			return code is the size in bytes of the string
>>>>> + *	On error:	return code is zero or a negative errno value
>>>>> + */
>>>>
>>>> Just an idle remark, don't worry about this for now, but: we might
>>>> want
>>>> to rein in this write_*() comment format a little some day.  A  
>>>> lot of
>>>> the content seems duplicated.
>>
>> I disagree.
>
> How?  The below seems to be an arguing against *removing* the  
> comments,
> or removing information from them, neither of which I'd be in favor  
> of.

Then I misunderstood what you meant by "rein in".

The apparent content duplication is because these functions are all  
slightly different.  What we had before was a single description of  
the return values at the top of the files that more or less fit each  
proc file, but didn't precisely fit any but the oldest.

(Responding a bit to Greg) IMO highlighting the differences instead  
means a person trying to understand this interface has to read the  
whole damn nfsctl.c file instead of looking at the one piece s/he is  
interested in.  This is documentation, not code, so I think a little  
text duplication is OK or even actually preferred.

> --b.
>
>> The present nfsctl user space API is entirely ad hoc.
>>
>> Although sometimes the behavior is the same, each function/file can
>> behave slightly differently than the others.  We have to be very
>> specific about this here because the comments serve as both "user"
>> documentation and as code/API specification.
>>
>> Because this code wasn't adequately documented, features have been  
>> added
>> over time without a close examination of the operation of other  
>> parts of
>> this code.
>>
>> If you really want to simplify the comments, we should consider
>> simplifying the API first, imo.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs"  
> in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-28 15:37           ` Chuck Lever
@ 2009-04-28 15:57             ` J. Bruce Fields
  2009-04-28 16:03               ` Chuck Lever
  2009-04-29  1:45               ` Greg Banks
  0 siblings, 2 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-28 15:57 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Greg Banks, Linux NFS ML

On Tue, Apr 28, 2009 at 11:37:09AM -0400, Chuck Lever wrote:
>
> On Apr 27, 2009, at 7:22 PM, J. Bruce Fields wrote:
>
>> On Mon, Apr 27, 2009 at 12:06:18PM -0400, Chuck Lever wrote:
>>> On Apr 25, 2009, at 6:03 PM, J. Bruce Fields wrote:
>>>> Pfft, did it again.
>>>>
>>>> --b.
>>>>
>>>> On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
>>>>> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
>>>>>> Add two control files to /proc/fs/nfsd:
>>>>>>
>>>>>> * "stats_enabled" can be used to disable or enable the gathering
>>>>>>  of per-client and per-export statistics in the server.
>>>>>>
>>>>>> * "stats_prune_period" can be used to set the period at
>>>>>>  which the pruning timer runs, in seconds.  Unused stats
>>>>>>  entries will survive at most twice that time.
>>>>>>
>>>>>> Signed-off-by: Greg Banks <gnb@sgi.com>
>>>>>> ---
>>>>>>
>>>>>> fs/nfsd/nfsctl.c |   99 
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++
>>>>>> 1 file changed, 99 insertions(+)
>>>>>>
>>>>>> Index: bfields/fs/nfsd/nfsctl.c
>>>>>> = 
>>>>>> ==================================================================
>>>>>> --- bfields.orig/fs/nfsd/nfsctl.c
>>>>>> +++ bfields/fs/nfsd/nfsctl.c
>>>>>> @@ -64,6 +64,8 @@ enum {
>>>>>> 	NFSD_Versions,
>>>>>> 	NFSD_Ports,
>>>>>> 	NFSD_MaxBlkSize,
>>>>>> +	NFSD_Stats_Enabled,
>>>>>> +	NFSD_Stats_Prune_Period,
>>>>>> 	/*
>>>>>> 	 * The below MUST come last.  Otherwise we leave a hole in
>>>>>> nfsd_files[]
>>>>>> 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
>>>>>> @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
>>>>>> static ssize_t write_versions(struct file *file, char *buf, size_t
>>>>>> size);
>>>>>> static ssize_t write_ports(struct file *file, char *buf, size_t
>>>>>> size);
>>>>>> static ssize_t write_maxblksize(struct file *file, char *buf,
>>>>>> size_t size);
>>>>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,
>>>>>> size_t size);
>>>>>> +static ssize_t write_stats_prune_period(struct file *file, char
>>>>>> *buf, size_t size);
>>>>>> #ifdef CONFIG_NFSD_V4
>>>>>> static ssize_t write_leasetime(struct file *file, char *buf,
>>>>>> size_t size);
>>>>>> static ssize_t write_recoverydir(struct file *file, char *buf,
>>>>>> size_t size);
>>>>>> @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
>>>>>> 	[NFSD_Versions] = write_versions,
>>>>>> 	[NFSD_Ports] = write_ports,
>>>>>> 	[NFSD_MaxBlkSize] = write_maxblksize,
>>>>>> +	[NFSD_Stats_Enabled] = write_stats_enabled,
>>>>>> +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
>>>>>> #ifdef CONFIG_NFSD_V4
>>>>>> 	[NFSD_Leasetime] = write_leasetime,
>>>>>> 	[NFSD_RecoveryDir] = write_recoverydir,
>>>>>> @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
>>>>>> 	return sprintf(buf, "%d\n", nfsd_max_blksize);
>>>>>> }
>>>>>>
>>>>>> +extern int nfsd_stats_enabled;
>>>>>> +
>>>>>> +/**
>>>>>> + * write_stats_enabled - Set or report whether per-client/
>>>>>> + *			 per-export stats are enabled.
>>>>>> + *
>>>>>> + * Input:
>>>>>> + *			buf:		ignored
>>>>>> + *			size:		zero
>>>>>> + *
>>>>>> + * OR
>>>>>> + *
>>>>>> + * Input:
>>>>>> + * 			buf:		C string containing an unsigned
>>>>>> + * 					integer value representing the new value
>>>>>> + *			size:		non-zero length of C string in @buf
>>>>>> + * Output:
>>>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C
>>>>>> string
>>>>>> + *			containing numeric value of the current setting
>>>>>> + *			return code is the size in bytes of the string
>>>>>> + *	On error:	return code is zero or a negative errno value
>>>>>> + */
>>>>>> +static ssize_t write_stats_enabled(struct file *file, char *buf,
>>>>>> size_t size)
>>>>>> +{
>>>>>> +	char *mesg = buf;
>>>>>> +	if (size > 0) {
>>>>>> +		int enabled;
>>>>>> +		int rv = get_int(&mesg, &enabled);
>>>>>> +		if (rv)
>>>>>> +			return rv;
>>>>>> +		/* check `enabled' against allowed range */
>>>>>> +		if (enabled < 0 || enabled > 1)
>>>>>> +			return -EINVAL;
>>>>>> +		/*
>>>>>> +		 * We can change the enabled flag at any time without
>>>>>> +		 * locking.  All it controls is whether stats are
>>>>>> +		 * gathered for new incoming NFS calls.  Old gathered
>>>>>> +		 * stats still sit around in the hash tables until
>>>>>> +		 * naturally pruned.
>>>>>> +		 */
>>>>>> +		nfsd_stats_enabled = enabled;
>>>>>> +	}
>>>>>> +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
>>>>>> +}
>>>>>> +
>>>>>> +extern int nfsd_stats_prune_period;
>>>>>> +
>>>>>> +/**
>>>>>> + * write_stats_prune_period - Set or report the period for  
>>>>>> pruning
>>>>>> + *			      old per-client/per-export stats entries,
>>>>>> + *			      in seconds.
>>>>>> + *
>>>>>> + * Input:
>>>>>> + *			buf:		ignored
>>>>>> + *			size:		zero
>>>>>> + *
>>>>>> + * OR
>>>>>> + *
>>>>>> + * Input:
>>>>>> + * 			buf:		C string containing an unsigned
>>>>>> + * 					integer value representing the new value
>>>>>> + *			size:		non-zero length of C string in @buf
>>>>>> + * Output:
>>>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C
>>>>>> string
>>>>>> + *			containing numeric value of the current setting
>>>>>> + *			return code is the size in bytes of the string
>>>>>> + *	On error:	return code is zero or a negative errno value
>>>>>> + */
>>>>>
>>>>> Just an idle remark, don't worry about this for now, but: we might
>>>>> want
>>>>> to rein in this write_*() comment format a little some day.  A  
>>>>> lot of
>>>>> the content seems duplicated.
>>>
>>> I disagree.
>>
>> How?  The below seems to be an arguing against *removing* the  
>> comments,
>> or removing information from them, neither of which I'd be in favor  
>> of.
>
> Then I misunderstood what you meant by "rein in".
>
> The apparent content duplication is because these functions are all  
> slightly different.  What we had before was a single description of the 
> return values at the top of the files that more or less fit each proc 
> file, but didn't precisely fit any but the oldest.
>
> (Responding a bit to Greg) IMO highlighting the differences instead  
> means a person trying to understand this interface has to read the whole 
> damn nfsctl.c file instead of looking at the one piece s/he is  
> interested in.  This is documentation, not code, so I think a little  
> text duplication is OK or even actually preferred.

Agreed, and I agree that nobody should have to read the whole file.  But
appropriate cross-references ("foo() behaves like bar() except...")
could prevent that.  As long as we don't require following too many such
references, I think the burden of tracking following references would be
outweighed by the benefits of more concise descriptions.

--b.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-28 15:57             ` J. Bruce Fields
@ 2009-04-28 16:03               ` Chuck Lever
  2009-04-28 16:26                 ` J. Bruce Fields
  2009-04-29  1:45               ` Greg Banks
  1 sibling, 1 reply; 63+ messages in thread
From: Chuck Lever @ 2009-04-28 16:03 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Greg Banks, Linux NFS ML


On Apr 28, 2009, at 11:57 AM, J. Bruce Fields wrote:

> On Tue, Apr 28, 2009 at 11:37:09AM -0400, Chuck Lever wrote:
>>
>> On Apr 27, 2009, at 7:22 PM, J. Bruce Fields wrote:
>>
>>> On Mon, Apr 27, 2009 at 12:06:18PM -0400, Chuck Lever wrote:
>>>> On Apr 25, 2009, at 6:03 PM, J. Bruce Fields wrote:
>>>>> Pfft, did it again.
>>>>>
>>>>> --b.
>>>>>
>>>>> On Sat, Apr 25, 2009 at 05:57:45PM -0400, bfields wrote:
>>>>>> On Wed, Apr 01, 2009 at 07:28:03AM +1100, Greg Banks wrote:
>>>>>>> Add two control files to /proc/fs/nfsd:
>>>>>>>
>>>>>>> * "stats_enabled" can be used to disable or enable the gathering
>>>>>>> of per-client and per-export statistics in the server.
>>>>>>>
>>>>>>> * "stats_prune_period" can be used to set the period at
>>>>>>> which the pruning timer runs, in seconds.  Unused stats
>>>>>>> entries will survive at most twice that time.
>>>>>>>
>>>>>>> Signed-off-by: Greg Banks <gnb@sgi.com>
>>>>>>> ---
>>>>>>>
>>>>>>> fs/nfsd/nfsctl.c |   99
>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>> 1 file changed, 99 insertions(+)
>>>>>>>
>>>>>>> Index: bfields/fs/nfsd/nfsctl.c
>>>>>>> =
>>>>>>> =
>>>>>>> =
>>>>>>> ================================================================
>>>>>>> --- bfields.orig/fs/nfsd/nfsctl.c
>>>>>>> +++ bfields/fs/nfsd/nfsctl.c
>>>>>>> @@ -64,6 +64,8 @@ enum {
>>>>>>> 	NFSD_Versions,
>>>>>>> 	NFSD_Ports,
>>>>>>> 	NFSD_MaxBlkSize,
>>>>>>> +	NFSD_Stats_Enabled,
>>>>>>> +	NFSD_Stats_Prune_Period,
>>>>>>> 	/*
>>>>>>> 	 * The below MUST come last.  Otherwise we leave a hole in
>>>>>>> nfsd_files[]
>>>>>>> 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
>>>>>>> @@ -92,6 +94,8 @@ static ssize_t write_pool_threads(struct
>>>>>>> static ssize_t write_versions(struct file *file, char *buf,  
>>>>>>> size_t
>>>>>>> size);
>>>>>>> static ssize_t write_ports(struct file *file, char *buf, size_t
>>>>>>> size);
>>>>>>> static ssize_t write_maxblksize(struct file *file, char *buf,
>>>>>>> size_t size);
>>>>>>> +static ssize_t write_stats_enabled(struct file *file, char  
>>>>>>> *buf,
>>>>>>> size_t size);
>>>>>>> +static ssize_t write_stats_prune_period(struct file *file, char
>>>>>>> *buf, size_t size);
>>>>>>> #ifdef CONFIG_NFSD_V4
>>>>>>> static ssize_t write_leasetime(struct file *file, char *buf,
>>>>>>> size_t size);
>>>>>>> static ssize_t write_recoverydir(struct file *file, char *buf,
>>>>>>> size_t size);
>>>>>>> @@ -113,6 +117,8 @@ static ssize_t (*write_op[])(struct file
>>>>>>> 	[NFSD_Versions] = write_versions,
>>>>>>> 	[NFSD_Ports] = write_ports,
>>>>>>> 	[NFSD_MaxBlkSize] = write_maxblksize,
>>>>>>> +	[NFSD_Stats_Enabled] = write_stats_enabled,
>>>>>>> +	[NFSD_Stats_Prune_Period] = write_stats_prune_period,
>>>>>>> #ifdef CONFIG_NFSD_V4
>>>>>>> 	[NFSD_Leasetime] = write_leasetime,
>>>>>>> 	[NFSD_RecoveryDir] = write_recoverydir,
>>>>>>> @@ -1121,6 +1127,97 @@ static ssize_t write_maxblksize(struct f
>>>>>>> 	return sprintf(buf, "%d\n", nfsd_max_blksize);
>>>>>>> }
>>>>>>>
>>>>>>> +extern int nfsd_stats_enabled;
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * write_stats_enabled - Set or report whether per-client/
>>>>>>> + *			 per-export stats are enabled.
>>>>>>> + *
>>>>>>> + * Input:
>>>>>>> + *			buf:		ignored
>>>>>>> + *			size:		zero
>>>>>>> + *
>>>>>>> + * OR
>>>>>>> + *
>>>>>>> + * Input:
>>>>>>> + * 			buf:		C string containing an unsigned
>>>>>>> + * 					integer value representing the new value
>>>>>>> + *			size:		non-zero length of C string in @buf
>>>>>>> + * Output:
>>>>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C
>>>>>>> string
>>>>>>> + *			containing numeric value of the current setting
>>>>>>> + *			return code is the size in bytes of the string
>>>>>>> + *	On error:	return code is zero or a negative errno value
>>>>>>> + */
>>>>>>> +static ssize_t write_stats_enabled(struct file *file, char  
>>>>>>> *buf,
>>>>>>> size_t size)
>>>>>>> +{
>>>>>>> +	char *mesg = buf;
>>>>>>> +	if (size > 0) {
>>>>>>> +		int enabled;
>>>>>>> +		int rv = get_int(&mesg, &enabled);
>>>>>>> +		if (rv)
>>>>>>> +			return rv;
>>>>>>> +		/* check `enabled' against allowed range */
>>>>>>> +		if (enabled < 0 || enabled > 1)
>>>>>>> +			return -EINVAL;
>>>>>>> +		/*
>>>>>>> +		 * We can change the enabled flag at any time without
>>>>>>> +		 * locking.  All it controls is whether stats are
>>>>>>> +		 * gathered for new incoming NFS calls.  Old gathered
>>>>>>> +		 * stats still sit around in the hash tables until
>>>>>>> +		 * naturally pruned.
>>>>>>> +		 */
>>>>>>> +		nfsd_stats_enabled = enabled;
>>>>>>> +	}
>>>>>>> +	return sprintf(buf, "%d\n", nfsd_stats_enabled);
>>>>>>> +}
>>>>>>> +
>>>>>>> +extern int nfsd_stats_prune_period;
>>>>>>> +
>>>>>>> +/**
>>>>>>> + * write_stats_prune_period - Set or report the period for
>>>>>>> pruning
>>>>>>> + *			      old per-client/per-export stats entries,
>>>>>>> + *			      in seconds.
>>>>>>> + *
>>>>>>> + * Input:
>>>>>>> + *			buf:		ignored
>>>>>>> + *			size:		zero
>>>>>>> + *
>>>>>>> + * OR
>>>>>>> + *
>>>>>>> + * Input:
>>>>>>> + * 			buf:		C string containing an unsigned
>>>>>>> + * 					integer value representing the new value
>>>>>>> + *			size:		non-zero length of C string in @buf
>>>>>>> + * Output:
>>>>>>> + *	On success:	passed-in buffer filled with '\n'-terminated C
>>>>>>> string
>>>>>>> + *			containing numeric value of the current setting
>>>>>>> + *			return code is the size in bytes of the string
>>>>>>> + *	On error:	return code is zero or a negative errno value
>>>>>>> + */
>>>>>>
>>>>>> Just an idle remark, don't worry about this for now, but: we  
>>>>>> might
>>>>>> want
>>>>>> to rein in this write_*() comment format a little some day.  A
>>>>>> lot of
>>>>>> the content seems duplicated.
>>>>
>>>> I disagree.
>>>
>>> How?  The below seems to be an arguing against *removing* the
>>> comments,
>>> or removing information from them, neither of which I'd be in favor
>>> of.
>>
>> Then I misunderstood what you meant by "rein in".
>>
>> The apparent content duplication is because these functions are all
>> slightly different.  What we had before was a single description of  
>> the
>> return values at the top of the files that more or less fit each proc
>> file, but didn't precisely fit any but the oldest.
>>
>> (Responding a bit to Greg) IMO highlighting the differences instead
>> means a person trying to understand this interface has to read the  
>> whole
>> damn nfsctl.c file instead of looking at the one piece s/he is
>> interested in.  This is documentation, not code, so I think a little
>> text duplication is OK or even actually preferred.
>
> Agreed, and I agree that nobody should have to read the whole file.   
> But
> appropriate cross-references ("foo() behaves like bar() except...")
> could prevent that.  As long as we don't require following too many  
> such
> references, I think the burden of tracking following references  
> would be
> outweighed by the benefits of more concise descriptions.

I'm OK with more brevity in these comments as long as we recognize  
that the devil is in the details.  We should preserve the specificity  
of the API descriptions.

I think it was you who was complaining recently about knowing whether  
a \n or a \0 is expected at the end of some of these strings.  ;-)

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-28 16:03               ` Chuck Lever
@ 2009-04-28 16:26                 ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-04-28 16:26 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Greg Banks, Linux NFS ML

On Tue, Apr 28, 2009 at 12:03:49PM -0400, Chuck Lever wrote:
> I'm OK with more brevity in these comments as long as we recognize that 
> the devil is in the details.  We should preserve the specificity of the 
> API descriptions.
>
> I think it was you who was complaining recently about knowing whether a 
> \n or a \0 is expected at the end of some of these strings.  ;-)

Oh, absolutely, I do appreciate the fact that we've got some
documentation there.

--b.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 03/29] knfsd: add userspace controls for stats tables
  2009-04-28 15:57             ` J. Bruce Fields
  2009-04-28 16:03               ` Chuck Lever
@ 2009-04-29  1:45               ` Greg Banks
  1 sibling, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-04-29  1:45 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Chuck Lever, Linux NFS ML

On Wed, Apr 29, 2009 at 1:57 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Tue, Apr 28, 2009 at 11:37:09AM -0400, Chuck Lever wrote:
>>
>> (Responding a bit to Greg) IMO highlighting the differences instead
>> means a person trying to understand this interface has to read the whole
>> damn nfsctl.c file instead of looking at the one piece s/he is
>> interested in.  This is documentation, not code, so I think a little
>> text duplication is OK or even actually preferred.
>
> Agreed, and I agree that nobody should have to read the whole file.  But
> appropriate cross-references ("foo() behaves like bar() except...")
> could prevent that.

I think that would be a definite improvement.  I'll see what I can do.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 14/29] knfsd: better hashing in the reply cache
  2009-03-31 20:28 ` [patch 14/29] knfsd: better hashing in the reply cache Greg Banks
@ 2009-05-08 22:01   ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-08 22:01 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:14AM +1100, Greg Banks wrote:
> Improve the hash function to handle clients which increment the XID
> in the unexpected byte order, by folding down the top bits of the XID.

I'm confused; maybe my arithemtic's wrong, but:

> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/nfscache.c |    5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> Index: bfields/fs/nfsd/nfscache.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfscache.c
> +++ bfields/fs/nfsd/nfscache.c
> @@ -35,12 +35,15 @@ static struct list_head 	lru_head;
>  static int			cache_disabled = 1;
>  
>  /*
> - * Calculate the hash index from an XID.
> + * Calculate the hash index from an XID.  Note, some clients increment
> + * their XIDs in host order, which can result in all the variation being
> + * in the top bits we see here.  So we fold those bits down.
>   */
>  static inline u32 request_hash(u32 xid)
>  {
>  	u32 h = xid;
>  	h ^= (xid >> 24);

This xor's the highest 8 bits into the lowest 8 bits; so the
higest-order bits are already being "folded down".

> +	h ^= ((xid & 0xff0000) >> 8);

This just folds those same bits down into the order 16-23 bits.  Does
that help?

>  	return h & (HASHSIZE-1);

Especially when we're only taking the bottom few (6, currently) bits
anyway?

--b.

>  }
>  
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 16/29] knfsd: use client IPv4 address in reply cache hash
  2009-03-31 20:28 ` [patch 16/29] knfsd: use client IPv4 address in reply cache hash Greg Banks
@ 2009-05-11 21:48   ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-11 21:48 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:16AM +1100, Greg Banks wrote:
> Use the IPv4 address of the client in the reply cache hash function.
> This can help improve the distribution of the hash function when the
> workload includes a large number of clients which mounted their NFS
> filesystems at nearly the same time and are doing similar sequences
> of NFS calls, a pattern seen with large compute clusters.
> 
> This code predates the IPv6 support in the current NFS server but
> should be harmless with IPv6 clients.
> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/nfscache.c |   27 +++++++++++++--------------
>  1 file changed, 13 insertions(+), 14 deletions(-)
> 
> Index: bfields/fs/nfsd/nfscache.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfscache.c
> +++ bfields/fs/nfsd/nfscache.c
> @@ -38,12 +38,17 @@ static int			cache_disabled = 1;
>   * Calculate the hash index from an XID.  Note, some clients increment
>   * their XIDs in host order, which can result in all the variation being
>   * in the top bits we see here.  So we fold those bits down.
> + *
> + * Experiment shows that using the Jenkins hash improves the spectral
> + * properties of this hash, but the CPU cost of calculating it outweighs
> + * the advantages.
>   */
> -static inline u32 request_hash(u32 xid)
> +static inline u32 request_hash(u32 xid, const struct sockaddr_in *sin)
>  {
>  	u32 h = xid;
>  	h ^= (xid >> 24);
>  	h ^= ((xid & 0xff0000) >> 8);
> +	h ^= sin->sin_addr.s_addr;

Tell me if I'm confused about the endianness: the variation is typically
in the low-order (host) end of the ip address, but the s_addr is stored
in network order, so the variation is in the high-order bits on a
little-endian machine, but &(HASHSIZE-1) is throwing out those bits.

>  	return h & (HASHSIZE-1);
>  }
>  

I'd've stuck the following in a separate patch as it's not really
related.

--b.

> @@ -114,16 +119,6 @@ lru_put_end(struct svc_cacherep *rp)
>  }
>  
>  /*
> - * Move a cache entry from one hash list to another
> - */
> -static void
> -hash_refile(struct svc_cacherep *rp)
> -{
> -	hlist_del_init(&rp->c_hash);
> -	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
> -}
> -
> -/*
>   * Try to find an entry matching the current call in the cache. When none
>   * is found, we grab the oldest unlocked entry off the LRU list.
>   * Note that no operation within the loop may sleep.
> @@ -137,7 +132,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  	__be32			xid = rqstp->rq_xid;
>  	u32			proto =  rqstp->rq_prot,
>  				vers = rqstp->rq_vers,
> -				proc = rqstp->rq_proc;
> +				proc = rqstp->rq_proc,
> +				h;
>  	unsigned long		age;
>  	int rtn;
>  
> @@ -146,11 +142,12 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  		nfsdstats.rcnocache++;
>  		return RC_DOIT;
>  	}
> +	h = request_hash(xid, svc_addr_in(rqstp));
>  
>  	spin_lock(&cache_lock);
>  	rtn = RC_DOIT;
>  
> -	rh = &cache_hash[request_hash(xid)];
> +	rh = &cache_hash[h];
>  	hlist_for_each_entry(rp, hn, rh, c_hash) {
>  		if (rp->c_state != RC_UNUSED &&
>  		    xid == rp->c_xid && proc == rp->c_proc &&
> @@ -198,7 +195,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  	rp->c_vers = vers;
>  	rp->c_timestamp = jiffies;
>  
> -	hash_refile(rp);
> +	/* Move the cache entry from one hash list to another */
> +	hlist_del_init(&rp->c_hash);
> +	hlist_add_head(&rp->c_hash, cache_hash + h);
>  
>  	/* release any buffer */
>  	if (rp->c_type == RC_REPLBUFF) {
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 13/29] knfsd: reply cache cleanups
  2009-03-31 20:28 ` [patch 13/29] knfsd: reply cache cleanups Greg Banks
@ 2009-05-12 19:54   ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-12 19:54 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:13AM +1100, Greg Banks wrote:
> Make REQHASH() an inline function.  Rename hash_list to cache_hash.
> Fix an obsolete comment.

OK--applying for 2.6.31.--b.

> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/nfscache.c         |   29 +++++++++++++++++++----------
>  include/linux/nfsd/cache.h |    3 +--
>  2 files changed, 20 insertions(+), 12 deletions(-)
> 
> Index: bfields/fs/nfsd/nfscache.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfscache.c
> +++ bfields/fs/nfsd/nfscache.c
> @@ -29,15 +29,24 @@
>   */
>  #define CACHESIZE		1024
>  #define HASHSIZE		64
> -#define REQHASH(xid)		(((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
>  
> -static struct hlist_head *	hash_list;
> +static struct hlist_head *	cache_hash;
>  static struct list_head 	lru_head;
>  static int			cache_disabled = 1;
>  
> +/*
> + * Calculate the hash index from an XID.
> + */
> +static inline u32 request_hash(u32 xid)
> +{
> +	u32 h = xid;
> +	h ^= (xid >> 24);
> +	return h & (HASHSIZE-1);
> +}
> +
>  static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
>  
> -/* 
> +/*
>   * locking for the reply cache:
>   * A cache entry is "single use" if c_state == RC_INPROG
>   * Otherwise, it when accessing _prev or _next, the lock must be held.
> @@ -62,8 +71,8 @@ int nfsd_reply_cache_init(void)
>  		i--;
>  	}
>  
> -	hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
> -	if (!hash_list)
> +	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
> +	if (!cache_hash)
>  		goto out_nomem;
>  
>  	cache_disabled = 0;
> @@ -88,8 +97,8 @@ void nfsd_reply_cache_shutdown(void)
>  
>  	cache_disabled = 1;
>  
> -	kfree (hash_list);
> -	hash_list = NULL;
> +	kfree (cache_hash);
> +	cache_hash = NULL;
>  }
>  
>  /*
> @@ -108,7 +117,7 @@ static void
>  hash_refile(struct svc_cacherep *rp)
>  {
>  	hlist_del_init(&rp->c_hash);
> -	hlist_add_head(&rp->c_hash, hash_list + REQHASH(rp->c_xid));
> +	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
>  }
>  
>  /*
> @@ -138,7 +147,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  	spin_lock(&cache_lock);
>  	rtn = RC_DOIT;
>  
> -	rh = &hash_list[REQHASH(xid)];
> +	rh = &cache_hash[request_hash(xid)];
>  	hlist_for_each_entry(rp, hn, rh, c_hash) {
>  		if (rp->c_state != RC_UNUSED &&
>  		    xid == rp->c_xid && proc == rp->c_proc &&
> @@ -264,7 +273,7 @@ nfsd_cache_update(struct svc_rqst *rqstp
>  
>  	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
>  	len >>= 2;
> -	
> +
>  	/* Don't cache excessive amounts of data and XDR failures */
>  	if (!statp || len > (256 >> 2)) {
>  		rp->c_state = RC_UNUSED;
> Index: bfields/include/linux/nfsd/cache.h
> ===================================================================
> --- bfields.orig/include/linux/nfsd/cache.h
> +++ bfields/include/linux/nfsd/cache.h
> @@ -14,8 +14,7 @@
>  #include <linux/uio.h>
>  
>  /*
> - * Representation of a reply cache entry. The first two members *must*
> - * be hash_next and hash_prev.
> + * Representation of a reply cache entry.
>   */
>  struct svc_cacherep {
>  	struct hlist_node	c_hash;
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 15/29] knfsd: fix reply cache memory corruption
  2009-03-31 20:28 ` [patch 15/29] knfsd: fix reply cache memory corruption Greg Banks
@ 2009-05-12 19:55   ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-12 19:55 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:15AM +1100, Greg Banks wrote:
> Fix a regression in the reply cache introduced when the code was
> converted to use proper Linux lists.  When a new entry needs to be
> inserted, the case where all the entries are currently being used
> by threads is not correctly detected.  This can result in memory
> corruption and a crash.  In the current code this is an extremely
> unlikely corner case; it would require the machine to have 1024
> nfsd threads and all of them to be busy at the same time.  However,
> upcoming reply cache changes make this more likely; a crash due to
> this problem was actually observed in field.

OK, that does indeed sound hard to reproduce as is, but may as well
apply it for 2.6.31 now.--b.

> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/nfscache.c |    4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> Index: bfields/fs/nfsd/nfscache.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfscache.c
> +++ bfields/fs/nfsd/nfscache.c
> @@ -177,8 +177,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  	}
>  	}
>  
> -	/* This should not happen */
> -	if (rp == NULL) {
> +	/* All entries on the LRU are in-progress. This should not happen */
> +	if (&rp->c_lru == &lru_head) {
>  		static int	complaints;
>  
>  		printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 21/29] knfsd: remove unreported filehandle stats counters
  2009-03-31 20:28 ` [patch 21/29] knfsd: remove unreported filehandle stats counters Greg Banks
@ 2009-05-12 20:00   ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-12 20:00 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:21AM +1100, Greg Banks wrote:
> The file nfsfh.c contains two static variables nfsd_nr_verified and
> nfsd_nr_put.  These are counters which are incremented as a side
> effect of the fh_verify() fh_compose() and fh_put() operations,
> i.e. at least twice per NFS call for any non-trivial workload.
> Needless to say this makes the cacheline that contains them (and any
> other innocent victims) a very hot contention point indeed under high
> call-rate workloads on multiprocessor NFS server.  It also turns out
> that these counters are not used anywhere.  They're not reported to
> userspace, they're not used in logic, they're not even exported from
> the object file (let alone the module).  All they do is waste CPU time.
> 
> So this patch removes them.

Clearly right, thanks!  Applied for 2.6.31.

> 
> Tests on a 16 CPU Altix A4700 with 2 10gige Myricom cards, configured
> separately (no bonding).  Workload is 640 client threads doing directory
> traverals with random small reads, from server RAM.
> 
> Before
> ======
> 
> Kernel profile:
> 
>   %   cumulative   self              self     total
>  time   samples   samples    calls   1/call   1/call  name
>   6.05   2716.00  2716.00    30406     0.09     1.02  svc_process
>   4.44   4706.00  1990.00     1975     1.01     1.01  spin_unlock_irqrestore
>   3.72   6376.00  1670.00     1666     1.00     1.00  svc_export_put
>   3.41   7907.00  1531.00     1786     0.86     1.02  nfsd_ofcache_lookup
>   3.25   9363.00  1456.00    10965     0.13     1.01  nfsd_dispatch
>   3.10  10752.00  1389.00     1376     1.01     1.01  nfsd_cache_lookup
>   2.57  11907.00  1155.00     4517     0.26     1.03  svc_tcp_recvfrom
>   ...
>   2.21  15352.00  1003.00     1081     0.93     1.00  nfsd_choose_ofc  <----
>   ^^^^ 
> 
> Here the function nfsd_choose_ofc() reads a global variable
> which by accident happened to be located in the same cacheline as
> nfsd_nr_verified.

nfsd_choose_ofc is something not in mainline?  But, OK, it's interesting
in any case to see that this small a change is measureable.--b.

> 
> Call rate:
> 
> nullarbor:~ # pmdumptext nfs3.server.calls
> ...
> Thu Dec 13 00:15:27     184780.663
> Thu Dec 13 00:15:28     184885.881
> Thu Dec 13 00:15:29     184449.215
> Thu Dec 13 00:15:30     184971.058
> Thu Dec 13 00:15:31     185036.052
> Thu Dec 13 00:15:32     185250.475
> Thu Dec 13 00:15:33     184481.319
> Thu Dec 13 00:15:34     185225.737
> Thu Dec 13 00:15:35     185408.018
> Thu Dec 13 00:15:36     185335.764
> 
>  
> After
> =====
> 
> kernel profile:
> 
>   %   cumulative   self              self     total
>  time   samples   samples    calls   1/call   1/call  name
>   6.33   2813.00  2813.00    29979     0.09     1.01  svc_process
>   4.66   4883.00  2070.00     2065     1.00     1.00  spin_unlock_irqrestore
>   4.06   6687.00  1804.00     2182     0.83     1.00  nfsd_ofcache_lookup
>   3.20   8110.00  1423.00    10932     0.13     1.00  nfsd_dispatch
>   3.03   9456.00  1346.00     1343     1.00     1.00  nfsd_cache_lookup
>   2.62  10622.00  1166.00     4645     0.25     1.01  svc_tcp_recvfrom
> [...]
>   0.10  42586.00    44.00       74     0.59     1.00  nfsd_choose_ofc  <--- HA!!
>   ^^^^
> 
> Call rate:
> 
> nullarbor:~ # pmdumptext nfs3.server.calls
> ...
> Thu Dec 13 01:45:28     194677.118
> Thu Dec 13 01:45:29     193932.692
> Thu Dec 13 01:45:30     194294.364
> Thu Dec 13 01:45:31     194971.276
> Thu Dec 13 01:45:32     194111.207
> Thu Dec 13 01:45:33     194999.635
> Thu Dec 13 01:45:34     195312.594
> Thu Dec 13 01:45:35     195707.293
> Thu Dec 13 01:45:36     194610.353
> Thu Dec 13 01:45:37     195913.662
> Thu Dec 13 01:45:38     194808.675
> 
> i.e. about a 5.3% improvement in call rate.
> 
> Signed-off-by: Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
> Reviewed-by: David Chinner <dgc@sgi.com>
> ---
> 
>  fs/nfsd/nfsfh.c |    6 ------
>  1 file changed, 6 deletions(-)
> 
> Index: bfields/fs/nfsd/nfsfh.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfsfh.c
> +++ bfields/fs/nfsd/nfsfh.c
> @@ -27,9 +27,6 @@
>  #define NFSDDBG_FACILITY		NFSDDBG_FH
>  
>  
> -static int nfsd_nr_verified;
> -static int nfsd_nr_put;
> -
>  /*
>   * our acceptability function.
>   * if NOSUBTREECHECK, accept anything
> @@ -251,7 +248,6 @@ static __be32 nfsd_set_fh_dentry(struct 
>  
>  	fhp->fh_dentry = dentry;
>  	fhp->fh_export = exp;
> -	nfsd_nr_verified++;
>  	return 0;
>  out:
>  	exp_put(exp);
> @@ -552,7 +548,6 @@ fh_compose(struct svc_fh *fhp, struct sv
>  			return nfserr_opnotsupp;
>  	}
>  
> -	nfsd_nr_verified++;
>  	return 0;
>  }
>  
> @@ -609,7 +604,6 @@ fh_put(struct svc_fh *fhp)
>  		fhp->fh_pre_saved = 0;
>  		fhp->fh_post_saved = 0;
>  #endif
> -		nfsd_nr_put++;
>  	}
>  	if (exp) {
>  		cache_put(&exp->h, &svc_export_cache);
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 22/29] knfsd: make svc_authenticate() scale
  2009-03-31 20:28 ` [patch 22/29] knfsd: make svc_authenticate() scale Greg Banks
@ 2009-05-12 21:24   ` J. Bruce Fields
  0 siblings, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-12 21:24 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML

On Wed, Apr 01, 2009 at 07:28:22AM +1100, Greg Banks wrote:
> Replace the global spinlock which protects the table of registered
> RPC authentication flavours, with an RCU scheme.  The spinlock was
> taken by nfsd on every CPU for every NFS call, resulting in lots
> of spinlock contention and one very hot and bouncy cacheline.
> 
> Tests on a 16 CPU Altix A4700 with 2 10gige Myricom cards, configured
> separately (no bonding).  Workload is 640 client threads doing directory
> traverals with random small reads, from server RAM.
> 
> Before: 242 KIOPS, with an oprofile like:
>   %   cumulative   self              self     total
>  time   samples   samples    calls   1/call   1/call  name
>   5.01   2276.00  2276.00     2666     0.85     1.00  nfsd_ofcache_lookup
>   4.61   4370.00  2094.00     2092     1.00     1.00  ia64_spinlock_contention	<----
>   4.20   6279.00  1909.00     3141     0.61     0.78  svc_sock_enqueue
>   4.03   8108.00  1829.00     1824     1.00     1.00  spin_unlock_irqrestore
>   3.32   9618.00  1510.00     3588     0.42     1.00  spin_lock
> 
>              2090.00    0.00    2088/2092        spin_lock [22]
> [40]     4.6 2094.00    0.00    2092         ia64_spinlock_contention [40]
> 
>              1473.39 2039.32    3501/3588        svc_authenticate [21]
> [22]     7.9 1510.00 2090.00    3588         spin_lock [22]
> 
> After: 253 KIOPS, with a oprofile like:
>   %   cumulative   self              self     total
>  time   samples   samples    calls   1/call   1/call  name
>   5.20   2250.00  2250.00     2638     0.85     1.00  nfsd_ofcache_lookup
>   4.31   4117.00  1867.00     1863     1.00     1.00  spin_unlock_irqrestore
>   3.13   5470.00  1353.00     1447     0.94     1.01  svcauth_unix_set_client
>   2.79   6677.00  1207.00     1203     1.00     1.00  exp_readunlock
>   2.77   7875.00  1198.00     1186     1.01     1.01  svc_export_put
>   ...
>   0.03  43095.00    13.00       13     1.00     1.00  ia64_spinlock_contention	<----
> 
> Before anyone asks, going to a rwlock_t kept similar performance and
> just turned the time spent spinning on the lock to time spent waiting
> for the cacheline to bounce.
> 
> Signed-off-by: Greg Banks <gnb-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
> Reviewed-by: David Chinner <dgc@sgi.com>
> ---
> 
>  net/sunrpc/svcauth.c |   26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> Index: bfields/net/sunrpc/svcauth.c
> ===================================================================
> --- bfields.orig/net/sunrpc/svcauth.c
> +++ bfields/net/sunrpc/svcauth.c
> @@ -42,17 +42,19 @@ svc_authenticate(struct svc_rqst *rqstp,
>  	*authp = rpc_auth_ok;
>  
>  	flavor = svc_getnl(&rqstp->rq_arg.head[0]);
> +	if (flavor >= RPC_AUTH_MAXFLAVOR)
> +		return SVC_DENIED;
>  
>  	dprintk("svc: svc_authenticate (%d)\n", flavor);
>  
> -	spin_lock(&authtab_lock);
> -	if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor])
> -			|| !try_module_get(aops->owner)) {
> -		spin_unlock(&authtab_lock);
> +	rcu_read_lock();
> +	aops = rcu_dereference(authtab[flavor]);
> +	if (!aops || !try_module_get(aops->owner)) {
> +		rcu_read_unlock();
>  		*authp = rpc_autherr_badcred;
>  		return SVC_DENIED;
>  	}
> -	spin_unlock(&authtab_lock);
> +	rcu_read_unlock();
>  
>  	rqstp->rq_authop = aops;
>  	return aops->accept(rqstp, authp);
> @@ -87,9 +89,13 @@ int
>  svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
>  {
>  	int rv = -EINVAL;
> +
> +	if (flavor >= RPC_AUTH_MAXFLAVOR)
> +		return -EINVAL;
> +
>  	spin_lock(&authtab_lock);
> -	if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
> -		authtab[flavor] = aops;
> +	if (authtab[flavor] == NULL) {
> +		rcu_assign_pointer(authtab[flavor], aops);
>  		rv = 0;
>  	}
>  	spin_unlock(&authtab_lock);
> @@ -100,9 +106,11 @@ EXPORT_SYMBOL_GPL(svc_auth_register);
>  void
>  svc_auth_unregister(rpc_authflavor_t flavor)
>  {
> +	if (flavor >= RPC_AUTH_MAXFLAVOR)
> +		return;
> +
>  	spin_lock(&authtab_lock);
> -	if (flavor < RPC_AUTH_MAXFLAVOR)
> -		authtab[flavor] = NULL;
> +	rcu_assign_pointer(authtab[flavor], NULL);

Despite having seen Paul McKenney explain rcu quite well at least a
couple times, I still have to go look at the documentation for these
functions....  Fortunately the documentation is good.

Looks like rcu_assign_pointer() is just a memory barrier, and doesn't
ensure, e.g, that this assignment won't happen during a read-side
critical section.  Don't we need more than that?  Maybe something like:

	rcu_assign_pointer(authtab[flavor], NULL);
	synchronize_rcu();

to ensure the aops doesn't go away before someone's even had a chance to
call try_module_get() on it?

--b.

>  	spin_unlock(&authtab_lock);
>  }
>  EXPORT_SYMBOL_GPL(svc_auth_unregister);
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 18/29] knfsd: dynamically expand the reply cache
  2009-03-31 20:28 ` [patch 18/29] knfsd: dynamically expand the reply cache Greg Banks
@ 2009-05-26 18:57   ` J. Bruce Fields
  2009-05-26 19:04     ` J. Bruce Fields
  2009-05-26 21:24     ` Rob Gardner
  0 siblings, 2 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-26 18:57 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML, Robert Gardner

On Wed, Apr 01, 2009 at 07:28:18AM +1100, Greg Banks wrote:
> Allow the reply cache to expand under nonidempotent NFS call load.
> The current fixed limit on reply cache entries is actually so small
> as to make the reply cache utterly ineffectual (see the comment in
> nfscache.c for details).
> 
> This is a simpler version of an older more complicated patch which
> dynamically expanded the hash index using lazy rehashing.  Here we
> allocate a hash index which is too large for the initial size of the
> reply cache, and don't ever resize it.

Hm, just on the subject; looks like someone (cc'd) is presenting at the
Linux Symposium on reply cache improvements:

	http://www.linuxsymposium.org/2009/view_abstract.php?content_key=89

I assume they're talking about the linux server?  Will there be patches?

--b.

> 
> Signed-off-by: Greg Banks <gnb@sgi.com>
> ---
> 
>  fs/nfsd/nfscache.c |   76 ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 66 insertions(+), 10 deletions(-)
> 
> Index: bfields/fs/nfsd/nfscache.c
> ===================================================================
> --- bfields.orig/fs/nfsd/nfscache.c
> +++ bfields/fs/nfsd/nfscache.c
> @@ -9,7 +9,7 @@
>   *
>   * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
>   *
> - * SMP lock splitting by Greg Banks <gnb@sgi.com>
> + * Dynamic expansion and SMP lock splitting by Greg Banks <gnb@sgi.com>
>   *     Copyright (c) 2005-2009 Silicon Graphics, Inc.
>   */
>  
> @@ -24,11 +24,21 @@
>  #include <linux/nfsd/nfsd.h>
>  #include <linux/nfsd/cache.h>
>  
> -/* Size of reply cache. Common values are:
> +/* Initial size of reply cache. Common values are:
>   * 4.3BSD:	128
>   * 4.4BSD:	256
>   * Solaris2:	1024
>   * DEC Unix:	512-4096
> + *
> + * All these values reflect network packet rates and NFS loads common
> + * somewhen around 1990, and are utterly inadequate for modern NFS
> + * servers.  To be at all effective the reply cache needs to hold all
> + * NFS calls seen by the server for at least a client RPC timeout period
> + * (typically 1.1 seconds), and to handle weird IP routing issues should
> + * really hold 120 seconds of traffic.  A modern NFS server can be
> + * fielding upwards of 10,000 calls per second, which means the default
> + * cache size of 1024 holds about 102 milliseconds'  traffic, i.e. the
> + * default size is three orders of magnitude too small.
>   */
>  /* number of buckets used to manage LRU lists and cache locks (power of 2) */
>  #ifdef CONFIG_SMP
> @@ -36,14 +46,22 @@
>  #else
>  #define CACHE_NUM_BUCKETS	1
>  #endif
> -/* number of entries in all LRU lists (power of 2) */
> +/* initial number of entries in all LRU lists (power of 2) */
>  #define CACHE_SIZE		(1024)
> +/* largest possible number of entries in all LRU lists (power of 2) */
> +#define CACHE_MAX_SIZE		(16*1024*CACHE_NUM_BUCKETS)
>  /* largest possible number of entries in LRU per bucket */
> -#define CACHE_BUCKET_MAX_SIZE	(CACHE_SIZE/CACHE_NUM_BUCKETS)
> +#define CACHE_BUCKET_MAX_SIZE	(CACHE_MAX_SIZE/CACHE_NUM_BUCKETS)
> +/* number of entries each bucket will expand by */
> +#define CACHE_BUCKET_INCREMENT	(1024/CACHE_NUM_BUCKETS)
>  /* log2 of largest desired hash chain length */
>  #define MAX_CHAIN_ORDER		2
>  /* initial and maximum size of the per-bucket hash table */
> -#define HASHSIZE		((CACHE_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
> +#define HASHSIZE		((CACHE_MAX_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
> +/* the cache attempts to expand if an entry younger than this is evicted */
> +#define CACHE_THRESH_AGE	(11 * HZ / 10)  /* in jiffies */
> +/* parameters for rate limiting cache expansion */
> +#define CACHE_RATE_JIFFIES	(HZ/2)
>  
>  /*
>   * locking for the reply cache:
> @@ -63,6 +81,9 @@ struct svc_cache_bucket
>  	struct list_head lru;
>  	unsigned int size;
>  	struct hlist_head *hash;
> +	/* parameters for expand rate limiting */
> +	unsigned long last;
> +	unsigned long nhits;
>  } ____cacheline_aligned_in_smp;
>  
>  static struct svc_cache_bucket	cache_buckets[CACHE_NUM_BUCKETS];
> @@ -90,18 +111,18 @@ static inline u32 request_hash(u32 xid, 
>  static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
>  
>  /*
> - * Initialise the reply cache data structures.
> + * Expand (or initialise) the reply cache data structures.
>   * Called without cache_lock, uses it internally.  Returns
>   * 0 on success, an error otherwise.
>   */
> -static int nfsd_cache_bucket_init(struct svc_cache_bucket *b, unsigned int num)
> +static int nfsd_cache_bucket_expand(struct svc_cache_bucket *b, unsigned int increment)
>  {
>  	struct svc_cacherep *rp;
>  	unsigned int i;
>  	LIST_HEAD(lru);
>  
>  	/* allocate new entries without the lock, keep them on their own list */
> -	i = num;
> +	i = increment;
>  	while (i) {
>  		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
>  		if (!rp)
> @@ -116,7 +137,7 @@ static int nfsd_cache_bucket_init(struct
>  	/* add the new entries */
>  	spin_lock(&b->lock);
>  
> -	b->size = num;
> +	b->size += increment;
>  	list_splice(&lru, &b->lru);
>  
>  	spin_unlock(&b->lock);
> @@ -142,7 +163,7 @@ int nfsd_reply_cache_init(void)
>  
>  		INIT_LIST_HEAD(&b->lru);
>  		spin_lock_init(&b->lock);
> -		if (nfsd_cache_bucket_init(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
> +		if (nfsd_cache_bucket_expand(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
>  			goto out_nomem;
>  		b->hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
>  		if (!b->hash)
> @@ -189,6 +210,26 @@ static inline void lru_put_end(struct sv
>  }
>  
>  /*
> + * Decide whether it is time to expand the cache.  Returns 1 iff
> + * the cache is to be expanded.  Called with bucket lock held.
> + */
> +static int nfsd_cache_expand_ratelimit(struct svc_cache_bucket *b)
> +{
> +	unsigned long now = jiffies;
> +
> +	b->nhits++;
> +	if (b->last == 0) {
> +		b->last = now;
> +	} else if ((now - b->last) > CACHE_RATE_JIFFIES &&
> +		   b->nhits > (b->size >> 4)) {
> +		b->nhits = 0;
> +		b->last = now;
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/*
>   * Try to find an entry matching the current call in the cache. When none
>   * is found, we grab the oldest unlocked entry off the LRU list.
>   * Note that no operation within the loop may sleep.
> @@ -207,6 +248,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  	struct svc_cache_bucket *b;
>  	unsigned long		age;
>  	int			rtn;
> +	int			expand = 0;
>  
>  	rqstp->rq_cacherep = NULL;
>  	if (cache_disabled || type == RC_NOCACHE) {
> @@ -259,6 +301,18 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  		goto out;
>  	}
>  
> +	if (rp->c_state != RC_UNUSED) {
> +		/* reusing an existing cache entry */
> +		age = jiffies - rp->c_timestamp;
> +		if (age < CACHE_THRESH_AGE &&
> +		    b->size < CACHE_BUCKET_MAX_SIZE &&
> +		    nfsd_cache_expand_ratelimit(b)) {
> +			expand = CACHE_BUCKET_INCREMENT;
> +			if (b->size + expand > CACHE_BUCKET_MAX_SIZE)
> +				expand = CACHE_BUCKET_MAX_SIZE - b->size;
> +		}
> +	}
> +
>  	rqstp->rq_cacherep = rp;
>  	rp->c_state = RC_INPROG;
>  	rp->c_xid = xid;
> @@ -280,6 +334,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
>  	rp->c_type = RC_NOCACHE;
>   out:
>  	spin_unlock(&b->lock);
> +	if (expand)
> +		nfsd_cache_bucket_expand(b, expand);
>  	return rtn;
>  
>  found_entry:
> 
> --
> Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 18/29] knfsd: dynamically expand the reply cache
  2009-05-26 18:57   ` J. Bruce Fields
@ 2009-05-26 19:04     ` J. Bruce Fields
  2009-05-26 21:24     ` Rob Gardner
  1 sibling, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-26 19:04 UTC (permalink / raw)
  To: Greg Banks; +Cc: Linux NFS ML, Robert Gardner

(With Greg's address corrected.)

On Tue, May 26, 2009 at 02:57:01PM -0400, bfields wrote:
> On Wed, Apr 01, 2009 at 07:28:18AM +1100, Greg Banks wrote:
> > Allow the reply cache to expand under nonidempotent NFS call load.
> > The current fixed limit on reply cache entries is actually so small
> > as to make the reply cache utterly ineffectual (see the comment in
> > nfscache.c for details).
> > 
> > This is a simpler version of an older more complicated patch which
> > dynamically expanded the hash index using lazy rehashing.  Here we
> > allocate a hash index which is too large for the initial size of the
> > reply cache, and don't ever resize it.
> 
> Hm, just on the subject; looks like someone (cc'd) is presenting at the
> Linux Symposium on reply cache improvements:
> 
> 	http://www.linuxsymposium.org/2009/view_abstract.php?content_key=89
> 
> I assume they're talking about the linux server?  Will there be patches?
> 
> --b.
> 
> > 
> > Signed-off-by: Greg Banks <gnb@sgi.com>
> > ---
> > 
> >  fs/nfsd/nfscache.c |   76 ++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 66 insertions(+), 10 deletions(-)
> > 
> > Index: bfields/fs/nfsd/nfscache.c
> > ===================================================================
> > --- bfields.orig/fs/nfsd/nfscache.c
> > +++ bfields/fs/nfsd/nfscache.c
> > @@ -9,7 +9,7 @@
> >   *
> >   * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
> >   *
> > - * SMP lock splitting by Greg Banks <gnb@sgi.com>
> > + * Dynamic expansion and SMP lock splitting by Greg Banks <gnb@sgi.com>
> >   *     Copyright (c) 2005-2009 Silicon Graphics, Inc.
> >   */
> >  
> > @@ -24,11 +24,21 @@
> >  #include <linux/nfsd/nfsd.h>
> >  #include <linux/nfsd/cache.h>
> >  
> > -/* Size of reply cache. Common values are:
> > +/* Initial size of reply cache. Common values are:
> >   * 4.3BSD:	128
> >   * 4.4BSD:	256
> >   * Solaris2:	1024
> >   * DEC Unix:	512-4096
> > + *
> > + * All these values reflect network packet rates and NFS loads common
> > + * somewhen around 1990, and are utterly inadequate for modern NFS
> > + * servers.  To be at all effective the reply cache needs to hold all
> > + * NFS calls seen by the server for at least a client RPC timeout period
> > + * (typically 1.1 seconds), and to handle weird IP routing issues should
> > + * really hold 120 seconds of traffic.  A modern NFS server can be
> > + * fielding upwards of 10,000 calls per second, which means the default
> > + * cache size of 1024 holds about 102 milliseconds'  traffic, i.e. the
> > + * default size is three orders of magnitude too small.
> >   */
> >  /* number of buckets used to manage LRU lists and cache locks (power of 2) */
> >  #ifdef CONFIG_SMP
> > @@ -36,14 +46,22 @@
> >  #else
> >  #define CACHE_NUM_BUCKETS	1
> >  #endif
> > -/* number of entries in all LRU lists (power of 2) */
> > +/* initial number of entries in all LRU lists (power of 2) */
> >  #define CACHE_SIZE		(1024)
> > +/* largest possible number of entries in all LRU lists (power of 2) */
> > +#define CACHE_MAX_SIZE		(16*1024*CACHE_NUM_BUCKETS)
> >  /* largest possible number of entries in LRU per bucket */
> > -#define CACHE_BUCKET_MAX_SIZE	(CACHE_SIZE/CACHE_NUM_BUCKETS)
> > +#define CACHE_BUCKET_MAX_SIZE	(CACHE_MAX_SIZE/CACHE_NUM_BUCKETS)
> > +/* number of entries each bucket will expand by */
> > +#define CACHE_BUCKET_INCREMENT	(1024/CACHE_NUM_BUCKETS)
> >  /* log2 of largest desired hash chain length */
> >  #define MAX_CHAIN_ORDER		2
> >  /* initial and maximum size of the per-bucket hash table */
> > -#define HASHSIZE		((CACHE_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
> > +#define HASHSIZE		((CACHE_MAX_SIZE>>MAX_CHAIN_ORDER)/CACHE_NUM_BUCKETS)
> > +/* the cache attempts to expand if an entry younger than this is evicted */
> > +#define CACHE_THRESH_AGE	(11 * HZ / 10)  /* in jiffies */
> > +/* parameters for rate limiting cache expansion */
> > +#define CACHE_RATE_JIFFIES	(HZ/2)
> >  
> >  /*
> >   * locking for the reply cache:
> > @@ -63,6 +81,9 @@ struct svc_cache_bucket
> >  	struct list_head lru;
> >  	unsigned int size;
> >  	struct hlist_head *hash;
> > +	/* parameters for expand rate limiting */
> > +	unsigned long last;
> > +	unsigned long nhits;
> >  } ____cacheline_aligned_in_smp;
> >  
> >  static struct svc_cache_bucket	cache_buckets[CACHE_NUM_BUCKETS];
> > @@ -90,18 +111,18 @@ static inline u32 request_hash(u32 xid, 
> >  static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
> >  
> >  /*
> > - * Initialise the reply cache data structures.
> > + * Expand (or initialise) the reply cache data structures.
> >   * Called without cache_lock, uses it internally.  Returns
> >   * 0 on success, an error otherwise.
> >   */
> > -static int nfsd_cache_bucket_init(struct svc_cache_bucket *b, unsigned int num)
> > +static int nfsd_cache_bucket_expand(struct svc_cache_bucket *b, unsigned int increment)
> >  {
> >  	struct svc_cacherep *rp;
> >  	unsigned int i;
> >  	LIST_HEAD(lru);
> >  
> >  	/* allocate new entries without the lock, keep them on their own list */
> > -	i = num;
> > +	i = increment;
> >  	while (i) {
> >  		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
> >  		if (!rp)
> > @@ -116,7 +137,7 @@ static int nfsd_cache_bucket_init(struct
> >  	/* add the new entries */
> >  	spin_lock(&b->lock);
> >  
> > -	b->size = num;
> > +	b->size += increment;
> >  	list_splice(&lru, &b->lru);
> >  
> >  	spin_unlock(&b->lock);
> > @@ -142,7 +163,7 @@ int nfsd_reply_cache_init(void)
> >  
> >  		INIT_LIST_HEAD(&b->lru);
> >  		spin_lock_init(&b->lock);
> > -		if (nfsd_cache_bucket_init(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
> > +		if (nfsd_cache_bucket_expand(b, CACHE_SIZE/CACHE_NUM_BUCKETS))
> >  			goto out_nomem;
> >  		b->hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
> >  		if (!b->hash)
> > @@ -189,6 +210,26 @@ static inline void lru_put_end(struct sv
> >  }
> >  
> >  /*
> > + * Decide whether it is time to expand the cache.  Returns 1 iff
> > + * the cache is to be expanded.  Called with bucket lock held.
> > + */
> > +static int nfsd_cache_expand_ratelimit(struct svc_cache_bucket *b)
> > +{
> > +	unsigned long now = jiffies;
> > +
> > +	b->nhits++;
> > +	if (b->last == 0) {
> > +		b->last = now;
> > +	} else if ((now - b->last) > CACHE_RATE_JIFFIES &&
> > +		   b->nhits > (b->size >> 4)) {
> > +		b->nhits = 0;
> > +		b->last = now;
> > +		return 1;
> > +	}
> > +	return 0;
> > +}
> > +
> > +/*
> >   * Try to find an entry matching the current call in the cache. When none
> >   * is found, we grab the oldest unlocked entry off the LRU list.
> >   * Note that no operation within the loop may sleep.
> > @@ -207,6 +248,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
> >  	struct svc_cache_bucket *b;
> >  	unsigned long		age;
> >  	int			rtn;
> > +	int			expand = 0;
> >  
> >  	rqstp->rq_cacherep = NULL;
> >  	if (cache_disabled || type == RC_NOCACHE) {
> > @@ -259,6 +301,18 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
> >  		goto out;
> >  	}
> >  
> > +	if (rp->c_state != RC_UNUSED) {
> > +		/* reusing an existing cache entry */
> > +		age = jiffies - rp->c_timestamp;
> > +		if (age < CACHE_THRESH_AGE &&
> > +		    b->size < CACHE_BUCKET_MAX_SIZE &&
> > +		    nfsd_cache_expand_ratelimit(b)) {
> > +			expand = CACHE_BUCKET_INCREMENT;
> > +			if (b->size + expand > CACHE_BUCKET_MAX_SIZE)
> > +				expand = CACHE_BUCKET_MAX_SIZE - b->size;
> > +		}
> > +	}
> > +
> >  	rqstp->rq_cacherep = rp;
> >  	rp->c_state = RC_INPROG;
> >  	rp->c_xid = xid;
> > @@ -280,6 +334,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp
> >  	rp->c_type = RC_NOCACHE;
> >   out:
> >  	spin_unlock(&b->lock);
> > +	if (expand)
> > +		nfsd_cache_bucket_expand(b, expand);
> >  	return rtn;
> >  
> >  found_entry:
> > 
> > --
> > Greg

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 18/29] knfsd: dynamically expand the reply cache
  2009-05-26 18:57   ` J. Bruce Fields
  2009-05-26 19:04     ` J. Bruce Fields
@ 2009-05-26 21:24     ` Rob Gardner
  2009-05-26 21:52       ` J. Bruce Fields
  2009-05-27  0:28       ` Greg Banks
  1 sibling, 2 replies; 63+ messages in thread
From: Rob Gardner @ 2009-05-26 21:24 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Linux NFS ML

J. Bruce Fields wrote:
>
> Hm, just on the subject; looks like someone (cc'd) is presenting at the
> Linux Symposium on reply cache improvements:
>
> 	http://www.linuxsymposium.org/2009/view_abstract.php?content_key=89
>
> I assume they're talking about the linux server?  Will there be patches?

Yes, the paper is referring to the linux server. I'd like to submit 
patches someday, but right now we're still working with an ancient 
kernel and it'll be a while until we get more current. In the meantime, 
the final paper is due at the end of this week and I'd love for a couple 
of 'experts' to review it. Anyone interested?

Rob Gardner


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 18/29] knfsd: dynamically expand the reply cache
  2009-05-26 21:24     ` Rob Gardner
@ 2009-05-26 21:52       ` J. Bruce Fields
  2009-05-27  0:28       ` Greg Banks
  1 sibling, 0 replies; 63+ messages in thread
From: J. Bruce Fields @ 2009-05-26 21:52 UTC (permalink / raw)
  To: Rob Gardner; +Cc: Linux NFS ML

On Tue, May 26, 2009 at 03:24:18PM -0600, Rob Gardner wrote:
> J. Bruce Fields wrote:
>>
>> Hm, just on the subject; looks like someone (cc'd) is presenting at the
>> Linux Symposium on reply cache improvements:
>>
>> 	http://www.linuxsymposium.org/2009/view_abstract.php?content_key=89
>>
>> I assume they're talking about the linux server?  Will there be patches?
>
> Yes, the paper is referring to the linux server. I'd like to submit  
> patches someday, but right now we're still working with an ancient  
> kernel and it'll be a while until we get more current. In the meantime,  
> the final paper is due at the end of this week and I'd love for a couple  
> of 'experts' to review it. Anyone interested?

I'd certainly be interested, though I can't promise I'll have time for a
conscientious review.

--b.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [patch 18/29] knfsd: dynamically expand the reply cache
  2009-05-26 21:24     ` Rob Gardner
  2009-05-26 21:52       ` J. Bruce Fields
@ 2009-05-27  0:28       ` Greg Banks
  1 sibling, 0 replies; 63+ messages in thread
From: Greg Banks @ 2009-05-27  0:28 UTC (permalink / raw)
  To: Rob Gardner; +Cc: J. Bruce Fields, Linux NFS ML

On Wed, May 27, 2009 at 7:24 AM, Rob Gardner <rob.gardner@hp.com> wrote:
> J. Bruce Fields wrote:
>>
>> Hm, just on the subject; looks like someone (cc'd) is presenting at the
>> Linux Symposium on reply cache improvements:
>>
>[...] In the meantime, the final paper is
> due at the end of this week and I'd love for a couple of 'experts' to review
> it. Anyone interested?
>

Yes please.  The abstract sounds very interesting.

-- 
Greg.

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2009-05-27  0:28 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-31 20:28 [patch 00/29] SGI enhancedNFS patches Greg Banks
2009-03-31 20:28 ` [patch 01/29] knfsd: Add infrastructure for measuring RPC service times Greg Banks
2009-04-25  2:13   ` J. Bruce Fields
2009-04-25  2:14     ` J. Bruce Fields
2009-04-25  2:52     ` Greg Banks
2009-03-31 20:28 ` [patch 02/29] knfsd: Add stats table infrastructure Greg Banks
2009-04-25  3:56   ` J. Bruce Fields
2009-04-26  4:12     ` Greg Banks
2009-03-31 20:28 ` [patch 03/29] knfsd: add userspace controls for stats tables Greg Banks
2009-04-25 21:57   ` J. Bruce Fields
2009-04-25 22:03     ` J. Bruce Fields
2009-04-27 16:06       ` Chuck Lever
2009-04-27 23:22         ` J. Bruce Fields
2009-04-28 15:37           ` Chuck Lever
2009-04-28 15:57             ` J. Bruce Fields
2009-04-28 16:03               ` Chuck Lever
2009-04-28 16:26                 ` J. Bruce Fields
2009-04-29  1:45               ` Greg Banks
     [not found]         ` <ac442c870904271827w6041a67ew82fe36a843beeac3@mail.gmail.com>
     [not found]           ` <ac442c870904271827w6041a67ew82fe36a843beeac3-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-28  1:31             ` Greg Banks
2009-04-26  4:14     ` Greg Banks
2009-03-31 20:28 ` [patch 04/29] knfsd: Add stats updating API Greg Banks
2009-03-31 20:28 ` [patch 05/29] knfsd: Infrastructure for providing stats to userspace Greg Banks
2009-04-01  0:28   ` J. Bruce Fields
2009-04-01  3:43     ` Greg Banks
2009-03-31 20:28 ` [patch 06/29] knfsd: Gather per-export stats Greg Banks
2009-03-31 20:28 ` [patch 07/29] knfsd: Prefetch the per-export stats entry Greg Banks
2009-03-31 20:28 ` [patch 08/29] knfsd: Gather per-client stats Greg Banks
2009-03-31 20:28 ` [patch 09/29] knfsd: Cache per-client stats entry on TCP transports Greg Banks
2009-03-31 20:28 ` [patch 10/29] knfsd: Update per-client & per-export stats from NFSv3 Greg Banks
2009-03-31 20:28 ` [patch 11/29] knfsd: Update per-client & per-export stats from NFSv2 Greg Banks
2009-03-31 20:28 ` [patch 12/29] knfsd: Update per-client & per-export stats from NFSv4 Greg Banks
2009-03-31 20:28 ` [patch 13/29] knfsd: reply cache cleanups Greg Banks
2009-05-12 19:54   ` J. Bruce Fields
2009-03-31 20:28 ` [patch 14/29] knfsd: better hashing in the reply cache Greg Banks
2009-05-08 22:01   ` J. Bruce Fields
2009-03-31 20:28 ` [patch 15/29] knfsd: fix reply cache memory corruption Greg Banks
2009-05-12 19:55   ` J. Bruce Fields
2009-03-31 20:28 ` [patch 16/29] knfsd: use client IPv4 address in reply cache hash Greg Banks
2009-05-11 21:48   ` J. Bruce Fields
2009-03-31 20:28 ` [patch 17/29] knfsd: make the reply cache SMP-friendly Greg Banks
2009-03-31 20:28 ` [patch 18/29] knfsd: dynamically expand the reply cache Greg Banks
2009-05-26 18:57   ` J. Bruce Fields
2009-05-26 19:04     ` J. Bruce Fields
2009-05-26 21:24     ` Rob Gardner
2009-05-26 21:52       ` J. Bruce Fields
2009-05-27  0:28       ` Greg Banks
2009-03-31 20:28 ` [patch 19/29] knfsd: faster probing in " Greg Banks
2009-03-31 20:28 ` [patch 20/29] knfsd: add extended reply cache stats Greg Banks
2009-03-31 20:28 ` [patch 21/29] knfsd: remove unreported filehandle stats counters Greg Banks
2009-05-12 20:00   ` J. Bruce Fields
2009-03-31 20:28 ` [patch 22/29] knfsd: make svc_authenticate() scale Greg Banks
2009-05-12 21:24   ` J. Bruce Fields
2009-03-31 20:28 ` [patch 23/29] knfsd: introduce SVC_INC_STAT Greg Banks
2009-03-31 20:28 ` [patch 24/29] knfsd: remove the program field from struct svc_stat Greg Banks
2009-03-31 20:28 ` [patch 25/29] knfsd: allocate svc_serv.sv_stats dynamically Greg Banks
2009-03-31 20:28 ` [patch 26/29] knfsd: make svc_serv.sv_stats per-CPU Greg Banks
2009-03-31 20:28 ` [patch 27/29] knfsd: move hot procedure count field out of svc_procedure Greg Banks
2009-03-31 20:28 ` [patch 28/29] knfsd: introduce NFSD_INC_STAT() Greg Banks
2009-03-31 20:28 ` [patch 29/29] knfsd: make nfsdstats per-CPU Greg Banks
2009-04-01  0:23 ` [patch 00/29] SGI enhancedNFS patches J. Bruce Fields
2009-04-01  3:32   ` Greg Banks
     [not found]     ` <ac442c870903312032t34630c6dvdbb644cb510f8079-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-01  6:34       ` Jeff Garzik
2009-04-01  6:41         ` Greg Banks

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.