All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH][RFC] NFS: Improving the access cache
@ 2006-04-26  1:14 Steve Dickson
  2006-04-26  1:31 ` Matthew Wilcox
                   ` (3 more replies)
  0 siblings, 4 replies; 36+ messages in thread
From: Steve Dickson @ 2006-04-26  1:14 UTC (permalink / raw)
  To: nfs; +Cc: linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 743 bytes --]

Currently the NFS client caches ACCESS information on a per uid basis
which fall apart when different process with different uid consistently
access the same directory. The end result being a storm of needless
ACCESS calls...

The attached patch used a hash table to store the nfs_access_entry
entires which cause the ACCESS request to only happen when the
attributes timeout.. The table is indexed by the addition of the
nfs_inode pointer and the cr_uid in the cred structure which should
spread things out nicely for some decent scalability (although the
locking scheme may need to be reworked a bit). The table has 256 entries
of struct list_head giving it a total size of 2k.

The patch is based on Trond's GIT tree...

Comments?

steved.

[-- Attachment #2: linux-2.6.17.rc2-nfs-accesscache-hash.patch --]
[-- Type: text/x-patch, Size: 6231 bytes --]

This patch improves the caching of ACCESS information by storing
the information in hash table. The patch will greatly decrease the
number of ACCESS calls with processes with different uids access
the same directory.

Signed-off-by: Steve Dickson <steved@redhat.com>
----------------------------------------------------

--- mynfs-2.6/fs/nfs/inode.c.acc	2006-04-21 15:02:07.000000000 -0400
+++ mynfs-2.6/fs/nfs/inode.c	2006-04-25 19:12:05.000000000 -0400
@@ -74,6 +74,9 @@ static void nfs_zap_acl_cache(struct ino
 
 static struct rpc_program	nfs_program;
 
+extern void nfs_zap_access_cache(struct inode *);
+extern void nfs_init_access_cache(void);
+
 static struct super_operations nfs_sops = { 
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
@@ -170,14 +173,11 @@ static void
 nfs_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct rpc_cred *cred;
 
 	nfs_wb_all(inode);
 	BUG_ON (!list_empty(&nfsi->open_files));
 	nfs_zap_acl_cache(inode);
-	cred = nfsi->cache_access.cred;
-	if (cred)
-		put_rpccred(cred);
+	nfs_zap_access_cache(inode);
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
@@ -940,7 +940,6 @@ nfs_fhget(struct super_block *sb, struct
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = jiffies;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		nfsi->cache_access.cred = NULL;
 
 		unlock_new_inode(inode);
 	} else
@@ -2892,6 +2891,8 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	nfs_init_access_cache();
+
 	err = nfs_init_nfspagecache();
 	if (err)
 		goto out4;
--- mynfs-2.6/fs/nfs/dir.c.acc	2006-04-21 15:02:07.000000000 -0400
+++ mynfs-2.6/fs/nfs/dir.c	2006-04-25 19:13:01.000000000 -0400
@@ -54,6 +54,16 @@ static int nfs_rename(struct inode *, st
 static int nfs_fsync_dir(struct file *, struct dentry *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 
+/*
+ * access cache
+ */
+#define ACCESS_HASH_BITS             8
+#define ACCESS_HASH_SIZE             (1 << ACCESS_HASH_BITS)
+#define ACCESS_HASH_MASK             (ACCESS_HASH_SIZE - 1)
+#define access_hashval(iptr, id) ((((uid_t)iptr) + (id)) & ACCESS_HASH_MASK)
+static struct list_head	access_hashtbl[ACCESS_HASH_SIZE];
+static spinlock_t access_hashlock;
+
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
 	.read		= generic_read_dir,
@@ -1635,36 +1645,102 @@ out:
 	unlock_kernel();
 	return error;
 }
+void nfs_init_access_cache(void)
+{
+	int i;
 
-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+	for (i=0; i < ACCESS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&access_hashtbl[i]);
+
+	spin_lock_init(&access_hashlock);
+
+	return;
+}
+void nfs_zap_access_cache(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_access_entry *cache = &nfsi->cache_access;
+	struct nfs_access_entry *cache;
+	struct list_head *head, *pos, *next;
+	int i;
+
+	spin_lock(&access_hashlock);
+	for (i=0; i < ACCESS_HASH_SIZE; i++) {
+		head = &access_hashtbl[access_hashval(nfsi, i)];
+		if (list_empty(head))
+			continue;
 
-	if (cache->cred != cred
-			|| time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
-			|| (nfsi->cache_validity & NFS_INO_INVALID_ACCESS))
-		return -ENOENT;
-	memcpy(res, cache, sizeof(*res));
-	return 0;
+		list_for_each_safe(pos, next, head) {
+			cache = list_entry(pos, struct nfs_access_entry, acc_list);
+			if (cache->id != (void *)nfsi)
+				continue;
+
+			list_del(&cache->acc_list);
+			put_rpccred(cache->cred);
+			kfree(cache);
+		}
+	}
+	spin_unlock(&access_hashlock);
+	return;
 }
 
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_access_entry *cache = &nfsi->cache_access;
+	int invalid = (nfsi->cache_validity & NFS_INO_INVALID_ACCESS);
+	struct nfs_access_entry *cache = NULL;
+	struct list_head *head;
+	int expired;
+
+	spin_lock(&access_hashlock);
+	head = &access_hashtbl[access_hashval(nfsi, cred->cr_uid)];
+	list_for_each_entry(cache, head, acc_list) {
+		if (cache->id != nfsi)
+			continue;
+		if (cache->cred != cred)
+			continue;
 
-	if (cache->cred != set->cred) {
-		if (cache->cred)
+		expired = time_after(jiffies, 
+			cache->jiffies + NFS_ATTRTIMEO(inode));
+		if (expired || invalid) {
+			list_del(&cache->acc_list);
+			spin_unlock(&access_hashlock);
 			put_rpccred(cache->cred);
-		cache->cred = get_rpccred(set->cred);
+			kfree(cache);
+			goto nolock;
+		}
+		memcpy(res, cache, sizeof(*res));
+		spin_unlock(&access_hashlock);
+		return 0;
 	}
-	/* FIXME: replace current access_cache BKL reliance with inode->i_lock */
-	spin_lock(&inode->i_lock);
+	spin_unlock(&access_hashlock);
+
+nolock:
+	return -ENOENT;
+}
+
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache = NULL;
+	struct list_head *head;
+
+	cache = (struct nfs_access_entry *)kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (!cache)
+		return;
+
+	spin_lock(&access_hashlock);
+	head = &access_hashtbl[access_hashval(nfsi, set->cred->cr_uid)];
 	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
-	spin_unlock(&inode->i_lock);
+	INIT_LIST_HEAD(&cache->acc_list);
+	list_add(&cache->acc_list, head);
+	spin_unlock(&access_hashlock);
+
+	cache->cred = get_rpccred(set->cred);
 	cache->jiffies = set->jiffies;
 	cache->mask = set->mask;
+	cache->id = (void *)nfsi;
+
+	return;
 }
 
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
--- mynfs-2.6/include/linux/nfs_fs.h.acc	2006-04-21 15:02:08.000000000 -0400
+++ mynfs-2.6/include/linux/nfs_fs.h	2006-04-25 19:12:05.000000000 -0400
@@ -72,6 +72,8 @@ struct nfs_access_entry {
 	unsigned long		jiffies;
 	struct rpc_cred *	cred;
 	int			mask;
+	void   *id;
+	struct list_head acc_list;
 };
 
 struct nfs4_state;
@@ -145,7 +147,6 @@ struct nfs_inode {
 	 */
 	atomic_t		data_updates;
 
-	struct nfs_access_entry	cache_access;
 #ifdef CONFIG_NFS_V3_ACL
 	struct posix_acl	*acl_access;
 	struct posix_acl	*acl_default;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26  1:14 [PATCH][RFC] NFS: Improving the access cache Steve Dickson
@ 2006-04-26  1:31 ` Matthew Wilcox
  2006-04-26  4:55 ` Neil Brown
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 36+ messages in thread
From: Matthew Wilcox @ 2006-04-26  1:31 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

On Tue, Apr 25, 2006 at 09:14:19PM -0400, Steve Dickson wrote:
> The attached patch used a hash table to store the nfs_access_entry
> entires which cause the ACCESS request to only happen when the
> attributes timeout.. The table is indexed by the addition of the
> nfs_inode pointer and the cr_uid in the cred structure which should
> spread things out nicely for some decent scalability (although the
> locking scheme may need to be reworked a bit). The table has 256 entries
> of struct list_head giving it a total size of 2k.

Seems to me like you could use an hlist instead, saving you 1k.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26  1:14 [PATCH][RFC] NFS: Improving the access cache Steve Dickson
  2006-04-26  1:31 ` Matthew Wilcox
@ 2006-04-26  4:55 ` Neil Brown
  2006-04-26 14:51   ` Steve Dickson
  2006-04-26 13:03   ` Trond Myklebust
  2006-04-26 13:17 ` [NFS] " Chuck Lever
  3 siblings, 1 reply; 36+ messages in thread
From: Neil Brown @ 2006-04-26  4:55 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

On Tuesday April 25, SteveD@redhat.com wrote:
> Currently the NFS client caches ACCESS information on a per uid basis
> which fall apart when different process with different uid consistently
> access the same directory. The end result being a storm of needless
> ACCESS calls...
> 
> The attached patch used a hash table to store the nfs_access_entry
> entires which cause the ACCESS request to only happen when the
> attributes timeout.. The table is indexed by the addition of the
> nfs_inode pointer and the cr_uid in the cred structure which should
> spread things out nicely for some decent scalability (although the
> locking scheme may need to be reworked a bit). The table has 256 entries
> of struct list_head giving it a total size of 2k.
> 
> The patch is based on Trond's GIT tree...
> 
> Comments?

- There is no upper bound imposed on the size of the cache, and no way
  for memory pressure to shrink the cache except indirectly by
  discarding inodes.
  I cannot see this being exploitable as getting access to lots of
  credentials would be hard for any given user.  However I feel that
  some cleaning might be in order.
- The nfs_zap_access_cache call isn't cheap.  If that could be
  amortised somehow it would be good.  Maybe use some version tagging
  so that when an inode is reused the access entry will no longer
  match in some way.  Then just clean the table by periodic scans that
  discard based on timeout information ?

It occurs to me that the large majority of inodes will only be
accessed by a single user and so need not reside in the cache.

So how about having a single "struct nfs_access_entry" pointer in the
inode.
When we do a lookup we look there first.
When we want to add an entry, we try to add it there first.
When we end up with two current access entries for the same inode,
only then do we insert them into the hash.
We would need to be able to tell from the inode whether anything is
hashed or not.  This could simply be if the nfs_access_entry point is
non-null, and its hashlist it non-empty.  Or we could just use a bit
flag somewhere.

I suspect this would keep the cache much smaller, and some of my other
concerns might then become irrelevant.

NeilBrown

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26  1:14 [PATCH][RFC] NFS: Improving the access cache Steve Dickson
@ 2006-04-26 13:03   ` Trond Myklebust
  2006-04-26  4:55 ` Neil Brown
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 36+ messages in thread
From: Trond Myklebust @ 2006-04-26 13:03 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
> Currently the NFS client caches ACCESS information on a per uid basis
> which fall apart when different process with different uid consistently
> access the same directory. The end result being a storm of needless
> ACCESS calls...
> 
> The attached patch used a hash table to store the nfs_access_entry
> entires which cause the ACCESS request to only happen when the
> attributes timeout.. The table is indexed by the addition of the
> nfs_inode pointer and the cr_uid in the cred structure which should
> spread things out nicely for some decent scalability (although the
> locking scheme may need to be reworked a bit). The table has 256 entries
> of struct list_head giving it a total size of 2k.

Instead of having the field 'id', why don't you let the nfs_inode keep a
small (hashed?) list of all the nfs_access_entry objects that refer to
it? That would speed up searches for cached entries.

I agree with Neil's assessment that we need a bound on the size of the
cache. In fact, enforcing a bound is pretty much the raison d'être for a
global table (by which I mean that if we don't need a bound, then we
might as well cache everything in the nfs_inode).
How about rather changing that hash table into an LRU list, then adding
a shrinker callback (using set_shrinker()) to allow the VM to free up
entries when memory pressure dictates that it must?

Cheers,
  Trond

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
@ 2006-04-26 13:03   ` Trond Myklebust
  0 siblings, 0 replies; 36+ messages in thread
From: Trond Myklebust @ 2006-04-26 13:03 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
> Currently the NFS client caches ACCESS information on a per uid basis
> which fall apart when different process with different uid consistent=
ly
> access the same directory. The end result being a storm of needless
> ACCESS calls...
>=20
> The attached patch used a hash table to store the nfs_access_entry
> entires which cause the ACCESS request to only happen when the
> attributes timeout.. The table is indexed by the addition of the
> nfs_inode pointer and the cr_uid in the cred structure which should
> spread things out nicely for some decent scalability (although the
> locking scheme may need to be reworked a bit). The table has 256 entr=
ies
> of struct list_head giving it a total size of 2k.

Instead of having the field 'id', why don't you let the nfs_inode keep =
a
small (hashed?) list of all the nfs_access_entry objects that refer to
it? That would speed up searches for cached entries.

I agree with Neil's assessment that we need a bound on the size of the
cache. In fact, enforcing a bound is pretty much the raison d'=C3=AAtre=
 for a
global table (by which I mean that if we don't need a bound, then we
might as well cache everything in the nfs_inode).
How about rather changing that hash table into an LRU list, then adding
a shrinker callback (using set_shrinker()) to allow the VM to free up
entries when memory pressure dictates that it must?

Cheers,
  Trond

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel=
" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 13:03   ` Trond Myklebust
@ 2006-04-26 13:14     ` Peter Staubach
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-04-26 13:14 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Steve Dickson, nfs, linux-fsdevel

Trond Myklebust wrote:

>On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
>  
>
>>Currently the NFS client caches ACCESS information on a per uid basis
>>which fall apart when different process with different uid consistently
>>access the same directory. The end result being a storm of needless
>>ACCESS calls...
>>
>>The attached patch used a hash table to store the nfs_access_entry
>>entires which cause the ACCESS request to only happen when the
>>attributes timeout.. The table is indexed by the addition of the
>>nfs_inode pointer and the cr_uid in the cred structure which should
>>spread things out nicely for some decent scalability (although the
>>locking scheme may need to be reworked a bit). The table has 256 entries
>>of struct list_head giving it a total size of 2k.
>>    
>>
>
>Instead of having the field 'id', why don't you let the nfs_inode keep a
>small (hashed?) list of all the nfs_access_entry objects that refer to
>it? That would speed up searches for cached entries.
>
>I agree with Neil's assessment that we need a bound on the size of the
>cache. In fact, enforcing a bound is pretty much the raison d'être for a
>global table (by which I mean that if we don't need a bound, then we
>might as well cache everything in the nfs_inode).
>How about rather changing that hash table into an LRU list, then adding
>a shrinker callback (using set_shrinker()) to allow the VM to free up
>entries when memory pressure dictates that it must?
>

Previous implementations have shown that a single per inode linear 
linked list
ends up not being scalable enough in certain situations.  There would end up
being too many entries in the list and searching the list would become
a bottleneck.  Adding a set of hash buckets per inode also proved to be
inefficient because in order to have enough hash buckets to make the hashing
efficient, much space was wasted.  Having a single set of hash buckets,
adequately sized, ended up being the best solution.

Why have a limit?  A better solution is to have the cache grow dynamically
as need requires and then have the system reclaim the memory when it starts
to run low on memory.

       ps
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
@ 2006-04-26 13:14     ` Peter Staubach
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-04-26 13:14 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Steve Dickson, nfs, linux-fsdevel

Trond Myklebust wrote:

>On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
> =20
>
>>Currently the NFS client caches ACCESS information on a per uid basis
>>which fall apart when different process with different uid consistent=
ly
>>access the same directory. The end result being a storm of needless
>>ACCESS calls...
>>
>>The attached patch used a hash table to store the nfs_access_entry
>>entires which cause the ACCESS request to only happen when the
>>attributes timeout.. The table is indexed by the addition of the
>>nfs_inode pointer and the cr_uid in the cred structure which should
>>spread things out nicely for some decent scalability (although the
>>locking scheme may need to be reworked a bit). The table has 256 entr=
ies
>>of struct list_head giving it a total size of 2k.
>>   =20
>>
>
>Instead of having the field 'id', why don't you let the nfs_inode keep=
 a
>small (hashed?) list of all the nfs_access_entry objects that refer to
>it? That would speed up searches for cached entries.
>
>I agree with Neil's assessment that we need a bound on the size of the
>cache. In fact, enforcing a bound is pretty much the raison d'=EAtre f=
or a
>global table (by which I mean that if we don't need a bound, then we
>might as well cache everything in the nfs_inode).
>How about rather changing that hash table into an LRU list, then addin=
g
>a shrinker callback (using set_shrinker()) to allow the VM to free up
>entries when memory pressure dictates that it must?
>

Previous implementations have shown that a single per inode linear=20
linked list
ends up not being scalable enough in certain situations.  There would e=
nd up
being too many entries in the list and searching the list would become
a bottleneck.  Adding a set of hash buckets per inode also proved to be
inefficient because in order to have enough hash buckets to make the ha=
shing
efficient, much space was wasted.  Having a single set of hash buckets,
adequately sized, ended up being the best solution.

Why have a limit?  A better solution is to have the cache grow dynamica=
lly
as need requires and then have the system reclaim the memory when it st=
arts
to run low on memory.

       ps
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel=
" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] [PATCH][RFC] NFS: Improving the access cache
  2006-04-26  1:14 [PATCH][RFC] NFS: Improving the access cache Steve Dickson
                   ` (2 preceding siblings ...)
  2006-04-26 13:03   ` Trond Myklebust
@ 2006-04-26 13:17 ` Chuck Lever
  2006-04-26 14:19   ` Steve Dickson
  3 siblings, 1 reply; 36+ messages in thread
From: Chuck Lever @ 2006-04-26 13:17 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

Steve Dickson wrote:
> Currently the NFS client caches ACCESS information on a per uid basis
> which fall apart when different process with different uid consistently
> access the same directory. The end result being a storm of needless
> ACCESS calls...
> 
> The attached patch used a hash table to store the nfs_access_entry
> entires which cause the ACCESS request to only happen when the
> attributes timeout.. The table is indexed by the addition of the
> nfs_inode pointer and the cr_uid in the cred structure which should
> spread things out nicely for some decent scalability (although the
> locking scheme may need to be reworked a bit). The table has 256 entries
> of struct list_head giving it a total size of 2k.
> 
> The patch is based on Trond's GIT tree...
> 
> Comments?
> 
> steved.

Hi Steve-

Thanks for digging into this.

I can't tell, but have you addressed the problem of racing processes 
generating multiple similar ACCESS requests?  Seems like some kind of 
serialization through your caching mechanism would resolve that nicely.

-- 
corporate:	<cel at netapp dot com>
personal:	<chucklever at bigfoot dot com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 13:14     ` Peter Staubach
@ 2006-04-26 14:01       ` Trond Myklebust
  -1 siblings, 0 replies; 36+ messages in thread
From: Trond Myklebust @ 2006-04-26 14:01 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Steve Dickson, nfs, linux-fsdevel

On Wed, 2006-04-26 at 09:14 -0400, Peter Staubach wrote:
> Trond Myklebust wrote:
> 
> >On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
> >  
> >
> >>Currently the NFS client caches ACCESS information on a per uid basis
> >>which fall apart when different process with different uid consistently
> >>access the same directory. The end result being a storm of needless
> >>ACCESS calls...
> >>
> >>The attached patch used a hash table to store the nfs_access_entry
> >>entires which cause the ACCESS request to only happen when the
> >>attributes timeout.. The table is indexed by the addition of the
> >>nfs_inode pointer and the cr_uid in the cred structure which should
> >>spread things out nicely for some decent scalability (although the
> >>locking scheme may need to be reworked a bit). The table has 256 entries
> >>of struct list_head giving it a total size of 2k.
> >>    
> >>
> >
> >Instead of having the field 'id', why don't you let the nfs_inode keep a
> >small (hashed?) list of all the nfs_access_entry objects that refer to
> >it? That would speed up searches for cached entries.
> >
> >I agree with Neil's assessment that we need a bound on the size of the
> >cache. In fact, enforcing a bound is pretty much the raison d'être for a
> >global table (by which I mean that if we don't need a bound, then we
> >might as well cache everything in the nfs_inode).
> >How about rather changing that hash table into an LRU list, then adding
> >a shrinker callback (using set_shrinker()) to allow the VM to free up
> >entries when memory pressure dictates that it must?
> >
> 
> Previous implementations have shown that a single per inode linear 
> linked list
> ends up not being scalable enough in certain situations.  There would end up
> being too many entries in the list and searching the list would become
> a bottleneck.  Adding a set of hash buckets per inode also proved to be
> inefficient because in order to have enough hash buckets to make the hashing
> efficient, much space was wasted.  Having a single set of hash buckets,
> adequately sized, ended up being the best solution.

What situations? AFAIA the number of processes in a typical setup are
almost always far smaller than the number of cached inodes.

For instance on my laptop, I'm currently running 146 processes, but
according to /proc/slabinfo I'm caching 330000 XFS inodes + 141500 ext3
inodes.
If I were to assume that a typical nfsroot system will show roughly the
same behaviour, then it would mean that a typical bucket in Steve's 256
hash entry table will contain at least 2000 entries that I need to
search through every time I want to do an access call.

> Why have a limit?  A better solution is to have the cache grow dynamically
> as need requires and then have the system reclaim the memory when it starts
> to run low on memory.

See my previous mail: that is exactly what I suggested in the 3rd
paragraph.

Cheers,
  Trond

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
@ 2006-04-26 14:01       ` Trond Myklebust
  0 siblings, 0 replies; 36+ messages in thread
From: Trond Myklebust @ 2006-04-26 14:01 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Steve Dickson, nfs, linux-fsdevel

On Wed, 2006-04-26 at 09:14 -0400, Peter Staubach wrote:
> Trond Myklebust wrote:
>=20
> >On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
> > =20
> >
> >>Currently the NFS client caches ACCESS information on a per uid bas=
is
> >>which fall apart when different process with different uid consiste=
ntly
> >>access the same directory. The end result being a storm of needless
> >>ACCESS calls...
> >>
> >>The attached patch used a hash table to store the nfs_access_entry
> >>entires which cause the ACCESS request to only happen when the
> >>attributes timeout.. The table is indexed by the addition of the
> >>nfs_inode pointer and the cr_uid in the cred structure which should
> >>spread things out nicely for some decent scalability (although the
> >>locking scheme may need to be reworked a bit). The table has 256 en=
tries
> >>of struct list_head giving it a total size of 2k.
> >>   =20
> >>
> >
> >Instead of having the field 'id', why don't you let the nfs_inode ke=
ep a
> >small (hashed?) list of all the nfs_access_entry objects that refer =
to
> >it? That would speed up searches for cached entries.
> >
> >I agree with Neil's assessment that we need a bound on the size of t=
he
> >cache. In fact, enforcing a bound is pretty much the raison d'=C3=AA=
tre for a
> >global table (by which I mean that if we don't need a bound, then we
> >might as well cache everything in the nfs_inode).
> >How about rather changing that hash table into an LRU list, then add=
ing
> >a shrinker callback (using set_shrinker()) to allow the VM to free u=
p
> >entries when memory pressure dictates that it must?
> >
>=20
> Previous implementations have shown that a single per inode linear=20
> linked list
> ends up not being scalable enough in certain situations.  There would=
 end up
> being too many entries in the list and searching the list would becom=
e
> a bottleneck.  Adding a set of hash buckets per inode also proved to =
be
> inefficient because in order to have enough hash buckets to make the =
hashing
> efficient, much space was wasted.  Having a single set of hash bucket=
s,
> adequately sized, ended up being the best solution.

What situations? AFAIA the number of processes in a typical setup are
almost always far smaller than the number of cached inodes.

=46or instance on my laptop, I'm currently running 146 processes, but
according to /proc/slabinfo I'm caching 330000 XFS inodes + 141500 ext3
inodes.
If I were to assume that a typical nfsroot system will show roughly the
same behaviour, then it would mean that a typical bucket in Steve's 256
hash entry table will contain at least 2000 entries that I need to
search through every time I want to do an access call.

> Why have a limit?  A better solution is to have the cache grow dynami=
cally
> as need requires and then have the system reclaim the memory when it =
starts
> to run low on memory.

See my previous mail: that is exactly what I suggested in the 3rd
paragraph.

Cheers,
  Trond

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel=
" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 14:01       ` Trond Myklebust
@ 2006-04-26 14:15         ` Peter Staubach
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-04-26 14:15 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Steve Dickson, nfs, linux-fsdevel

Trond Myklebust wrote:

>On Wed, 2006-04-26 at 09:14 -0400, Peter Staubach wrote:
>  
>
>>Trond Myklebust wrote:
>>
>>    
>>
>>>On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
>>> 
>>>
>>>      
>>>
>>>>Currently the NFS client caches ACCESS information on a per uid basis
>>>>which fall apart when different process with different uid consistently
>>>>access the same directory. The end result being a storm of needless
>>>>ACCESS calls...
>>>>
>>>>The attached patch used a hash table to store the nfs_access_entry
>>>>entires which cause the ACCESS request to only happen when the
>>>>attributes timeout.. The table is indexed by the addition of the
>>>>nfs_inode pointer and the cr_uid in the cred structure which should
>>>>spread things out nicely for some decent scalability (although the
>>>>locking scheme may need to be reworked a bit). The table has 256 entries
>>>>of struct list_head giving it a total size of 2k.
>>>>   
>>>>
>>>>        
>>>>
>>>Instead of having the field 'id', why don't you let the nfs_inode keep a
>>>small (hashed?) list of all the nfs_access_entry objects that refer to
>>>it? That would speed up searches for cached entries.
>>>
>>>I agree with Neil's assessment that we need a bound on the size of the
>>>cache. In fact, enforcing a bound is pretty much the raison d'être for a
>>>global table (by which I mean that if we don't need a bound, then we
>>>might as well cache everything in the nfs_inode).
>>>How about rather changing that hash table into an LRU list, then adding
>>>a shrinker callback (using set_shrinker()) to allow the VM to free up
>>>entries when memory pressure dictates that it must?
>>>
>>>      
>>>
>>Previous implementations have shown that a single per inode linear 
>>linked list
>>ends up not being scalable enough in certain situations.  There would end up
>>being too many entries in the list and searching the list would become
>>a bottleneck.  Adding a set of hash buckets per inode also proved to be
>>inefficient because in order to have enough hash buckets to make the hashing
>>efficient, much space was wasted.  Having a single set of hash buckets,
>>adequately sized, ended up being the best solution.
>>    
>>
>
>What situations? AFAIA the number of processes in a typical setup are
>almost always far smaller than the number of cached inodes.
>
>  
>

The situation that doesn't scale is one where there are many different
users on the system.  It is the situation where there are more then just
a few users per file.  This can happen on compute servers or systems
used for timesharing sorts of purposes.

>For instance on my laptop, I'm currently running 146 processes, but
>according to /proc/slabinfo I'm caching 330000 XFS inodes + 141500 ext3
>inodes.
>If I were to assume that a typical nfsroot system will show roughly the
>same behaviour, then it would mean that a typical bucket in Steve's 256
>hash entry table will contain at least 2000 entries that I need to
>search through every time I want to do an access call.
>

For such a system, there needs to be more than 256 hash buckets.  The number
of the access cache hash buckets needs to be on scale with the number of 
hash
buckets used for similarly sized caches and tables.

       ps
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
@ 2006-04-26 14:15         ` Peter Staubach
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-04-26 14:15 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Steve Dickson, nfs, linux-fsdevel

Trond Myklebust wrote:

>On Wed, 2006-04-26 at 09:14 -0400, Peter Staubach wrote:
> =20
>
>>Trond Myklebust wrote:
>>
>>   =20
>>
>>>On Tue, 2006-04-25 at 21:14 -0400, Steve Dickson wrote:
>>>=20
>>>
>>>     =20
>>>
>>>>Currently the NFS client caches ACCESS information on a per uid bas=
is
>>>>which fall apart when different process with different uid consiste=
ntly
>>>>access the same directory. The end result being a storm of needless
>>>>ACCESS calls...
>>>>
>>>>The attached patch used a hash table to store the nfs_access_entry
>>>>entires which cause the ACCESS request to only happen when the
>>>>attributes timeout.. The table is indexed by the addition of the
>>>>nfs_inode pointer and the cr_uid in the cred structure which should
>>>>spread things out nicely for some decent scalability (although the
>>>>locking scheme may need to be reworked a bit). The table has 256 en=
tries
>>>>of struct list_head giving it a total size of 2k.
>>>>  =20
>>>>
>>>>       =20
>>>>
>>>Instead of having the field 'id', why don't you let the nfs_inode ke=
ep a
>>>small (hashed?) list of all the nfs_access_entry objects that refer =
to
>>>it? That would speed up searches for cached entries.
>>>
>>>I agree with Neil's assessment that we need a bound on the size of t=
he
>>>cache. In fact, enforcing a bound is pretty much the raison d'=EAtre=
 for a
>>>global table (by which I mean that if we don't need a bound, then we
>>>might as well cache everything in the nfs_inode).
>>>How about rather changing that hash table into an LRU list, then add=
ing
>>>a shrinker callback (using set_shrinker()) to allow the VM to free u=
p
>>>entries when memory pressure dictates that it must?
>>>
>>>     =20
>>>
>>Previous implementations have shown that a single per inode linear=20
>>linked list
>>ends up not being scalable enough in certain situations.  There would=
 end up
>>being too many entries in the list and searching the list would becom=
e
>>a bottleneck.  Adding a set of hash buckets per inode also proved to =
be
>>inefficient because in order to have enough hash buckets to make the =
hashing
>>efficient, much space was wasted.  Having a single set of hash bucket=
s,
>>adequately sized, ended up being the best solution.
>>   =20
>>
>
>What situations? AFAIA the number of processes in a typical setup are
>almost always far smaller than the number of cached inodes.
>
> =20
>

The situation that doesn't scale is one where there are many different
users on the system.  It is the situation where there are more then jus=
t
a few users per file.  This can happen on compute servers or systems
used for timesharing sorts of purposes.

>For instance on my laptop, I'm currently running 146 processes, but
>according to /proc/slabinfo I'm caching 330000 XFS inodes + 141500 ext=
3
>inodes.
>If I were to assume that a typical nfsroot system will show roughly th=
e
>same behaviour, then it would mean that a typical bucket in Steve's 25=
6
>hash entry table will contain at least 2000 entries that I need to
>search through every time I want to do an access call.
>

=46or such a system, there needs to be more than 256 hash buckets.  The=
 number
of the access cache hash buckets needs to be on scale with the number o=
f=20
hash
buckets used for similarly sized caches and tables.

       ps
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel=
" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 13:17 ` [NFS] " Chuck Lever
@ 2006-04-26 14:19   ` Steve Dickson
  0 siblings, 0 replies; 36+ messages in thread
From: Steve Dickson @ 2006-04-26 14:19 UTC (permalink / raw)
  To: cel; +Cc: nfs, linux-fsdevel

Chuck Lever wrote:
> 
> Hi Steve-
> 
> Thanks for digging into this.
> 
> I can't tell, but have you addressed the problem of racing processes 
> generating multiple similar ACCESS requests?  Seems like some kind of 
> serialization through your caching mechanism would resolve that nicely.
No.. I was just trying to duplicate logic that was there...
but that is a good point... It turns out that I messed up the
locking anyways... the NFS_INO_INVALID_ACCESS bit is no longer
being protect by the inode->i_lock which is not good... so I
need to rework that part...


steved.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26  4:55 ` Neil Brown
@ 2006-04-26 14:51   ` Steve Dickson
  2006-04-26 22:32     ` Neil Brown
  0 siblings, 1 reply; 36+ messages in thread
From: Steve Dickson @ 2006-04-26 14:51 UTC (permalink / raw)
  To: Neil Brown; +Cc: nfs, linux-fsdevel



Neil Brown wrote:
> - There is no upper bound imposed on the size of the cache, and no way
>   for memory pressure to shrink the cache except indirectly by
>   discarding inodes.
>   I cannot see this being exploitable as getting access to lots of
>   credentials would be hard for any given user.  However I feel that
>   some cleaning might be in order.
I guess there is no upper bound checking because I didn't see any type
of boundary checking the server hashing code I stoled 8-) Or maybe
I just missed it... Is there an example and what would trigger
this cleanup?

> - The nfs_zap_access_cache call isn't cheap.  If that could be
>   amortised somehow it would be good.  Maybe use some version tagging
>   so that when an inode is reused the access entry will no longer
>   match in some way.  Then just clean the table by periodic scans that
>   discard based on timeout information ?
yes.. I did realize that ifs_zap_access_cache would be expensive...
and I think there might be an issue with holding the access_hash lock
spin lock while calling put_rpccred() since it can also take out
another spin lock... Maybe I just spin through the table marking
entries for deletion and then let somebody else clean them up??
Is there already a clean up process that I would us? I don't
recall one...

> 
> It occurs to me that the large majority of inodes will only be
> accessed by a single user and so need not reside in the cache.
> 
> So how about having a single "struct nfs_access_entry" pointer in the
> inode.
> When we do a lookup we look there first.
> When we want to add an entry, we try to add it there first.
> When we end up with two current access entries for the same inode,
> only then do we insert them into the hash.
To rephrase to make sure I understand....
1) P1(uid=1) creates an access pointer in the nfs_inode
2) P2(uid=2) sees the access pointer is not null so it adds them both
    to the table, right?

> We would need to be able to tell from the inode whether anything is
> hashed or not.  This could simply be if the nfs_access_entry point is
> non-null, and its hashlist it non-empty.  Or we could just use a bit
> flag somewhere.
So I guess it would be something like:
if (nfs_inode->access == null)
     set nfs_inode->access
if (nfs_inode->access =! NULL && nfs_inode->access_hash == empty)
     move both pointer into hast able.
if (nfs_inode->access == null && nfs_inode->access_hash != empty)
     use hastable.

But now the question is how would I know when there is only one
entry in the table? Or do we just let the hash table "drain"
naturally and when it become empty we start with the nfs_inode->access
pointer again... Is this close to what your thinking??

steved.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 13:03   ` Trond Myklebust
@ 2006-04-26 15:03     ` Steve Dickson
  -1 siblings, 0 replies; 36+ messages in thread
From: Steve Dickson @ 2006-04-26 15:03 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: nfs, linux-fsdevel

Trond Myklebust wrote:
> 
> 
> Instead of having the field 'id', why don't you let the nfs_inode keep a
> small (hashed?) list of all the nfs_access_entry objects that refer to
> it? That would speed up searches for cached entries.
Actually I did look into having a pointer in the nfs_inode... but
what do you do when the second hashed list is needed. Meaning
P2(uid2) comes along and hashes to a different que. I guess
I thought it was a bit messy to keep overwriting the point in the
nfs_inode so I just kept everything in the hash table...

> I agree with Neil's assessment that we need a bound on the size of the
> cache. In fact, enforcing a bound is pretty much the raison d'être for a
> global table (by which I mean that if we don't need a bound, then we
> might as well cache everything in the nfs_inode).
Ok..

> How about rather changing that hash table into an LRU list, then adding
> a shrinker callback (using set_shrinker()) to allow the VM to free up
> entries when memory pressure dictates that it must?
Sounds interesting.. Just to be clear, by LRU list you mean use hlist
correct?

steved.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
@ 2006-04-26 15:03     ` Steve Dickson
  0 siblings, 0 replies; 36+ messages in thread
From: Steve Dickson @ 2006-04-26 15:03 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: nfs, linux-fsdevel

Trond Myklebust wrote:
>=20
>=20
> Instead of having the field 'id', why don't you let the nfs_inode kee=
p a
> small (hashed?) list of all the nfs_access_entry objects that refer t=
o
> it? That would speed up searches for cached entries.
Actually I did look into having a pointer in the nfs_inode... but
what do you do when the second hashed list is needed. Meaning
P2(uid2) comes along and hashes to a different que. I guess
I thought it was a bit messy to keep overwriting the point in the
nfs_inode so I just kept everything in the hash table...

> I agree with Neil's assessment that we need a bound on the size of th=
e
> cache. In fact, enforcing a bound is pretty much the raison d'=C3=AAt=
re for a
> global table (by which I mean that if we don't need a bound, then we
> might as well cache everything in the nfs_inode).
Ok..

> How about rather changing that hash table into an LRU list, then addi=
ng
> a shrinker callback (using set_shrinker()) to allow the VM to free up
> entries when memory pressure dictates that it must?
Sounds interesting.. Just to be clear, by LRU list you mean use hlist
correct?

steved.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel=
" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 14:15         ` Peter Staubach
  (?)
@ 2006-04-26 15:44         ` Trond Myklebust
  2006-04-26 17:01           ` Peter Staubach
  -1 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2006-04-26 15:44 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Steve Dickson, nfs, linux-fsdevel

On Wed, 2006-04-26 at 10:15 -0400, Peter Staubach wrote:
> >
> >What situations? AFAIA the number of processes in a typical setup are
> >almost always far smaller than the number of cached inodes.
> >
> >  
> >
> 
> The situation that doesn't scale is one where there are many different
> users on the system.  It is the situation where there are more then just
> a few users per file.  This can happen on compute servers or systems
> used for timesharing sorts of purposes.

Yes, but the number of users <= number of processes which even on those
systems is almost always much, much less than the number of cached
inodes.

> >For instance on my laptop, I'm currently running 146 processes, but
> >according to /proc/slabinfo I'm caching 330000 XFS inodes + 141500 ext3
> >inodes.
> >If I were to assume that a typical nfsroot system will show roughly the
> >same behaviour, then it would mean that a typical bucket in Steve's 256
> >hash entry table will contain at least 2000 entries that I need to
> >search through every time I want to do an access call.
> >
> 
> For such a system, there needs to be more than 256 hash buckets.  The number
> of the access cache hash buckets needs to be on scale with the number of 
> hash
> buckets used for similarly sized caches and tables.

The inode cache is the only similarly sized cache I can think of.

That is set either by the user, or it takes a default value of (total
memory size) / 2^14 buckets (see alloc_large_system_hash). On a 1Gb
system, that makes the default hash table size ~ 65536 entries. I can't
see people wanting to put up with a 256K static hash table for access
caching too.

Furthermore, note that the inode cache is only searched when
initialising a dentry. It is not searched on _every_ traversal of a path
element.

Cheers,
  Trond


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 15:44         ` Trond Myklebust
@ 2006-04-26 17:01           ` Peter Staubach
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-04-26 17:01 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Steve Dickson, nfs, linux-fsdevel

Trond Myklebust wrote:

>On Wed, 2006-04-26 at 10:15 -0400, Peter Staubach wrote:
>  
>
>>>What situations? AFAIA the number of processes in a typical setup are
>>>almost always far smaller than the number of cached inodes.
>>>
>>> 
>>>
>>>      
>>>
>>The situation that doesn't scale is one where there are many different
>>users on the system.  It is the situation where there are more then just
>>a few users per file.  This can happen on compute servers or systems
>>used for timesharing sorts of purposes.
>>    
>>
>
>Yes, but the number of users <= number of processes which even on those
>systems is almost always much, much less than the number of cached
>inodes.
>
>  
>

There isn't a 1-to-1 correspondence between processes and files.  A single
process accesses many different files and many of the processes will be
accessing the same files.  Shared libraries are easy examples of files
which are accessed by multiple processes and processes themselves access
multiple shared libraries.

>>>For instance on my laptop, I'm currently running 146 processes, but
>>>according to /proc/slabinfo I'm caching 330000 XFS inodes + 141500 ext3
>>>inodes.
>>>If I were to assume that a typical nfsroot system will show roughly the
>>>same behaviour, then it would mean that a typical bucket in Steve's 256
>>>hash entry table will contain at least 2000 entries that I need to
>>>search through every time I want to do an access call.
>>>
>>>      
>>>
>>For such a system, there needs to be more than 256 hash buckets.  The number
>>of the access cache hash buckets needs to be on scale with the number of 
>>hash
>>buckets used for similarly sized caches and tables.
>>    
>>
>
>The inode cache is the only similarly sized cache I can think of.
>
>That is set either by the user, or it takes a default value of (total
>memory size) / 2^14 buckets (see alloc_large_system_hash). On a 1Gb
>system, that makes the default hash table size ~ 65536 entries. I can't
>see people wanting to put up with a 256K static hash table for access
>caching too.
>
>  
>

I think that if the performance benefits warrant such a cache, then it is
worth it.  It is a very small percentage of the real memory on the system.

Previous, informal, studies showed that caching access privileges like
this was good at short circuiting 90%+ of access calls.

However, we could always divide this further when sizing the access
cache.  If we assume that 1/2 or 1/4 or some percentage of the files
accessed will be on NFS mounted file systems, then the access cache just
needs to be based on the number of NFS inodes, not the total number of
inodes.

>Furthermore, note that the inode cache is only searched when
>initialising a dentry. It is not searched on _every_ traversal of a path
>element.
>

Very true, which points out the importance of getting the access to the 
access
cache correct and fast.  The number of entries in the access cache will 
be at
least the number of NFS inodes in the system and could be much higher 
depending
upon whether the system is single-user, desktop style system, or a 
multi-user
shared system.  The key to making this cache cheap is to make the hash 
algorithm
cheap and keeping the hash chains short.

       ps

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 14:51   ` Steve Dickson
@ 2006-04-26 22:32     ` Neil Brown
  2006-05-02  9:49       ` Steve Dickson
  0 siblings, 1 reply; 36+ messages in thread
From: Neil Brown @ 2006-04-26 22:32 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

On Wednesday April 26, SteveD@redhat.com wrote:
> 
> 
> Neil Brown wrote:
> > - There is no upper bound imposed on the size of the cache, and no way
> >   for memory pressure to shrink the cache except indirectly by
> >   discarding inodes.
> >   I cannot see this being exploitable as getting access to lots of
> >   credentials would be hard for any given user.  However I feel that
> >   some cleaning might be in order.
> I guess there is no upper bound checking because I didn't see any type
> of boundary checking the server hashing code I stoled 8-) Or maybe
> I just missed it... Is there an example and what would trigger
> this cleanup?

You can register a 'shrinker' which gets called then the vm wants to
reclaim memory.  See mm/vmscan.c
It gets passed the number of items you should try to discard, and
returned the number of items that are left.

And if you arrange to do all the freeing in batches using the
shrinker, you could use RCU to make all the searches and additions
lockless (Well, you still need rcu_read_lock, but that is super light
weight).  That would be fun :-)

> > 
> > It occurs to me that the large majority of inodes will only be
> > accessed by a single user and so need not reside in the cache.
> > 
> > So how about having a single "struct nfs_access_entry" pointer in the
> > inode.
> > When we do a lookup we look there first.
> > When we want to add an entry, we try to add it there first.
> > When we end up with two current access entries for the same inode,
> > only then do we insert them into the hash.
> To rephrase to make sure I understand....
> 1) P1(uid=1) creates an access pointer in the nfs_inode
> 2) P2(uid=2) sees the access pointer is not null so it adds them both
>     to the table, right?
> 

Exactly.

> > We would need to be able to tell from the inode whether anything is
> > hashed or not.  This could simply be if the nfs_access_entry point is
> > non-null, and its hashlist it non-empty.  Or we could just use a bit
> > flag somewhere.
> So I guess it would be something like:
> if (nfs_inode->access == null)
>      set nfs_inode->access
> if (nfs_inode->access =! NULL && nfs_inode->access_hash == empty)
>      move both pointer into hast able.
> if (nfs_inode->access == null && nfs_inode->access_hash != empty)
>      use hastable.
> 
> But now the question is how would I know when there is only one
> entry in the table? Or do we just let the hash table "drain"
> naturally and when it become empty we start with the nfs_inode->access
> pointer again... Is this close to what your thinking??

Yes.  Spot on.  Once some inode has 'spilled' into the hash table
there isn't a lot to gain by "unspilling" it.

NeilBrown

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH][RFC] NFS: Improving the access cache
  2006-04-26 22:32     ` Neil Brown
@ 2006-05-02  9:49       ` Steve Dickson
  2006-05-02 13:51         ` [NFS] " Peter Staubach
  2006-05-03  4:42         ` Chuck Lever
  0 siblings, 2 replies; 36+ messages in thread
From: Steve Dickson @ 2006-05-02  9:49 UTC (permalink / raw)
  To: nfs; +Cc: linux-fsdevel

Neil Brown wrote:
>>To rephrase to make sure I understand....
>>1) P1(uid=1) creates an access pointer in the nfs_inode
>>2) P2(uid=2) sees the access pointer is not null so it adds them both
>>    to the table, right?
>>
> 
> 
> Exactly.
> 
> 
>>>We would need to be able to tell from the inode whether anything is
>>>hashed or not.  This could simply be if the nfs_access_entry point is
>>>non-null, and its hashlist it non-empty.  Or we could just use a bit
>>>flag somewhere.
>>
>>So I guess it would be something like:
>>if (nfs_inode->access == null)
>>     set nfs_inode->access
>>if (nfs_inode->access =! NULL && nfs_inode->access_hash == empty)
>>     move both pointer into hast able.
>>if (nfs_inode->access == null && nfs_inode->access_hash != empty)
>>     use hastable.
>>
>>But now the question is how would I know when there is only one
>>entry in the table? Or do we just let the hash table "drain"
>>naturally and when it become empty we start with the nfs_inode->access
>>pointer again... Is this close to what your thinking??
> 
> 
> Yes.  Spot on.  Once some inode has 'spilled' into the hash table
> there isn't a lot to gain by "unspilling" it.
Talking with Trond, he would like to do something slightly different
which I'll outline here to make sure we are all on the same page....

Basically we would maintain one global hlist (i.e. link list) that
would contain all of the cached entries; then each nfs_inode would
have its own LRU hlist that would contain entries that are associated
with that nfs_inode. So each entry would be on two lists, the
global hlist and hlist in the nfs_inode.

We would govern memory consumption by only allowing 30 entries
on any one hlist in the nfs_inode and by registering the globe
hlist with the VFS shrinker which will cause the list to be prune
when memory is needed. So this means, when the 31st entry was added
to the hlist in the nfs_inode, the least recently used entry would
be removed.

Locking might be a bit tricky, but do able... To make this scalable,
I would think we would need global read/write spin_lock. The read_lock()
would be taken when the hlist in the inode was searched and the
write_lock() would taken when the hlist in the inode was changed
and when the global list was prune.

Comments?

steved.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-02  9:49       ` Steve Dickson
@ 2006-05-02 13:51         ` Peter Staubach
  2006-05-02 14:38           ` Steve Dickson
  2006-05-03  4:42         ` Chuck Lever
  1 sibling, 1 reply; 36+ messages in thread
From: Peter Staubach @ 2006-05-02 13:51 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

Steve Dickson wrote:

> Neil Brown wrote:
>
>>> To rephrase to make sure I understand....
>>> 1) P1(uid=1) creates an access pointer in the nfs_inode
>>> 2) P2(uid=2) sees the access pointer is not null so it adds them both
>>>    to the table, right?
>>>
>>
>>
>> Exactly.
>>
>>
>>>> We would need to be able to tell from the inode whether anything is
>>>> hashed or not.  This could simply be if the nfs_access_entry point is
>>>> non-null, and its hashlist it non-empty.  Or we could just use a bit
>>>> flag somewhere.
>>>
>>>
>>> So I guess it would be something like:
>>> if (nfs_inode->access == null)
>>>     set nfs_inode->access
>>> if (nfs_inode->access =! NULL && nfs_inode->access_hash == empty)
>>>     move both pointer into hast able.
>>> if (nfs_inode->access == null && nfs_inode->access_hash != empty)
>>>     use hastable.
>>>
>>> But now the question is how would I know when there is only one
>>> entry in the table? Or do we just let the hash table "drain"
>>> naturally and when it become empty we start with the nfs_inode->access
>>> pointer again... Is this close to what your thinking??
>>
>>
>>
>> Yes.  Spot on.  Once some inode has 'spilled' into the hash table
>> there isn't a lot to gain by "unspilling" it.
>
> Talking with Trond, he would like to do something slightly different
> which I'll outline here to make sure we are all on the same page....
>
> Basically we would maintain one global hlist (i.e. link list) that
> would contain all of the cached entries; then each nfs_inode would
> have its own LRU hlist that would contain entries that are associated
> with that nfs_inode. So each entry would be on two lists, the
> global hlist and hlist in the nfs_inode.
>

How are these lists used?

I would suggest that a global set of hash queues would work better than
a linked list and that these hash queues by used to find the cache entry
for any particular user.  Finding the entry for a particular (user,inode)
needs to be fast and linearly searching a linked list is slow.  Linear
searching needs to be avoided.  Comparing the fewest number of entries
possible will result in the best performance because the comparisons
need to take into account the entire user identification, including
the groups list.

The list in the inode seems useful, but only for purges.  Searching via
this list will be very slow once the list grows beyond a few entries.
Purging needs to be fast because purging the access cache entries for a
particular file will need to happen whenever the ctime on the file changes.
This list can be used to make it easy to find the correct entries in the
global access cache.

> We would govern memory consumption by only allowing 30 entries
> on any one hlist in the nfs_inode and by registering the globe
> hlist with the VFS shrinker which will cause the list to be prune
> when memory is needed. So this means, when the 31st entry was added
> to the hlist in the nfs_inode, the least recently used entry would
> be removed.
>

Why is there a limit at all and why is 30 the right number?  This
seems small and rather arbitrary.  If there is some way to trigger
memory reclaiming, then letting the list grow as appropriate seems
like a good thing to do.

Making sure that you are one of the original 30 users accessing the
file in order to get reasonable performance seems tricky to me.  :-)

> Locking might be a bit tricky, but do able... To make this scalable,
> I would think we would need global read/write spin_lock. The read_lock()
> would be taken when the hlist in the inode was searched and the
> write_lock() would taken when the hlist in the inode was changed
> and when the global list was prune.
>

Sorry, read/write spin lock?  I thought that spin locks were exclusive,
either the lock was held or the process spins waiting to acquire it.

A global reader/writer lock can make a lot of sense though.  The reader
lock can allow concurrent lookups in the cache and the writer lock can
serialize updates to the cache.

A global spin lock on the cache can work well.  As long as the spin lock
is not held for very long, ie. short search times, then the lack of
concurrency should not be noticeable.  One global spin lock can also
make the implementation much simpler by simplifying the locking
tremendously.  Grab the lock, search for an entry, release it.  Grab
the lock, insert a new entry, release it.  Simple and fast, not prone
to deadlock or starvation issues.

    Thanx...

       ps

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-02 13:51         ` [NFS] " Peter Staubach
@ 2006-05-02 14:38           ` Steve Dickson
  2006-05-02 14:51             ` Peter Staubach
  0 siblings, 1 reply; 36+ messages in thread
From: Steve Dickson @ 2006-05-02 14:38 UTC (permalink / raw)
  To: Peter Staubach; +Cc: nfs, linux-fsdevel

Peter Staubach wrote:
>> Basically we would maintain one global hlist (i.e. link list) that
>> would contain all of the cached entries; then each nfs_inode would
>> have its own LRU hlist that would contain entries that are associated
>> with that nfs_inode. So each entry would be on two lists, the
>> global hlist and hlist in the nfs_inode.
>>
> 
> How are these lists used?
The inode hlist will be used to search and purge...

> 
> I would suggest that a global set of hash queues would work better than
> a linked list and that these hash queues by used to find the cache entry
> for any particular user.  Finding the entry for a particular (user,inode)
> needs to be fast and linearly searching a linked list is slow.  Linear
> searching needs to be avoided.  Comparing the fewest number of entries
> possible will result in the best performance because the comparisons
> need to take into account the entire user identification, including
> the groups list.
I guess we could have the VFS  shrinker to purge a hash table just
as well as a link list... although a hash table will have an
small memory cost...

> The list in the inode seems useful, but only for purges.  Searching via
> this list will be very slow once the list grows beyond a few entries.
> Purging needs to be fast because purging the access cache entries for a
> particular file will need to happen whenever the ctime on the file changes.
> This list can be used to make it easy to find the correct entries in the
> global access cache.
Seems reasonable assuming we use a hash table...

> 
>> We would govern memory consumption by only allowing 30 entries
>> on any one hlist in the nfs_inode and by registering the globe
>> hlist with the VFS shrinker which will cause the list to be prune
>> when memory is needed. So this means, when the 31st entry was added
>> to the hlist in the nfs_inode, the least recently used entry would
>> be removed.
>>
> 
> Why is there a limit at all and why is 30 the right number?  This
> seems small and rather arbitrary.  If there is some way to trigger
> memory reclaiming, then letting the list grow as appropriate seems
> like a good thing to do.
Well the vfs mechanism will be the trigger... so your saying we
should just let the purge hlist lists in the nfs_inode grow
untethered? How about read-only filesystems where the ctime
will not change... I would think we might want some type of
high water mark for that case, true?

> 
> Making sure that you are one of the original 30 users accessing the
> file in order to get reasonable performance seems tricky to me.  :-)
> 
>> Locking might be a bit tricky, but do able... To make this scalable,
>> I would think we would need global read/write spin_lock. The read_lock()
>> would be taken when the hlist in the inode was searched and the
>> write_lock() would taken when the hlist in the inode was changed
>> and when the global list was prune.
>>
> 
> Sorry, read/write spin lock?  I thought that spin locks were exclusive,
> either the lock was held or the process spins waiting to acquire it.
See the rwlock_t lock type in asm/spinlock.h.. That's the one
I was planning on using...


steved.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-02 14:38           ` Steve Dickson
@ 2006-05-02 14:51             ` Peter Staubach
  2006-05-02 15:26               ` [NFS] " Ian Kent
  0 siblings, 1 reply; 36+ messages in thread
From: Peter Staubach @ 2006-05-02 14:51 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

Steve Dickson wrote:

> Peter Staubach wrote:
>
>>> Basically we would maintain one global hlist (i.e. link list) that
>>> would contain all of the cached entries; then each nfs_inode would
>>> have its own LRU hlist that would contain entries that are associated
>>> with that nfs_inode. So each entry would be on two lists, the
>>> global hlist and hlist in the nfs_inode.
>>>
>>
>> How are these lists used?
>
> The inode hlist will be used to search and purge...
>

Eeee!  A linear search?  That gets expensive as the list grows.  A hashed
list would keep the search times down.


>>
>> I would suggest that a global set of hash queues would work better than
>> a linked list and that these hash queues by used to find the cache entry
>> for any particular user.  Finding the entry for a particular 
>> (user,inode)
>> needs to be fast and linearly searching a linked list is slow.  Linear
>> searching needs to be avoided.  Comparing the fewest number of entries
>> possible will result in the best performance because the comparisons
>> need to take into account the entire user identification, including
>> the groups list.
>
> I guess we could have the VFS  shrinker to purge a hash table just
> as well as a link list... although a hash table will have an
> small memory cost...
>

Yes, but small.  There are always space/time tradeoffs.

>> The list in the inode seems useful, but only for purges.  Searching via
>> this list will be very slow once the list grows beyond a few entries.
>> Purging needs to be fast because purging the access cache entries for a
>> particular file will need to happen whenever the ctime on the file 
>> changes.
>> This list can be used to make it easy to find the correct entries in the
>> global access cache.
>
> Seems reasonable assuming we use a hash table...
>
>>
>>> We would govern memory consumption by only allowing 30 entries
>>> on any one hlist in the nfs_inode and by registering the globe
>>> hlist with the VFS shrinker which will cause the list to be prune
>>> when memory is needed. So this means, when the 31st entry was added
>>> to the hlist in the nfs_inode, the least recently used entry would
>>> be removed.
>>>
>>
>> Why is there a limit at all and why is 30 the right number?  This
>> seems small and rather arbitrary.  If there is some way to trigger
>> memory reclaiming, then letting the list grow as appropriate seems
>> like a good thing to do.
>
> Well the vfs mechanism will be the trigger... so your saying we
> should just let the purge hlist lists in the nfs_inode grow
> untethered? How about read-only filesystems where the ctime
> will not change... I would think we might want some type of
> high water mark for that case, true?
>

Not true.  Why have a limit at all?  As long as there is memory to store
the information, why place arbitrary limits on the amount of information
stored?

As long as the memory can be reclaimed when the system needs it, then
I don't see any reason to place limits.  Whatever number that is chosen
is always the wrong number and requiring users to guess at the sizes or
take steps to tune the system, when the system could have just done the
right thing in the first place, is just wrong.

>>
>> Making sure that you are one of the original 30 users accessing the
>> file in order to get reasonable performance seems tricky to me.  :-)
>>
>>> Locking might be a bit tricky, but do able... To make this scalable,
>>> I would think we would need global read/write spin_lock. The 
>>> read_lock()
>>> would be taken when the hlist in the inode was searched and the
>>> write_lock() would taken when the hlist in the inode was changed
>>> and when the global list was prune.
>>>
>>
>> Sorry, read/write spin lock?  I thought that spin locks were exclusive,
>> either the lock was held or the process spins waiting to acquire it.
>
> See the rwlock_t lock type in asm/spinlock.h.. That's the one
> I was planning on using...


Hmmm.  A reader/writer lock which is busy waited for.  Is there any idea
of the costs of such locks?  This does seem like it would fit the bill
though.  It would be interesting to see what the access patterns for the
cache end up looking like, whether it is useful to separate readers from
writers in this fashion.

    Thanx...

       ps


-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-02 14:51             ` Peter Staubach
@ 2006-05-02 15:26               ` Ian Kent
  0 siblings, 0 replies; 36+ messages in thread
From: Ian Kent @ 2006-05-02 15:26 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Steve Dickson, nfs, linux-fsdevel

On Tue, 2 May 2006, Peter Staubach wrote:

> Steve Dickson wrote:
> 
> > Peter Staubach wrote:
> > 
> > > > Basically we would maintain one global hlist (i.e. link list) that
> > > > would contain all of the cached entries; then each nfs_inode would
> > > > have its own LRU hlist that would contain entries that are associated
> > > > with that nfs_inode. So each entry would be on two lists, the
> > > > global hlist and hlist in the nfs_inode.
> > > > 
> > > 
> > > How are these lists used?
> > 
> > The inode hlist will be used to search and purge...
> > 
> 
> Eeee!  A linear search?  That gets expensive as the list grows.  A hashed
> list would keep the search times down.

I thought hlist was meant to be used for hash tables with chaining?

Ian


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-02  9:49       ` Steve Dickson
  2006-05-02 13:51         ` [NFS] " Peter Staubach
@ 2006-05-03  4:42         ` Chuck Lever
  2006-05-05 14:07           ` Steve Dickson
  2006-05-08  2:44           ` [NFS] " Neil Brown
  1 sibling, 2 replies; 36+ messages in thread
From: Chuck Lever @ 2006-05-03  4:42 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

Steve Dickson wrote:
> Talking with Trond, he would like to do something slightly different
> which I'll outline here to make sure we are all on the same page....
> 
> Basically we would maintain one global hlist (i.e. link list) that
> would contain all of the cached entries; then each nfs_inode would
> have its own LRU hlist that would contain entries that are associated
> with that nfs_inode. So each entry would be on two lists, the
> global hlist and hlist in the nfs_inode.
> 
> We would govern memory consumption by only allowing 30 entries
> on any one hlist in the nfs_inode and by registering the globe
> hlist with the VFS shrinker which will cause the list to be prune
> when memory is needed. So this means, when the 31st entry was added
> to the hlist in the nfs_inode, the least recently used entry would
> be removed.
> 
> Locking might be a bit tricky, but do able... To make this scalable,
> I would think we would need global read/write spin_lock. The read_lock()
> would be taken when the hlist in the inode was searched and the
> write_lock() would taken when the hlist in the inode was changed
> and when the global list was prune.

For the sake of discussion, let me propose some design alternatives.

1.  We already have cache shrinkage built in: when an inode is purged 
due to cache shrinkage, the access cache for that inode is purged as 
well.  In other words, there is already a mechanism for external memory 
pressure to shrink this cache.  I don't see a strong need to complicate 
matters by adding more cache shrinkage than already exists with normal 
inode and dentry cache shrinkage.

Now you won't need to hold a global lock to serialize normal accesses 
with purging and cache garbage collection.  Eliminating global 
serialization is a Good Thing (tm).

2.  Use a radix tree per inode.  The radix tree key is a uid or gid, and 
each node in a tree stores the access mask for that {inode, uid} tuple. 
  This seems a lot simpler to implement than a dual hlist, and will 
scale automatically with a large number of uids accessing the same 
inode.  The nodes are small, and you don't need to allocate a big chunk 
of contiguous memory for a hash table.

3.  Instead of serializing by spinning, you should use a semaphore.  The 
reason for this is that when multiple processes owned by the same uid 
access the same inode concurrently, only the first process should be 
allowed to generate a real ACCESS request; otherwise they will race and 
potentially all of them could generate the same ACCESS request concurrently.

You will need to serialize on-the-wire requests with accesses to the 
cache, and such wire requests will need the waiting processes to sleep, 
not spin.

4.  You will need some mechanism for ensuring that the contents of the 
access cache are "up to date".  You will need some way of deciding when 
to revalidate each {inode, uid} tuple.  Based on what Peter said, I 
think you are going to check the inode's ctime, and purge the whole 
access cache for an inode if its ctime changes.  But you may need 
something like an nfs_revalidate_inode() before you proceed to examine 
an inode's access cache.  It might be more efficient to generate just an 
ACCESS request instead of a GETATTR followed by an ACCESS, but I don't 
see an easy way to do this given the current inode revalidation 
architecture of the client.

5.  You need to handle ESTALE.  Often, ->permission is the first thing 
the VFS will do before a lookup or open, and that is when the NFS client 
first notices that a cached file handle is stale.  Should ESTALE 
returned on an ACCESS request mean always return permission denied, or 
should it mean purge the access cache and grant access, so that the next 
VFS step sees the ESTALE and can recover appropriately?

-- 
corporate:	<cel at netapp dot com>
personal:	<chucklever at bigfoot dot com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-03  4:42         ` Chuck Lever
@ 2006-05-05 14:07           ` Steve Dickson
  2006-05-05 14:53             ` Peter Staubach
  2006-05-08  2:44           ` [NFS] " Neil Brown
  1 sibling, 1 reply; 36+ messages in thread
From: Steve Dickson @ 2006-05-05 14:07 UTC (permalink / raw)
  To: cel; +Cc: nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1779 bytes --]

Here is the second attempt at improving the access cache. I believe
all of the concerns from the first patch have been addressed.

Chuck Lever wrote:
> 
> 1.  We already have cache shrinkage built in: when an inode is purged 
> due to cache shrinkage, the access cache for that inode is purged as 
> well.  In other words, there is already a mechanism for external memory 
> pressure to shrink this cache.  I don't see a strong need to complicate 
> matters by adding more cache shrinkage than already exists with normal 
> inode and dentry cache shrinkage.
I agree... so there no extra code to shrink this cache.

> 
> 2.  Use a radix tree per inode.  
Using radix trees actual makes things much simpler... Good idea, imo.

> 
> 3.  Instead of serializing by spinning, you should use a semaphore. 
I turns out that spin lock are probably better protecting this cache
because while looking up the cache entry I might have to discard the
entry, so I would have to upgrade a read semaphore to a write one which
is a bit messy and not needed imho... But I do agree with you about
global locks, so I added the spin lock to the nfs_inode.

> 
> You will need to serialize on-the-wire requests with accesses to the 
> cache, and such wire requests will need the waiting processes to sleep, 
> not spin.
True... so I use added a bit to the nfs_inode flags that will
serialize over-the-write request similar to how getattrs are.

> 
> 4.  You will need some mechanism for ensuring that the contents of the 
> access cache are "up to date".  
This cache is tied to the attribute cache and when that times out
or is invalided, the cache entry is discarded... With my testing,
this seems to just fine.

> 
> 5.  You need to handle ESTALE. 
True... and this patch does.

Comments?

steved.


[-- Attachment #2: mynfs-2.6-accesscache-radixtree.patch --]
[-- Type: text/x-patch, Size: 9098 bytes --]

This patch improves the caching of ACCESS information by storing
the information in radix tree. It also serializes over-the-wire
ACCESS calls. The patch greatly decreases the number of ACCESS 
calls when processes with different uids access the same directory.

Signed-off-by: Steve Dickson <steved@redhat.com>
----------------------------------------------------
--- mynfs-2.6/fs/nfs/nfs4proc.c.orig	2006-05-03 11:25:58.000000000 -0400
+++ mynfs-2.6/fs/nfs/nfs4proc.c	2006-05-04 15:17:11.000000000 -0400
@@ -785,6 +785,10 @@ static int _nfs4_do_access(struct inode 
 	status = _nfs4_proc_access(inode, &cache);
 	if (status != 0)
 		return status;
+
+	/* Zero masks are turned into a negative entry */
+	if (!cache.mask) 
+		cache.mask |= ~mask;
 	nfs_access_add_cache(inode, &cache);
 out:
 	if ((cache.mask & mask) == mask)
--- mynfs-2.6/fs/nfs/inode.c.orig	2006-05-03 11:25:58.000000000 -0400
+++ mynfs-2.6/fs/nfs/inode.c	2006-05-05 09:35:20.000000000 -0400
@@ -170,14 +170,11 @@ static void
 nfs_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct rpc_cred *cred;
 
 	nfs_wb_all(inode);
 	BUG_ON (!list_empty(&nfsi->open_files));
 	nfs_zap_acl_cache(inode);
-	cred = nfsi->cache_access.cred;
-	if (cred)
-		put_rpccred(cred);
+	nfs_zap_access_cache(inode);
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
@@ -940,7 +937,6 @@ nfs_fhget(struct super_block *sb, struct
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = jiffies;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		nfsi->cache_access.cred = NULL;
 
 		unlock_new_inode(inode);
 	} else
@@ -1039,7 +1035,7 @@ static int nfs_wait_schedule(void *word)
 /*
  * Wait for the inode to get unlocked.
  */
-static int nfs_wait_on_inode(struct inode *inode)
+int nfs_wait_on_inode(struct inode *inode, int bit)
 {
 	struct rpc_clnt	*clnt = NFS_CLIENT(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -1047,20 +1043,20 @@ static int nfs_wait_on_inode(struct inod
 	int error;
 
 	rpc_clnt_sigmask(clnt, &oldmask);
-	error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
+	error = wait_on_bit_lock(&nfsi->flags, bit,
 					nfs_wait_schedule, TASK_INTERRUPTIBLE);
 	rpc_clnt_sigunmask(clnt, &oldmask);
 
 	return error;
 }
 
-static void nfs_wake_up_inode(struct inode *inode)
+void nfs_wake_up_inode(struct inode *inode, int bit)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
+	clear_bit(bit, &nfsi->flags);
 	smp_mb__after_clear_bit();
-	wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
+	wake_up_bit(&nfsi->flags, bit);
 }
 
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -1235,7 +1231,7 @@ __nfs_revalidate_inode(struct nfs_server
 	if (NFS_STALE(inode))
  		goto out_nowait;
 
-	status = nfs_wait_on_inode(inode);
+	status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING);
 	if (status < 0)
 		goto out;
 	if (NFS_STALE(inode)) {
@@ -1283,7 +1279,7 @@ __nfs_revalidate_inode(struct nfs_server
 		(long long)NFS_FILEID(inode));
 
  out:
-	nfs_wake_up_inode(inode);
+	nfs_wake_up_inode(inode, NFS_INO_REVALIDATING);
 
  out_nowait:
 	unlock_kernel();
@@ -2874,6 +2870,8 @@ static void init_once(void * foo, kmem_c
 		INIT_LIST_HEAD(&nfsi->commit);
 		INIT_LIST_HEAD(&nfsi->open_files);
 		INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+		INIT_RADIX_TREE(&nfsi->access_cache, GFP_ATOMIC);
+		spin_lock_init(&nfsi->access_lock);
 		atomic_set(&nfsi->data_updates, 0);
 		nfsi->ndirty = 0;
 		nfsi->ncommit = 0;
--- mynfs-2.6/fs/nfs/dir.c.orig	2006-05-03 11:25:58.000000000 -0400
+++ mynfs-2.6/fs/nfs/dir.c	2006-05-05 09:34:47.000000000 -0400
@@ -1636,35 +1636,67 @@ out:
 	return error;
 }
 
-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, 
+	struct nfs_access_entry *res)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_access_entry *cache = &nfsi->cache_access;
+	int invalid = (nfsi->cache_validity & NFS_INO_INVALID_ACCESS);
+	struct nfs_access_entry *cache;
+	int expired;
+
+	spin_lock(&nfsi->access_lock);
+	cache = (struct nfs_access_entry *)
+		radix_tree_lookup(&nfsi->access_cache, cred->cr_uid);
+
+	if (cache) {
+		expired = time_after(jiffies, 
+				cache->jiffies + NFS_ATTRTIMEO(inode));
+		if (expired || invalid) {
+			(void)radix_tree_delete(&nfsi->access_cache, cred->cr_uid);
+			spin_unlock(&nfsi->access_lock);
+			put_rpccred(cache->cred);
+			kfree(cache);
+			goto nolock;
+		}
+		memcpy(res, cache, sizeof(*res));
+		spin_unlock(&nfsi->access_lock);
+		return 0;
+	}
+	spin_unlock(&nfsi->access_lock);
 
-	if (cache->cred != cred
-			|| time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
-			|| (nfsi->cache_validity & NFS_INO_INVALID_ACCESS))
-		return -ENOENT;
-	memcpy(res, cache, sizeof(*res));
-	return 0;
+nolock:
+	return -ENOENT;
 }
 
 void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_access_entry *cache = &nfsi->cache_access;
-
-	if (cache->cred != set->cred) {
-		if (cache->cred)
-			put_rpccred(cache->cred);
-		cache->cred = get_rpccred(set->cred);
+	struct nfs_access_entry *cache = NULL;
+	uid_t cr_uid = set->cred->cr_uid;
+	int error;
+
+	cache = (struct nfs_access_entry *)kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (!cache)
+		return;
+
+	spin_lock(&nfsi->access_lock);
+	error = radix_tree_insert(&nfsi->access_cache, cr_uid, cache);
+	if (error) {
+		spin_unlock(&nfsi->access_lock);
+		kfree(cache);
+		return;
 	}
-	/* FIXME: replace current access_cache BKL reliance with inode->i_lock */
+
+	cache->cred = get_rpccred(set->cred);
+	cache->jiffies = jiffies;
+	cache->mask = set->mask;
+	spin_unlock(&nfsi->access_lock);
+
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
 	spin_unlock(&inode->i_lock);
-	cache->jiffies = set->jiffies;
-	cache->mask = set->mask;
+
+	return;
 }
 
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
@@ -1674,19 +1706,42 @@ static int nfs_do_access(struct inode *i
 
 	status = nfs_access_get_cached(inode, cred, &cache);
 	if (status == 0)
-		goto out;
+		goto out_nowait;
+
+	if (test_and_set_bit(NFS_INO_ACCESS, &NFS_FLAGS(inode))) {
+		status = nfs_wait_on_inode(inode, NFS_INO_ACCESS);
+		if (status < 0)
+			return status;
+
+		nfs_access_get_cached(inode, cred, &cache);
+		goto out_nowait;
+	}
 
 	/* Be clever: ask server to check for all possible rights */
 	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
 	cache.cred = cred;
-	cache.jiffies = jiffies;
 	status = NFS_PROTO(inode)->access(inode, &cache);
-	if (status != 0)
+	if (status != 0) {
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+		}
+		nfs_wake_up_inode(inode, NFS_INO_ACCESS);
 		return status;
+	}
+
+	/* Zero masks are turned into a negative entry */
+	if (!cache.mask) 
+		cache.mask |= ~mask;
 	nfs_access_add_cache(inode, &cache);
-out:
+
+	nfs_wake_up_inode(inode, NFS_INO_ACCESS);
+
+out_nowait:
 	if ((cache.mask & mask) == mask)
 		return 0;
+
 	return -EACCES;
 }
 
--- mynfs-2.6/include/linux/nfs_fs.h.orig	2006-05-03 11:26:19.000000000 -0400
+++ mynfs-2.6/include/linux/nfs_fs.h	2006-05-05 09:34:08.000000000 -0400
@@ -145,7 +145,8 @@ struct nfs_inode {
 	 */
 	atomic_t		data_updates;
 
-	struct nfs_access_entry	cache_access;
+	struct radix_tree_root	access_cache;
+	spinlock_t access_lock;
 #ifdef CONFIG_NFS_V3_ACL
 	struct posix_acl	*acl_access;
 	struct posix_acl	*acl_default;
@@ -199,6 +200,7 @@ struct nfs_inode {
 #define NFS_INO_REVALIDATING	(0)		/* revalidating attrs */
 #define NFS_INO_ADVISE_RDPLUS	(1)		/* advise readdirplus */
 #define NFS_INO_STALE		(2)		/* possible stale inode */
+#define NFS_INO_ACCESS		(3)		/* checking access bits */
 
 static inline struct nfs_inode *NFS_I(struct inode *inode)
 {
@@ -256,6 +258,17 @@ static inline int NFS_USE_READDIRPLUS(st
 	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 }
 
+static inline void nfs_zap_access_cache(struct inode *inode)
+{
+	struct nfs_access_entry *cache;
+
+	cache = radix_tree_delete(&NFS_I(inode)->access_cache, inode->i_uid);
+	if (cache) {
+		put_rpccred(cache->cred);
+		kfree(cache);
+	}
+}
+
 /**
  * nfs_save_change_attribute - Returns the inode attribute change cookie
  * @inode - pointer to inode
@@ -299,6 +312,8 @@ extern int nfs_attribute_timeout(struct 
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
+extern int nfs_wait_on_inode(struct inode *inode, int bit);
+extern void nfs_wake_up_inode(struct inode *inode, int bit);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern void nfs_begin_attr_update(struct inode *);

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-05 14:07           ` Steve Dickson
@ 2006-05-05 14:53             ` Peter Staubach
  2006-05-05 14:59               ` Peter Staubach
  2006-05-06 14:35               ` [NFS] " Steve Dickson
  0 siblings, 2 replies; 36+ messages in thread
From: Peter Staubach @ 2006-05-05 14:53 UTC (permalink / raw)
  To: Steve Dickson; +Cc: cel, nfs, linux-fsdevel

Steve Dickson wrote:

> Here is the second attempt at improving the access cache. I believe
> all of the concerns from the first patch have been addressed.
>
> Chuck Lever wrote:
>
>>
>> 1.  We already have cache shrinkage built in: when an inode is purged 
>> due to cache shrinkage, the access cache for that inode is purged as 
>> well.  In other words, there is already a mechanism for external 
>> memory pressure to shrink this cache.  I don't see a strong need to 
>> complicate matters by adding more cache shrinkage than already exists 
>> with normal inode and dentry cache shrinkage.
>
> I agree... so there no extra code to shrink this cache.
>

I think that this _may_ be okay, but since this cache is not required for
correctness, it would be nice to be able to shrink the entries from active
inodes off of it when system memory goes low enough.

>>
>> 2.  Use a radix tree per inode.  
>
> Using radix trees actual makes things much simpler... Good idea, imo.
>

It does seem like a good idea, although just using the uid is not sufficient
to identify the correct access cache entry.  The group and groups list must
also be used.  Can the radix tree implementation support this?

>------------------------------------------------------------------------
>
>This patch improves the caching of ACCESS information by storing
>the information in radix tree. It also serializes over-the-wire
>ACCESS calls. The patch greatly decreases the number of ACCESS 
>calls when processes with different uids access the same directory.
>
>  
>

Some comments intermixed below...

>Signed-off-by: Steve Dickson <steved@redhat.com>
>----------------------------------------------------
>--- mynfs-2.6/fs/nfs/nfs4proc.c.orig	2006-05-03 11:25:58.000000000 -0400
>+++ mynfs-2.6/fs/nfs/nfs4proc.c	2006-05-04 15:17:11.000000000 -0400
>@@ -785,6 +785,10 @@ static int _nfs4_do_access(struct inode 
> 	status = _nfs4_proc_access(inode, &cache);
> 	if (status != 0)
> 		return status;
>+
>+	/* Zero masks are turned into a negative entry */
>+	if (!cache.mask) 
>+		cache.mask |= ~mask;
> 	nfs_access_add_cache(inode, &cache);
> out:
> 	if ((cache.mask & mask) == mask)
>--- mynfs-2.6/fs/nfs/inode.c.orig	2006-05-03 11:25:58.000000000 -0400
>+++ mynfs-2.6/fs/nfs/inode.c	2006-05-05 09:35:20.000000000 -0400
>@@ -170,14 +170,11 @@ static void
> nfs_clear_inode(struct inode *inode)
> {
> 	struct nfs_inode *nfsi = NFS_I(inode);
>-	struct rpc_cred *cred;
> 
> 	nfs_wb_all(inode);
> 	BUG_ON (!list_empty(&nfsi->open_files));
> 	nfs_zap_acl_cache(inode);
>-	cred = nfsi->cache_access.cred;
>-	if (cred)
>-		put_rpccred(cred);
>+	nfs_zap_access_cache(inode);
> 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
> }
> 
>@@ -940,7 +937,6 @@ nfs_fhget(struct super_block *sb, struct
> 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
> 		nfsi->attrtimeo_timestamp = jiffies;
> 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
>-		nfsi->cache_access.cred = NULL;
> 
> 		unlock_new_inode(inode);
> 	} else
>@@ -1039,7 +1035,7 @@ static int nfs_wait_schedule(void *word)
> /*
>  * Wait for the inode to get unlocked.
>  */
>-static int nfs_wait_on_inode(struct inode *inode)
>+int nfs_wait_on_inode(struct inode *inode, int bit)
> {
> 	struct rpc_clnt	*clnt = NFS_CLIENT(inode);
> 	struct nfs_inode *nfsi = NFS_I(inode);
>@@ -1047,20 +1043,20 @@ static int nfs_wait_on_inode(struct inod
> 	int error;
> 
> 	rpc_clnt_sigmask(clnt, &oldmask);
>-	error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
>+	error = wait_on_bit_lock(&nfsi->flags, bit,
> 					nfs_wait_schedule, TASK_INTERRUPTIBLE);
> 	rpc_clnt_sigunmask(clnt, &oldmask);
> 
> 	return error;
> }
> 
>-static void nfs_wake_up_inode(struct inode *inode)
>+void nfs_wake_up_inode(struct inode *inode, int bit)
> {
> 	struct nfs_inode *nfsi = NFS_I(inode);
> 
>-	clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
>+	clear_bit(bit, &nfsi->flags);
> 	smp_mb__after_clear_bit();
>-	wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
>+	wake_up_bit(&nfsi->flags, bit);
> }
> 
> int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
>@@ -1235,7 +1231,7 @@ __nfs_revalidate_inode(struct nfs_server
> 	if (NFS_STALE(inode))
>  		goto out_nowait;
> 
>-	status = nfs_wait_on_inode(inode);
>+	status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING);
> 	if (status < 0)
> 		goto out;
> 	if (NFS_STALE(inode)) {
>@@ -1283,7 +1279,7 @@ __nfs_revalidate_inode(struct nfs_server
> 		(long long)NFS_FILEID(inode));
> 
>  out:
>-	nfs_wake_up_inode(inode);
>+	nfs_wake_up_inode(inode, NFS_INO_REVALIDATING);
> 
>  out_nowait:
> 	unlock_kernel();
>@@ -2874,6 +2870,8 @@ static void init_once(void * foo, kmem_c
> 		INIT_LIST_HEAD(&nfsi->commit);
> 		INIT_LIST_HEAD(&nfsi->open_files);
> 		INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>+		INIT_RADIX_TREE(&nfsi->access_cache, GFP_ATOMIC);
>+		spin_lock_init(&nfsi->access_lock);
> 		atomic_set(&nfsi->data_updates, 0);
> 		nfsi->ndirty = 0;
> 		nfsi->ncommit = 0;
>--- mynfs-2.6/fs/nfs/dir.c.orig	2006-05-03 11:25:58.000000000 -0400
>+++ mynfs-2.6/fs/nfs/dir.c	2006-05-05 09:34:47.000000000 -0400
>@@ -1636,35 +1636,67 @@ out:
> 	return error;
> }
> 
>-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
>+int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, 
>+	struct nfs_access_entry *res)
> {
> 	struct nfs_inode *nfsi = NFS_I(inode);
>-	struct nfs_access_entry *cache = &nfsi->cache_access;
>+	int invalid = (nfsi->cache_validity & NFS_INO_INVALID_ACCESS);
>+	struct nfs_access_entry *cache;
>+	int expired;
>+
>+	spin_lock(&nfsi->access_lock);
>+	cache = (struct nfs_access_entry *)
>+		radix_tree_lookup(&nfsi->access_cache, cred->cr_uid);
>+
>+	if (cache) {
>+		expired = time_after(jiffies, 
>+				cache->jiffies + NFS_ATTRTIMEO(inode));
>+		if (expired || invalid) {
>+			(void)radix_tree_delete(&nfsi->access_cache, cred->cr_uid);
>+			spin_unlock(&nfsi->access_lock);
>+			put_rpccred(cache->cred);
>+			kfree(cache);
>+			goto nolock;
>+		}
>+		memcpy(res, cache, sizeof(*res));
>+		spin_unlock(&nfsi->access_lock);
>+		return 0;
>+	}
>+	spin_unlock(&nfsi->access_lock);
> 
>-	if (cache->cred != cred
>-			|| time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
>-			|| (nfsi->cache_validity & NFS_INO_INVALID_ACCESS))
>-		return -ENOENT;
>-	memcpy(res, cache, sizeof(*res));
>-	return 0;
>+nolock:
>+	return -ENOENT;
> }
> 
> void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
> {
> 	struct nfs_inode *nfsi = NFS_I(inode);
>-	struct nfs_access_entry *cache = &nfsi->cache_access;
>-
>-	if (cache->cred != set->cred) {
>-		if (cache->cred)
>-			put_rpccred(cache->cred);
>-		cache->cred = get_rpccred(set->cred);
>+	struct nfs_access_entry *cache = NULL;
>+	uid_t cr_uid = set->cred->cr_uid;
>+	int error;
>+
>+	cache = (struct nfs_access_entry *)kmalloc(sizeof(*cache), GFP_KERNEL);
>+	if (!cache)
>+		return;
>+
>+	spin_lock(&nfsi->access_lock);
>+	error = radix_tree_insert(&nfsi->access_cache, cr_uid, cache);
>+	if (error) {
>+		spin_unlock(&nfsi->access_lock);
>+		kfree(cache);
>+		return;
> 	}
>-	/* FIXME: replace current access_cache BKL reliance with inode->i_lock */
>+
>+	cache->cred = get_rpccred(set->cred);
>+	cache->jiffies = jiffies;
>+	cache->mask = set->mask;
>+	spin_unlock(&nfsi->access_lock);
>+
> 	spin_lock(&inode->i_lock);
> 	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
> 	spin_unlock(&inode->i_lock);
>-	cache->jiffies = set->jiffies;
>-	cache->mask = set->mask;
>+
>+	return;
> }
> 
> static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
>@@ -1674,19 +1706,42 @@ static int nfs_do_access(struct inode *i
> 
> 	status = nfs_access_get_cached(inode, cred, &cache);
> 	if (status == 0)
>-		goto out;
>+		goto out_nowait;
>+
>+	if (test_and_set_bit(NFS_INO_ACCESS, &NFS_FLAGS(inode))) {
>+		status = nfs_wait_on_inode(inode, NFS_INO_ACCESS);
>+		if (status < 0)
>+			return status;
>+
>+		nfs_access_get_cached(inode, cred, &cache);
>+		goto out_nowait;
>  
>

You can't just go to out_nowait here, can you?  What happens if something
happened to the file while you were asleep?  Or an entry could be added to
the cache for some reason such as kmalloc failure?  I would suggest going
back to the top and starting the process all over again.

>+	}
> 
> 	/* Be clever: ask server to check for all possible rights */
> 	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
> 	cache.cred = cred;
>-	cache.jiffies = jiffies;
> 	status = NFS_PROTO(inode)->access(inode, &cache);
>-	if (status != 0)
>+	if (status != 0) {
>+		if (status == -ESTALE) {
>+			nfs_zap_caches(inode);
>+			if (!S_ISDIR(inode->i_mode))
>+				set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
>+		}
>+		nfs_wake_up_inode(inode, NFS_INO_ACCESS);
> 		return status;
>+	}
>+
>+	/* Zero masks are turned into a negative entry */
>+	if (!cache.mask) 
>+		cache.mask |= ~mask;
> 	nfs_access_add_cache(inode, &cache);
>-out:
>+
>+	nfs_wake_up_inode(inode, NFS_INO_ACCESS);
>+
>+out_nowait:
> 	if ((cache.mask & mask) == mask)
> 		return 0;
>+
> 	return -EACCES;
> }
> 
>--- mynfs-2.6/include/linux/nfs_fs.h.orig	2006-05-03 11:26:19.000000000 -0400
>+++ mynfs-2.6/include/linux/nfs_fs.h	2006-05-05 09:34:08.000000000 -0400
>@@ -145,7 +145,8 @@ struct nfs_inode {
> 	 */
> 	atomic_t		data_updates;
> 
>-	struct nfs_access_entry	cache_access;
>+	struct radix_tree_root	access_cache;
>+	spinlock_t access_lock;
> #ifdef CONFIG_NFS_V3_ACL
> 	struct posix_acl	*acl_access;
> 	struct posix_acl	*acl_default;
>@@ -199,6 +200,7 @@ struct nfs_inode {
> #define NFS_INO_REVALIDATING	(0)		/* revalidating attrs */
> #define NFS_INO_ADVISE_RDPLUS	(1)		/* advise readdirplus */
> #define NFS_INO_STALE		(2)		/* possible stale inode */
>+#define NFS_INO_ACCESS		(3)		/* checking access bits */
> 
> static inline struct nfs_inode *NFS_I(struct inode *inode)
> {
>@@ -256,6 +258,17 @@ static inline int NFS_USE_READDIRPLUS(st
> 	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
> }
> 
>+static inline void nfs_zap_access_cache(struct inode *inode)
>+{
>+	struct nfs_access_entry *cache;
>+
>+	cache = radix_tree_delete(&NFS_I(inode)->access_cache, inode->i_uid);
>  
>

Do you need to be hold access_lock here?

>+	if (cache) {
>+		put_rpccred(cache->cred);
>+		kfree(cache);
>+	}
>  
>

What happens if there was more than 1 entry in the cache?

>+}
>+
> /**
>  * nfs_save_change_attribute - Returns the inode attribute change cookie
>  * @inode - pointer to inode
>@@ -299,6 +312,8 @@ extern int nfs_attribute_timeout(struct 
> extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
> extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
> extern void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
>+extern int nfs_wait_on_inode(struct inode *inode, int bit);
>+extern void nfs_wake_up_inode(struct inode *inode, int bit);
> extern int nfs_setattr(struct dentry *, struct iattr *);
> extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
> extern void nfs_begin_attr_update(struct inode *);
>  
>

    Thanx...

       ps


-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-05 14:53             ` Peter Staubach
@ 2006-05-05 14:59               ` Peter Staubach
  2006-05-06 14:35               ` [NFS] " Steve Dickson
  1 sibling, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-05-05 14:59 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Steve Dickson, cel, nfs, linux-fsdevel

Peter Staubach wrote:

>
> You can't just go to out_nowait here, can you?  What happens if something
> happened to the file while you were asleep?  Or an entry could be 
> added to


Opps, that would be "could not be added"...

> the cache for some reason such as kmalloc failure?  I would suggest going
> back to the top and starting the process all over again.


    Sorry...

       ps


-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-05 14:53             ` Peter Staubach
  2006-05-05 14:59               ` Peter Staubach
@ 2006-05-06 14:35               ` Steve Dickson
  2006-05-08 14:07                 ` Peter Staubach
  1 sibling, 1 reply; 36+ messages in thread
From: Steve Dickson @ 2006-05-06 14:35 UTC (permalink / raw)
  To: nfs; +Cc: linux-fsdevel

Peter Staubach wrote:
>>>
>>> 2.  Use a radix tree per inode.  
>>
>>
>> Using radix trees actual makes things much simpler... Good idea, imo.
>>
> 
> It does seem like a good idea, although just using the uid is not sufficient
> to identify the correct access cache entry.  The group and groups list must
> also be used.  Can the radix tree implementation support this?
We could use (nfsi + uid) as the index... but its not clear what that
would buys us... And the group and group list were never part of the
cache in the first place.... is this something you feel needs to be
added or am I just not understanding....

>> static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, 
>> int mask)
>> @@ -1674,19 +1706,42 @@ static int nfs_do_access(struct inode *i
>>
>>     status = nfs_access_get_cached(inode, cred, &cache);
>>     if (status == 0)
>> -        goto out;
>> +        goto out_nowait;
>> +
>> +    if (test_and_set_bit(NFS_INO_ACCESS, &NFS_FLAGS(inode))) {
>> +        status = nfs_wait_on_inode(inode, NFS_INO_ACCESS);
>> +        if (status < 0)
>> +            return status;
>> +
>> +        nfs_access_get_cached(inode, cred, &cache);
>> +        goto out_nowait;
>>  
>>
> 
> You can't just go to out_nowait here, can you?  What happens if something
> happened to the file while you were asleep?  
point... after the nfs_wait_on_inode() call, nfs_access_get_cached()
will be retried ala...

static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int 
mask)
{
     struct nfs_access_entry cache;
     int status;

retry:
     status = nfs_access_get_cached(inode, cred, &cache);
     if (status == 0)
         goto out_nowait;

     if (test_and_set_bit(NFS_INO_ACCESS, &NFS_FLAGS(inode))) {
         status = nfs_wait_on_inode(inode, NFS_INO_ACCESS);
         if (status < 0)
             return status;

         goto retry;
     }



>> +static inline void nfs_zap_access_cache(struct inode *inode)
>> +{
>> +    struct nfs_access_entry *cache;
>> +
>> +    cache = radix_tree_delete(&NFS_I(inode)->access_cache, 
>> inode->i_uid);
>>  
>>
> 
> Do you need to be hold access_lock here?
Yes...

> 
>> +    if (cache) {
>> +        put_rpccred(cache->cred);
>> +        kfree(cache);
>> +    }
>>  
>>
> 
> What happens if there was more than 1 entry in the cache?
I thought that since nfs_clear_inode() is called for
each and every inode that cache would drain "naturally"
but that turns out to be a bit brain dead... :-\

The problem is there does not seems to be a radix tree interface
that will simply destroy the tree... It appears you need to know
at least the beginning index and since the indexes in this tree are
not symmetrical I'm not sure that would even help...

Anybody.... Is there a way to destroy a radix tree without
known any of the indexes?

steved.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-03  4:42         ` Chuck Lever
  2006-05-05 14:07           ` Steve Dickson
@ 2006-05-08  2:44           ` Neil Brown
  2006-05-08  3:23             ` Chuck Lever
  1 sibling, 1 reply; 36+ messages in thread
From: Neil Brown @ 2006-05-08  2:44 UTC (permalink / raw)
  To: cel; +Cc: Steve Dickson, nfs, linux-fsdevel

On Wednesday May 3, cel@citi.umich.edu wrote:
> 
> For the sake of discussion, let me propose some design alternatives.
> 
> 1.  We already have cache shrinkage built in: when an inode is purged 
> due to cache shrinkage, the access cache for that inode is purged as 
> well.  In other words, there is already a mechanism for external memory 
> pressure to shrink this cache.  I don't see a strong need to complicate 
> matters by adding more cache shrinkage than already exists with normal 
> inode and dentry cache shrinkage.
> 

If you have one particular file that is use regularly - and so never
falls out of cache - and is uses occasionally by every single user in
you system, then that one inode could contribute to thousands of
access cache items that will never be purged.

This is why I thought that some sort of cleaning of the access cache
was important.

NeilBrown

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-08  2:44           ` [NFS] " Neil Brown
@ 2006-05-08  3:23             ` Chuck Lever
  2006-05-08  3:28               ` Neil Brown
  0 siblings, 1 reply; 36+ messages in thread
From: Chuck Lever @ 2006-05-08  3:23 UTC (permalink / raw)
  To: Neil Brown; +Cc: Steve Dickson, nfs, linux-fsdevel

Neil Brown wrote:
> On Wednesday May 3, cel@citi.umich.edu wrote:
>> For the sake of discussion, let me propose some design alternatives.
>>
>> 1.  We already have cache shrinkage built in: when an inode is purged 
>> due to cache shrinkage, the access cache for that inode is purged as 
>> well.  In other words, there is already a mechanism for external memory 
>> pressure to shrink this cache.  I don't see a strong need to complicate 
>> matters by adding more cache shrinkage than already exists with normal 
>> inode and dentry cache shrinkage.
> 
> If you have one particular file that is used regularly - and so never
> falls out of cache - and is used occasionally by every single user in
> your system, then that one inode could contribute to thousands of
> access cache items that will never be purged.

I speculate that this would not be a problem.

First, each entry in the cache is not going to be very large.  For the 
sake of easy math, let's say each one is 32 bytes.  One hundred thousand 
of these is a little more than 3MB.  Or, say we have ten thousand files 
with 10 different access cache entries: again, that's just about 3MB. 
(Naturally I'm ignoring the slab accounting overhead).

Even a single-user system these days is going to have a gigabyte or more 
of RAM, so we're talking less than a percent of memory tied up in this 
case.  Three megabytes is probably less memory than a single Gnome 
application uses for its working set.

I'm always told to start with a design that is as simple as possible 
(and no simpler) and build on it only when I find a problem; this avoids 
overdesign.  I don't see a compelling reason to start with a complicated 
design here, and there are good reasons to keep it simple.

If allowing the access cache to grow potentially without bounds still 
makes you nervous, I maintain that we still want to avoid a global LRU. 
  Having an LRU _per-inode_ might be a simple way to limit the amount of 
memory that is consumed without the locking and management overhead of a 
global LRU.  If entries haven't been accessed in more than actimeo 
seconds, then purge them; we'll have to go back to the server to 
revalidate such entries anyway, so there's no reason to keep them around.

It is pretty cheap to release, say, the two oldest entries every time 
you try an access, provided they have not been touched in actimeo seconds.

-- 
corporate:	<cel at netapp dot com>
personal:	<chucklever at bigfoot dot com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-08  3:23             ` Chuck Lever
@ 2006-05-08  3:28               ` Neil Brown
  0 siblings, 0 replies; 36+ messages in thread
From: Neil Brown @ 2006-05-08  3:28 UTC (permalink / raw)
  To: cel; +Cc: Steve Dickson, nfs, linux-fsdevel

On Sunday May 7, cel@citi.umich.edu wrote:
> > If you have one particular file that is used regularly - and so never
> > falls out of cache - and is used occasionally by every single user in
> > your system, then that one inode could contribute to thousands of
> > access cache items that will never be purged.
> 
> I speculate that this would not be a problem.
> 
> First, each entry in the cache is not going to be very large.  For the 
> sake of easy math, let's say each one is 32 bytes.  One hundred thousand 
> of these is a little more than 3MB.  Or, say we have ten thousand files 
> with 10 different access cache entries: again, that's just about 3MB. 
> (Naturally I'm ignoring the slab accounting overhead).
> 
> Even a single-user system these days is going to have a gigabyte or more 
> of RAM, so we're talking less than a percent of memory tied up in this 
> case.  Three megabytes is probably less memory than a single Gnome 
> application uses for its working set.

:-)

> 
> I'm always told to start with a design that is as simple as possible 
> (and no simpler) and build on it only when I find a problem; this avoids 
> overdesign.  I don't see a compelling reason to start with a complicated 
> design here, and there are good reasons to keep it simple.

You are probably right.

> 
> If allowing the access cache to grow potentially without bounds still 
> makes you nervous, I maintain that we still want to avoid a global LRU. 
>   Having an LRU _per-inode_ might be a simple way to limit the amount of 
> memory that is consumed without the locking and management overhead of a 
> global LRU.  If entries haven't been accessed in more than actimeo 
> seconds, then purge them; we'll have to go back to the server to 
> revalidate such entries anyway, so there's no reason to keep them around.
> 
> It is pretty cheap to release, say, the two oldest entries every time 
> you try an access, provided they have not been touched in actimeo seconds.

If you want simple, a per-inode lru which is search linearly and
discards old entries as you suggest would probably be just right.
Leave the option of making a more sophisticated data structure if it
really seems necessary.

NeilBrown

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-06 14:35               ` [NFS] " Steve Dickson
@ 2006-05-08 14:07                 ` Peter Staubach
  2006-05-08 17:09                   ` Trond Myklebust
  0 siblings, 1 reply; 36+ messages in thread
From: Peter Staubach @ 2006-05-08 14:07 UTC (permalink / raw)
  To: Steve Dickson; +Cc: nfs, linux-fsdevel

Steve Dickson wrote:

> Peter Staubach wrote:
>
>>>>
>>>> 2.  Use a radix tree per inode.  
>>>
>>>
>>>
>>> Using radix trees actual makes things much simpler... Good idea, imo.
>>>
>>
>> It does seem like a good idea, although just using the uid is not 
>> sufficient
>> to identify the correct access cache entry.  The group and groups 
>> list must
>> also be used.  Can the radix tree implementation support this?
>
> We could use (nfsi + uid) as the index... but its not clear what that
> would buys us... And the group and group list were never part of the
> cache in the first place.... is this something you feel needs to be
> added or am I just not understanding.... 


Yes, I believe that the entire user identification, uid, gid, groups list,
needs to be used to identify the correct cache entry.  An easy example of
differing access rights, but with the same uid, is an application which is
setgid.

I believe that the "key" to the cache entry should be the entire user
identification that the NFS server sees and uses when calculating access
rights.  Using the uid as part of a hash key is okay and probably a good
idea, but I don't think that the uid is sufficient to completely identify
a specific cache entry.

Given this, then I suspect that the current access cache is broken...

    Thanx...

       ps

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-08 14:07                 ` Peter Staubach
@ 2006-05-08 17:09                   ` Trond Myklebust
  2006-05-08 17:20                     ` Peter Staubach
  2006-05-08 17:37                     ` Steve Dickson
  0 siblings, 2 replies; 36+ messages in thread
From: Trond Myklebust @ 2006-05-08 17:09 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Steve Dickson, nfs, linux-fsdevel

On Mon, 2006-05-08 at 10:07 -0400, Peter Staubach wrote:
> Steve Dickson wrote:
> 
> > Peter Staubach wrote:
> >
> >>>>
> >>>> 2.  Use a radix tree per inode.  
> >>>
> >>>
> >>>
> >>> Using radix trees actual makes things much simpler... Good idea, imo.
> >>>
> >>
> >> It does seem like a good idea, although just using the uid is not 
> >> sufficient
> >> to identify the correct access cache entry.  The group and groups 
> >> list must
> >> also be used.  Can the radix tree implementation support this?
> >
> > We could use (nfsi + uid) as the index... but its not clear what that
> > would buys us... And the group and group list were never part of the
> > cache in the first place.... is this something you feel needs to be
> > added or am I just not understanding.... 
> 
> 
> Yes, I believe that the entire user identification, uid, gid, groups list,
> needs to be used to identify the correct cache entry.  An easy example of
> differing access rights, but with the same uid, is an application which is
> setgid.
> 
> I believe that the "key" to the cache entry should be the entire user
> identification that the NFS server sees and uses when calculating access
> rights.  Using the uid as part of a hash key is okay and probably a good
> idea, but I don't think that the uid is sufficient to completely identify
> a specific cache entry.
> 
> Given this, then I suspect that the current access cache is broken...

No it is not: it uses the full RPC cred as the index.

Cheers,
  Trond


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [NFS] Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-08 17:09                   ` Trond Myklebust
@ 2006-05-08 17:20                     ` Peter Staubach
  2006-05-08 17:37                     ` Steve Dickson
  1 sibling, 0 replies; 36+ messages in thread
From: Peter Staubach @ 2006-05-08 17:20 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Steve Dickson, nfs, linux-fsdevel

Trond Myklebust wrote:

>>
>>I believe that the "key" to the cache entry should be the entire user
>>identification that the NFS server sees and uses when calculating access
>>rights.  Using the uid as part of a hash key is okay and probably a good
>>idea, but I don't think that the uid is sufficient to completely identify
>>a specific cache entry.
>>
>>Given this, then I suspect that the current access cache is broken...
>>    
>>
>
>No it is not: it uses the full RPC cred as the index.
>

Cool!

       ps

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: Re: [PATCH][RFC] NFS: Improving the access cache
  2006-05-08 17:09                   ` Trond Myklebust
  2006-05-08 17:20                     ` Peter Staubach
@ 2006-05-08 17:37                     ` Steve Dickson
  1 sibling, 0 replies; 36+ messages in thread
From: Steve Dickson @ 2006-05-08 17:37 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Peter Staubach, nfs



Trond Myklebust wrote:
> No it is not: it uses the full RPC cred as the index.
I concur... I just wrote a test problem and things
work as expected...

steved.


-------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2006-05-08 23:15 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-04-26  1:14 [PATCH][RFC] NFS: Improving the access cache Steve Dickson
2006-04-26  1:31 ` Matthew Wilcox
2006-04-26  4:55 ` Neil Brown
2006-04-26 14:51   ` Steve Dickson
2006-04-26 22:32     ` Neil Brown
2006-05-02  9:49       ` Steve Dickson
2006-05-02 13:51         ` [NFS] " Peter Staubach
2006-05-02 14:38           ` Steve Dickson
2006-05-02 14:51             ` Peter Staubach
2006-05-02 15:26               ` [NFS] " Ian Kent
2006-05-03  4:42         ` Chuck Lever
2006-05-05 14:07           ` Steve Dickson
2006-05-05 14:53             ` Peter Staubach
2006-05-05 14:59               ` Peter Staubach
2006-05-06 14:35               ` [NFS] " Steve Dickson
2006-05-08 14:07                 ` Peter Staubach
2006-05-08 17:09                   ` Trond Myklebust
2006-05-08 17:20                     ` Peter Staubach
2006-05-08 17:37                     ` Steve Dickson
2006-05-08  2:44           ` [NFS] " Neil Brown
2006-05-08  3:23             ` Chuck Lever
2006-05-08  3:28               ` Neil Brown
2006-04-26 13:03 ` Trond Myklebust
2006-04-26 13:03   ` Trond Myklebust
2006-04-26 13:14   ` Peter Staubach
2006-04-26 13:14     ` Peter Staubach
2006-04-26 14:01     ` Trond Myklebust
2006-04-26 14:01       ` Trond Myklebust
2006-04-26 14:15       ` Peter Staubach
2006-04-26 14:15         ` Peter Staubach
2006-04-26 15:44         ` Trond Myklebust
2006-04-26 17:01           ` Peter Staubach
2006-04-26 15:03   ` Steve Dickson
2006-04-26 15:03     ` Steve Dickson
2006-04-26 13:17 ` [NFS] " Chuck Lever
2006-04-26 14:19   ` Steve Dickson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.