All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] dcache scalability patch (2.4.17)
@ 2002-07-12 14:09 Maneesh Soni
  2002-07-12 14:13 ` [Lse-tech] " Christoph Hellwig
  2002-07-12 14:29 ` Alexander Viro
  0 siblings, 2 replies; 18+ messages in thread
From: Maneesh Soni @ 2002-07-12 14:09 UTC (permalink / raw)
  To: LKML; +Cc: lse-tech, Al Viro

Here is the dcache scalability patch (cleaned up) as disscussed in 
the previous post to lkml by Dipankar. The patch uses RCU for doing fast
dcache lookup. It also does lazy updates to lru list of dentries to
avoid doing write operations while doing lookup.

Following changes were done in this version

o Removed the d_next_hash hack. Dentries are unhashed using list_del 
  instead of list_del_init, and that too using d_drop interface  
o Used d_drop_locked and d_unhashed instead of directly manipulating hash list 
  using list macros 
o Changed DCACHE_UNLINKED to DCACHE_UNHASHED 

Regards,
Maneesh

diff -urN linux-2.4.17-base/fs/autofs4/root.c linux-2.4.17-dc8/fs/autofs4/root.c
--- linux-2.4.17-base/fs/autofs4/root.c	Tue Oct 24 10:27:38 2000
+++ linux-2.4.17-dc8/fs/autofs4/root.c	Fri Jul 12 10:59:38 2002
@@ -403,7 +403,7 @@
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	list_del_init(&dentry->d_hash);
+	d_drop_locked(dentry);
 	spin_unlock(&dcache_lock);
 
 	dput(ino->dentry);
diff -urN linux-2.4.17-base/fs/dcache.c linux-2.4.17-dc8/fs/dcache.c
--- linux-2.4.17-base/fs/dcache.c	Fri Dec 21 23:11:55 2001
+++ linux-2.4.17-dc8/fs/dcache.c	Fri Jul 12 16:18:39 2002
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 
 #include <asm/uaccess.h>
+#include <linux/rcupdate.h>
 
 #define DCACHE_PARANOIA 1
 /* #define DCACHE_DEBUG 1 */
@@ -55,14 +56,21 @@
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {0, 0, 45, 0,};
 
+static void d_callback(void *arg)
+{
+	struct dentry * dentry = (struct dentry *)arg;
+
+	if (dname_external(dentry)) 
+		kfree((void *) dentry->d_name.name);
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
 /* no dcache_lock, please */
 static inline void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
-	if (dname_external(dentry)) 
-		kfree(dentry->d_name.name);
-	kmem_cache_free(dentry_cache, dentry); 
+	call_rcu(&dentry->d_rcu, d_callback, dentry);
 	dentry_stat.nr_dentry--;
 }
 
@@ -124,9 +132,13 @@
 	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
 		return;
 
-	/* dput on a free dentry? */
-	if (!list_empty(&dentry->d_lru))
-		BUG();
+	spin_lock(&dentry->d_lock);
+        if (atomic_read(&dentry->d_count)) {
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&dcache_lock);
+                return;
+        }
+
 	/*
 	 * AV: ->d_delete() is _NOT_ allowed to block now.
 	 */
@@ -135,18 +147,28 @@
 			goto unhash_it;
 	}
 	/* Unreachable? Get rid of it */
-	if (list_empty(&dentry->d_hash))
+	if (d_unhashed(dentry))
 		goto kill_it;
-	list_add(&dentry->d_lru, &dentry_unused);
-	dentry_stat.nr_unused++;
+
+	if (list_empty(&dentry->d_lru)) {
+		dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
+		list_add(&dentry->d_lru, &dentry_unused);
+		dentry_stat.nr_unused++;
+	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
 
 kill_it: {
 		struct dentry *parent;
+		spin_unlock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_lru)) {
+			list_del(&dentry->d_lru);
+			dentry_stat.nr_unused--;
+		}
 		list_del(&dentry->d_child);
 		/* drops the lock, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -177,7 +199,7 @@
 	 * If it's already been dropped, return OK.
 	 */
 	spin_lock(&dcache_lock);
-	if (list_empty(&dentry->d_hash)) {
+	if (d_unhashed(dentry)) {
 		spin_unlock(&dcache_lock);
 		return 0;
 	}
@@ -201,15 +223,18 @@
 	 * we might still populate it if it was a
 	 * working directory or similar).
 	 */
+	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			return -EBUSY;
 		}
 	}
-
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
+
 	return 0;
 }
 
@@ -217,11 +242,14 @@
 
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
+	spin_lock(&dentry->d_lock);
 	atomic_inc(&dentry->d_count);
+	dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_stat.nr_unused--;
 		list_del_init(&dentry->d_lru);
 	}
+	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
@@ -252,8 +280,8 @@
 		tmp = next;
 		next = tmp->next;
 		alias = list_entry(tmp, struct dentry, d_alias);
-		if (!list_empty(&alias->d_hash)) {
-			__dget_locked(alias);
+		if (!d_unhashed(alias)) {
+			dget(alias);
 			spin_unlock(&dcache_lock);
 			return alias;
 		}
@@ -263,7 +291,7 @@
 }
 
 /*
- *	Try to kill dentries associated with this inode.
+ *     Try to kill dentries associated with this inode.
  * WARNING: you must own a reference to inode.
  */
 void d_prune_aliases(struct inode *inode)
@@ -274,13 +302,16 @@
 	tmp = head;
 	while ((tmp = tmp->next) != head) {
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			__dget_locked(dentry);
+			__dget(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
-			d_drop(dentry);
 			dput(dentry);
 			goto restart;
 		}
+		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -295,7 +326,8 @@
 {
 	struct dentry * parent;
 
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	list_del(&dentry->d_child);
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -330,19 +362,20 @@
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
+		dentry_stat.nr_unused--;
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
+		spin_lock(&dentry->d_lock);
 		/* If the dentry was recently referenced, don't free it. */
 		if (dentry->d_vfs_flags & DCACHE_REFERENCED) {
 			dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
-			list_add(&dentry->d_lru, &dentry_unused);
+			if (!atomic_read(&dentry->d_count)) {
+				list_add(&dentry->d_lru, &dentry_unused);
+				dentry_stat.nr_unused++;
+			}
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		dentry_stat.nr_unused--;
-
-		/* Unused dentry with a count? */
-		if (atomic_read(&dentry->d_count))
-			BUG();
 
 		prune_one_dentry(dentry);
 		if (!--count)
@@ -405,10 +438,13 @@
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		if (atomic_read(&dentry->d_count))
-			continue;
+		spin_lock(&dentry->d_lock);
 		dentry_stat.nr_unused--;
 		list_del_init(tmp);
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
 		prune_one_dentry(dentry);
 		goto repeat;
 	}
@@ -488,11 +524,13 @@
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
+		list_del_init(&dentry->d_lru);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			list_del(&dentry->d_lru);
 			list_add(&dentry->d_lru, dentry_unused.prev);
 			found++;
 		}
+		spin_unlock(&dentry->d_lock);
 		/*
 		 * Descend a level if the d_subdirs list is non-empty.
 		 */
@@ -606,8 +644,9 @@
 	str[name->len] = 0;
 
 	atomic_set(&dentry->d_count, 1);
-	dentry->d_vfs_flags = 0;
+	dentry->d_vfs_flags = DCACHE_UNHASHED;
 	dentry->d_flags = 0;
+	dentry->d_lock = SPIN_LOCK_UNLOCKED;
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
 	dentry->d_sb = NULL;
@@ -708,8 +747,9 @@
 	const unsigned char *str = name->name;
 	struct list_head *head = d_hash(parent,hash);
 	struct list_head *tmp;
+	struct dentry * found = NULL;
 
-	spin_lock(&dcache_lock);
+	/* rcu_read_lock(); for pre-emptible kernel */
 	tmp = head->next;
 	for (;;) {
 		struct dentry * dentry = list_entry(tmp, struct dentry, d_hash);
@@ -729,13 +769,16 @@
 			if (memcmp(dentry->d_name.name, str, len))
 				continue;
 		}
-		__dget_locked(dentry);
-		dentry->d_vfs_flags |= DCACHE_REFERENCED;
-		spin_unlock(&dcache_lock);
-		return dentry;
+		spin_lock(&dentry->d_lock);
+		if (!(dentry->d_vfs_flags & DCACHE_UNHASHED)) {
+			found = __dget(dentry);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* rcu_read_unlock(); for pre-emptible kernel */
+		return found;
 	}
-	spin_unlock(&dcache_lock);
-	return NULL;
+	/* rcu_read_unlock(); for pre-emptible kernel */
+	return found;
 }
 
 /**
@@ -774,7 +817,7 @@
 	lhp = base = d_hash(dparent, dentry->d_name.hash);
 	while ((lhp = lhp->next) != base) {
 		if (dentry == list_entry(lhp, struct dentry, d_hash)) {
-			__dget_locked(dentry);
+			dget(dentry);
 			spin_unlock(&dcache_lock);
 			return 1;
 		}
@@ -834,9 +877,12 @@
 void d_rehash(struct dentry * entry)
 {
 	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
-	if (!list_empty(&entry->d_hash)) BUG();
 	spin_lock(&dcache_lock);
+	spin_lock(&entry->d_lock);
+	if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG();
 	list_add(&entry->d_hash, list);
+	entry->d_vfs_flags &= ~DCACHE_UNHASHED;
+	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
@@ -909,7 +955,7 @@
 	list_add(&dentry->d_hash, &target->d_hash);
 
 	/* Unhash the target: dput() will then get rid of it */
-	list_del_init(&target->d_hash);
+	d_drop_locked(target);
 
 	list_del(&dentry->d_child);
 	list_del(&target->d_child);
@@ -951,7 +997,7 @@
 
 	*--end = '\0';
 	buflen--;
-	if (!IS_ROOT(dentry) && list_empty(&dentry->d_hash)) {
+	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
 		buflen -= 10;
 		end -= 10;
 		memcpy(end, " (deleted)", 10);
@@ -1034,7 +1080,7 @@
 	error = -ENOENT;
 	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (pwd->d_parent == pwd || !list_empty(&pwd->d_hash)) {
+	if (pwd->d_parent == pwd || !d_unhashed(pwd)) {
 		unsigned long len;
 		char * cwd;
 
diff -urN linux-2.4.17-base/fs/intermezzo/journal.c linux-2.4.17-dc8/fs/intermezzo/journal.c
--- linux-2.4.17-base/fs/intermezzo/journal.c	Fri Dec 21 23:11:55 2001
+++ linux-2.4.17-dc8/fs/intermezzo/journal.c	Mon Jul  8 16:18:43 2002
@@ -186,7 +186,7 @@
 
         *--end = '\0';
         buflen--;
-        if (dentry->d_parent != dentry && list_empty(&dentry->d_hash)) {
+        if (dentry->d_parent != dentry && d_unhashed(dentry)) {
                 buflen -= 10;
                 end -= 10;
                 memcpy(end, " (deleted)", 10);
diff -urN linux-2.4.17-base/fs/nfsd/nfsfh.c linux-2.4.17-dc8/fs/nfsd/nfsfh.c
--- linux-2.4.17-base/fs/nfsd/nfsfh.c	Thu Oct  4 11:29:22 2001
+++ linux-2.4.17-dc8/fs/nfsd/nfsfh.c	Mon Jul  8 16:18:43 2002
@@ -348,7 +348,7 @@
 	spin_lock(&dcache_lock);
 	list_for_each(lp, &child->d_inode->i_dentry) {
 		struct dentry *tmp = list_entry(lp,struct dentry, d_alias);
-		if (!list_empty(&tmp->d_hash) &&
+		if (!d_unhashed(tmp) &&
 		    tmp->d_parent == parent) {
 			child = dget_locked(tmp);
 			spin_unlock(&dcache_lock);
diff -urN linux-2.4.17-base/fs/readdir.c linux-2.4.17-dc8/fs/readdir.c
--- linux-2.4.17-base/fs/readdir.c	Mon Aug 13 03:29:08 2001
+++ linux-2.4.17-dc8/fs/readdir.c	Mon Jul  8 16:18:43 2002
@@ -79,7 +79,7 @@
 			while(1) {
 				struct dentry *de = list_entry(list, struct dentry, d_child);
 
-				if (!list_empty(&de->d_hash) && de->d_inode) {
+  				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
 					if (filldir(dirent, de->d_name.name, de->d_name.len, filp->f_pos, de->d_inode->i_ino, DT_UNKNOWN) < 0)
 						break;
diff -urN linux-2.4.17-base/include/linux/dcache.h linux-2.4.17-dc8/include/linux/dcache.h
--- linux-2.4.17-base/include/linux/dcache.h	Fri Nov 23 01:16:18 2001
+++ linux-2.4.17-dc8/include/linux/dcache.h	Fri Jul 12 10:59:09 2002
@@ -5,6 +5,8 @@
 
 #include <asm/atomic.h>
 #include <linux/mount.h>
+#include <linux/rcupdate.h>
+#include <asm/system.h>
 
 /*
  * linux/include/linux/dcache.h
@@ -65,11 +67,13 @@
 
 struct dentry {
 	atomic_t d_count;
+	spinlock_t d_lock;		/* lock for d_count & d_vfs_flags */
+	unsigned long d_vfs_flags;	/* moved here to be on same cacheline */
 	unsigned int d_flags;
 	struct inode  * d_inode;	/* Where the name belongs to - NULL is negative */
 	struct dentry * d_parent;	/* parent directory */
 	struct list_head d_hash;	/* lookup hash list */
-	struct list_head d_lru;		/* d_count = 0 LRU list */
+	struct list_head d_lru;		/* LRU list */
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
@@ -78,8 +82,8 @@
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations  *d_op;
 	struct super_block * d_sb;	/* The root of the dentry tree */
-	unsigned long d_vfs_flags;
 	void * d_fsdata;		/* fs-specific data */
+	struct rcu_head d_rcu;
 	unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
 };
 
@@ -123,10 +127,23 @@
 					 * s_nfsd_free_path semaphore will be down
 					 */
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
+#define DCACHE_UNHASHED		0x0010	
 
 extern spinlock_t dcache_lock;
 
 /**
+ *	d_unhashed -	is dentry hashed
+ *	@dentry: entry to check
+ *
+ *	Returns true if the dentry passed is not currently hashed.
+ */
+ 
+static __inline__ int d_unhashed(struct dentry *dentry)
+{
+	return (dentry->d_vfs_flags & DCACHE_UNHASHED);
+}
+
+/**
  * d_drop - drop a dentry
  * @dentry: dentry to drop
  *
@@ -143,14 +160,32 @@
  * timeouts or autofs deletes).
  */
 
+static __inline__ void __d_drop(struct dentry * dentry)
+{
+	if (!d_unhashed(dentry)) {
+		dentry->d_vfs_flags |= DCACHE_UNHASHED;
+		wmb();
+		list_del(&dentry->d_hash);
+		wmb();
+	}
+}
+
 static __inline__ void d_drop(struct dentry * dentry)
 {
 	spin_lock(&dcache_lock);
-	list_del(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_hash);
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
+static __inline__ void d_drop_locked(struct dentry * dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
 static __inline__ int dname_external(struct dentry *d)
 {
 	return d->d_name.name != d->d_iname; 
@@ -243,27 +278,25 @@
 static __inline__ struct dentry * dget(struct dentry *dentry)
 {
 	if (dentry) {
-		if (!atomic_read(&dentry->d_count))
-			BUG();
+		spin_lock(&dentry->d_lock);
 		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
 	}
 	return dentry;
 }
 
-extern struct dentry * dget_locked(struct dentry *);
-
-/**
- *	d_unhashed -	is dentry hashed
- *	@dentry: entry to check
- *
- *	Returns true if the dentry passed is not currently hashed.
- */
- 
-static __inline__ int d_unhashed(struct dentry *dentry)
+static __inline__ struct dentry * __dget(struct dentry *dentry)
 {
-	return list_empty(&dentry->d_hash);
+	if (dentry) {
+		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+	}
+	return dentry;
 }
 
+extern struct dentry * dget_locked(struct dentry *);
+
 extern void dput(struct dentry *);
 
 static __inline__ int d_mountpoint(struct dentry *dentry)

-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/locking/rcupdate.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:09 [RFC] dcache scalability patch (2.4.17) Maneesh Soni
@ 2002-07-12 14:13 ` Christoph Hellwig
  2002-07-12 16:20   ` Dipankar Sarma
  2002-07-15  8:10   ` Maneesh Soni
  2002-07-12 14:29 ` Alexander Viro
  1 sibling, 2 replies; 18+ messages in thread
From: Christoph Hellwig @ 2002-07-12 14:13 UTC (permalink / raw)
  To: Maneesh Soni; +Cc: LKML, lse-tech, Al Viro

> diff -urN linux-2.4.17-base/fs/dcache.c linux-2.4.17-dc8/fs/dcache.c
> --- linux-2.4.17-base/fs/dcache.c	Fri Dec 21 23:11:55 2001
> +++ linux-2.4.17-dc8/fs/dcache.c	Fri Jul 12 16:18:39 2002
> @@ -25,6 +25,7 @@
>  #include <linux/module.h>
>  
>  #include <asm/uaccess.h>
> +#include <linux/rcupdate.h>

Please try to include <linux/*.h> before <asm/*.h> headers.


> +static void d_callback(void *arg)
> +{
> +	struct dentry * dentry = (struct dentry *)arg;
> +
> +	if (dname_external(dentry)) 
> +		kfree((void *) dentry->d_name.name);
> +	kmem_cache_free(dentry_cache, dentry); 
> +}

why do you cast to void * before calling kfree?

> -	/* dput on a free dentry? */
> -	if (!list_empty(&dentry->d_lru))
> -		BUG();
> +	spin_lock(&dentry->d_lock);
> +        if (atomic_read(&dentry->d_count)) {
> +                spin_unlock(&dentry->d_lock);
> +                spin_unlock(&dcache_lock);
> +                return;
> +        }
> +

Please use tabs instead of eight spaces in kernel code.

Another implementation details is whether we shouldn't spin on a bit of
->d_vfs_flags instead of increasing struct dentry further.  Maybe the
spin_lock_bit interface that wli prototypes might be a godd choise.

Else the patch looks fine to me, although I'm wondering why you target 2.4.17

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:09 [RFC] dcache scalability patch (2.4.17) Maneesh Soni
  2002-07-12 14:13 ` [Lse-tech] " Christoph Hellwig
@ 2002-07-12 14:29 ` Alexander Viro
  2002-07-12 16:10   ` [Lse-tech] " Dipankar Sarma
                     ` (3 more replies)
  1 sibling, 4 replies; 18+ messages in thread
From: Alexander Viro @ 2002-07-12 14:29 UTC (permalink / raw)
  To: Maneesh Soni; +Cc: LKML, lse-tech



On Fri, 12 Jul 2002, Maneesh Soni wrote:

> Here is the dcache scalability patch (cleaned up) as disscussed in 
> the previous post to lkml by Dipankar. The patch uses RCU for doing fast
> dcache lookup. It also does lazy updates to lru list of dentries to
> avoid doing write operations while doing lookup.

Where is
	* version for 2.5.<current>
	* analysis of benefits in real-world situations for 2.5 version?

Patch adds complexity and unless you can show that it gives significant
benefits outside of pathological situations, it's not going in.

Note: measurements on 2.4 do not make sense; reduction of cacheline
bouncing between 2.4 and 2.5 will change the results anyway and
if any of these patches are going to be applied to 2.4, reduction of
cacheline bouncing on ->d_count is going to go in before that one.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:29 ` Alexander Viro
@ 2002-07-12 16:10   ` Dipankar Sarma
  2002-07-12 18:02     ` Dipankar Sarma
  2002-07-12 17:35   ` Paul Menage
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Dipankar Sarma @ 2002-07-12 16:10 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Maneesh Soni, LKML, lse-tech

On Fri, Jul 12, 2002 at 10:29:53AM -0400, Alexander Viro wrote:
> On Fri, 12 Jul 2002, Maneesh Soni wrote:
> 
> > Here is the dcache scalability patch (cleaned up) as disscussed in 
> > the previous post to lkml by Dipankar. The patch uses RCU for doing fast
> > dcache lookup. It also does lazy updates to lru list of dentries to
> > avoid doing write operations while doing lookup.
> 
> Where is
> 	* version for 2.5.<current>
> 	* analysis of benefits in real-world situations for 2.5 version?

I know that 2.5 patches are available, but Maneesh will probably
respond to this on Monday.

I am working on getting 2.5 measurements done. BTW, would you consider
specweb99 reasonably real-world ? If not, do you have any suggestions 
for benchmarks ? I suspect that dbench wouldn't cut it ;-).

> 
> Patch adds complexity and unless you can show that it gives significant
> benefits outside of pathological situations, it's not going in.

Fair enough.

> 
> Note: measurements on 2.4 do not make sense; reduction of cacheline
> bouncing between 2.4 and 2.5 will change the results anyway and

Quite possible. Our performance measurements have been far
behind and we are catching up now. You may expect 2.5 numbers soon.

> if any of these patches are going to be applied to 2.4, reduction of
> cacheline bouncing on ->d_count is going to go in before that one.

That is an issue we need to work on. We can do some cache event
profiling to understand the extent of the d_count cacheline bouncing.
At the same time, it seems that the dcache_lock cacheline is also
bouncing around and it is probably more shared than the dentries 
for / or /usr. One thing for sure - RCU based lookup of dcache
makes it difficult to optimize on dget()s. We will have to figure
out a way to do this.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:13 ` [Lse-tech] " Christoph Hellwig
@ 2002-07-12 16:20   ` Dipankar Sarma
  2002-07-15  8:10   ` Maneesh Soni
  1 sibling, 0 replies; 18+ messages in thread
From: Dipankar Sarma @ 2002-07-12 16:20 UTC (permalink / raw)
  To: Christoph Hellwig, Maneesh Soni, LKML, lse-tech, Al Viro

On Fri, Jul 12, 2002 at 03:13:22PM +0100, Christoph Hellwig wrote:
> 
> Else the patch looks fine to me, although I'm wondering why you target 2.4.17

Just to clarify from the project standpoint - we are *not* targeting
2.4.X. 2.4.X is what was used for ongoing performance measurement
work and we had to hop on to that bandwagon. It was just a proof
of concept.

We will publish the 2.5 stuff soon.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:29 ` Alexander Viro
  2002-07-12 16:10   ` [Lse-tech] " Dipankar Sarma
@ 2002-07-12 17:35   ` Paul Menage
  2002-07-13  8:52     ` Alexander Viro
  2002-07-12 22:50   ` Hanna Linder
  2002-07-15  7:49   ` Maneesh Soni
  3 siblings, 1 reply; 18+ messages in thread
From: Paul Menage @ 2002-07-12 17:35 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Maneesh Soni, LKML, lse-tech, pmenage

>
>Note: measurements on 2.4 do not make sense; reduction of cacheline
>bouncing between 2.4 and 2.5 will change the results anyway and
>if any of these patches are going to be applied to 2.4, reduction of
>cacheline bouncing on ->d_count is going to go in before that one.

I think there are some other possibilities for cache-bounce removal in
struct dentry. The most obvious one is d_vfs_flags - not only does it
get written to on every d_lookup (to set DCACHE_REFERENCED) but it also
shares a cache line (on Pentium IV) with d_op, d_iname and part of
d_name (along with d_sb and d_fsdata, but these don't seem to be so
crucial).

Some quick stats gathering suggested that DCACHE_REFERENCED is already
set 95%-98% of the time, so this cache bounce is not even doing anything
useful. I submitted this patch a while ago making the DCACHE_REFERENCED
bit setting be conditional on it not being already set, which didn't
generate any interest. One problem with this patch would be the
additional branch prediction misses (on some architectures?) that would
work against the benefits of not dirtying a cache line. 

Maybe we should have a function definition something like the following:

static __inline__ void __ensure_bit_set(int nr, volatile unsigned long * addr) 
{
#if defined (CONFIG_BIG_SMP) || defined(ARCH_HAVE_PREDICATED_WRITE)
	if(!test_bit(nr, addr))
#endif
		set_bit(nr, addr);
}	

so that architectures that support conditional writes (arm and ia64?) and 
for SMP systems with enough processors that cache-bouncing is an issue, 
the test can be performed, and for others where the branch prediction 
miss would hurt us more than the cache dirtying it would just do it 
unconditionally.
 
--- linux-2.5.13/fs/dcache.c	Thu May  2 17:22:42 2002
+++ linux-2.5.13-nodref/fs/dcache.c	Sat May  4 16:36:38 2002
@@ -883,7 +883,8 @@
 			if (memcmp(dentry->d_name.name, str, len))
 				continue;
 		}
-		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+		if(!(dentry->d_vfs_flags & DCACHE_REFERENCED))
+			dentry->d_vfs_flags |= DCACHE_REFERENCED;
 		return dentry;
 	}
 	return NULL;


Perhaps another solution is to rearrange struct dentry - put the
volatile stuff in one cache line (or set of lines), and the constant
stuff in another. So probably d_count, d_lru and d_vfs_flags
would be in the volatile line, and most other stuff in the the other.

It would probably make sense to ensure that d_name and d_iname share a
cache line if possible, even on smaller cache-line architectures, and
maybe also d_hash and d_parent on that same line.

Paul




^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 16:10   ` [Lse-tech] " Dipankar Sarma
@ 2002-07-12 18:02     ` Dipankar Sarma
  2002-07-12 18:10       ` Hanna Linder
  0 siblings, 1 reply; 18+ messages in thread
From: Dipankar Sarma @ 2002-07-12 18:02 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Maneesh Soni, LKML, lse-tech

On Fri, Jul 12, 2002 at 09:40:08PM +0530, Dipankar Sarma wrote:
> On Fri, Jul 12, 2002 at 10:29:53AM -0400, Alexander Viro wrote:
> > Where is
> > 	* version for 2.5.<current>
> > 	* analysis of benefits in real-world situations for 2.5 version?
> 
> I know that 2.5 patches are available, but Maneesh will probably
> respond to this on Monday.
> 
> I am working on getting 2.5 measurements done. BTW, would you consider
> specweb99 reasonably real-world ? If not, do you have any suggestions 
> for benchmarks ? I suspect that dbench wouldn't cut it ;-).

Hi Al,

Mark Hahn made a good point over private email that real-worldness
also includes hardware. I will dig around and see if we can work out
a setup with different dual/4CPU hardware to do webserver benchmarks 
and analyze results for 2.5 patches.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 18:02     ` Dipankar Sarma
@ 2002-07-12 18:10       ` Hanna Linder
  0 siblings, 0 replies; 18+ messages in thread
From: Hanna Linder @ 2002-07-12 18:10 UTC (permalink / raw)
  To: dipankar, Alexander Viro; +Cc: Maneesh Soni, LKML, lse-tech

--On Friday, July 12, 2002 23:32:05 +0530 Dipankar Sarma <dipankar@in.ibm.com> wrote:

> 
> Hi Al,
> 
> Mark Hahn made a good point over private email that real-worldness
> also includes hardware. I will dig around and see if we can work out
> a setup with different dual/4CPU hardware to do webserver benchmarks 
> and analyze results for 2.5 patches.
> 

Dipankar,

	I just loaded a 2-way PII 400Mhz with 256Meg of ram which
is a lot more real world than most the other systems we test. It
is on the net (behind the ibm firewall of course) so you can use
it if you want. 

Hanna


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:29 ` Alexander Viro
  2002-07-12 16:10   ` [Lse-tech] " Dipankar Sarma
  2002-07-12 17:35   ` Paul Menage
@ 2002-07-12 22:50   ` Hanna Linder
  2002-07-15  7:49   ` Maneesh Soni
  3 siblings, 0 replies; 18+ messages in thread
From: Hanna Linder @ 2002-07-12 22:50 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Maneesh Soni, LKML, lse-tech

--On Friday, July 12, 2002 10:29:53 -0400 Alexander Viro <viro@math.psu.edu> wrote:
> 
> 
> On Fri, 12 Jul 2002, Maneesh Soni wrote:
> 
>> Here is the dcache scalability patch (cleaned up) as disscussed in 
>> the previous post to lkml by Dipankar. The patch uses RCU for doing fast
>> dcache lookup. It also does lazy updates to lru list of dentries to
>> avoid doing write operations while doing lookup.
> 
> Where is
> 	* version for 2.5.<current>
> 	* analysis of benefits in real-world situations for 2.5 version?
> 
> Patch adds complexity and unless you can show that it gives significant
> benefits outside of pathological situations, it's not going in.


Here are the slides where I presented, among other things, some 
performance results of fastwalk compared to using rcu with lazy 
updating of the d_lru list. The results are similar to what Dipankar 
just published but there are a few more data points.

http://lse.sf.net/locking

Thanks.

Hanna



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 17:35   ` Paul Menage
@ 2002-07-13  8:52     ` Alexander Viro
  2002-07-13 17:25       ` Paul Menage
  2002-07-15  0:10       ` Linus Torvalds
  0 siblings, 2 replies; 18+ messages in thread
From: Alexander Viro @ 2002-07-13  8:52 UTC (permalink / raw)
  To: Paul Menage; +Cc: Linus Torvalds, Maneesh Soni, LKML, lse-tech



On Fri, 12 Jul 2002, Paul Menage wrote:

> >Note: measurements on 2.4 do not make sense; reduction of cacheline
> >bouncing between 2.4 and 2.5 will change the results anyway and
> >if any of these patches are going to be applied to 2.4, reduction of
> >cacheline bouncing on ->d_count is going to go in before that one.
> 
> I think there are some other possibilities for cache-bounce removal in
> struct dentry. The most obvious one is d_vfs_flags - not only does it
> get written to on every d_lookup (to set DCACHE_REFERENCED) but it also
> shares a cache line (on Pentium IV) with d_op, d_iname and part of
> d_name (along with d_sb and d_fsdata, but these don't seem to be so
> crucial).
> 
> Some quick stats gathering suggested that DCACHE_REFERENCED is already
> set 95%-98% of the time, so this cache bounce is not even doing anything
> useful. I submitted this patch a while ago making the DCACHE_REFERENCED
> bit setting be conditional on it not being already set, which didn't
> generate any interest. One problem with this patch would be the
> additional branch prediction misses (on some architectures?) that would
> work against the benefits of not dirtying a cache line. 

Frankly, I'd rather moved setting DCACHE_REFERENCED to dput().  We don't
care for that bit for dentries with positive ->d_count.

So I'd just do

vi fs/dcache.c -c '/|= DCACHE_R/d|/nr_un/pu|<|x'

and be done with that.  Linus?


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-13  8:52     ` Alexander Viro
@ 2002-07-13 17:25       ` Paul Menage
  2002-07-13 17:33         ` Alexander Viro
  2002-07-15  0:10       ` Linus Torvalds
  1 sibling, 1 reply; 18+ messages in thread
From: Paul Menage @ 2002-07-13 17:25 UTC (permalink / raw)
  To: Alexander Viro; +Cc: LKML, lse-tech, pmenage

>
>Frankly, I'd rather moved setting DCACHE_REFERENCED to dput().  We don't
>care for that bit for dentries with positive ->d_count.
>
>So I'd just do
>
>vi fs/dcache.c -c '/|= DCACHE_R/d|/nr_un/pu|<|x'
>
>and be done with that.  Linus?
>

Some possibly minor issues with that: 

- accessing foo/../bar, won't mark foo as referenced, even though it
might be being referenced frequently. Probably not a common case for foo
to be accessed exclusively in this way, but it could be fixed by marking
a dentry referenced when following ".."

- currently, negative dentries start off unreferenced and get marked
referenced the second and subsequent time that they're used. This would
change to starting off referenced (by the ref count set in lock_nd()
after the ->lookup()) but then not being marked referenced ever again,
as they're always looked at under dcache_lock, and no count is taken on
them. So used-once negative dentries would hang around longer, and
frequently-used negative dentries would be cleaned up sooner.

- referenced bit will be set possibly long after the reference is 
actually taken/used, which might make dentry pruning a little less 
accurate.

I was considering suggesting moving the reference bit setting to
unlock_nd(), since that's another place where we're already changing
d_count, but that still has the first two problems that I mentioned.
Either way, moving d_vfsflags to the same cacheline as d_count would
probably be a good idea.

Paul


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-13 17:25       ` Paul Menage
@ 2002-07-13 17:33         ` Alexander Viro
  2002-07-13 17:51           ` Paul Menage
  2002-07-13 17:54           ` Alexander Viro
  0 siblings, 2 replies; 18+ messages in thread
From: Alexander Viro @ 2002-07-13 17:33 UTC (permalink / raw)
  To: Paul Menage; +Cc: LKML, lse-tech



On Sat, 13 Jul 2002, Paul Menage wrote:

> - accessing foo/../bar, won't mark foo as referenced, even though it
> might be being referenced frequently. Probably not a common case for foo
> to be accessed exclusively in this way, but it could be fixed by marking
> a dentry referenced when following ".."

It certainly will.  Look - until ->d_count hits zero referenced bit is
not touched or looked at.  At all.

Look at the code.  There is _no_ aging for dentries with positive ->d_count.
They start aging only when after they enter unused_list...


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-13 17:33         ` Alexander Viro
@ 2002-07-13 17:51           ` Paul Menage
  2002-07-13 17:54           ` Alexander Viro
  1 sibling, 0 replies; 18+ messages in thread
From: Paul Menage @ 2002-07-13 17:51 UTC (permalink / raw)
  To: Alexander Viro; +Cc: LKML, lse-tech, pmenage

>
>
>On Sat, 13 Jul 2002, Paul Menage wrote:
>
>> - accessing foo/../bar, won't mark foo as referenced, even though it
>> might be being referenced frequently. Probably not a common case for foo
>> to be accessed exclusively in this way, but it could be fixed by marking
>> a dentry referenced when following ".."
>
>It certainly will.  Look - until ->d_count hits zero referenced bit is
>not touched or looked at.  At all.
>
>Look at the code.  There is _no_ aging for dentries with positive ->d_count.
>They start aging only when after they enter unused_list...
>

Yes, but with the fastwalk lookup, accessing foo/../bar won't do a
dget() for foo (assuming it was cached), and hence will never do a
dput() on it either. So if the only references to foo are as foo/../bar
then its d_count will stay zero and it will never be marked referenced.
(Or am I missing something?)

Paul


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-13 17:33         ` Alexander Viro
  2002-07-13 17:51           ` Paul Menage
@ 2002-07-13 17:54           ` Alexander Viro
  1 sibling, 0 replies; 18+ messages in thread
From: Alexander Viro @ 2002-07-13 17:54 UTC (permalink / raw)
  To: Paul Menage; +Cc: LKML, lse-tech



On Sat, 13 Jul 2002, Alexander Viro wrote:

> 
> 
> On Sat, 13 Jul 2002, Paul Menage wrote:
> 
> > - accessing foo/../bar, won't mark foo as referenced, even though it
> > might be being referenced frequently. Probably not a common case for foo
> > to be accessed exclusively in this way, but it could be fixed by marking
> > a dentry referenced when following ".."
> 
> It certainly will.  Look - until ->d_count hits zero referenced bit is
> not touched or looked at.  At all.
> 
> Look at the code.  There is _no_ aging for dentries with positive ->d_count.
> They start aging only when after they enter unused_list...

	On the second thought, I should apply that advice myself.  It's true
that they start aging only after they hit the unused list, but they are born
old.  Hrm...

	OK, it kills that variant and it really looks like there's no way
around dirtying dentry first time we find it on d_lookup().  Which means
that we probably want to put that |= under if - without any ifdefs...


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-13  8:52     ` Alexander Viro
  2002-07-13 17:25       ` Paul Menage
@ 2002-07-15  0:10       ` Linus Torvalds
  1 sibling, 0 replies; 18+ messages in thread
From: Linus Torvalds @ 2002-07-15  0:10 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Paul Menage, Maneesh Soni, LKML, lse-tech



On Sat, 13 Jul 2002, Alexander Viro wrote:
>
> So I'd just do
>
> vi fs/dcache.c -c '/|= DCACHE_R/d|/nr_un/pu|<|x'
>
> and be done with that.  Linus?

Done.

For future reference - don't anybody else try to send patches as vi
scripts, please. Yes, it's manly, but let's face it, so is bungee-jumping
with the cord tied to your testicles.

		Linus


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:29 ` Alexander Viro
                     ` (2 preceding siblings ...)
  2002-07-12 22:50   ` Hanna Linder
@ 2002-07-15  7:49   ` Maneesh Soni
  3 siblings, 0 replies; 18+ messages in thread
From: Maneesh Soni @ 2002-07-15  7:49 UTC (permalink / raw)
  To: Alexander Viro; +Cc: LKML, lse-tech

On Fri, Jul 12, 2002 at 10:29:53AM -0400, Alexander Viro wrote:
> 
> 
> On Fri, 12 Jul 2002, Maneesh Soni wrote:
> 
> > Here is the dcache scalability patch (cleaned up) as disscussed in 
> > the previous post to lkml by Dipankar. The patch uses RCU for doing fast
> > dcache lookup. It also does lazy updates to lru list of dentries to
> > avoid doing write operations while doing lookup.
> 
> Where is
> 	* version for 2.5.<current>
> 	* analysis of benefits in real-world situations for 2.5 version?
> 
> Patch adds complexity and unless you can show that it gives significant
> benefits outside of pathological situations, it's not going in.
> 
> Note: measurements on 2.4 do not make sense; reduction of cacheline
> bouncing between 2.4 and 2.5 will change the results anyway and
> if any of these patches are going to be applied to 2.4, reduction of
> cacheline bouncing on ->d_count is going to go in before that one.

Hi Viro,

The 2.4 tests we did, also has fastwalk patch ported from 2.5. Though
fastwalk has performed better (throughput improved by 1%) than base 2.4 
but I think because of increased hold and wait times on dcache_lock, results
are not as good as using only dcache_rcu.
	http://marc.theaimsgroup.com/?l=linux-kernel&m=102645767914212&w=2

For 2.5 I tried to merge fastwalk and dcache_rcu but both doesnot seem
to compliment each other. fastwalk takes dcache_lock much earlier than
d_lookup. 

On 2.5 we get better results from dcache_rcu when we reomve
fastwalk and put the dcache code back to 2.5.10ish level of code. Probably
this is not the correct way as it involves lots of code changes and we need
to workout some other way. 

There are some numbers from 2.5.20-base vs dcache_rcu(no fastwalk) done by 
Anton Blanchard with dbench.
	http://samba.org/~anton/linux/2.5.20/

Regards,
Maneesh

-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/locking/rcupdate.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] [RFC] dcache scalability patch (2.4.17)
  2002-07-12 14:13 ` [Lse-tech] " Christoph Hellwig
  2002-07-12 16:20   ` Dipankar Sarma
@ 2002-07-15  8:10   ` Maneesh Soni
  2002-07-15  8:25     ` Maneesh Soni
  1 sibling, 1 reply; 18+ messages in thread
From: Maneesh Soni @ 2002-07-15  8:10 UTC (permalink / raw)
  To: Christoph Hellwig, LKML, lse-tech, Al Viro

On Fri, Jul 12, 2002 at 03:13:22PM +0100, Christoph Hellwig wrote:
> > diff -urN linux-2.4.17-base/fs/dcache.c linux-2.4.17-dc8/fs/dcache.c
> > --- linux-2.4.17-base/fs/dcache.c	Fri Dec 21 23:11:55 2001
> > +++ linux-2.4.17-dc8/fs/dcache.c	Fri Jul 12 16:18:39 2002
> > @@ -25,6 +25,7 @@
> >  #include <linux/module.h>
> >  
> >  #include <asm/uaccess.h>
> > +#include <linux/rcupdate.h>
> 
> Please try to include <linux/*.h> before <asm/*.h> headers.
Ok.. no problem.

> 
> > +static void d_callback(void *arg)
> > +{
> > +	struct dentry * dentry = (struct dentry *)arg;
> > +
> > +	if (dname_external(dentry)) 
> > +		kfree((void *) dentry->d_name.name);
> > +	kmem_cache_free(dentry_cache, dentry); 
> > +}
> 
> why do you cast to void * before calling kfree?
I think this got left over becasue of some intermediate trials.. I have
removed this now.

> 
> > -	/* dput on a free dentry? */
> > -	if (!list_empty(&dentry->d_lru))
> > -		BUG();
> > +	spin_lock(&dentry->d_lock);
> > +        if (atomic_read(&dentry->d_count)) {
> > +                spin_unlock(&dentry->d_lock);
> > +                spin_unlock(&dcache_lock);
> > +                return;
> > +        }
> > +
> 
> Please use tabs instead of eight spaces in kernel code.
Sure..
 
> Another implementation details is whether we shouldn't spin on a bit of
> ->d_vfs_flags instead of increasing struct dentry further.  Maybe the
> spin_lock_bit interface that wli prototypes might be a godd choise.
I have note seen this but it should not be impossible.

> Else the patch looks fine to me, although I'm wondering why you target 2.4.17

Thanks for the review. I corrected the above two problems.. new patch is as
below. As Dipankr said 2.4 version can be taken just as proof of concept and 
for disscussion.

Regards,
Maneesh


diff -urN linux-2.4.17-base/fs/autofs4/root.c linux-2.4.17-dc8/fs/autofs4/root.c
--- linux-2.4.17-base/fs/autofs4/root.c	Tue Oct 24 10:27:38 2000
+++ linux-2.4.17-dc8/fs/autofs4/root.c	Fri Jul 12 10:59:38 2002
@@ -403,7 +403,7 @@
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	list_del_init(&dentry->d_hash);
+	d_drop_locked(dentry);
 	spin_unlock(&dcache_lock);
 
 	dput(ino->dentry);
diff -urN linux-2.4.17-base/fs/dcache.c linux-2.4.17-dc8/fs/dcache.c
--- linux-2.4.17-base/fs/dcache.c	Fri Dec 21 23:11:55 2001
+++ linux-2.4.17-dc8/fs/dcache.c	Mon Jul 15 13:20:29 2002
@@ -23,6 +23,7 @@
 #include <linux/smp_lock.h>
 #include <linux/cache.h>
 #include <linux/module.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 
@@ -55,14 +56,21 @@
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {0, 0, 45, 0,};
 
+static void d_callback(void *arg)
+{
+	struct dentry * dentry = (struct dentry *)arg;
+
+	if (dname_external(dentry)) 
+		kfree(dentry->d_name.name);
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
 /* no dcache_lock, please */
 static inline void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
-	if (dname_external(dentry)) 
-		kfree(dentry->d_name.name);
-	kmem_cache_free(dentry_cache, dentry); 
+	call_rcu(&dentry->d_rcu, d_callback, dentry);
 	dentry_stat.nr_dentry--;
 }
 
@@ -124,9 +132,13 @@
 	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
 		return;
 
-	/* dput on a free dentry? */
-	if (!list_empty(&dentry->d_lru))
-		BUG();
+	spin_lock(&dentry->d_lock);
+	if (atomic_read(&dentry->d_count)) {
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&dcache_lock);
+		return;
+	}
+
 	/*
 	 * AV: ->d_delete() is _NOT_ allowed to block now.
 	 */
@@ -135,18 +147,28 @@
 			goto unhash_it;
 	}
 	/* Unreachable? Get rid of it */
-	if (list_empty(&dentry->d_hash))
+	if (d_unhashed(dentry))
 		goto kill_it;
-	list_add(&dentry->d_lru, &dentry_unused);
-	dentry_stat.nr_unused++;
+
+	if (list_empty(&dentry->d_lru)) {
+		dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
+		list_add(&dentry->d_lru, &dentry_unused);
+		dentry_stat.nr_unused++;
+	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
 
 kill_it: {
 		struct dentry *parent;
+		spin_unlock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_lru)) {
+			list_del(&dentry->d_lru);
+			dentry_stat.nr_unused--;
+		}
 		list_del(&dentry->d_child);
 		/* drops the lock, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -177,7 +199,7 @@
 	 * If it's already been dropped, return OK.
 	 */
 	spin_lock(&dcache_lock);
-	if (list_empty(&dentry->d_hash)) {
+	if (d_unhashed(dentry)) {
 		spin_unlock(&dcache_lock);
 		return 0;
 	}
@@ -201,15 +223,18 @@
 	 * we might still populate it if it was a
 	 * working directory or similar).
 	 */
+	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			return -EBUSY;
 		}
 	}
-
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
+
 	return 0;
 }
 
@@ -217,11 +242,14 @@
 
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
+	spin_lock(&dentry->d_lock);
 	atomic_inc(&dentry->d_count);
+	dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_stat.nr_unused--;
 		list_del_init(&dentry->d_lru);
 	}
+	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
@@ -252,8 +280,8 @@
 		tmp = next;
 		next = tmp->next;
 		alias = list_entry(tmp, struct dentry, d_alias);
-		if (!list_empty(&alias->d_hash)) {
-			__dget_locked(alias);
+		if (!d_unhashed(alias)) {
+			dget(alias);
 			spin_unlock(&dcache_lock);
 			return alias;
 		}
@@ -263,7 +291,7 @@
 }
 
 /*
- *	Try to kill dentries associated with this inode.
+ *     Try to kill dentries associated with this inode.
  * WARNING: you must own a reference to inode.
  */
 void d_prune_aliases(struct inode *inode)
@@ -274,13 +302,16 @@
 	tmp = head;
 	while ((tmp = tmp->next) != head) {
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			__dget_locked(dentry);
+			__dget(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
-			d_drop(dentry);
 			dput(dentry);
 			goto restart;
 		}
+		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -295,7 +326,8 @@
 {
 	struct dentry * parent;
 
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	list_del(&dentry->d_child);
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -330,19 +362,20 @@
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
+		dentry_stat.nr_unused--;
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
+		spin_lock(&dentry->d_lock);
 		/* If the dentry was recently referenced, don't free it. */
 		if (dentry->d_vfs_flags & DCACHE_REFERENCED) {
 			dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
-			list_add(&dentry->d_lru, &dentry_unused);
+			if (!atomic_read(&dentry->d_count)) {
+				list_add(&dentry->d_lru, &dentry_unused);
+				dentry_stat.nr_unused++;
+			}
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		dentry_stat.nr_unused--;
-
-		/* Unused dentry with a count? */
-		if (atomic_read(&dentry->d_count))
-			BUG();
 
 		prune_one_dentry(dentry);
 		if (!--count)
@@ -405,10 +438,13 @@
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		if (atomic_read(&dentry->d_count))
-			continue;
+		spin_lock(&dentry->d_lock);
 		dentry_stat.nr_unused--;
 		list_del_init(tmp);
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
 		prune_one_dentry(dentry);
 		goto repeat;
 	}
@@ -488,11 +524,13 @@
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
+		list_del_init(&dentry->d_lru);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			list_del(&dentry->d_lru);
 			list_add(&dentry->d_lru, dentry_unused.prev);
 			found++;
 		}
+		spin_unlock(&dentry->d_lock);
 		/*
 		 * Descend a level if the d_subdirs list is non-empty.
 		 */
@@ -606,8 +644,9 @@
 	str[name->len] = 0;
 
 	atomic_set(&dentry->d_count, 1);
-	dentry->d_vfs_flags = 0;
+	dentry->d_vfs_flags = DCACHE_UNHASHED;
 	dentry->d_flags = 0;
+	dentry->d_lock = SPIN_LOCK_UNLOCKED;
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
 	dentry->d_sb = NULL;
@@ -709,7 +748,7 @@
 	struct list_head *head = d_hash(parent,hash);
 	struct list_head *tmp;
 
-	spin_lock(&dcache_lock);
+	/* rcu_read_lock(); for pre-emptible kernel */
 	tmp = head->next;
 	for (;;) {
 		struct dentry * dentry = list_entry(tmp, struct dentry, d_hash);
@@ -729,12 +768,15 @@
 			if (memcmp(dentry->d_name.name, str, len))
 				continue;
 		}
-		__dget_locked(dentry);
-		dentry->d_vfs_flags |= DCACHE_REFERENCED;
-		spin_unlock(&dcache_lock);
+		spin_lock(&dentry->d_lock);
+		if (!(dentry->d_vfs_flags & DCACHE_UNHASHED)) {
+			dentry = __dget(dentry);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* rcu_read_unlock(); for pre-emptible kernel */
 		return dentry;
 	}
-	spin_unlock(&dcache_lock);
+	/* rcu_read_unlock(); for pre-emptible kernel */
 	return NULL;
 }
 
@@ -774,7 +816,7 @@
 	lhp = base = d_hash(dparent, dentry->d_name.hash);
 	while ((lhp = lhp->next) != base) {
 		if (dentry == list_entry(lhp, struct dentry, d_hash)) {
-			__dget_locked(dentry);
+			dget(dentry);
 			spin_unlock(&dcache_lock);
 			return 1;
 		}
@@ -834,9 +876,12 @@
 void d_rehash(struct dentry * entry)
 {
 	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
-	if (!list_empty(&entry->d_hash)) BUG();
 	spin_lock(&dcache_lock);
+	spin_lock(&entry->d_lock);
+	if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG();
 	list_add(&entry->d_hash, list);
+	entry->d_vfs_flags &= ~DCACHE_UNHASHED;
+	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
@@ -909,7 +954,7 @@
 	list_add(&dentry->d_hash, &target->d_hash);
 
 	/* Unhash the target: dput() will then get rid of it */
-	list_del_init(&target->d_hash);
+	d_drop_locked(target);
 
 	list_del(&dentry->d_child);
 	list_del(&target->d_child);
@@ -951,7 +996,7 @@
 
 	*--end = '\0';
 	buflen--;
-	if (!IS_ROOT(dentry) && list_empty(&dentry->d_hash)) {
+	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
 		buflen -= 10;
 		end -= 10;
 		memcpy(end, " (deleted)", 10);
@@ -1034,7 +1079,7 @@
 	error = -ENOENT;
 	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (pwd->d_parent == pwd || !list_empty(&pwd->d_hash)) {
+	if (pwd->d_parent == pwd || !d_unhashed(pwd)) {
 		unsigned long len;
 		char * cwd;
 
diff -urN linux-2.4.17-base/fs/intermezzo/journal.c linux-2.4.17-dc8/fs/intermezzo/journal.c
--- linux-2.4.17-base/fs/intermezzo/journal.c	Fri Dec 21 23:11:55 2001
+++ linux-2.4.17-dc8/fs/intermezzo/journal.c	Mon Jul  8 16:18:43 2002
@@ -186,7 +186,7 @@
 
         *--end = '\0';
         buflen--;
-        if (dentry->d_parent != dentry && list_empty(&dentry->d_hash)) {
+        if (dentry->d_parent != dentry && d_unhashed(dentry)) {
                 buflen -= 10;
                 end -= 10;
                 memcpy(end, " (deleted)", 10);
diff -urN linux-2.4.17-base/fs/nfsd/nfsfh.c linux-2.4.17-dc8/fs/nfsd/nfsfh.c
--- linux-2.4.17-base/fs/nfsd/nfsfh.c	Thu Oct  4 11:29:22 2001
+++ linux-2.4.17-dc8/fs/nfsd/nfsfh.c	Mon Jul  8 16:18:43 2002
@@ -348,7 +348,7 @@
 	spin_lock(&dcache_lock);
 	list_for_each(lp, &child->d_inode->i_dentry) {
 		struct dentry *tmp = list_entry(lp,struct dentry, d_alias);
-		if (!list_empty(&tmp->d_hash) &&
+		if (!d_unhashed(tmp) &&
 		    tmp->d_parent == parent) {
 			child = dget_locked(tmp);
 			spin_unlock(&dcache_lock);
diff -urN linux-2.4.17-base/fs/readdir.c linux-2.4.17-dc8/fs/readdir.c
--- linux-2.4.17-base/fs/readdir.c	Mon Aug 13 03:29:08 2001
+++ linux-2.4.17-dc8/fs/readdir.c	Mon Jul  8 16:18:43 2002
@@ -79,7 +79,7 @@
 			while(1) {
 				struct dentry *de = list_entry(list, struct dentry, d_child);
 
-				if (!list_empty(&de->d_hash) && de->d_inode) {
+  				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
 					if (filldir(dirent, de->d_name.name, de->d_name.len, filp->f_pos, de->d_inode->i_ino, DT_UNKNOWN) < 0)
 						break;
diff -urN linux-2.4.17-base/include/linux/dcache.h linux-2.4.17-dc8/include/linux/dcache.h
--- linux-2.4.17-base/include/linux/dcache.h	Fri Nov 23 01:16:18 2001
+++ linux-2.4.17-dc8/include/linux/dcache.h	Fri Jul 12 10:59:09 2002
@@ -5,6 +5,8 @@
 
 #include <asm/atomic.h>
 #include <linux/mount.h>
+#include <linux/rcupdate.h>
+#include <asm/system.h>
 
 /*
  * linux/include/linux/dcache.h
@@ -65,11 +67,13 @@
 
 struct dentry {
 	atomic_t d_count;
+	spinlock_t d_lock;		/* lock for d_count & d_vfs_flags */
+	unsigned long d_vfs_flags;	/* moved here to be on same cacheline */
 	unsigned int d_flags;
 	struct inode  * d_inode;	/* Where the name belongs to - NULL is negative */
 	struct dentry * d_parent;	/* parent directory */
 	struct list_head d_hash;	/* lookup hash list */
-	struct list_head d_lru;		/* d_count = 0 LRU list */
+	struct list_head d_lru;		/* LRU list */
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
@@ -78,8 +82,8 @@
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations  *d_op;
 	struct super_block * d_sb;	/* The root of the dentry tree */
-	unsigned long d_vfs_flags;
 	void * d_fsdata;		/* fs-specific data */
+	struct rcu_head d_rcu;
 	unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
 };
 
@@ -123,10 +127,23 @@
 					 * s_nfsd_free_path semaphore will be down
 					 */
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
+#define DCACHE_UNHASHED		0x0010	
 
 extern spinlock_t dcache_lock;
 
 /**
+ *	d_unhashed -	is dentry hashed
+ *	@dentry: entry to check
+ *
+ *	Returns true if the dentry passed is not currently hashed.
+ */
+ 
+static __inline__ int d_unhashed(struct dentry *dentry)
+{
+	return (dentry->d_vfs_flags & DCACHE_UNHASHED);
+}
+
+/**
  * d_drop - drop a dentry
  * @dentry: dentry to drop
  *
@@ -143,14 +160,32 @@
  * timeouts or autofs deletes).
  */
 
+static __inline__ void __d_drop(struct dentry * dentry)
+{
+	if (!d_unhashed(dentry)) {
+		dentry->d_vfs_flags |= DCACHE_UNHASHED;
+		wmb();
+		list_del(&dentry->d_hash);
+		wmb();
+	}
+}
+
 static __inline__ void d_drop(struct dentry * dentry)
 {
 	spin_lock(&dcache_lock);
-	list_del(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_hash);
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
+static __inline__ void d_drop_locked(struct dentry * dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
 static __inline__ int dname_external(struct dentry *d)
 {
 	return d->d_name.name != d->d_iname; 
@@ -243,27 +278,25 @@
 static __inline__ struct dentry * dget(struct dentry *dentry)
 {
 	if (dentry) {
-		if (!atomic_read(&dentry->d_count))
-			BUG();
+		spin_lock(&dentry->d_lock);
 		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
 	}
 	return dentry;
 }
 
-extern struct dentry * dget_locked(struct dentry *);
-
-/**
- *	d_unhashed -	is dentry hashed
- *	@dentry: entry to check
- *
- *	Returns true if the dentry passed is not currently hashed.
- */
- 
-static __inline__ int d_unhashed(struct dentry *dentry)
+static __inline__ struct dentry * __dget(struct dentry *dentry)
 {
-	return list_empty(&dentry->d_hash);
+	if (dentry) {
+		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+	}
+	return dentry;
 }
 
+extern struct dentry * dget_locked(struct dentry *);
+
 extern void dput(struct dentry *);
 
 static __inline__ int d_mountpoint(struct dentry *dentry)
-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/locking/rcupdate.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Lse-tech] [RFC] dcache scalability patch (2.4.17)
  2002-07-15  8:10   ` Maneesh Soni
@ 2002-07-15  8:25     ` Maneesh Soni
  0 siblings, 0 replies; 18+ messages in thread
From: Maneesh Soni @ 2002-07-15  8:25 UTC (permalink / raw)
  To: Christoph Hellwig, LKML, lse-tech, Al Viro

oops.. sorry.

This is the correct patch..


diff -urN linux-2.4.17-base/fs/autofs4/root.c linux-2.4.17-dc8/fs/autofs4/root.c
--- linux-2.4.17-base/fs/autofs4/root.c	Tue Oct 24 10:27:38 2000
+++ linux-2.4.17-dc8/fs/autofs4/root.c	Fri Jul 12 10:59:38 2002
@@ -403,7 +403,7 @@
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	list_del_init(&dentry->d_hash);
+	d_drop_locked(dentry);
 	spin_unlock(&dcache_lock);
 
 	dput(ino->dentry);
diff -urN linux-2.4.17-base/fs/dcache.c linux-2.4.17-dc8/fs/dcache.c
--- linux-2.4.17-base/fs/dcache.c	Fri Dec 21 23:11:55 2001
+++ linux-2.4.17-dc8/fs/dcache.c	Mon Jul 15 13:47:35 2002
@@ -23,6 +23,7 @@
 #include <linux/smp_lock.h>
 #include <linux/cache.h>
 #include <linux/module.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 
@@ -55,14 +56,21 @@
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {0, 0, 45, 0,};
 
+static void d_callback(void *arg)
+{
+	struct dentry * dentry = (struct dentry *)arg;
+
+	if (dname_external(dentry)) 
+		kfree(dentry->d_name.name);
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
 /* no dcache_lock, please */
 static inline void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
-	if (dname_external(dentry)) 
-		kfree(dentry->d_name.name);
-	kmem_cache_free(dentry_cache, dentry); 
+	call_rcu(&dentry->d_rcu, d_callback, dentry);
 	dentry_stat.nr_dentry--;
 }
 
@@ -124,9 +132,13 @@
 	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
 		return;
 
-	/* dput on a free dentry? */
-	if (!list_empty(&dentry->d_lru))
-		BUG();
+	spin_lock(&dentry->d_lock);
+	if (atomic_read(&dentry->d_count)) {
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&dcache_lock);
+		return;
+	}
+
 	/*
 	 * AV: ->d_delete() is _NOT_ allowed to block now.
 	 */
@@ -135,18 +147,28 @@
 			goto unhash_it;
 	}
 	/* Unreachable? Get rid of it */
-	if (list_empty(&dentry->d_hash))
+	if (d_unhashed(dentry))
 		goto kill_it;
-	list_add(&dentry->d_lru, &dentry_unused);
-	dentry_stat.nr_unused++;
+
+	if (list_empty(&dentry->d_lru)) {
+		dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
+		list_add(&dentry->d_lru, &dentry_unused);
+		dentry_stat.nr_unused++;
+	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
 
 kill_it: {
 		struct dentry *parent;
+		spin_unlock(&dentry->d_lock);
+		if (!list_empty(&dentry->d_lru)) {
+			list_del(&dentry->d_lru);
+			dentry_stat.nr_unused--;
+		}
 		list_del(&dentry->d_child);
 		/* drops the lock, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -177,7 +199,7 @@
 	 * If it's already been dropped, return OK.
 	 */
 	spin_lock(&dcache_lock);
-	if (list_empty(&dentry->d_hash)) {
+	if (d_unhashed(dentry)) {
 		spin_unlock(&dcache_lock);
 		return 0;
 	}
@@ -201,15 +223,18 @@
 	 * we might still populate it if it was a
 	 * working directory or similar).
 	 */
+	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			return -EBUSY;
 		}
 	}
-
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
+
 	return 0;
 }
 
@@ -217,11 +242,14 @@
 
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
+	spin_lock(&dentry->d_lock);
 	atomic_inc(&dentry->d_count);
+	dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_stat.nr_unused--;
 		list_del_init(&dentry->d_lru);
 	}
+	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
@@ -252,8 +280,8 @@
 		tmp = next;
 		next = tmp->next;
 		alias = list_entry(tmp, struct dentry, d_alias);
-		if (!list_empty(&alias->d_hash)) {
-			__dget_locked(alias);
+		if (!d_unhashed(alias)) {
+			dget(alias);
 			spin_unlock(&dcache_lock);
 			return alias;
 		}
@@ -263,7 +291,7 @@
 }
 
 /*
- *	Try to kill dentries associated with this inode.
+ *     Try to kill dentries associated with this inode.
  * WARNING: you must own a reference to inode.
  */
 void d_prune_aliases(struct inode *inode)
@@ -274,13 +302,16 @@
 	tmp = head;
 	while ((tmp = tmp->next) != head) {
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			__dget_locked(dentry);
+			__dget(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
-			d_drop(dentry);
 			dput(dentry);
 			goto restart;
 		}
+		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -295,7 +326,8 @@
 {
 	struct dentry * parent;
 
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	list_del(&dentry->d_child);
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -330,19 +362,20 @@
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
+		dentry_stat.nr_unused--;
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
+		spin_lock(&dentry->d_lock);
 		/* If the dentry was recently referenced, don't free it. */
 		if (dentry->d_vfs_flags & DCACHE_REFERENCED) {
 			dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
-			list_add(&dentry->d_lru, &dentry_unused);
+			if (!atomic_read(&dentry->d_count)) {
+				list_add(&dentry->d_lru, &dentry_unused);
+				dentry_stat.nr_unused++;
+			}
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		dentry_stat.nr_unused--;
-
-		/* Unused dentry with a count? */
-		if (atomic_read(&dentry->d_count))
-			BUG();
 
 		prune_one_dentry(dentry);
 		if (!--count)
@@ -405,10 +438,13 @@
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		if (atomic_read(&dentry->d_count))
-			continue;
+		spin_lock(&dentry->d_lock);
 		dentry_stat.nr_unused--;
 		list_del_init(tmp);
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
 		prune_one_dentry(dentry);
 		goto repeat;
 	}
@@ -488,11 +524,13 @@
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
+		list_del_init(&dentry->d_lru);
+		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			list_del(&dentry->d_lru);
 			list_add(&dentry->d_lru, dentry_unused.prev);
 			found++;
 		}
+		spin_unlock(&dentry->d_lock);
 		/*
 		 * Descend a level if the d_subdirs list is non-empty.
 		 */
@@ -606,8 +644,9 @@
 	str[name->len] = 0;
 
 	atomic_set(&dentry->d_count, 1);
-	dentry->d_vfs_flags = 0;
+	dentry->d_vfs_flags = DCACHE_UNHASHED;
 	dentry->d_flags = 0;
+	dentry->d_lock = SPIN_LOCK_UNLOCKED;
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
 	dentry->d_sb = NULL;
@@ -708,8 +747,9 @@
 	const unsigned char *str = name->name;
 	struct list_head *head = d_hash(parent,hash);
 	struct list_head *tmp;
+	struct dentry * found = NULL;
 
-	spin_lock(&dcache_lock);
+	/* rcu_read_lock(); for pre-emptible kernel */
 	tmp = head->next;
 	for (;;) {
 		struct dentry * dentry = list_entry(tmp, struct dentry, d_hash);
@@ -729,12 +769,15 @@
 			if (memcmp(dentry->d_name.name, str, len))
 				continue;
 		}
-		__dget_locked(dentry);
-		dentry->d_vfs_flags |= DCACHE_REFERENCED;
-		spin_unlock(&dcache_lock);
-		return dentry;
+		spin_lock(&dentry->d_lock);
+		if (!(dentry->d_vfs_flags & DCACHE_UNHASHED)) {
+			found = __dget(dentry);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* rcu_read_unlock(); for pre-emptible kernel */
+		return found;
 	}
-	spin_unlock(&dcache_lock);
+	/* rcu_read_unlock(); for pre-emptible kernel */
 	return NULL;
 }
 
@@ -774,7 +817,7 @@
 	lhp = base = d_hash(dparent, dentry->d_name.hash);
 	while ((lhp = lhp->next) != base) {
 		if (dentry == list_entry(lhp, struct dentry, d_hash)) {
-			__dget_locked(dentry);
+			dget(dentry);
 			spin_unlock(&dcache_lock);
 			return 1;
 		}
@@ -834,9 +877,12 @@
 void d_rehash(struct dentry * entry)
 {
 	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
-	if (!list_empty(&entry->d_hash)) BUG();
 	spin_lock(&dcache_lock);
+	spin_lock(&entry->d_lock);
+	if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG();
 	list_add(&entry->d_hash, list);
+	entry->d_vfs_flags &= ~DCACHE_UNHASHED;
+	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
@@ -909,7 +955,7 @@
 	list_add(&dentry->d_hash, &target->d_hash);
 
 	/* Unhash the target: dput() will then get rid of it */
-	list_del_init(&target->d_hash);
+	d_drop_locked(target);
 
 	list_del(&dentry->d_child);
 	list_del(&target->d_child);
@@ -951,7 +997,7 @@
 
 	*--end = '\0';
 	buflen--;
-	if (!IS_ROOT(dentry) && list_empty(&dentry->d_hash)) {
+	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
 		buflen -= 10;
 		end -= 10;
 		memcpy(end, " (deleted)", 10);
@@ -1034,7 +1080,7 @@
 	error = -ENOENT;
 	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (pwd->d_parent == pwd || !list_empty(&pwd->d_hash)) {
+	if (pwd->d_parent == pwd || !d_unhashed(pwd)) {
 		unsigned long len;
 		char * cwd;
 
diff -urN linux-2.4.17-base/fs/intermezzo/journal.c linux-2.4.17-dc8/fs/intermezzo/journal.c
--- linux-2.4.17-base/fs/intermezzo/journal.c	Fri Dec 21 23:11:55 2001
+++ linux-2.4.17-dc8/fs/intermezzo/journal.c	Mon Jul  8 16:18:43 2002
@@ -186,7 +186,7 @@
 
         *--end = '\0';
         buflen--;
-        if (dentry->d_parent != dentry && list_empty(&dentry->d_hash)) {
+        if (dentry->d_parent != dentry && d_unhashed(dentry)) {
                 buflen -= 10;
                 end -= 10;
                 memcpy(end, " (deleted)", 10);
diff -urN linux-2.4.17-base/fs/nfsd/nfsfh.c linux-2.4.17-dc8/fs/nfsd/nfsfh.c
--- linux-2.4.17-base/fs/nfsd/nfsfh.c	Thu Oct  4 11:29:22 2001
+++ linux-2.4.17-dc8/fs/nfsd/nfsfh.c	Mon Jul  8 16:18:43 2002
@@ -348,7 +348,7 @@
 	spin_lock(&dcache_lock);
 	list_for_each(lp, &child->d_inode->i_dentry) {
 		struct dentry *tmp = list_entry(lp,struct dentry, d_alias);
-		if (!list_empty(&tmp->d_hash) &&
+		if (!d_unhashed(tmp) &&
 		    tmp->d_parent == parent) {
 			child = dget_locked(tmp);
 			spin_unlock(&dcache_lock);
diff -urN linux-2.4.17-base/fs/readdir.c linux-2.4.17-dc8/fs/readdir.c
--- linux-2.4.17-base/fs/readdir.c	Mon Aug 13 03:29:08 2001
+++ linux-2.4.17-dc8/fs/readdir.c	Mon Jul  8 16:18:43 2002
@@ -79,7 +79,7 @@
 			while(1) {
 				struct dentry *de = list_entry(list, struct dentry, d_child);
 
-				if (!list_empty(&de->d_hash) && de->d_inode) {
+  				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
 					if (filldir(dirent, de->d_name.name, de->d_name.len, filp->f_pos, de->d_inode->i_ino, DT_UNKNOWN) < 0)
 						break;
diff -urN linux-2.4.17-base/include/linux/dcache.h linux-2.4.17-dc8/include/linux/dcache.h
--- linux-2.4.17-base/include/linux/dcache.h	Fri Nov 23 01:16:18 2001
+++ linux-2.4.17-dc8/include/linux/dcache.h	Fri Jul 12 10:59:09 2002
@@ -5,6 +5,8 @@
 
 #include <asm/atomic.h>
 #include <linux/mount.h>
+#include <linux/rcupdate.h>
+#include <asm/system.h>
 
 /*
  * linux/include/linux/dcache.h
@@ -65,11 +67,13 @@
 
 struct dentry {
 	atomic_t d_count;
+	spinlock_t d_lock;		/* lock for d_count & d_vfs_flags */
+	unsigned long d_vfs_flags;	/* moved here to be on same cacheline */
 	unsigned int d_flags;
 	struct inode  * d_inode;	/* Where the name belongs to - NULL is negative */
 	struct dentry * d_parent;	/* parent directory */
 	struct list_head d_hash;	/* lookup hash list */
-	struct list_head d_lru;		/* d_count = 0 LRU list */
+	struct list_head d_lru;		/* LRU list */
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
@@ -78,8 +82,8 @@
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations  *d_op;
 	struct super_block * d_sb;	/* The root of the dentry tree */
-	unsigned long d_vfs_flags;
 	void * d_fsdata;		/* fs-specific data */
+	struct rcu_head d_rcu;
 	unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
 };
 
@@ -123,10 +127,23 @@
 					 * s_nfsd_free_path semaphore will be down
 					 */
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
+#define DCACHE_UNHASHED		0x0010	
 
 extern spinlock_t dcache_lock;
 
 /**
+ *	d_unhashed -	is dentry hashed
+ *	@dentry: entry to check
+ *
+ *	Returns true if the dentry passed is not currently hashed.
+ */
+ 
+static __inline__ int d_unhashed(struct dentry *dentry)
+{
+	return (dentry->d_vfs_flags & DCACHE_UNHASHED);
+}
+
+/**
  * d_drop - drop a dentry
  * @dentry: dentry to drop
  *
@@ -143,14 +160,32 @@
  * timeouts or autofs deletes).
  */
 
+static __inline__ void __d_drop(struct dentry * dentry)
+{
+	if (!d_unhashed(dentry)) {
+		dentry->d_vfs_flags |= DCACHE_UNHASHED;
+		wmb();
+		list_del(&dentry->d_hash);
+		wmb();
+	}
+}
+
 static __inline__ void d_drop(struct dentry * dentry)
 {
 	spin_lock(&dcache_lock);
-	list_del(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_hash);
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
+static __inline__ void d_drop_locked(struct dentry * dentry)
+{
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+}
+
 static __inline__ int dname_external(struct dentry *d)
 {
 	return d->d_name.name != d->d_iname; 
@@ -243,27 +278,25 @@
 static __inline__ struct dentry * dget(struct dentry *dentry)
 {
 	if (dentry) {
-		if (!atomic_read(&dentry->d_count))
-			BUG();
+		spin_lock(&dentry->d_lock);
 		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
 	}
 	return dentry;
 }
 
-extern struct dentry * dget_locked(struct dentry *);
-
-/**
- *	d_unhashed -	is dentry hashed
- *	@dentry: entry to check
- *
- *	Returns true if the dentry passed is not currently hashed.
- */
- 
-static __inline__ int d_unhashed(struct dentry *dentry)
+static __inline__ struct dentry * __dget(struct dentry *dentry)
 {
-	return list_empty(&dentry->d_hash);
+	if (dentry) {
+		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
+	}
+	return dentry;
 }
 
+extern struct dentry * dget_locked(struct dentry *);
+
 extern void dput(struct dentry *);
 
 static __inline__ int d_mountpoint(struct dentry *dentry)

-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/locking/rcupdate.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2002-07-15  8:17 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-07-12 14:09 [RFC] dcache scalability patch (2.4.17) Maneesh Soni
2002-07-12 14:13 ` [Lse-tech] " Christoph Hellwig
2002-07-12 16:20   ` Dipankar Sarma
2002-07-15  8:10   ` Maneesh Soni
2002-07-15  8:25     ` Maneesh Soni
2002-07-12 14:29 ` Alexander Viro
2002-07-12 16:10   ` [Lse-tech] " Dipankar Sarma
2002-07-12 18:02     ` Dipankar Sarma
2002-07-12 18:10       ` Hanna Linder
2002-07-12 17:35   ` Paul Menage
2002-07-13  8:52     ` Alexander Viro
2002-07-13 17:25       ` Paul Menage
2002-07-13 17:33         ` Alexander Viro
2002-07-13 17:51           ` Paul Menage
2002-07-13 17:54           ` Alexander Viro
2002-07-15  0:10       ` Linus Torvalds
2002-07-12 22:50   ` Hanna Linder
2002-07-15  7:49   ` Maneesh Soni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.