All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] simplifying fast_dput(), dentry_kill() et.al.
@ 2023-10-30  0:37 Al Viro
  2023-10-30 21:53 ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-10-30  0:37 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

	Back in 2015 when fast_dput() got introduced, I'd been worried
about ->d_delete() being exposed to dentries with zero refcount.
To quote my reply to Linus back then,

"The only potential nastiness I can see here is that filesystem with
->d_delete() always returning 1 might be surprised by encountering
a hashed dentry with zero d_count.  I can't recall anything actually
sensitive to that, and there might very well be no such examples,
but in principle it might be a problem.  Might be a good idea to check
DCACHE_OP_DELETE before anything else..."

Looking at that again, that check was not a good idea.  Sure, ->d_delete()
instances could, in theory, check d_count (as BUG_ON(d_count(dentry) != 1)
or something equally useful) or, worse, drop and regain ->d_lock.
The latter would be rather hard to pull off safely, but it is not
impossible.  The thing is, none of the in-tree instances do anything of
that sort and I don't see any valid reasons why anyone would want to.

And getting rid of that would, AFAICS, allow for much simpler rules
around __dentry_kill() and friends - we could hold rcu_read_lock
over the places where dentry_kill() drops/regains ->d_lock and
that would allow
	* fast_dput() always decrementing refcount
	* retain_dentry() never modifying it
	* __dentry_kill() always called with refcount 0 (currently
it gets 1 from dentry_kill() and 0 in all other cases)

Does anybody see any problems with something along the lines of the
(untested) patch below?  It would need to be carved up (and accompanied
by "thou shalt not play silly buggers with ->d_lockref in your
->d_delete() instances" in D/f/porting), obviously, but I would really
like to get saner rules around refcount manipulations in there - as
it is, trying to document them gets very annoying.

Comments?

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..af0e067f6982 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -709,7 +708,7 @@ EXPORT_SYMBOL(d_mark_dontcache);
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
 static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
+	__releases(dentry->d_lock) __releases(rcu)
 {
 	struct inode *inode = dentry->d_inode;
 	struct dentry *parent = NULL;
@@ -730,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 			goto slow_positive;
 		}
 	}
+	rcu_read_unlock();
 	__dentry_kill(dentry);
 	return parent;
 
@@ -739,9 +739,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	spin_lock(&dentry->d_lock);
 	parent = lock_parent(dentry);
 got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
+	if (likely(dentry->d_lockref.count == 0 && !retain_dentry(dentry))) {
+		rcu_read_unlock();
 		__dentry_kill(dentry);
 		return parent;
 	}
@@ -751,6 +750,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
 	return NULL;
 }
 
@@ -768,15 +768,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -830,7 +822,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -855,12 +847,6 @@ static inline bool fast_dput(struct dentry *dentry)
 		return true;
 	}
 
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -903,10 +889,9 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-
 		if (likely(retain_dentry(dentry))) {
 			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
 			return;
 		}
 
@@ -918,14 +903,10 @@ EXPORT_SYMBOL(dput);
 static void __dput_to_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -1191,7 +1172,7 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
+		if (parent != dentry && !--parent->d_lockref.count)
 			__dput_to_list(parent, list);
 		__dentry_kill(dentry);
 	}
@@ -1638,7 +1619,8 @@ void shrink_dcache_parent(struct dentry *parent)
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
 					__dput_to_list(parent, &data.dispose);
 				__dentry_kill(data.victim);
 			}

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-30  0:37 [RFC] simplifying fast_dput(), dentry_kill() et.al Al Viro
@ 2023-10-30 21:53 ` Al Viro
  2023-10-30 22:18   ` Linus Torvalds
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-10-30 21:53 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Mon, Oct 30, 2023 at 12:37:59AM +0000, Al Viro wrote:
> 	Back in 2015 when fast_dput() got introduced, I'd been worried
> about ->d_delete() being exposed to dentries with zero refcount.
> To quote my reply to Linus back then,
> 
> "The only potential nastiness I can see here is that filesystem with
> ->d_delete() always returning 1 might be surprised by encountering
> a hashed dentry with zero d_count.  I can't recall anything actually
> sensitive to that, and there might very well be no such examples,
> but in principle it might be a problem.  Might be a good idea to check
> DCACHE_OP_DELETE before anything else..."
> 
> Looking at that again, that check was not a good idea.  Sure, ->d_delete()
> instances could, in theory, check d_count (as BUG_ON(d_count(dentry) != 1)
> or something equally useful) or, worse, drop and regain ->d_lock.
> The latter would be rather hard to pull off safely, but it is not
> impossible.  The thing is, none of the in-tree instances do anything of
> that sort and I don't see any valid reasons why anyone would want to.
> 
> And getting rid of that would, AFAICS, allow for much simpler rules
> around __dentry_kill() and friends - we could hold rcu_read_lock
> over the places where dentry_kill() drops/regains ->d_lock and
> that would allow
> 	* fast_dput() always decrementing refcount
> 	* retain_dentry() never modifying it
> 	* __dentry_kill() always called with refcount 0 (currently
> it gets 1 from dentry_kill() and 0 in all other cases)
> 
> Does anybody see any problems with something along the lines of the
> (untested) patch below?  It would need to be carved up (and accompanied
> by "thou shalt not play silly buggers with ->d_lockref in your
> ->d_delete() instances" in D/f/porting), obviously, but I would really
> like to get saner rules around refcount manipulations in there - as
> it is, trying to document them gets very annoying.
> 
> Comments?

After fixing a couple of brainos, it seems to work.  See below:

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..5e975a013508 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -709,7 +708,7 @@ EXPORT_SYMBOL(d_mark_dontcache);
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
 static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
+	__releases(dentry->d_lock) __releases(rcu)
 {
 	struct inode *inode = dentry->d_inode;
 	struct dentry *parent = NULL;
@@ -730,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 			goto slow_positive;
 		}
 	}
+	rcu_read_unlock();
 	__dentry_kill(dentry);
 	return parent;
 
@@ -739,9 +739,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	spin_lock(&dentry->d_lock);
 	parent = lock_parent(dentry);
 got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
+	rcu_read_unlock();
+	if (likely(dentry->d_lockref.count == 0 && !retain_dentry(dentry))) {
 		__dentry_kill(dentry);
 		return parent;
 	}
@@ -768,15 +767,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * try to decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -787,8 +778,12 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	if (unlikely(ret < 0)) {
 		spin_lock(&dentry->d_lock);
-		if (dentry->d_lockref.count > 1) {
-			dentry->d_lockref.count--;
+		if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
+			spin_unlock(&dentry->d_lock);
+			return true;
+		}
+		dentry->d_lockref.count--;
+		if (dentry->d_lockref.count) {
 			spin_unlock(&dentry->d_lock);
 			return true;
 		}
@@ -830,7 +825,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -854,13 +849,6 @@ static inline bool fast_dput(struct dentry *dentry)
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
-
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -903,10 +891,9 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-
 		if (likely(retain_dentry(dentry))) {
 			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
 			return;
 		}
 
@@ -918,14 +905,10 @@ EXPORT_SYMBOL(dput);
 static void __dput_to_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -1191,7 +1174,7 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
+		if (parent != dentry && !--parent->d_lockref.count)
 			__dput_to_list(parent, list);
 		__dentry_kill(dentry);
 	}
@@ -1638,7 +1621,8 @@ void shrink_dcache_parent(struct dentry *parent)
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
 					__dput_to_list(parent, &data.dispose);
 				__dentry_kill(data.victim);
 			}

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-30 21:53 ` Al Viro
@ 2023-10-30 22:18   ` Linus Torvalds
  2023-10-31  0:18     ` Al Viro
  2023-10-31  2:25     ` [RFC] simplifying fast_dput(), dentry_kill() et.al Gao Xiang
  0 siblings, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-10-30 22:18 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel

On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> After fixing a couple of brainos, it seems to work.

This all makes me unnaturally nervous, probably because it;s overly
subtle, and I have lost the context for some of the rules.

I like the patch, because honestly, our current logic for dput_fast()
is nasty, andI agree with you that the existence of d_op->d_delete()
shouldn't change the locking logic.

At the same time, I just worry. That whole lockref_put_return() thing
has horrific semantics, and this is the only case that uses it, and I
wish we didn't need it.

[ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
too, and that looks completely bogus, since it doesn't check the
return value! ]

At the same time, that whole fast_dpu() is one of the more critical
places, and we definitely don't want to take the lock just because the
ref goes down to zero (and we still leave it around).

End result: I *think* that patch is an improvement, but this code just
makes me unreasonably nervous.

               Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-30 22:18   ` Linus Torvalds
@ 2023-10-31  0:18     ` Al Viro
  2023-10-31  1:53       ` Al Viro
                         ` (2 more replies)
  2023-10-31  2:25     ` [RFC] simplifying fast_dput(), dentry_kill() et.al Gao Xiang
  1 sibling, 3 replies; 119+ messages in thread
From: Al Viro @ 2023-10-31  0:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > After fixing a couple of brainos, it seems to work.
> 
> This all makes me unnaturally nervous, probably because it;s overly
> subtle, and I have lost the context for some of the rules.

A bit of context: I started to look at the possibility of refcount overflows.
Writing the current rules for dentry refcounting and lifetime down was the
obvious first step, and that immediately turned into an awful mess.

It is overly subtle.  Even more so when you throw the shrink lists into
the mix - shrink_lock_dentry() got too smart for its own good, and that
leads to really awful correctness proofs.  The next thing in the series
is getting rid of the "it had been moved around, so somebody had clearly
been taking/dropping references and we can just evict it from the
shrink list and be done with that" crap - the things get much simpler
if the rules become
	* call it under rcu_read_lock, with dentry locked
	* if returned true
		dentry, parent, inode locked, refcount is zero.
	* if returned false
		dentry locked, refcount is non-zero.
It used to be that way, but removal of trylock loops had turned that
into something much more subtle.  Restoring the old semantics without
trylocks on the slow path is doable and it makes analysis much simpler.

BTW, where how aggressive do we want to be with d_lru_del()?

We obviously do not do that on non-final dput, even if we have
a dentry with positive refcount in LRU list.  But when we hit e.g.
shrink_dcache_parent(), all dentries in the subtree get d_lru_del(),
whether they are busy or not.  I'm not sure it's a good idea...

Sure, we want __dentry_kill() to remove the victim from LRU and
we want the same done to anything moved to a shrink list.
Having LRU scanners ({prune,shrink}_dcache_sb() do that to
dentries with positive refcount also makes sense.  Do we really need
the other cases?

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  0:18     ` Al Viro
@ 2023-10-31  1:53       ` Al Viro
  2023-10-31  6:12         ` Al Viro
  2023-11-01  2:22       ` [RFC] simplifying fast_dput(), dentry_kill() et.al Al Viro
  2023-11-05 19:54       ` Al Viro
  2 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-10-31  1:53 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > After fixing a couple of brainos, it seems to work.
> > 
> > This all makes me unnaturally nervous, probably because it;s overly
> > subtle, and I have lost the context for some of the rules.
> 
> A bit of context: I started to look at the possibility of refcount overflows.
> Writing the current rules for dentry refcounting and lifetime down was the
> obvious first step, and that immediately turned into an awful mess.
> 
> It is overly subtle.  Even more so when you throw the shrink lists into
> the mix - shrink_lock_dentry() got too smart for its own good, and that
> leads to really awful correctness proofs.  The next thing in the series
> is getting rid of the "it had been moved around, so somebody had clearly
> been taking/dropping references and we can just evict it from the
> shrink list and be done with that" crap - the things get much simpler
> if the rules become
> 	* call it under rcu_read_lock, with dentry locked
> 	* if returned true
> 		dentry, parent, inode locked, refcount is zero.
> 	* if returned false
> 		dentry locked, refcount is non-zero.
> It used to be that way, but removal of trylock loops had turned that
> into something much more subtle.  Restoring the old semantics without
> trylocks on the slow path is doable and it makes analysis much simpler.

It's also a perfect match to what we want in dentry_kill(), actually.
And looking into that has caught another place too subtle for its own
good:
        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                if (unlikely(!spin_trylock(&parent->d_lock))) {
                        parent = __lock_parent(dentry);
                        if (likely(inode || !dentry->d_inode))
                                goto got_locks;
                        /* negative that became positive */
                        if (parent)
                                spin_unlock(&parent->d_lock);
                        inode = dentry->d_inode;
                        goto slow_positive;
                }
        }
        __dentry_kill(dentry);
        return parent;

slow_positive:
        spin_unlock(&dentry->d_lock);
        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        parent = lock_parent(dentry);
got_locks:

That code (in dentry_kill()) relies upon the assumption that
positive dentry couldn't have become negative under us while
__lock_parent() had it unlocked.  Which is only true because
we have a positive refcount here.

IOW, the patch is broken as posted upthread.  It's really
not hard to fix, fortunately, and what we end up in dentry_kill()
looks a lot better that way -

static struct dentry *dentry_kill(struct dentry *dentry)
        __releases(dentry->d_lock) __releases(rcu)
{
        struct dentry *parent = NULL;
	if (likely(shrink_lock_dentry(dentry))) {
		if (!IS_ROOT(dentry))
			parent = dentry->d_parent;
		rcu_read_unlock();
		__dentry_kill(dentry);
	} else {
		rcu_read_unlock();
		spin_unlock(&dentry->d_lock);
	}
	return parent;
}

Carving that series up will be interesting, though...

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-30 22:18   ` Linus Torvalds
  2023-10-31  0:18     ` Al Viro
@ 2023-10-31  2:25     ` Gao Xiang
  2023-10-31  2:29       ` Gao Xiang
  2023-10-31  3:02       ` Linus Torvalds
  1 sibling, 2 replies; 119+ messages in thread
From: Gao Xiang @ 2023-10-31  2:25 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro; +Cc: linux-fsdevel

Hi Linus,

On 2023/10/31 06:18, Linus Torvalds wrote:
> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>
>> After fixing a couple of brainos, it seems to work.
> 
> This all makes me unnaturally nervous, probably because it;s overly
> subtle, and I have lost the context for some of the rules.
> 
> I like the patch, because honestly, our current logic for dput_fast()
> is nasty, andI agree with you that the existence of d_op->d_delete()
> shouldn't change the locking logic.
> 
> At the same time, I just worry. That whole lockref_put_return() thing
> has horrific semantics, and this is the only case that uses it, and I
> wish we didn't need it.
> 
> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
> too, and that looks completely bogus, since it doesn't check the
> return value! ]

  74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
  75                                                struct erofs_workgroup *grp)
  76 {
  77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
  78         struct erofs_workgroup *pre;
  79
  80         /*
  81          * Bump up before making this visible to others for the XArray in order
  82          * to avoid potential UAF without serialized by xa_lock.
  83          */
  84         lockref_get(&grp->lockref);
  85
  86 repeat:
  87         xa_lock(&sbi->managed_pslots);
  88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
  89                            NULL, grp, GFP_NOFS);
  90         if (pre) {
  91                 if (xa_is_err(pre)) {
  92                         pre = ERR_PTR(xa_err(pre));
  93                 } else if (!erofs_workgroup_get(pre)) {
  94                         /* try to legitimize the current in-tree one */
  95                         xa_unlock(&sbi->managed_pslots);
  96                         cond_resched();
  97                         goto repeat;
  98                 }
  99                 lockref_put_return(&grp->lockref);

This line it just decreases the reference count just bumpped up at the
line 84 (and it will always succeed).

Since it finds a previous one at line 88, so the old one will be used
(and be returned) instead of the new one and the new allocated one
will be freed in the caller.

Hopefully it explains the use case here.

100                 grp = pre;
101         }
102         xa_unlock(&sbi->managed_pslots);
103         return grp;
104 }

Thanks,
Gao Xiang


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  2:25     ` [RFC] simplifying fast_dput(), dentry_kill() et.al Gao Xiang
@ 2023-10-31  2:29       ` Gao Xiang
  2023-10-31  3:02       ` Linus Torvalds
  1 sibling, 0 replies; 119+ messages in thread
From: Gao Xiang @ 2023-10-31  2:29 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro; +Cc: linux-fsdevel



On 2023/10/31 10:25, Gao Xiang wrote:
> Hi Linus,
> 
> On 2023/10/31 06:18, Linus Torvalds wrote:
>> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>
>>> After fixing a couple of brainos, it seems to work.
>>
>> This all makes me unnaturally nervous, probably because it;s overly
>> subtle, and I have lost the context for some of the rules.
>>
>> I like the patch, because honestly, our current logic for dput_fast()
>> is nasty, andI agree with you that the existence of d_op->d_delete()
>> shouldn't change the locking logic.
>>
>> At the same time, I just worry. That whole lockref_put_return() thing
>> has horrific semantics, and this is the only case that uses it, and I
>> wish we didn't need it.
>>
>> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
>> too, and that looks completely bogus, since it doesn't check the
>> return value! ]
> 
>   74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>   75                                                struct erofs_workgroup *grp)
>   76 {
>   77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
>   78         struct erofs_workgroup *pre;
>   79
>   80         /*
>   81          * Bump up before making this visible to others for the XArray in order
>   82          * to avoid potential UAF without serialized by xa_lock.
>   83          */
>   84         lockref_get(&grp->lockref);
>   85
>   86 repeat:
>   87         xa_lock(&sbi->managed_pslots);
>   88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
>   89                            NULL, grp, GFP_NOFS);
>   90         if (pre) {
>   91                 if (xa_is_err(pre)) {
>   92                         pre = ERR_PTR(xa_err(pre));
>   93                 } else if (!erofs_workgroup_get(pre)) {
>   94                         /* try to legitimize the current in-tree one */
>   95                         xa_unlock(&sbi->managed_pslots);
>   96                         cond_resched();
>   97                         goto repeat;
>   98                 }
>   99                 lockref_put_return(&grp->lockref);
> 
> This line it just decreases the reference count just bumpped up at the
> line 84 (and it will always succeed).

Add some words: also since it's a new allocated one without populated
so it won't be locked by others.

> 
> Since it finds a previous one at line 88, so the old one will be used
> (and be returned) instead of the new one and the new allocated one
> will be freed in the caller.
> 
> Hopefully it explains the use case here.
> 
> 100                 grp = pre;
> 101         }
> 102         xa_unlock(&sbi->managed_pslots);
> 103         return grp;
> 104 }
> 
> Thanks,
> Gao Xiang
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  2:25     ` [RFC] simplifying fast_dput(), dentry_kill() et.al Gao Xiang
  2023-10-31  2:29       ` Gao Xiang
@ 2023-10-31  3:02       ` Linus Torvalds
  2023-10-31  3:13         ` Gao Xiang
  2023-10-31  3:26         ` Al Viro
  1 sibling, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-10-31  3:02 UTC (permalink / raw)
  To: Gao Xiang; +Cc: Al Viro, linux-fsdevel

On Mon, 30 Oct 2023 at 16:25, Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>
> >
> > [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
> > too, and that looks completely bogus, since it doesn't check the
> > return value! ]
>
>   74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>   75                                                struct erofs_workgroup *grp)
>   76 {
>   77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
>   78         struct erofs_workgroup *pre;
>   79
>   80         /*
>   81          * Bump up before making this visible to others for the XArray in order
>   82          * to avoid potential UAF without serialized by xa_lock.
>   83          */
>   84         lockref_get(&grp->lockref);
>   85
>   86 repeat:
>   87         xa_lock(&sbi->managed_pslots);
>   88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
>   89                            NULL, grp, GFP_NOFS);
>   90         if (pre) {
>   91                 if (xa_is_err(pre)) {
>   92                         pre = ERR_PTR(xa_err(pre));
>   93                 } else if (!erofs_workgroup_get(pre)) {
>   94                         /* try to legitimize the current in-tree one */
>   95                         xa_unlock(&sbi->managed_pslots);
>   96                         cond_resched();
>   97                         goto repeat;
>   98                 }
>   99                 lockref_put_return(&grp->lockref);
>
> This line it just decreases the reference count just bumpped up at the
> line 84 (and it will always succeed).

You have two possible scenarios:

 - it doesn't always succeed, because somebody else has the lock on
the grp->lockref right now, or because lockref doesn't do any
optimized cases at all

 - nobody else can access grp->lockref at the same time, so the lock
is pointless, so you shouldn't be using lockref in the first place,
and certainly not lockref_put_return

IOW, I don't see how lockref_put_return() could possibly *ever* be the
right thing to do.

The thing is, lockref_put_return() is fundamentally designed to be
something that can fail.

In  fact, in some situations it will *always* fail. Check this out:

#define USE_CMPXCHG_LOCKREF \
        (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \
         IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4)
...
#if USE_CMPXCHG_LOCKREF
...
#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif
...
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}

look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big
because of spinlock debugging, or whatever), lockref_put_return() will
*always* fail, expecting the caller to deal with that failure.

So doing a lockref_put_return() without dealing with the failure case
is FUNDAMENTALLY BROKEN.

Yes, it's an odd function. It's a function that is literally designed
for that dcache use-case, where we have a fast-path and a slow path,
and the "lockref_put_return() fails" is the slow-path that needs to
take the spinlock and do it carefully.

You *cannot* use that function without failure handling. Really.

                     Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  3:02       ` Linus Torvalds
@ 2023-10-31  3:13         ` Gao Xiang
  2023-10-31  3:26         ` Al Viro
  1 sibling, 0 replies; 119+ messages in thread
From: Gao Xiang @ 2023-10-31  3:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, linux-fsdevel



On 2023/10/31 11:02, Linus Torvalds wrote:
> On Mon, 30 Oct 2023 at 16:25, Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>>
>>>
>>> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
>>> too, and that looks completely bogus, since it doesn't check the
>>> return value! ]
>>
>>    74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>>    75                                                struct erofs_workgroup *grp)
>>    76 {
>>    77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
>>    78         struct erofs_workgroup *pre;
>>    79
>>    80         /*
>>    81          * Bump up before making this visible to others for the XArray in order
>>    82          * to avoid potential UAF without serialized by xa_lock.
>>    83          */
>>    84         lockref_get(&grp->lockref);
>>    85
>>    86 repeat:
>>    87         xa_lock(&sbi->managed_pslots);
>>    88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
>>    89                            NULL, grp, GFP_NOFS);
>>    90         if (pre) {
>>    91                 if (xa_is_err(pre)) {
>>    92                         pre = ERR_PTR(xa_err(pre));
>>    93                 } else if (!erofs_workgroup_get(pre)) {
>>    94                         /* try to legitimize the current in-tree one */
>>    95                         xa_unlock(&sbi->managed_pslots);
>>    96                         cond_resched();
>>    97                         goto repeat;
>>    98                 }
>>    99                 lockref_put_return(&grp->lockref);
>>
>> This line it just decreases the reference count just bumpped up at the
>> line 84 (and it will always succeed).
> 
> You have two possible scenarios:
> 
>   - it doesn't always succeed, because somebody else has the lock on
> the grp->lockref right now, or because lockref doesn't do any
> optimized cases at all
> 
>   - nobody else can access grp->lockref at the same time, so the lock
> is pointless, so you shouldn't be using lockref in the first place,
> and certainly not lockref_put_return

Yeah, the second case is the real use case here.

> 
> IOW, I don't see how lockref_put_return() could possibly *ever* be the
> right thing to do.
> 
> The thing is, lockref_put_return() is fundamentally designed to be
> something that can fail.
> 
> In  fact, in some situations it will *always* fail. Check this out:
> 
> #define USE_CMPXCHG_LOCKREF \
>          (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \
>           IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4)
> ...
> #if USE_CMPXCHG_LOCKREF
> ...
> #else
> 
> #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
> 
> #endif
> ...
> int lockref_put_return(struct lockref *lockref)
> {
>          CMPXCHG_LOOP(
>                  new.count--;
>                  if (old.count <= 0)
>                          return -1;
>          ,
>                  return new.count;
>          );
>          return -1;
> }
> 
> look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big
> because of spinlock debugging, or whatever), lockref_put_return() will
> *always* fail, expecting the caller to deal with that failure.
> 
> So doing a lockref_put_return() without dealing with the failure case
> is FUNDAMENTALLY BROKEN.

Yeah, thanks for point out, I get it.  I think this really needs to be
cleaned up since we don't actually care about locking here (since as I
said it doesn't actually populate to XArray).

> 
> Yes, it's an odd function. It's a function that is literally designed
> for that dcache use-case, where we have a fast-path and a slow path,
> and the "lockref_put_return() fails" is the slow-path that needs to
> take the spinlock and do it carefully.
> 
> You *cannot* use that function without failure handling. Really.

I will fix+cleanup this path later and send upstream.  Thanks for the
heads up.

Thanks,
Gao Xiang

> 
>                       Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  3:02       ` Linus Torvalds
  2023-10-31  3:13         ` Gao Xiang
@ 2023-10-31  3:26         ` Al Viro
  2023-10-31  3:41           ` Linus Torvalds
  1 sibling, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-10-31  3:26 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Gao Xiang, linux-fsdevel

On Mon, Oct 30, 2023 at 05:02:32PM -1000, Linus Torvalds wrote:

> look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big
> because of spinlock debugging, or whatever), lockref_put_return() will
> *always* fail, expecting the caller to deal with that failure.
> 
> So doing a lockref_put_return() without dealing with the failure case
> is FUNDAMENTALLY BROKEN.
> 
> Yes, it's an odd function. It's a function that is literally designed
> for that dcache use-case, where we have a fast-path and a slow path,
> and the "lockref_put_return() fails" is the slow-path that needs to
> take the spinlock and do it carefully.
> 
> You *cannot* use that function without failure handling. Really.

Put another way, it's a fastpath-only thing.  Not sure how much use
would it be to slap __must_check on it; a warning along the lines
of "DON'T USE UNLESS YOU HAVE READ <archive link>" in lockref.h
might be useful.

BTW, is there any reason for -128 for marking them dead?  Looks like
-1 wouldn't be any worse...

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  3:26         ` Al Viro
@ 2023-10-31  3:41           ` Linus Torvalds
  0 siblings, 0 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-10-31  3:41 UTC (permalink / raw)
  To: Al Viro; +Cc: Gao Xiang, linux-fsdevel

On Mon, 30 Oct 2023 at 17:26, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> BTW, is there any reason for -128 for marking them dead?  Looks like
> -1 wouldn't be any worse...

It's *much* too long ago, but I have this dim memory of simply wanting
to make sure that "dead" was clearly separate from "underflow".

                  Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  1:53       ` Al Viro
@ 2023-10-31  6:12         ` Al Viro
  2023-11-01  6:18           ` Al Viro
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
  0 siblings, 2 replies; 119+ messages in thread
From: Al Viro @ 2023-10-31  6:12 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Tue, Oct 31, 2023 at 01:53:51AM +0000, Al Viro wrote:

> Carving that series up will be interesting, though...

I think I have a sane carve-up; will post if it survives testing.

Cumulative diff follows:

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..213026d5b033 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry)
 static struct dentry *__lock_parent(struct dentry *dentry)
 {
 	struct dentry *parent;
-	rcu_read_lock();
-	spin_unlock(&dentry->d_lock);
 again:
 	parent = READ_ONCE(dentry->d_parent);
 	spin_lock(&parent->d_lock);
@@ -642,7 +640,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
 		spin_unlock(&parent->d_lock);
 		goto again;
 	}
-	rcu_read_unlock();
 	if (parent != dentry)
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	else
@@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 		return NULL;
 	if (likely(spin_trylock(&parent->d_lock)))
 		return parent;
-	return __lock_parent(dentry);
+	rcu_read_lock();
+	spin_unlock(&dentry->d_lock);
+	parent = __lock_parent(dentry);
+	rcu_read_unlock();
+	return parent;
+}
+
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy.  Otherwise, return true and have
+ * that dentry's inode and parent both locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = dentry->d_parent;
+
+	if (unlikely(dentry->d_lockref.count))
+		return false;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto slow;
+	if (dentry == parent)
+		return true;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return true;
+
+	if (inode)
+		spin_unlock(&inode->i_lock);
+slow:
+	spin_unlock(&dentry->d_lock);
+
+	for (;;) {
+		if (inode)
+			spin_lock(&inode->i_lock);
+		parent = __lock_parent(dentry);
+		if (likely(inode == dentry->d_inode))
+			break;
+		if (inode)
+			spin_unlock(&inode->i_lock);
+		inode = dentry->d_inode;
+		spin_unlock(&dentry->d_lock);
+		if (parent)
+			spin_unlock(&parent->d_lock);
+	}
+	if (likely(!dentry->d_lockref.count))
+		return true;
+	if (inode)
+		spin_unlock(&inode->i_lock);
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	return false;
 }
 
 static inline bool retain_dentry(struct dentry *dentry)
@@ -680,7 +734,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -703,61 +756,12 @@ void d_mark_dontcache(struct inode *inode)
 }
 EXPORT_SYMBOL(d_mark_dontcache);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
-{
-	struct inode *inode = dentry->d_inode;
-	struct dentry *parent = NULL;
-
-	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
-		goto slow_positive;
-
-	if (!IS_ROOT(dentry)) {
-		parent = dentry->d_parent;
-		if (unlikely(!spin_trylock(&parent->d_lock))) {
-			parent = __lock_parent(dentry);
-			if (likely(inode || !dentry->d_inode))
-				goto got_locks;
-			/* negative that became positive */
-			if (parent)
-				spin_unlock(&parent->d_lock);
-			inode = dentry->d_inode;
-			goto slow_positive;
-		}
-	}
-	__dentry_kill(dentry);
-	return parent;
-
-slow_positive:
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&inode->i_lock);
-	spin_lock(&dentry->d_lock);
-	parent = lock_parent(dentry);
-got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
-		__dentry_kill(dentry);
-		return parent;
-	}
-	/* we are keeping it, after all */
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	if (parent)
-		spin_unlock(&parent->d_lock);
-	spin_unlock(&dentry->d_lock);
-	return NULL;
-}
-
 /*
  * Try to do a lockless dput(), and return whether that was successful.
  *
  * If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
  *
  * The caller needs to hold the RCU read lock, so that the dentry is
  * guaranteed to stay around even if the refcount goes down to zero!
@@ -768,15 +772,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * try to decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -787,12 +783,12 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	if (unlikely(ret < 0)) {
 		spin_lock(&dentry->d_lock);
-		if (dentry->d_lockref.count > 1) {
-			dentry->d_lockref.count--;
+		if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
 			spin_unlock(&dentry->d_lock);
 			return true;
 		}
-		return false;
+		dentry->d_lockref.count--;
+		goto locked;
 	}
 
 	/*
@@ -830,7 +826,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -850,17 +846,11 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * else could have killed it and marked it dead. Either way, we
 	 * don't need to do anything else.
 	 */
-	if (dentry->d_lockref.count) {
+locked:
+	if (dentry->d_lockref.count || retain_dentry(dentry)) {
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
-
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -903,29 +893,29 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-
-		if (likely(retain_dentry(dentry))) {
+		if (likely(lock_for_kill(dentry))) {
+			struct dentry *parent = dentry->d_parent;
+			rcu_read_unlock();
+			__dentry_kill(dentry);
+			if (dentry == parent)
+				return;
+			dentry = parent;
+		} else {
+			rcu_read_unlock();
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
-
-		dentry = dentry_kill(dentry);
 	}
 }
 EXPORT_SYMBOL(dput);
 
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -937,8 +927,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
-	if (!retain_dentry(dentry))
-		__dput_to_list(dentry, list);
+	to_shrink_list(dentry, list);
 	spin_unlock(&dentry->d_lock);
 }
 
@@ -1117,58 +1106,6 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list.  Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
-{
-	struct inode *inode;
-	struct dentry *parent;
-
-	if (dentry->d_lockref.count)
-		return false;
-
-	inode = dentry->d_inode;
-	if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
-		spin_unlock(&dentry->d_lock);
-		spin_lock(&inode->i_lock);
-		spin_lock(&dentry->d_lock);
-		if (unlikely(dentry->d_lockref.count))
-			goto out;
-		/* changed inode means that somebody had grabbed it */
-		if (unlikely(inode != dentry->d_inode))
-			goto out;
-	}
-
-	parent = dentry->d_parent;
-	if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
-		return true;
-
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&parent->d_lock);
-	if (unlikely(parent != dentry->d_parent)) {
-		spin_unlock(&parent->d_lock);
-		spin_lock(&dentry->d_lock);
-		goto out;
-	}
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	if (likely(!dentry->d_lockref.count))
-		return true;
-	spin_unlock(&parent->d_lock);
-out:
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	return false;
-}
-
 void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
@@ -1177,12 +1114,11 @@ void shrink_dentry_list(struct list_head *list)
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
-		if (!shrink_lock_dentry(dentry)) {
+		if (!lock_for_kill(dentry)) {
 			bool can_free = false;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
-			if (dentry->d_lockref.count < 0)
-				can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
 			if (can_free)
 				dentry_free(dentry);
@@ -1191,8 +1127,8 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
-			__dput_to_list(parent, list);
+		if (parent != dentry && !--parent->d_lockref.count)
+			to_shrink_list(parent, list);
 		__dentry_kill(dentry);
 	}
 }
@@ -1632,14 +1568,15 @@ void shrink_dcache_parent(struct dentry *parent)
 		if (data.victim) {
 			struct dentry *parent;
 			spin_lock(&data.victim->d_lock);
-			if (!shrink_lock_dentry(data.victim)) {
+			if (!lock_for_kill(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
-					__dput_to_list(parent, &data.dispose);
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
+					to_shrink_list(parent, &data.dispose);
 				__dentry_kill(data.victim);
 			}
 		}

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  0:18     ` Al Viro
  2023-10-31  1:53       ` Al Viro
@ 2023-11-01  2:22       ` Al Viro
  2023-11-01 14:29         ` Benjamin Coddington
  2023-11-05 19:54       ` Al Viro
  2 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-01  2:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, linux-nfs, Olga Kornievskaia

[NFS folks Cc'd]

On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > After fixing a couple of brainos, it seems to work.
> > 
> > This all makes me unnaturally nervous, probably because it;s overly
> > subtle, and I have lost the context for some of the rules.
> 
> A bit of context: I started to look at the possibility of refcount overflows.
> Writing the current rules for dentry refcounting and lifetime down was the
> obvious first step, and that immediately turned into an awful mess.
> 
> It is overly subtle.  Even more so when you throw the shrink lists into
> the mix - shrink_lock_dentry() got too smart for its own good, and that
> leads to really awful correctness proofs.

... and for another example of subtle shit, consider DCACHE_NORCU.  Recall
c0eb027e5aef "vfs: don't do RCU lookup of empty pathnames" and note that
it relies upon never getting results of alloc_file_pseudo() with directory
inode anywhere near descriptor tables.

Back then I basically went "fine, nobody would ever use alloc_file_pseudo()
for that anyway", but... there's a call in __nfs42_ssc_open() that doesn't
have any obvious protection against ending up with directory inode.
That does not end up anywhere near descriptor tables, as far as I can tell,
fortunately.

Unfortunately, it is quite capable of fucking the things up in different
ways, even if it's promptly closed.  d_instantiate() on directory inode
is a really bad thing; a plenty of places expect to have only one alias
for those, and would be very unhappy with that kind of crap without any
RCU considerations.

I'm pretty sure that this NFS code really does not want to use that for
directories; the simplest solution would be to refuse alloc_file_pseudo()
for directory inodes.  NFS folks - do you have a problem with the
following patch?

======
Make sure we never feed a directory to alloc_file_pseudo()

That would be broken in a lot of ways, from UAF in pathwalk if
that thing ever gets into descriptor tables, to royally screwing
every place that relies upon the lack of aliases for directory
inodes (i.e. quite a bit of VFS).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..5331a696896e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,6 +326,9 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
 	struct path path;
 	struct file *file;
 
+	if (WARN_ON_ONCE(S_ISDIR(inode->i_mode)))
+		return ERR_PTR(-EISDIR);
+
 	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
 	if (!path.dentry)
 		return ERR_PTR(-ENOMEM);

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  6:12         ` Al Viro
@ 2023-11-01  6:18           ` Al Viro
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
  1 sibling, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Tue, Oct 31, 2023 at 06:12:26AM +0000, Al Viro wrote:
> On Tue, Oct 31, 2023 at 01:53:51AM +0000, Al Viro wrote:
> 
> > Carving that series up will be interesting, though...
> 
> I think I have a sane carve-up; will post if it survives testing.

OK, current variant survives local testing.  Lives in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache

Shortlog:
      fast_dput(): having ->d_delete() is not reason to delay refcount decrement
      fast_dput(): handle underflows gracefully
      fast_dput(): new rules for refcount
      __dput_to_list(): do decrement of refcount in the caller
      retain_dentry(): lift decrement of ->d_count into callers
      __dentry_kill(): get consistent rules for ->d_count
      dentry_kill(): don't bother with retain_dentry() on slow path
      Call retain_dentry() with refcount 0
      fold the call of retain_dentry() into fast_dput()
      don't try to cut corners in shrink_lock_dentry()
      fold dentry_kill() into dput()
      get rid of __dget()
      shrink_dentry_list(): no need to check that dentry refcount is marked dead
      to_shrink_list(): call only if refcount is 0
      switch select_collect{,2}() to use of to_shrink_list()

Diffstat:
 fs/dcache.c | 268 ++++++++++++++++++++++--------------------------------------
 1 file changed, 96 insertions(+), 172 deletions(-)

Individual patches in followups.  Review and testing would be welcome,
and it's obviously next cycle fodder.  Massage of refcounting is in the
first 11 commits, last 4 are followups.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement
  2023-10-31  6:12         ` Al Viro
  2023-11-01  6:18           ` Al Viro
@ 2023-11-01  6:20           ` Al Viro
  2023-11-01  6:20             ` [PATCH 02/15] fast_dput(): handle underflows gracefully Al Viro
                               ` (13 more replies)
  1 sibling, 14 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

->d_delete() is a way for filesystem to tell that dentry is not worth
keeping cached.  It is not guaranteed to be called every time a dentry
has refcount drop down to zero; it is not guaranteed to be called before
dentry gets evicted.  In other words, it is not suitable for any kind
of keeping track of dentry state.

None of the in-tree filesystems attempt to use it that way, fortunately.

So the contortions done by fast_dput() (as well as dentry_kill()) are
not warranted.  fast_dput() certainly should treat having ->d_delete()
instance as "can't assume we'll be keeping it", but that's not different
from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar
to making ->d_delete() returns true when called).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..5ec14df04f11 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -768,15 +768,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * try to decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -830,7 +822,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 02/15] fast_dput(): handle underflows gracefully
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 03/15] fast_dput(): new rules for refcount Al Viro
                               ` (12 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

If refcount is less than 1, we should just warn, unlock dentry and
return true, so that the caller doesn't try to do anything else.

Taking care of that leaves the rest of "lockref_put_return() has
failed" case equivalent to "decrement refcount and rejoin the
normal slow path after the point where we grab ->d_lock".

NOTE: lockref_put_return() is strictly a fastpath thing - unlike
the rest of lockref primitives, it does not contain a fallback.
Caller (and it looks like fast_dput() is the only legitimate one
in the entire kernel) has to do that itself.  Reasons for
lockref_put_return() failures:
	* ->d_lock held by somebody
	* refcount <= 0
	* ... or an architecture not supporting lockref use of
cmpxchg - sparc, anything non-SMP, config with spinlock debugging...

We could add a fallback, but it would be a clumsy API - we'd have
to distinguish between:
	(1) refcount > 1 - decremented, lock not held on return
	(2) refcount < 1 - left alone, probably no sense to hold the lock
	(3) refcount is 1, no cmphxcg - decremented, lock held on return
	(4) refcount is 1, cmphxcg supported - decremented, lock *NOT* held
	    on return.
We want to return with no lock held in case (4); that's the whole point of that
thing.  We very much do not want to have the fallback in case (3) return without
a lock, since the caller might have to retake it in that case.
So it wouldn't be more convenient than doing the fallback in the caller and
it would be very easy to screw up, especially since the test coverage would
suck - no way to test (3) and (4) on the same kernel build.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5ec14df04f11..ddc534b39c22 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -779,12 +779,12 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	if (unlikely(ret < 0)) {
 		spin_lock(&dentry->d_lock);
-		if (dentry->d_lockref.count > 1) {
-			dentry->d_lockref.count--;
+		if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
 			spin_unlock(&dentry->d_lock);
 			return true;
 		}
-		return false;
+		dentry->d_lockref.count--;
+		goto locked;
 	}
 
 	/*
@@ -842,6 +842,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * else could have killed it and marked it dead. Either way, we
 	 * don't need to do anything else.
 	 */
+locked:
 	if (dentry->d_lockref.count) {
 		spin_unlock(&dentry->d_lock);
 		return true;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 03/15] fast_dput(): new rules for refcount
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
  2023-11-01  6:20             ` [PATCH 02/15] fast_dput(): handle underflows gracefully Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 04/15] __dput_to_list(): do decrement of refcount in the caller Al Viro
                               ` (11 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Currently the "need caller to do more work" path in fast_dput()
has refcount decremented, then, with ->d_lock held and
refcount verified to have reached 0 fast_dput() forcibly resets
the refcount to 1.

Move that resetting refcount to 1 into the callers; later in
the series it will be massaged out of existence.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index ddc534b39c22..4108312f2426 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -847,13 +847,6 @@ static inline bool fast_dput(struct dentry *dentry)
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
-
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -896,6 +889,7 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
+		dentry->d_lockref.count = 1;
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
@@ -930,6 +924,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
+	dentry->d_lockref.count = 1;
 	if (!retain_dentry(dentry))
 		__dput_to_list(dentry, list);
 	spin_unlock(&dentry->d_lock);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 04/15] __dput_to_list(): do decrement of refcount in the caller
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
  2023-11-01  6:20             ` [PATCH 02/15] fast_dput(): handle underflows gracefully Al Viro
  2023-11-01  6:20             ` [PATCH 03/15] fast_dput(): new rules for refcount Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 05/15] retain_dentry(): lift decrement of ->d_count into callers Al Viro
                               ` (10 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

... and rename it to to_shrink_list(), seeing that it no longer
does dropping any references

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 4108312f2426..3a160717620b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -902,16 +902,13 @@ void dput(struct dentry *dentry)
 }
 EXPORT_SYMBOL(dput);
 
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
+		if (!dentry->d_lockref.count)
 			d_shrink_add(dentry, list);
 	}
 }
@@ -925,8 +922,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 	}
 	rcu_read_unlock();
 	dentry->d_lockref.count = 1;
-	if (!retain_dentry(dentry))
-		__dput_to_list(dentry, list);
+	if (!retain_dentry(dentry)) {
+		--dentry->d_lockref.count;
+		to_shrink_list(dentry, list);
+	}
 	spin_unlock(&dentry->d_lock);
 }
 
@@ -1184,8 +1183,10 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
-			__dput_to_list(parent, list);
+		if (parent != dentry) {
+			--parent->d_lockref.count;
+			to_shrink_list(parent, list);
+		}
 		__dentry_kill(dentry);
 	}
 }
@@ -1631,8 +1632,10 @@ void shrink_dcache_parent(struct dentry *parent)
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
-					__dput_to_list(parent, &data.dispose);
+				if (parent != data.victim) {
+					--parent->d_lockref.count;
+					to_shrink_list(parent, &data.dispose);
+				}
 				__dentry_kill(data.victim);
 			}
 		}
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 05/15] retain_dentry(): lift decrement of ->d_count into callers
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (2 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 04/15] __dput_to_list(): do decrement of refcount in the caller Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 06/15] __dentry_kill(): get consistent rules for ->d_count Al Viro
                               ` (9 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 3a160717620b..0114b5195535 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -744,6 +743,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	} else if (likely(!retain_dentry(dentry))) {
 		__dentry_kill(dentry);
 		return parent;
+	} else {
+		dentry->d_lockref.count--;
 	}
 	/* we are keeping it, after all */
 	if (inode)
@@ -893,6 +894,7 @@ void dput(struct dentry *dentry)
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
+			dentry->d_lockref.count--;
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
@@ -925,6 +927,8 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 	if (!retain_dentry(dentry)) {
 		--dentry->d_lockref.count;
 		to_shrink_list(dentry, list);
+	} else {
+		--dentry->d_lockref.count;
 	}
 	spin_unlock(&dentry->d_lock);
 }
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 06/15] __dentry_kill(): get consistent rules for ->d_count
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (3 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 05/15] retain_dentry(): lift decrement of ->d_count into callers Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 07/15] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
                               ` (8 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Currently we call it with ->d_count equal to 1 when called from
dentry_kill(); all other callers have ->d_count equal to 0.

Make it always be called with zero ->d_count; on this step we
just decrement it before the calls in dentry_kill().  That is
safe, since all places that care about the value of ->d_count
either do that under ->d_lock or hold a reference to dentry
in question.  Either is sufficient to prevent observing a
dentry immediately prior to __dentry_kill() call from dentry_kill().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 0114b5195535..c89337ae30ce 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -729,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 			goto slow_positive;
 		}
 	}
+	dentry->d_lockref.count--;
 	__dentry_kill(dentry);
 	return parent;
 
@@ -741,6 +742,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	if (unlikely(dentry->d_lockref.count != 1)) {
 		dentry->d_lockref.count--;
 	} else if (likely(!retain_dentry(dentry))) {
+		dentry->d_lockref.count--;
 		__dentry_kill(dentry);
 		return parent;
 	} else {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 07/15] dentry_kill(): don't bother with retain_dentry() on slow path
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (4 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 06/15] __dentry_kill(): get consistent rules for ->d_count Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 08/15] Call retain_dentry() with refcount 0 Al Viro
                               ` (7 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

We have already checked it and dentry used to look not worthy
of keeping.  The only hard obstacle to evicting dentry is
non-zero refcount; everything else is advisory - e.g. memory
pressure could evict any dentry found with refcount zero.
On the slow path in dentry_kill() we had dropped and regained
->d_lock; we must recheck the refcount, but everything else
is not worth bothering with.

Note that filesystem can not count upon ->d_delete() being
called for dentry - not even once.  Again, memory pressure
(as well as d_prune_aliases(), or attempted rmdir() of ancestor,
or...) will not call ->d_delete() at all.

So from the correctness point of view we are fine doing the
check only once.  And it makes things simpler down the road.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index c89337ae30ce..7931f5108581 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -739,14 +739,10 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	spin_lock(&dentry->d_lock);
 	parent = lock_parent(dentry);
 got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
-		dentry->d_lockref.count--;
+	dentry->d_lockref.count--;
+	if (likely(dentry->d_lockref.count == 0)) {
 		__dentry_kill(dentry);
 		return parent;
-	} else {
-		dentry->d_lockref.count--;
 	}
 	/* we are keeping it, after all */
 	if (inode)
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 08/15] Call retain_dentry() with refcount 0
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (5 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 07/15] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:20             ` [PATCH 09/15] fold the call of retain_dentry() into fast_dput() Al Viro
                               ` (6 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Instead of bumping it from 0 to 1, calling retain_dentry(), then
decrementing it back to 0 (with ->d_lock held all the way through),
just leave refcount at 0 through all of that.

It will have a visible effect for ->d_delete() - now it can be
called with refcount 0 instead of 1 and it can no longer play
silly buggers with dropping/regaining ->d_lock.  Not that any
in-tree instances tried to (it's pretty hard to get right).

Any out-of-tree ones will have to adjust (assuming they need any
changes).

Note that we do not need to extend rcu-critical area here - we have
verified that refcount is non-negative after having grabbed ->d_lock,
so nobody will be able to free dentry until they get into __dentry_kill(),
which won't happen until they manage to grab ->d_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 7931f5108581..30bebec591db 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -888,15 +888,14 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		dentry->d_lockref.count = 1;
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
-			dentry->d_lockref.count--;
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
 
+		dentry->d_lockref.count = 1;
 		dentry = dentry_kill(dentry);
 	}
 }
@@ -921,13 +920,8 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
-	dentry->d_lockref.count = 1;
-	if (!retain_dentry(dentry)) {
-		--dentry->d_lockref.count;
+	if (!retain_dentry(dentry))
 		to_shrink_list(dentry, list);
-	} else {
-		--dentry->d_lockref.count;
-	}
 	spin_unlock(&dentry->d_lock);
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 09/15] fold the call of retain_dentry() into fast_dput()
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (6 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 08/15] Call retain_dentry() with refcount 0 Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  8:45               ` Al Viro
  2023-11-01  6:20             ` [PATCH 10/15] don't try to cut corners in shrink_lock_dentry() Al Viro
                               ` (5 subsequent siblings)
  13 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Calls of retain_dentry() happen immediately after getting false
from fast_dput() and getting true from retain_dentry() is
treated the same way as non-zero refcount would be treated by
fast_dput() - unlock dentry and bugger off.

Doing that in fast_dput() itself is simpler.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 30bebec591db..6f79d452af81 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -757,6 +757,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
  * Try to do a lockless dput(), and return whether that was successful.
  *
  * If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
  *
  * The caller needs to hold the RCU read lock, so that the dentry is
  * guaranteed to stay around even if the refcount goes down to zero!
@@ -842,7 +844,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * don't need to do anything else.
 	 */
 locked:
-	if (dentry->d_lockref.count) {
+	if (dentry->d_lockref.count || retain_dentry(dentry)) {
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
@@ -889,12 +891,6 @@ void dput(struct dentry *dentry)
 
 		/* Slow case: now with the dentry lock held */
 		rcu_read_unlock();
-
-		if (likely(retain_dentry(dentry))) {
-			spin_unlock(&dentry->d_lock);
-			return;
-		}
-
 		dentry->d_lockref.count = 1;
 		dentry = dentry_kill(dentry);
 	}
@@ -920,8 +916,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
-	if (!retain_dentry(dentry))
-		to_shrink_list(dentry, list);
+	to_shrink_list(dentry, list);
 	spin_unlock(&dentry->d_lock);
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 10/15] don't try to cut corners in shrink_lock_dentry()
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (7 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 09/15] fold the call of retain_dentry() into fast_dput() Al Viro
@ 2023-11-01  6:20             ` Al Viro
  2023-11-01  6:21             ` [PATCH 11/15] fold dentry_kill() into dput() Al Viro
                               ` (4 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

That is to say, do *not* treat the ->d_inode or ->d_parent changes
as "it's hard, return false; somebody must have grabbed it, so
even if has zero refcount, we don't need to bother killing it -
final dput() from whoever grabbed it would've done everything".

First of all, that is not guaranteed.  It might have been dropped
by dput_to_list(), which would've found it already on a shrink
list (ours) and decided that they don't need to put it on their
shrink list.

What's more, dentry_kill() is doing pretty much the same thing,
cutting its own set of corners (it assumes that dentry can't
go from positive to negative, so inode can change but only once
and only in one direction).

Doing that right allows to get rid of that not-quite-duplication
and removes the only reason to play silly buggers with re-incrementing
refcount before dentry_kill().

Replacement is called lock_for_kill(); called under rcu_read_lock
and with ->d_lock held.  If it returns false, dentry has non-zero
refcount and the same locks are held.  If it returns true,
dentry has zero refcount and its parent and inode (if any) are
locked.

Part of __lock_parent() had been lifted into lock_parent() to
allow its reuse.  Now it's called with rcu_read_lock already
held and dentry already unlocked.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 159 ++++++++++++++++++++++------------------------------
 1 file changed, 66 insertions(+), 93 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 6f79d452af81..5fd6162cd994 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry)
 static struct dentry *__lock_parent(struct dentry *dentry)
 {
 	struct dentry *parent;
-	rcu_read_lock();
-	spin_unlock(&dentry->d_lock);
 again:
 	parent = READ_ONCE(dentry->d_parent);
 	spin_lock(&parent->d_lock);
@@ -642,7 +640,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
 		spin_unlock(&parent->d_lock);
 		goto again;
 	}
-	rcu_read_unlock();
 	if (parent != dentry)
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	else
@@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 		return NULL;
 	if (likely(spin_trylock(&parent->d_lock)))
 		return parent;
-	return __lock_parent(dentry);
+	rcu_read_lock();
+	spin_unlock(&dentry->d_lock);
+	parent = __lock_parent(dentry);
+	rcu_read_unlock();
+	return parent;
+}
+
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy.  Otherwise, return true and have
+ * that dentry's inode and parent both locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = dentry->d_parent;
+
+	if (unlikely(dentry->d_lockref.count))
+		return false;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto slow;
+	if (dentry == parent)
+		return true;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return true;
+
+	if (inode)
+		spin_unlock(&inode->i_lock);
+slow:
+	spin_unlock(&dentry->d_lock);
+
+	for (;;) {
+		if (inode)
+			spin_lock(&inode->i_lock);
+		parent = __lock_parent(dentry);
+		if (likely(inode == dentry->d_inode))
+			break;
+		if (inode)
+			spin_unlock(&inode->i_lock);
+		inode = dentry->d_inode;
+		spin_unlock(&dentry->d_lock);
+		if (parent)
+			spin_unlock(&parent->d_lock);
+	}
+	if (likely(!dentry->d_lockref.count))
+		return true;
+	if (inode)
+		spin_unlock(&inode->i_lock);
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	return false;
 }
 
 static inline bool retain_dentry(struct dentry *dentry)
@@ -710,45 +764,16 @@ EXPORT_SYMBOL(d_mark_dontcache);
 static struct dentry *dentry_kill(struct dentry *dentry)
 	__releases(dentry->d_lock)
 {
-	struct inode *inode = dentry->d_inode;
-	struct dentry *parent = NULL;
 
-	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
-		goto slow_positive;
-
-	if (!IS_ROOT(dentry)) {
-		parent = dentry->d_parent;
-		if (unlikely(!spin_trylock(&parent->d_lock))) {
-			parent = __lock_parent(dentry);
-			if (likely(inode || !dentry->d_inode))
-				goto got_locks;
-			/* negative that became positive */
-			if (parent)
-				spin_unlock(&parent->d_lock);
-			inode = dentry->d_inode;
-			goto slow_positive;
-		}
-	}
 	dentry->d_lockref.count--;
-	__dentry_kill(dentry);
-	return parent;
-
-slow_positive:
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&inode->i_lock);
-	spin_lock(&dentry->d_lock);
-	parent = lock_parent(dentry);
-got_locks:
-	dentry->d_lockref.count--;
-	if (likely(dentry->d_lockref.count == 0)) {
+	rcu_read_lock();
+	if (likely(lock_for_kill(dentry))) {
+		struct dentry *parent = dentry->d_parent;
+		rcu_read_unlock();
 		__dentry_kill(dentry);
-		return parent;
+		return parent != dentry ? parent : NULL;
 	}
-	/* we are keeping it, after all */
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	if (parent)
-		spin_unlock(&parent->d_lock);
+	rcu_read_unlock();
 	spin_unlock(&dentry->d_lock);
 	return NULL;
 }
@@ -1100,58 +1125,6 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list.  Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
-{
-	struct inode *inode;
-	struct dentry *parent;
-
-	if (dentry->d_lockref.count)
-		return false;
-
-	inode = dentry->d_inode;
-	if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
-		spin_unlock(&dentry->d_lock);
-		spin_lock(&inode->i_lock);
-		spin_lock(&dentry->d_lock);
-		if (unlikely(dentry->d_lockref.count))
-			goto out;
-		/* changed inode means that somebody had grabbed it */
-		if (unlikely(inode != dentry->d_inode))
-			goto out;
-	}
-
-	parent = dentry->d_parent;
-	if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
-		return true;
-
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&parent->d_lock);
-	if (unlikely(parent != dentry->d_parent)) {
-		spin_unlock(&parent->d_lock);
-		spin_lock(&dentry->d_lock);
-		goto out;
-	}
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	if (likely(!dentry->d_lockref.count))
-		return true;
-	spin_unlock(&parent->d_lock);
-out:
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	return false;
-}
-
 void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
@@ -1160,7 +1133,7 @@ void shrink_dentry_list(struct list_head *list)
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
-		if (!shrink_lock_dentry(dentry)) {
+		if (!lock_for_kill(dentry)) {
 			bool can_free = false;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
@@ -1617,7 +1590,7 @@ void shrink_dcache_parent(struct dentry *parent)
 		if (data.victim) {
 			struct dentry *parent;
 			spin_lock(&data.victim->d_lock);
-			if (!shrink_lock_dentry(data.victim)) {
+			if (!lock_for_kill(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 11/15] fold dentry_kill() into dput()
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (8 preceding siblings ...)
  2023-11-01  6:20             ` [PATCH 10/15] don't try to cut corners in shrink_lock_dentry() Al Viro
@ 2023-11-01  6:21             ` Al Viro
  2023-11-01  6:21             ` [PATCH 12/15] get rid of __dget() Al Viro
                               ` (3 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:21 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5fd6162cd994..5114514b13da 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -756,28 +756,6 @@ void d_mark_dontcache(struct inode *inode)
 }
 EXPORT_SYMBOL(d_mark_dontcache);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
-{
-
-	dentry->d_lockref.count--;
-	rcu_read_lock();
-	if (likely(lock_for_kill(dentry))) {
-		struct dentry *parent = dentry->d_parent;
-		rcu_read_unlock();
-		__dentry_kill(dentry);
-		return parent != dentry ? parent : NULL;
-	}
-	rcu_read_unlock();
-	spin_unlock(&dentry->d_lock);
-	return NULL;
-}
-
 /*
  * Try to do a lockless dput(), and return whether that was successful.
  *
@@ -915,9 +893,18 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-		dentry->d_lockref.count = 1;
-		dentry = dentry_kill(dentry);
+		if (likely(lock_for_kill(dentry))) {
+			struct dentry *parent = dentry->d_parent;
+			rcu_read_unlock();
+			__dentry_kill(dentry);
+			if (dentry == parent)
+				return;
+			dentry = parent;
+		} else {
+			rcu_read_unlock();
+			spin_unlock(&dentry->d_lock);
+			return;
+		}
 	}
 }
 EXPORT_SYMBOL(dput);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 12/15] get rid of __dget()
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (9 preceding siblings ...)
  2023-11-01  6:21             ` [PATCH 11/15] fold dentry_kill() into dput() Al Viro
@ 2023-11-01  6:21             ` Al Viro
  2023-11-01  6:21             ` [PATCH 13/15] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
                               ` (2 subsequent siblings)
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:21 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

fold into the sole remaining caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5114514b13da..49b3fd27559f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -938,11 +938,6 @@ static inline void __dget_dlock(struct dentry *dentry)
 	dentry->d_lockref.count++;
 }
 
-static inline void __dget(struct dentry *dentry)
-{
-	lockref_get(&dentry->d_lockref);
-}
-
 struct dentry *dget_parent(struct dentry *dentry)
 {
 	int gotref;
@@ -992,7 +987,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
 	if (hlist_empty(&inode->i_dentry))
 		return NULL;
 	alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-	__dget(alias);
+	lockref_get(&alias->d_lockref);
 	return alias;
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 13/15] shrink_dentry_list(): no need to check that dentry refcount is marked dead
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (10 preceding siblings ...)
  2023-11-01  6:21             ` [PATCH 12/15] get rid of __dget() Al Viro
@ 2023-11-01  6:21             ` Al Viro
  2023-11-01  6:21             ` [PATCH 14/15] to_shrink_list(): call only if refcount is 0 Al Viro
  2023-11-01  6:21             ` [PATCH 15/15] switch select_collect{,2}() to use of to_shrink_list() Al Viro
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:21 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

... we won't see DCACHE_MAY_FREE on anything that is *not* dead
and checking d_flags is just as cheap as checking d_count.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 49b3fd27559f..19f6eb6f2bde 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1116,11 +1116,10 @@ void shrink_dentry_list(struct list_head *list)
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
 		if (!lock_for_kill(dentry)) {
-			bool can_free = false;
+			bool can_free;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
-			if (dentry->d_lockref.count < 0)
-				can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
 			if (can_free)
 				dentry_free(dentry);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 14/15] to_shrink_list(): call only if refcount is 0
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (11 preceding siblings ...)
  2023-11-01  6:21             ` [PATCH 13/15] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
@ 2023-11-01  6:21             ` Al Viro
  2023-11-01  6:21             ` [PATCH 15/15] switch select_collect{,2}() to use of to_shrink_list() Al Viro
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:21 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

The only thing it does if refcount is not zero is d_lru_del(); no
point, IMO, seeing that plain dput() does nothing of that sort...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 19f6eb6f2bde..7c763a8c916b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -915,8 +915,7 @@ __must_hold(&dentry->d_lock)
 	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -1128,10 +1127,8 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry) {
-			--parent->d_lockref.count;
+		if (parent != dentry && !--parent->d_lockref.count)
 			to_shrink_list(parent, list);
-		}
 		__dentry_kill(dentry);
 	}
 }
@@ -1577,10 +1574,9 @@ void shrink_dcache_parent(struct dentry *parent)
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim) {
-					--parent->d_lockref.count;
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
 					to_shrink_list(parent, &data.dispose);
-				}
 				__dentry_kill(data.victim);
 			}
 		}
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 15/15] switch select_collect{,2}() to use of to_shrink_list()
  2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                               ` (12 preceding siblings ...)
  2023-11-01  6:21             ` [PATCH 14/15] to_shrink_list(): call only if refcount is 0 Al Viro
@ 2023-11-01  6:21             ` Al Viro
  13 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-01  6:21 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 7c763a8c916b..c47d08da390f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1491,13 +1491,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 
 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 		data->found++;
-	} else {
-		if (dentry->d_flags & DCACHE_LRU_LIST)
-			d_lru_del(dentry);
-		if (!dentry->d_lockref.count) {
-			d_shrink_add(dentry, &data->dispose);
-			data->found++;
-		}
+	} else if (!dentry->d_lockref.count) {
+		to_shrink_list(dentry, &data->dispose);
+		data->found++;
 	}
 	/*
 	 * We can return to the caller if we have found some (this
@@ -1518,17 +1514,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
 	if (data->start == dentry)
 		goto out;
 
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		if (!dentry->d_lockref.count) {
+	if (!dentry->d_lockref.count) {
+		if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 			rcu_read_lock();
 			data->victim = dentry;
 			return D_WALK_QUIT;
 		}
-	} else {
-		if (dentry->d_flags & DCACHE_LRU_LIST)
-			d_lru_del(dentry);
-		if (!dentry->d_lockref.count)
-			d_shrink_add(dentry, &data->dispose);
+		to_shrink_list(dentry, &data->dispose);
 	}
 	/*
 	 * We can return to the caller if we have found some (this
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput()
  2023-11-01  6:20             ` [PATCH 09/15] fold the call of retain_dentry() into fast_dput() Al Viro
@ 2023-11-01  8:45               ` Al Viro
  2023-11-01 17:30                 ` Linus Torvalds
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-01  8:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Wed, Nov 01, 2023 at 06:20:58AM +0000, Al Viro wrote:
> Calls of retain_dentry() happen immediately after getting false
> from fast_dput() and getting true from retain_dentry() is
> treated the same way as non-zero refcount would be treated by
> fast_dput() - unlock dentry and bugger off.
> 
> Doing that in fast_dput() itself is simpler.

FWIW, I wonder if it would be better to reorganize it a bit -

// in some cases we can show that retain_dentry() would return true
// without having to take ->d_lock
< your comments regarding that part go here>
static inline bool lockless_retain_dentry(struct dentry *dentry)
{
        unsigned int d_flags;

        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);
        d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
                        DCACHE_DISCONNECTED | DCACHE_DONTCACHE;

        /* Nothing to do? Dropping the reference was all we needed? */
        if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
                return true;
	return false;
}

and fast_dput() becomes

{
        int ret;
	// try to do decrement locklessly
	ret = lockref_put_return(&dentry->d_lockref);
	if (likely(ret >= 0)) {
		// could we show that full check would succeed?
		if (ret || lockless_retain_dentry(dentry))
			return true;
		// too bad, have to lock it and do full variant
		spin_lock(&dentry->d_lock);
	} else {
		// failed, no chance to avoid grabbing lock
                spin_lock(&dentry->d_lock);
		// underflow?  whine and back off - we are done
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
		// decrement it under lock, then...
                dentry->d_lockref.count--;
        }
	// full check it is...
        if (dentry->d_lockref.count || retain_dentry(dentry)) {
                spin_unlock(&dentry->d_lock);
                return true;
        }
        return false;
}

Might be easier to follow that way...  Another thing: would you mind

#if USE_CMPXCHG_LOCKREF
extern int lockref_put_return(struct lockref *);
#else
static inline int lockref_put_return(struct lockref *l)
{
	return -1;
}
#endif

in include/linux/lockref.h?  Would be useful on DEBUG_SPINLOCK configs...

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-11-01  2:22       ` [RFC] simplifying fast_dput(), dentry_kill() et.al Al Viro
@ 2023-11-01 14:29         ` Benjamin Coddington
  0 siblings, 0 replies; 119+ messages in thread
From: Benjamin Coddington @ 2023-11-01 14:29 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel, linux-nfs, Olga Kornievskaia

On 31 Oct 2023, at 22:22, Al Viro wrote:

> [NFS folks Cc'd]
>
> On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
>> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
>>> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>>
>>>> After fixing a couple of brainos, it seems to work.
>>>
>>> This all makes me unnaturally nervous, probably because it;s overly
>>> subtle, and I have lost the context for some of the rules.
>>
>> A bit of context: I started to look at the possibility of refcount overflows.
>> Writing the current rules for dentry refcounting and lifetime down was the
>> obvious first step, and that immediately turned into an awful mess.
>>
>> It is overly subtle.  Even more so when you throw the shrink lists into
>> the mix - shrink_lock_dentry() got too smart for its own good, and that
>> leads to really awful correctness proofs.
>
> ... and for another example of subtle shit, consider DCACHE_NORCU.  Recall
> c0eb027e5aef "vfs: don't do RCU lookup of empty pathnames" and note that
> it relies upon never getting results of alloc_file_pseudo() with directory
> inode anywhere near descriptor tables.
>
> Back then I basically went "fine, nobody would ever use alloc_file_pseudo()
> for that anyway", but... there's a call in __nfs42_ssc_open() that doesn't
> have any obvious protection against ending up with directory inode.
> That does not end up anywhere near descriptor tables, as far as I can tell,
> fortunately.
>
> Unfortunately, it is quite capable of fucking the things up in different
> ways, even if it's promptly closed.  d_instantiate() on directory inode
> is a really bad thing; a plenty of places expect to have only one alias
> for those, and would be very unhappy with that kind of crap without any
> RCU considerations.
>
> I'm pretty sure that this NFS code really does not want to use that for
> directories; the simplest solution would be to refuse alloc_file_pseudo()
> for directory inodes.  NFS folks - do you have a problem with the
> following patch?

It would be a protocol violation to use COPY on a directory:

https://www.rfc-editor.org/rfc/rfc7862.html#section-15.2.3

   Both SAVED_FH and CURRENT_FH must be regular files.  If either
   SAVED_FH or CURRENT_FH is not a regular file, the operation MUST fail
   and return NFS4ERR_WRONG_TYPE.

so nfsd4_verify_copy() does S_ISREG() checks before alloc_file_pseudo().

Ben


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput()
  2023-11-01  8:45               ` Al Viro
@ 2023-11-01 17:30                 ` Linus Torvalds
  2023-11-01 18:19                   ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Linus Torvalds @ 2023-11-01 17:30 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel

On Tue, 31 Oct 2023 at 22:45, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Wed, Nov 01, 2023 at 06:20:58AM +0000, Al Viro wrote:
> > Calls of retain_dentry() happen immediately after getting false
> > from fast_dput() and getting true from retain_dentry() is
> > treated the same way as non-zero refcount would be treated by
> > fast_dput() - unlock dentry and bugger off.
> >
> > Doing that in fast_dput() itself is simpler.
>
> FWIW, I wonder if it would be better to reorganize it a bit -

Hmm. Yes. Except I don't love how the retaining logic is then duplicated.

Could we perhaps at least try to share the dentry flag tests between
the "real" retain_dentry() code and the lockless version?

> Another thing: would you mind
>
> #if USE_CMPXCHG_LOCKREF
> extern int lockref_put_return(struct lockref *);
> #else
> static inline int lockref_put_return(struct lockref *l)
> {
>         return -1;
> }
> #endif
>
> in include/linux/lockref.h?  Would be useful on DEBUG_SPINLOCK configs...

The above sounds like a good idea, not only for better code generation
for the debug case, but because it would have possibly made the erofs
misuse more obvious to people.

             Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput()
  2023-11-01 17:30                 ` Linus Torvalds
@ 2023-11-01 18:19                   ` Al Viro
  2023-11-10  4:20                     ` lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput()) Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-01 18:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Wed, Nov 01, 2023 at 07:30:34AM -1000, Linus Torvalds wrote:
> On Tue, 31 Oct 2023 at 22:45, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > On Wed, Nov 01, 2023 at 06:20:58AM +0000, Al Viro wrote:
> > > Calls of retain_dentry() happen immediately after getting false
> > > from fast_dput() and getting true from retain_dentry() is
> > > treated the same way as non-zero refcount would be treated by
> > > fast_dput() - unlock dentry and bugger off.
> > >
> > > Doing that in fast_dput() itself is simpler.
> >
> > FWIW, I wonder if it would be better to reorganize it a bit -
> 
> Hmm. Yes. Except I don't love how the retaining logic is then duplicated.

Er...  That change would be an equivalent transformation - the same duplication
is there right now...

> Could we perhaps at least try to share the dentry flag tests between
> the "real" retain_dentry() code and the lockless version?

Umm...  There are 3 groups:

DONTCACHE, DISCONNECTED - hard false
!LRU_LIST, !REFERENCED - not an obstacle to true, but need to take locks
OP_DELETE - can't tell without asking filesystem, which would need ->d_lock.

gcc-12 on x86 turns the series of ifs into
        movl    %edi, %eax
	andl    $32832, %eax
	cmpl    $32832, %eax
	jne     .L17
	andl    $168, %edi
	jne     .L17
instead of combining that into
        andl    $33000, %edi
	cmpl    $32832, %edi
	jne     .L17

OTOH, that's not much of pessimization...  Up to you.


> > Another thing: would you mind
> >
> > #if USE_CMPXCHG_LOCKREF
> > extern int lockref_put_return(struct lockref *);
> > #else
> > static inline int lockref_put_return(struct lockref *l)
> > {
> >         return -1;
> > }
> > #endif
> >
> > in include/linux/lockref.h?  Would be useful on DEBUG_SPINLOCK configs...
> 
> The above sounds like a good idea, not only for better code generation
> for the debug case, but because it would have possibly made the erofs
> misuse more obvious to people.

To make it even more obvious, perhaps rename it as well?  I.e.

/*
 * unlike the rest of these primitives, the one below does *not* contain
 * a fallback; caller needs to take care of handling that.
 */
#if USE_CMPXCHG_LOCKREF
extern int __lockref_put_return(struct lockref *);
#else
static inline int __lockref_put_return(struct lockref *l)
{
	return -1;
}
#endif

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-10-31  0:18     ` Al Viro
  2023-10-31  1:53       ` Al Viro
  2023-11-01  2:22       ` [RFC] simplifying fast_dput(), dentry_kill() et.al Al Viro
@ 2023-11-05 19:54       ` Al Viro
  2023-11-05 21:59         ` Al Viro
  2023-11-06  5:53         ` Al Viro
  2 siblings, 2 replies; 119+ messages in thread
From: Al Viro @ 2023-11-05 19:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > After fixing a couple of brainos, it seems to work.
> > 
> > This all makes me unnaturally nervous, probably because it;s overly
> > subtle, and I have lost the context for some of the rules.
> 
> A bit of context: I started to look at the possibility of refcount overflows.
> Writing the current rules for dentry refcounting and lifetime down was the
> obvious first step, and that immediately turned into an awful mess.
> 
> It is overly subtle.

	Another piece of too subtle shite: ordering of ->d_iput() of child
and __dentry_kill() of parent.  As it is, in some cases it is possible for
the latter to happen before the former.  It is *not* possible in the cases
when in-tree ->d_iput() instances actually look at the parent (all of those
are due to sillyrename stuff), but the proof is convoluted and very brittle.

	The origin of that mess is in the interaction of shrink_dcache_for_umount()
with shrink_dentry_list().  What we want to avoid is a directory looking like
it's busy since shrink_dcache_for_umount() doesn't see any children to account
for positive refcount of parent.  The kinda-sorta solution we use is to decrement
the parent's refcount *before* __dentry_kill() of child and put said parent
into a shrink list.  That makes shrink_dcache_for_umount() do the right thing,
but it's possible to end up with parent freed before the child is done with;
scenario is non-obvious, and rather hard to hit, but it's not impossible.

	dput() does no such thing - it does not decrement the parent's
refcount until the child had been taken care of.  That's fine, as far
as shrink_dcache_for_umount() is concerned - this is not a false positive;
with slightly different timing shrink_dcache_for_umount() would've reported
the child as being busy.  IOW, there should be no overlap between dput()
in one thread and shrink_dcache_for_umount() in another.  Unfortunately,
memory eviction *can* come in the middle of shrink_dcache_for_umount().

	Life would be much simpler if shrink_dentry_list() would not have
to pull that kind of tricks and used the same ordering as dput() does.
IMO there's a reasonably cheap way to achieve that:

	* have shrink_dcache_for_umount() mark the superblock (either in
->s_flags or inside the ->s_dentry_lru itself) and have the logics
in retain_dentry() that does insertion into LRU list check ->d_sb for that
mark, treating its presence as "do not retain".
	* after marking the superblock shrink_dentry_for_umount() is guaranteed
that nothing new will be added to shrink list in question.  Have it call
shrink_dcache_sb() to drain LRU.
	* Now shrink_dentry_list() in one thread hitting a dentry on
a superblock going throug shrink_dcache_for_umount() in another thread is
always a bug and reporting busy dentries is the right thing to do.
So we can switch shrink_dentry_list() to the same "drop reference to parent
only after the child had been killed" ordering as we have in dput().

	IMO that removes a fairly nasty trap for ->d_iput() and ->d_delete()
instances.  As for the overhead, the relevant fragment of retain_dentry() is
	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
		d_lru_add(dentry);
	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
		dentry->d_flags |= DCACHE_REFERENCED;
	return true;
That would become
	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
		if (unlikely(dentry->d_sb is marked))
			return false;
		d_lru_add(dentry);
	} else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
		dentry->d_flags |= DCACHE_REFERENCED;
	return true;
Note that d_lru_add() will hit ->d_sb->s_dentry_lru, so we are not
adding memory traffic here; the else if part doesn't need to be
touched - we only need to prevent insertions into LRU.

	Comments?

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-11-05 19:54       ` Al Viro
@ 2023-11-05 21:59         ` Al Viro
  2023-11-06  5:53         ` Al Viro
  1 sibling, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-05 21:59 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Sun, Nov 05, 2023 at 07:54:16PM +0000, Al Viro wrote:

> 	* have shrink_dcache_for_umount() mark the superblock (either in
> ->s_flags or inside the ->s_dentry_lru itself) and have the logics
> in retain_dentry() that does insertion into LRU list check ->d_sb for that
> mark, treating its presence as "do not retain".

	BTW, no barriers, etc. are needed for those - once we are into
shrink_dentry_for_umount() there must be no dput() calls anywhere on
that filesystem.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-11-05 19:54       ` Al Viro
  2023-11-05 21:59         ` Al Viro
@ 2023-11-06  5:53         ` Al Viro
  2023-11-07  2:08           ` Al Viro
  1 sibling, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-06  5:53 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Tobin C. Harding

On Sun, Nov 05, 2023 at 07:54:16PM +0000, Al Viro wrote:
> On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> > On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > > >
> > > > After fixing a couple of brainos, it seems to work.
> > > 
> > > This all makes me unnaturally nervous, probably because it;s overly
> > > subtle, and I have lost the context for some of the rules.
> > 
> > A bit of context: I started to look at the possibility of refcount overflows.
> > Writing the current rules for dentry refcounting and lifetime down was the
> > obvious first step, and that immediately turned into an awful mess.
> > 
> > It is overly subtle.
> 
> 	Another piece of too subtle shite: ordering of ->d_iput() of child
> and __dentry_kill() of parent.  As it is, in some cases it is possible for
> the latter to happen before the former.  It is *not* possible in the cases
> when in-tree ->d_iput() instances actually look at the parent (all of those
> are due to sillyrename stuff), but the proof is convoluted and very brittle.
> 
> 	The origin of that mess is in the interaction of shrink_dcache_for_umount()
> with shrink_dentry_list().  What we want to avoid is a directory looking like
> it's busy since shrink_dcache_for_umount() doesn't see any children to account
> for positive refcount of parent.  The kinda-sorta solution we use is to decrement
> the parent's refcount *before* __dentry_kill() of child and put said parent
> into a shrink list.  That makes shrink_dcache_for_umount() do the right thing,
> but it's possible to end up with parent freed before the child is done with;
> scenario is non-obvious, and rather hard to hit, but it's not impossible.

D'oh...  We actually don't need to worry about eviction on memory pressure at that
point; unregister_shrinker() is done early enough to prevent that.

So shrink_dcache_for_umount() does not need to worry about shrink lists use
in prune_dcache_sb().

Use in namespace_unlock() is guaranteed that all dentries involved either
have a matching mount in the list of mounts to be dropped (and thus protected
from simultaneous fs shutdown) or have a matching mount pinned by the caller.

Use in mntput_no_expire() is guaranteed the same - all dentries involved are
on superblock of mount we are going to drop after the call of shrink_dentry_list().

All other users also either have an active reference to superblock or are done
by ->kill_sb() synchronously (and thus can't race with shrink_dcache_for_umount())
or are done async, but flushed and/or waited for by foofs_kill_sb() before
they get to shrink_dcache_for_umount().

IOW, I'd been too paranoid in "Teach shrink_dcache_parent() to cope with
mixed-filesystem shrink lists" - the real requirements are milder; in-tree
users didn't need these games with parent.  Dcache side of Tobin's Slab
Movable Objects patches needed those, though...

AFAICS, there are 3 options:
	1) leave the current weirdness with ->d_iput() on child vs __dentry_kill()
on parent.  Document the requirement to ->d_iput() (and ->d_release()) to cope
with that, promise that in case of sillyrename the ordering will be there and
write down the proof of that.  No code changes, rather revolting docs to
write, trouble waiting to happen in ->d_iput().
	2) require that shrink_dentry_list() should never overlap with
shrink_dcache_for_umount() on any of the filesystems represented in the
shrink list, guarantee that parent won't get to __dentry_kill() before
the child gets through __dentry_kill() completely and accept that resurrecting
SMO stuff will require more work.  Smallish patch, tolerable docs, probably
the best option at the moment.
	3) bite the bullet and get shrink_dentry_list() to coexist with
shrink_dcache_for_umount(), with sane ordering of ->d_iput() vs. parent's
__dentry_kill().  Doable, but AFAICS it will take a counter of children
currently being killed in the parent dentry.  shrink_dentry_list() would
bump that on parent, __dentry_kill() the victim, then relock the parent
and decrement that counter along with the main refcount.  That would allow
the shrink_dcache_for_umount() to cope with that crap.  No requirements
for shrink_dentry_kill() callers that way, sane environment for ->d_iput(),
no obstacles for SMO stuff.  OTOH, we need to get space for additional
counter in struct dentry; again, doable (->d_subdirs/->d_child can be
converted to hlist, saving us a pointer in each dentry), but... I'd
leave that option alone until something that needs it would show up
(e.g. if/when Tobin resurrects his patchset).

	My preference would be (2) for the coming cycle + prototype of
a patch doing (3) on top of that for the future.

Completely untested diff for (2) (on top of #work.dcache, sans the
documentation update) below:

diff --git a/fs/dcache.c b/fs/dcache.c
index ccf41c5ee804..c978207f3fc4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1086,10 +1086,27 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
+static inline void shrink_kill(struct dentry *victim, struct list_head *list)
+{
+	struct dentry *parent = victim->d_parent;
+
+	__dentry_kill(victim);
+
+	if (parent == victim || lockref_put_or_lock(&parent->d_lockref))
+		return;
+
+	if (!WARN_ON_ONCE(parent->d_lockref.count != 1)) {
+		parent->d_lockref.count = 0;
+		to_shrink_list(parent, list);
+	}
+	spin_unlock(&parent->d_lock);
+}
+
+
 void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
-		struct dentry *dentry, *parent;
+		struct dentry *dentry;
 
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
@@ -1106,10 +1123,7 @@ void shrink_dentry_list(struct list_head *list)
 		}
 		rcu_read_unlock();
 		d_shrink_del(dentry);
-		parent = dentry->d_parent;
-		if (parent != dentry && !--parent->d_lockref.count)
-			to_shrink_list(parent, list);
-		__dentry_kill(dentry);
+		shrink_kill(dentry, list);
 	}
 }
 
@@ -1537,19 +1551,14 @@ void shrink_dcache_parent(struct dentry *parent)
 			break;
 		data.victim = NULL;
 		d_walk(parent, &data, select_collect2);
-		if (data.victim) {
-			struct dentry *parent;
+		if (data.victim) { // rcu_read_lock held - see select_collect2()
 			spin_lock(&data.victim->d_lock);
 			if (!lock_for_kill(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
 				rcu_read_unlock();
-				parent = data.victim->d_parent;
-				if (parent != data.victim &&
-				    !--parent->d_lockref.count)
-					to_shrink_list(parent, &data.dispose);
-				__dentry_kill(data.victim);
+				shrink_kill(data.victim, &data.dispose);
 			}
 		}
 		if (!list_empty(&data.dispose))

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [RFC] simplifying fast_dput(), dentry_kill() et.al.
  2023-11-06  5:53         ` Al Viro
@ 2023-11-07  2:08           ` Al Viro
  2023-11-09  6:19             ` [RFC][PATCHSET v2] " Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-07  2:08 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Tobin C. Harding

On Mon, Nov 06, 2023 at 05:53:53AM +0000, Al Viro wrote:

> AFAICS, there are 3 options:
> 	1) leave the current weirdness with ->d_iput() on child vs __dentry_kill()
> on parent.  Document the requirement to ->d_iput() (and ->d_release()) to cope
> with that, promise that in case of sillyrename the ordering will be there and
> write down the proof of that.  No code changes, rather revolting docs to
> write, trouble waiting to happen in ->d_iput().
> 	2) require that shrink_dentry_list() should never overlap with
> shrink_dcache_for_umount() on any of the filesystems represented in the
> shrink list, guarantee that parent won't get to __dentry_kill() before
> the child gets through __dentry_kill() completely and accept that resurrecting
> SMO stuff will require more work.  Smallish patch, tolerable docs, probably
> the best option at the moment.
> 	3) bite the bullet and get shrink_dentry_list() to coexist with
> shrink_dcache_for_umount(), with sane ordering of ->d_iput() vs. parent's
> __dentry_kill().  Doable, but AFAICS it will take a counter of children
> currently being killed in the parent dentry.  shrink_dentry_list() would
> bump that on parent, __dentry_kill() the victim, then relock the parent
> and decrement that counter along with the main refcount.  That would allow
> the shrink_dcache_for_umount() to cope with that crap.  No requirements
> for shrink_dentry_kill() callers that way, sane environment for ->d_iput(),
> no obstacles for SMO stuff.  OTOH, we need to get space for additional
> counter in struct dentry; again, doable (->d_subdirs/->d_child can be
> converted to hlist, saving us a pointer in each dentry), but... I'd
> leave that option alone until something that needs it would show up
> (e.g. if/when Tobin resurrects his patchset).

	4) instead of having __dentry_kill() called with dentry, parent
and inode locked and doing
	->d_prune
	unhash
	remove from list of children
	unlock parent
	detach from inode
	unlock dentry and inode
	drop inode
	->d_release
	relock dentry
	if on shrink list, mark as ready to free 
	unlock dentry
	if was not on shrink list, free it
go for calling it with just dentry and inode locked and do
	->d_prune
	unhash
	detach from inode
	unlock dentry and inode
	drop inode
	->d_release
	lock parent (if any, as usual)
	lock dentry
	remove from list of children
	if on shrink list, mark as ready to free
	unlock dentry
	if was on shrink list, free it
	decrement parent's refcount (again, if there was a parent)
	if refcount is still positive - unlock parent and return NULL
	otherwise return parent

What changes:
	* caller needs milder locking environment; lock_for_kill() gets simpler.
	  Note that only positive dentries can be moved, so inside __dentry_kill()
	  we need no retry loops, etc. - ->d_parent is stable by the point we decide
	  to remove from the list of children.
	* code that iterates through the list of children (not much of it)
	  needs to cope with seeing negative unhashed dentries with
	  refcount marked dead.  Most of it will need no changes at all.
	* ->d_prune() instances are called without parent's ->d_lock; just
	  the victim's one.  Might require changes to out-of-tree filesystems.
	* dput() turns into
	if (!dentry)
		return;
	rcu_read_lock()
	if (fast_dput(dentry)) {
		rcu_read_unlock();
		return;
	}
	while (lock_for_kill(dentry)) { // not bothering with the parent
		rcu_read_unlock();
		dentry = __dentry_kill(dentry);
		if (!dentry)
			return;
		if (retain_dentry(dentry)) {
			spin_unlock(&dentry->d_lock);
			return;
		}
		rcu_read_lock();
	}
	spin_unlock(&dentry->d_lock);
	rcu_read_unlock();
since there's no point trying to avoid locking the parents - we need
to grab those locks at some point anyway, just to remove a child from
the list of children, and that way we return from __dentry_kill() with
that lock held.
	* shrink_dentry_list() eviction of parents happens thus:
	do {
		rcu_read_unlock();
		victim = __dentry_kill(victim);
		rcu_read_lock();
	while (victim && lock_for_kill(victim));
	rcu_read_unlock();
	if (victim)
		spin_unlock(&victim->d_lock);
	* sane order of ->d_iput() on child vs. __dentry_kill() on parent.
	* shrink_dcache_for_umount() does the right thing even if it
overlaps shrink_dentry_list().

	If that works, it's probably the best variant...

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [RFC][PATCHSET v2] simplifying fast_dput(), dentry_kill() et.al.
  2023-11-07  2:08           ` Al Viro
@ 2023-11-09  6:19             ` Al Viro
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

	The series below is the fallout of trying to document the dentry
refcounting and life cycle - basically, getting rid of the bits that
had been too subtle and ugly to write them up.

	Results so far:
* -136LoC (-170LoC not counting the additions in D/f/porting ;-)
* considerably simpler locking for __dentry_kill()
* fast_dput() is pretty much "dput() sans killing dentry; returns true if
we are done, false - if dentry needs killing (in which case dentry will
be left locked and refcount is known to be 0).
* retain_dentry() not messing with refcounting - called with refcount 0
and ->d_lock held, returns whether we want the dentry retained in cache.
* rules for shrink lists are much simpler now - to_shrink_list() puts
a locked dentry with zero refcount into a shrink list, no need to guarantee
that filesystem containing that dentry won't get shut down before we get
to eventual shrink_dentry_list() - it would do the right thing.
* ->d_iput() and ->d_release() no longer have weird corner cases when they
could get called with parent already killed.  That happened to be avoided
in the cases where in-kernel instances would bother to work with the parent,
but that used to be very brittle and convoluted.  Now it's "parent is kept
pinned until __dentry_kill() of child is done".
* a bunch of other subtle shit is gone (e.g. the logics in shrink_lock_dentry()
had rather subtle corner cases, with convoluted reasons why they won't break
things - except that in some cases they would, etc.)

	This stuff lives in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache2
individual patches are in followups.  Help with reviewing and testing would
be very welcome - it seems to survive the local beating, but it definitely
needs more.

Shortlog:
Al Viro (22):
      struct dentry: get rid of randomize_layout idiocy
      switch nfsd_client_rmdir() to use of simple_recursive_removal()
      coda_flag_children(): cope with dentries turning negative
      dentry: switch the lists of children to hlist
      centralize killing dentry from shrink list
      get rid of __dget()
      shrink_dentry_list(): no need to check that dentry refcount is marked dead
      fast_dput(): having ->d_delete() is not reason to delay refcount decrement
      fast_dput(): handle underflows gracefully
      fast_dput(): new rules for refcount
      __dput_to_list(): do decrement of refcount in the callers
      Make retain_dentry() neutral with respect to refcounting
      __dentry_kill(): get consistent rules for victim's refcount
      dentry_kill(): don't bother with retain_dentry() on slow path
      Call retain_dentry() with refcount 0
      fold the call of retain_dentry() into fast_dput()
      don't try to cut corners in shrink_lock_dentry()
      fold dentry_kill() into dput()
      to_shrink_list(): call only if refcount is 0
      switch select_collect{,2}() to use of to_shrink_list()
      d_prune_aliases(): use a shrink list
      __dentry_kill(): new locking scheme

Diffstat:
 Documentation/filesystems/porting.rst     |  34 +++
 arch/powerpc/platforms/cell/spufs/inode.c |   5 +-
 fs/afs/dynroot.c                          |   5 +-
 fs/autofs/expire.c                        |   7 +-
 fs/ceph/dir.c                             |   2 +-
 fs/ceph/mds_client.c                      |   2 +-
 fs/coda/cache.c                           |   9 +-
 fs/dcache.c                               | 409 ++++++++++--------------------
 fs/libfs.c                                |  45 ++--
 fs/nfsd/nfsctl.c                          |  70 +----
 fs/notify/fsnotify.c                      |   2 +-
 fs/tracefs/inode.c                        |  34 +--
 include/linux/dcache.h                    |  22 +-
 13 files changed, 255 insertions(+), 391 deletions(-)

Patch description follows:

	Part 1 - preparations

01/22) struct dentry: get rid of randomize_layout idiocy.
	This is beyond ridiculous.  There is a reason why that thing
is cacheline-aligned...

02/22) nfsd_client_rmdir() and its gut open-code simple_recursive_removal();
converting to calling that cleans the things up in there *and* reduces
the amount of places where we touch the list of children, which simplifies
the work later in the series.

03/22) more fun caught while looking at the places that go through the
lists of children: coda_flag_children() assumes that ->d_lock on parent
is enough to prevent children going negative.  Ain't so...

04/22) switch the lists of children to hlist.  We never bother with
accessing the list tail and using hlist saves us a pointer per each
dentry.  Besides, it ends up more readable that way.  Fields used to hold
the lists got renamed - d_children/d_sib instead of d_subdirs/d_child.
Yes, any out-of-tree code that works with the lists of children gets
loudly broken; not hard to fix.

05/22) centralize killing dentry from shrink list
There are identical pieces of code in shrink_dentry_list() and
shrink_dcache_for_umount(); they would require identical massage through
the series below, unifying them into an inlined helper reduces the amount
of noise.

06/22) get rid of __dget()
fold into the sole remaining caller

07/22) shrink_dentry_list(): no need to check that dentry refcount is
marked dead.  We won't see DCACHE_MAY_FREE on anything that is *not*
dead and checking d_flags is just as cheap as checking refcount.

	Part 2 - massage of dput() and friends

08/22) fast_dput(): having ->d_delete() is not reason to delay refcount
decrement.
	->d_delete() is a way for filesystem to tell that dentry is not
worth keeping cached.  It is not guaranteed to be called every time a dentry
has refcount drop down to zero; it is not guaranteed to be called before
dentry gets evicted.  In other words, it is not suitable for any kind
of keeping track of dentry state.
	None of the in-tree filesystems attempt to use it that way,
fortunately.
	So the contortions done by fast_dput() (as well as dentry_kill())
are not warranted.  fast_dput() certainly should treat having ->d_delete()
instance as "can't assume we'll be keeping it", but that's not different
from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar
to making ->d_delete() returns true when called).

09/22) fast_dput(): handle underflows gracefully.
	If refcount is less than 1, we should just warn, unlock
dentry and return true, so that the caller doesn't try to do anything
else.
	Taking care of that leaves the rest of "lockref_put_return() has
failed" case equivalent to "decrement refcount and rejoin the normal
slow path after the point where we grab ->d_lock".

10/22) fast_dput(): new rules for refcount.
	Currently the "need caller to do more work" path in fast_dput()
has refcount decremented, then, with ->d_lock held and refcount verified
to have reached 0 fast_dput() forcibly resets the refcount to 1.
	Move that resetting refcount to 1 into the callers; later in
the series it will be massaged out of existence.

11/22) __dput_to_list(): do decrement of refcount in the callers
... and rename it to to_shrink_list(), seeing that it no longer
does dropping any references.

12/22) make retain_dentry() neutral with respect to refcounting.
It used to decrement refcount if and only if it returned true.
Lift those decrements into the callers.

13/22) __dentry_kill(): get consistent rules for victim's refcount
	Currently we call it with refcount equal to 1 when called from
dentry_kill(); all other callers have it equal to 0.
	Make it always be called with zero refcount; on this step we just
decrement it before the calls in dentry_kill().  That is safe, since
all places that care about the value of refcount either do that under
->d_lock or hold a reference to dentry in question.  Either is sufficient
to prevent observing a dentry immediately prior to __dentry_kill()
getting called from dentry_kill().

14/22) dentry_kill(): don't bother with retain_dentry() on the slow path
	We have already checked it and dentry used to look not worthy
of keeping.  The only hard obstacle to evicting dentry is non-zero
refcount; everything else is advisory - e.g. memory pressure could evict
any dentry found with refcount zero.  On the slow path in dentry_kill()
we had dropped and regained ->d_lock; we must recheck the refcount,
but everything else is not worth bothering with.
	Note that filesystem can not count upon ->d_delete() being
called for dentry - not even once.  Again, memory pressure (as well as
d_prune_aliases(), or attempted rmdir() of ancestor, or...) will not
call ->d_delete() at all.
	So from the correctness point of view we are fine doing the
check only once.  And it makes things simpler down the road.
	The doctor said "To the morgue", so to the morgue it is!

15/22) Call retain_dentry() with refcount 0.
	Instead of bumping it from 0 to 1, calling retain_dentry(),
then decrementing it back to 0 (with ->d_lock held all the way through),
just leave refcount at 0 through all of that.
	It will have a visible effect for ->d_delete() - now it can
be called with refcount 0 instead of 1 and it can no longer play silly
buggers with dropping/regaining ->d_lock.  Not that any in-tree instances
tried to (it's pretty hard to get right).
	Any out-of-tree ones will have to adjust (assuming they need
any changes).

16/22) fold the call of retain_dentry() into fast_dput()
	Calls of retain_dentry() happen immediately after getting false
from fast_dput() and getting true from retain_dentry() is treated the
same way as non-zero refcount would be treated by fast_dput() - unlock
dentry and bugger off.

17/22) don't try to cut corners in shrink_lock_dentry().
	That is to say, do *not* treat the ->d_inode or ->d_parent
changes as "it's hard, return false; somebody must have grabbed it,
so even if has zero refcount, we don't need to bother killing it -
final dput() from whoever grabbed it would've done everything".
	First of all, that is not guaranteed.  It might have been dropped
by shrink_kill() handling of victim's parent, which would've found it
already on a shrink list (ours) and decided that they don't need to put
it on their shrink list.
	What's more, dentry_kill() is doing pretty much the same thing,
cutting its own set of corners (it assumes that dentry can't go from
positive to negative, so its inode can change but only once and only in
one direction).
	Doing that right allows to get rid of that not-quite-duplication
and removes the only reason for re-incrementing refcount before the call
of dentry_kill().
	Replacement is called lock_for_kill(); called under rcu_read_lock
and with ->d_lock held.  If it returns false, dentry has non-zero refcount
and the same locks are held.  If it returns true, dentry has zero refcount
and all locks required by __dentry_kill() are taken.
	Part of __lock_parent() had been lifted into lock_parent() to
allow its reuse.  Now it's called with rcu_read_lock already held and
dentry already unlocked.
	Note that this is not the final change - locking requirements
for __dentry_kill() are going to change later in the series and the set
of locks taken by lock_for_kill() will be adjusted.

18/22) fold dentry_kill() into dput().
	Not worth keeping separate.

19/22) to_shrink_list(): call only if refcount is 0
	The only thing it does if refcount is not zero is d_lru_del();
no point, IMO, seeing that plain dput() does nothing of that sort...
Note that 2 of 3 current callers are guaranteed that refcount is 0.

20/22) switch select_collect{,2}() to use of to_shrink_list()
Same note about d_lru_del() as in (18/22).

21/22) d_prune_aliases(): use a shrink list
	Instead of dropping aliases one by one, restarting, etc., just
collect them into a shrink list and kill them off in one pass.
	We don't really need the restarts - one alias can't pin another
(directory has only one alias, and couldn't be its own ancestor anyway),
so collecting everything that is not busy and taking it out would
take care of everything evictable that had been there as we entered
the function.  And new aliases added while we'd been dropping old ones
could just as easily have appeared right as we return to caller...

22/22) __dentry_kill(): new locking scheme
	Currently we enter __dentry_kill() with parent (along with the
victim dentry and victim's inode) held locked.	Then we
	mark dentry refcount as dead
	call ->d_prune()
	remove dentry from hash
	remove it from the parent's list of children
	unlock the parent, don't need it from that point on
	detach dentry from inode,
	unlock dentry and drop the inode (via ->d_iput())
	call ->d_release()
	regain the lock on dentry
	check if it's on a shrink list (in which case freeing its empty
	  husk has to be left to shrink_dentry_list()) or not (in which
	  case we can free it ourselves).  In the former case, mark it
	  as an empty husk, so that shrink_dentry_list() would know it
	  can free the sucker.
	drop the lock on dentry
... and usually the caller proceeds to drop a reference on the parent,
possibly retaking the lock on it.
	That is painful for a bunch of reasons, starting with the need
to take locks out of order, but not limited to that - the parent of
positive dentry can change if we drop its ->d_lock, so getting these
locks has to be done with care.
	Moreover, as soon as dentry is out of the parent's list of
children, shrink_dcache_for_umount() won't see it anymore, making it
appear as if the parent is inexplicably busy.  We do work around that
by having shrink_dentry_list() decrement the parent's refcount first and
put it on shrink list to be evicted once we are done with __dentry_kill()
of child, but that may in some cases lead to ->d_iput() on child called
after the parent got killed.  That doesn't happen in cases where in-tree
->d_iput() instances might want to look at the parent, but that's brittle
as hell.
	Solution: do removal from the parent's list of children in the
very end of __dentry_kill().  As the result, the callers do not need to
lock the parent and by the time we really need the parent locked, dentry
is negative and is guaranteed not to be moved around.
	It does mean that ->d_prune() will be called with parent not
locked.  It also means that we might see dentries in process of being torn
down while going through the parent's list of children; those dentries
will be unhashed, negative and with refcount marked dead.  In practice,
that's enough for in-tree code that looks through the list of children
to do the right thing as-is.  Out-of-tree code might need to be adjusted.
	Calling conventions: __dentry_kill(dentry) is called with
dentry->d_lock held, along with ->i_lock of its inode (if any).
It either returns the parent (locked, with refcount decremented to 0)
or NULL (if there'd been no parent or if refcount decrement for parent
hadn't reached 0).
	lock_for_kill() is adjusted for new requirements - it doesn't
touch the parent's ->d_lock at all.
	Callers adjusted.  Note that for dput() we don't need to
bother with fast_dput() for the parent - we just need to check
retain_dentry() for it, since its ->d_lock is still held since the
moment when __dentry_kill() had taken it to remove the victim from the
list of children.
	The kludge with early decrement of parent's refcount in
shrink_dentry_list() is no longer needed - shrink_dcache_for_umount()
sees the half-killed dentries in the list of children for as long as
they are pinning the parent.  They are easily recognized and accounted
for by select_collect(), so we know we are not done yet.
	As the result, we always have the expected ordering
for ->d_iput()/->d_release() vs. __dentry_kill() of the parent, no
exceptions.  Moreover, the current rules for shrink lists (one must make
sure that shrink_dcache_for_umount() won't happen while any dentries
from the superblock in question are on any shrink lists) are gone -
shrink_dcache_for_umount() will do the right thing in all cases, taking
such dentries out.  Their empty husks (memory occupied by struct dentry
itself + its external name, if any) will remain on the shrink lists,
but they are no obstacles to filesystem shutdown.  And such husks will
get freed as soon as shrink_dentry_list() of the list they are on gets
to them.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy
  2023-11-09  6:19             ` [RFC][PATCHSET v2] " Al Viro
@ 2023-11-09  6:20               ` Al Viro
  2023-11-09  6:20                 ` [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal() Al Viro
                                   ` (21 more replies)
  0 siblings, 22 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

This is beyond ridiculous.  There is a reason why that thing is
cacheline-aligned...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..8b4ad3c3bba0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -111,7 +111,7 @@ struct dentry {
 		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
 	 	struct rcu_head d_rcu;
 	} d_u;
-} __randomize_layout;
+};
 
 /*
  * dentry->d_lock spinlock nesting subclasses:
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal()
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:42                   ` Christian Brauner
  2023-11-09 14:01                   ` Chuck Lever
  2023-11-09  6:20                 ` [PATCH 03/22] coda_flag_children(): cope with dentries turning negative Al Viro
                                   ` (20 subsequent siblings)
  21 siblings, 2 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfsctl.c | 70 ++++++++++--------------------------------------
 1 file changed, 14 insertions(+), 56 deletions(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7ed02fb88a36..035b42c1a181 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1235,63 +1235,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name,
 
 #endif
 
-static void clear_ncl(struct inode *inode)
+static void clear_ncl(struct dentry *dentry)
 {
+	struct inode *inode = d_inode(dentry);
 	struct nfsdfs_client *ncl = inode->i_private;
 
+	spin_lock(&inode->i_lock);
 	inode->i_private = NULL;
+	spin_unlock(&inode->i_lock);
 	kref_put(&ncl->cl_ref, ncl->cl_release);
 }
 
-static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode)
-{
-	struct nfsdfs_client *nc = inode->i_private;
-
-	if (nc)
-		kref_get(&nc->cl_ref);
-	return nc;
-}
-
 struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
 {
 	struct nfsdfs_client *nc;
 
-	inode_lock_shared(inode);
-	nc = __get_nfsdfs_client(inode);
-	inode_unlock_shared(inode);
+	spin_lock(&inode->i_lock);
+	nc = inode->i_private;
+	if (nc)
+		kref_get(&nc->cl_ref);
+	spin_unlock(&inode->i_lock);
 	return nc;
 }
-/* from __rpc_unlink */
-static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
-{
-	int ret;
-
-	clear_ncl(d_inode(dentry));
-	dget(dentry);
-	ret = simple_unlink(dir, dentry);
-	d_drop(dentry);
-	fsnotify_unlink(dir, dentry);
-	dput(dentry);
-	WARN_ON_ONCE(ret);
-}
-
-static void nfsdfs_remove_files(struct dentry *root)
-{
-	struct dentry *dentry, *tmp;
-
-	list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) {
-		if (!simple_positive(dentry)) {
-			WARN_ON_ONCE(1); /* I think this can't happen? */
-			continue;
-		}
-		nfsdfs_remove_file(d_inode(root), dentry);
-	}
-}
 
 /* XXX: cut'n'paste from simple_fill_super; figure out if we could share
  * code instead. */
 static  int nfsdfs_create_files(struct dentry *root,
 				const struct tree_descr *files,
+				struct nfsdfs_client *ncl,
 				struct dentry **fdentries)
 {
 	struct inode *dir = d_inode(root);
@@ -1310,8 +1281,9 @@ static  int nfsdfs_create_files(struct dentry *root,
 			dput(dentry);
 			goto out;
 		}
+		kref_get(&ncl->cl_ref);
 		inode->i_fop = files->ops;
-		inode->i_private = __get_nfsdfs_client(dir);
+		inode->i_private = ncl;
 		d_add(dentry, inode);
 		fsnotify_create(dir, dentry);
 		if (fdentries)
@@ -1320,7 +1292,6 @@ static  int nfsdfs_create_files(struct dentry *root,
 	inode_unlock(dir);
 	return 0;
 out:
-	nfsdfs_remove_files(root);
 	inode_unlock(dir);
 	return -ENOMEM;
 }
@@ -1340,7 +1311,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
 	dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
 	if (IS_ERR(dentry)) /* XXX: tossing errors? */
 		return NULL;
-	ret = nfsdfs_create_files(dentry, files, fdentries);
+	ret = nfsdfs_create_files(dentry, files, ncl, fdentries);
 	if (ret) {
 		nfsd_client_rmdir(dentry);
 		return NULL;
@@ -1351,20 +1322,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
 /* Taken from __rpc_rmdir: */
 void nfsd_client_rmdir(struct dentry *dentry)
 {
-	struct inode *dir = d_inode(dentry->d_parent);
-	struct inode *inode = d_inode(dentry);
-	int ret;
-
-	inode_lock(dir);
-	nfsdfs_remove_files(dentry);
-	clear_ncl(inode);
-	dget(dentry);
-	ret = simple_rmdir(dir, dentry);
-	WARN_ON_ONCE(ret);
-	d_drop(dentry);
-	fsnotify_rmdir(dir, dentry);
-	dput(dentry);
-	inode_unlock(dir);
+	simple_recursive_removal(dentry, clear_ncl);
 }
 
 static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 03/22] coda_flag_children(): cope with dentries turning negative
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
  2023-11-09  6:20                 ` [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal() Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:43                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 04/22] dentry: switch the lists of children to hlist Al Viro
                                   ` (19 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

->d_lock on parent does not stabilize ->d_inode of child.
We don't do much with that inode in there, but we need
at least to avoid struct inode getting freed under us...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/cache.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 3b8c4513118f..bfbc03c6b632 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -92,13 +92,16 @@ static void coda_flag_children(struct dentry *parent, int flag)
 {
 	struct dentry *de;
 
+	rcu_read_lock();
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(de, &parent->d_subdirs, d_child) {
+		struct inode *inode = d_inode_rcu(de);
 		/* don't know what to do with negative dentries */
-		if (d_inode(de) ) 
-			coda_flag_inode(d_inode(de), flag);
+		if (inode)
+			coda_flag_inode(inode, flag);
 	}
 	spin_unlock(&parent->d_lock);
+	rcu_read_unlock();
 	return; 
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 04/22] dentry: switch the lists of children to hlist
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
  2023-11-09  6:20                 ` [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal() Al Viro
  2023-11-09  6:20                 ` [PATCH 03/22] coda_flag_children(): cope with dentries turning negative Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:48                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 05/22] centralize killing dentry from shrink list Al Viro
                                   ` (18 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Saves a pointer per struct dentry and actually makes the things less
clumsy.  Cleaned the d_walk() and dcache_readdir() a bit by use
of hlist_for_... iterators.

A couple of new helpers - d_first_child() and d_next_sibling(),
to make the expressions less awful.

X-fuck-kABI: gladly
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst     |  9 +++
 arch/powerpc/platforms/cell/spufs/inode.c |  5 +-
 fs/afs/dynroot.c                          |  5 +-
 fs/autofs/expire.c                        |  7 +--
 fs/ceph/dir.c                             |  2 +-
 fs/ceph/mds_client.c                      |  2 +-
 fs/coda/cache.c                           |  2 +-
 fs/dcache.c                               | 76 +++++++++++------------
 fs/libfs.c                                | 45 +++++++-------
 fs/notify/fsnotify.c                      |  2 +-
 fs/tracefs/inode.c                        | 34 +++++-----
 include/linux/dcache.h                    | 20 ++++--
 12 files changed, 108 insertions(+), 101 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 4d05b9862451..58627f0baf3e 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1045,3 +1045,12 @@ filesystem type is now moved to a later point when the devices are closed:
 As this is a VFS level change it has no practical consequences for filesystems
 other than that all of them must use one of the provided kill_litter_super(),
 kill_anon_super(), or kill_block_super() helpers.
+
+---
+
+**mandatory**
+
+The list of children anchored in parent dentry got turned into hlist now.
+Field names got changed (->d_children/->d_sib instead of ->d_subdirs/->d_child
+for anchor/entries resp.), so any affected places will be immediately caught
+by compiler.
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 38c5be34c895..71019bfa0eb7 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -145,10 +145,11 @@ spufs_evict_inode(struct inode *inode)
 
 static void spufs_prune_dir(struct dentry *dir)
 {
-	struct dentry *dentry, *tmp;
+	struct dentry *dentry;
+	struct hlist_node *n;
 
 	inode_lock(d_inode(dir));
-	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
+	hlist_for_each_entry_safe(dentry, n, &dir->d_children, d_sib) {
 		spin_lock(&dentry->d_lock);
 		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 95bcbd7654d1..fb510c3197e4 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -370,7 +370,7 @@ int afs_dynroot_populate(struct super_block *sb)
 void afs_dynroot_depopulate(struct super_block *sb)
 {
 	struct afs_net *net = afs_sb2net(sb);
-	struct dentry *root = sb->s_root, *subdir, *tmp;
+	struct dentry *root = sb->s_root, *subdir;
 
 	/* Prevent more subdirs from being created */
 	mutex_lock(&net->proc_cells_lock);
@@ -379,10 +379,11 @@ void afs_dynroot_depopulate(struct super_block *sb)
 	mutex_unlock(&net->proc_cells_lock);
 
 	if (root) {
+		struct hlist_node *n;
 		inode_lock(root->d_inode);
 
 		/* Remove all the pins for dirs created for manually added cells */
-		list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+		hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
 			if (subdir->d_fsdata) {
 				subdir->d_fsdata = NULL;
 				dput(subdir);
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 038b3d2d9f57..39d8c84c16f4 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -73,12 +73,9 @@ static int autofs_mount_busy(struct vfsmount *mnt,
 /* p->d_lock held */
 static struct dentry *positive_after(struct dentry *p, struct dentry *child)
 {
-	if (child)
-		child = list_next_entry(child, d_child);
-	else
-		child = list_first_entry(&p->d_subdirs, struct dentry, d_child);
+	child = child ? d_next_sibling(child) : d_first_child(p);
 
-	list_for_each_entry_from(child, &p->d_subdirs, d_child) {
+	hlist_for_each_entry_from(child, d_sib) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
 		if (simple_positive(child)) {
 			dget_dlock(child);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 854cbdd66661..87884a578ec9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -171,7 +171,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
- * d_child when we initially get results back from the MDS, and
+ * d_children when we initially get results back from the MDS, and
  * falling back to a "normal" sync readdir if any dentries in the dir
  * are dropped.
  *
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 293b93182955..a566b4b029b9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2100,7 +2100,7 @@ static bool drop_negative_children(struct dentry *dentry)
 		goto out;
 
 	spin_lock(&dentry->d_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+	hlist_for_each_entry(child, &dentry->d_children, d_sib) {
 		if (d_really_is_positive(child)) {
 			all_negative = false;
 			break;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index bfbc03c6b632..f5b71a35f9db 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -94,7 +94,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 
 	rcu_read_lock();
 	spin_lock(&parent->d_lock);
-	list_for_each_entry(de, &parent->d_subdirs, d_child) {
+	hlist_for_each_entry(de, &parent->d_children, d_sib) {
 		struct inode *inode = d_inode_rcu(de);
 		/* don't know what to do with negative dentries */
 		if (inode)
diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..1b8ec1a9bf1c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -51,8 +51,8 @@
  *   - d_lru
  *   - d_count
  *   - d_unhashed()
- *   - d_parent and d_subdirs
- *   - childrens' d_child and d_parent
+ *   - d_parent and d_chilren
+ *   - childrens' d_sib and d_parent
  *   - d_u.d_alias, d_inode
  *
  * Ordering:
@@ -537,7 +537,7 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
-static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+static inline void dentry_unlist(struct dentry *dentry)
 {
 	struct dentry *next;
 	/*
@@ -545,12 +545,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
 	 * attached to the dentry tree
 	 */
 	dentry->d_flags |= DCACHE_DENTRY_KILLED;
-	if (unlikely(list_empty(&dentry->d_child)))
+	if (unlikely(hlist_unhashed(&dentry->d_sib)))
 		return;
-	__list_del_entry(&dentry->d_child);
+	__hlist_del(&dentry->d_sib);
 	/*
 	 * Cursors can move around the list of children.  While we'd been
-	 * a normal list member, it didn't matter - ->d_child.next would've
+	 * a normal list member, it didn't matter - ->d_sib.next would've
 	 * been updated.  However, from now on it won't be and for the
 	 * things like d_walk() it might end up with a nasty surprise.
 	 * Normally d_walk() doesn't care about cursors moving around -
@@ -558,20 +558,20 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
 	 * of its own, we get through it without ever unlocking the parent.
 	 * There is one exception, though - if we ascend from a child that
 	 * gets killed as soon as we unlock it, the next sibling is found
-	 * using the value left in its ->d_child.next.  And if _that_
+	 * using the value left in its ->d_sib.next.  And if _that_
 	 * pointed to a cursor, and cursor got moved (e.g. by lseek())
 	 * before d_walk() regains parent->d_lock, we'll end up skipping
 	 * everything the cursor had been moved past.
 	 *
-	 * Solution: make sure that the pointer left behind in ->d_child.next
+	 * Solution: make sure that the pointer left behind in ->d_sib.next
 	 * points to something that won't be moving around.  I.e. skip the
 	 * cursors.
 	 */
-	while (dentry->d_child.next != &parent->d_subdirs) {
-		next = list_entry(dentry->d_child.next, struct dentry, d_child);
+	while (dentry->d_sib.next) {
+		next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
 		if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
 			break;
-		dentry->d_child.next = next->d_child.next;
+		dentry->d_sib.next = next->d_sib.next;
 	}
 }
 
@@ -600,7 +600,7 @@ static void __dentry_kill(struct dentry *dentry)
 	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	dentry_unlist(dentry, parent);
+	dentry_unlist(dentry);
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	if (dentry->d_inode)
@@ -1348,8 +1348,7 @@ enum d_walk_ret {
 static void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *))
 {
-	struct dentry *this_parent;
-	struct list_head *next;
+	struct dentry *this_parent, *dentry;
 	unsigned seq = 0;
 	enum d_walk_ret ret;
 	bool retry = true;
@@ -1371,13 +1370,9 @@ static void d_walk(struct dentry *parent, void *data,
 		break;
 	}
 repeat:
-	next = this_parent->d_subdirs.next;
+	dentry = d_first_child(this_parent);
 resume:
-	while (next != &this_parent->d_subdirs) {
-		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
-		next = tmp->next;
-
+	hlist_for_each_entry_from(dentry, d_sib) {
 		if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
 			continue;
 
@@ -1398,7 +1393,7 @@ static void d_walk(struct dentry *parent, void *data,
 			continue;
 		}
 
-		if (!list_empty(&dentry->d_subdirs)) {
+		if (!hlist_empty(&dentry->d_children)) {
 			spin_unlock(&this_parent->d_lock);
 			spin_release(&dentry->d_lock.dep_map, _RET_IP_);
 			this_parent = dentry;
@@ -1413,24 +1408,23 @@ static void d_walk(struct dentry *parent, void *data,
 	rcu_read_lock();
 ascend:
 	if (this_parent != parent) {
-		struct dentry *child = this_parent;
-		this_parent = child->d_parent;
+		dentry = this_parent;
+		this_parent = dentry->d_parent;
 
-		spin_unlock(&child->d_lock);
+		spin_unlock(&dentry->d_lock);
 		spin_lock(&this_parent->d_lock);
 
 		/* might go back up the wrong parent if we have had a rename. */
 		if (need_seqretry(&rename_lock, seq))
 			goto rename_retry;
 		/* go into the first sibling still alive */
-		do {
-			next = child->d_child.next;
-			if (next == &this_parent->d_subdirs)
-				goto ascend;
-			child = list_entry(next, struct dentry, d_child);
-		} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
-		rcu_read_unlock();
-		goto resume;
+		hlist_for_each_entry_continue(dentry, d_sib) {
+			if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
+				rcu_read_unlock();
+				goto resume;
+			}
+		}
+		goto ascend;
 	}
 	if (need_seqretry(&rename_lock, seq))
 		goto rename_retry;
@@ -1530,7 +1524,7 @@ int d_set_mounted(struct dentry *dentry)
  * Search the dentry child list of the specified parent,
  * and move any unused dentries to the end of the unused
  * list for prune_dcache(). We descend to the next level
- * whenever the d_subdirs list is non-empty and continue
+ * whenever the d_children list is non-empty and continue
  * searching.
  *
  * It returns zero iff there are no unused children,
@@ -1657,7 +1651,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
 {
 	/* it has busy descendents; complain about those instead */
-	if (!list_empty(&dentry->d_subdirs))
+	if (!hlist_empty(&dentry->d_children))
 		return D_WALK_CONTINUE;
 
 	/* root with refcount 1 is fine */
@@ -1814,9 +1808,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_fsdata = NULL;
 	INIT_HLIST_BL_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
-	INIT_LIST_HEAD(&dentry->d_subdirs);
+	INIT_HLIST_HEAD(&dentry->d_children);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
-	INIT_LIST_HEAD(&dentry->d_child);
+	INIT_HLIST_NODE(&dentry->d_sib);
 	d_set_d_op(dentry, dentry->d_sb->s_d_op);
 
 	if (dentry->d_op && dentry->d_op->d_init) {
@@ -1855,7 +1849,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	 */
 	__dget_dlock(parent);
 	dentry->d_parent = parent;
-	list_add(&dentry->d_child, &parent->d_subdirs);
+	hlist_add_head(&dentry->d_sib, &parent->d_children);
 	spin_unlock(&parent->d_lock);
 
 	return dentry;
@@ -2993,11 +2987,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	} else {
 		target->d_parent = old_parent;
 		swap_names(dentry, target);
-		list_move(&target->d_child, &target->d_parent->d_subdirs);
+		if (!hlist_unhashed(&target->d_sib))
+			__hlist_del(&target->d_sib);
+		hlist_add_head(&target->d_sib, &target->d_parent->d_children);
 		__d_rehash(target);
 		fsnotify_update_flags(target);
 	}
-	list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	if (!hlist_unhashed(&dentry->d_sib))
+		__hlist_del(&dentry->d_sib);
+	hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
 	__d_rehash(dentry);
 	fsnotify_update_flags(dentry);
 	fscrypt_handle_d_move(dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index 37f2d34ee090..1a7d30f867f3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -101,15 +101,16 @@ EXPORT_SYMBOL(dcache_dir_close);
  * If no such element exists, NULL is returned.
  */
 static struct dentry *scan_positives(struct dentry *cursor,
-					struct list_head *p,
+					struct hlist_node **p,
 					loff_t count,
 					struct dentry *last)
 {
 	struct dentry *dentry = cursor->d_parent, *found = NULL;
 
 	spin_lock(&dentry->d_lock);
-	while ((p = p->next) != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(p, struct dentry, d_child);
+	while (*p) {
+		struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
+		p = &d->d_sib.next;
 		// we must at least skip cursors, to avoid livelocks
 		if (d->d_flags & DCACHE_DENTRY_CURSOR)
 			continue;
@@ -123,8 +124,10 @@ static struct dentry *scan_positives(struct dentry *cursor,
 			count = 1;
 		}
 		if (need_resched()) {
-			list_move(&cursor->d_child, p);
-			p = &cursor->d_child;
+			if (!hlist_unhashed(&cursor->d_sib))
+				__hlist_del(&cursor->d_sib);
+			hlist_add_behind(&cursor->d_sib, &d->d_sib);
+			p = &cursor->d_sib.next;
 			spin_unlock(&dentry->d_lock);
 			cond_resched();
 			spin_lock(&dentry->d_lock);
@@ -156,13 +159,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 		inode_lock_shared(dentry->d_inode);
 
 		if (offset > 2)
-			to = scan_positives(cursor, &dentry->d_subdirs,
+			to = scan_positives(cursor, &dentry->d_children.first,
 					    offset - 2, NULL);
 		spin_lock(&dentry->d_lock);
+		hlist_del_init(&cursor->d_sib);
 		if (to)
-			list_move(&cursor->d_child, &to->d_child);
-		else
-			list_del_init(&cursor->d_child);
+			hlist_add_behind(&cursor->d_sib, &to->d_sib);
 		spin_unlock(&dentry->d_lock);
 		dput(to);
 
@@ -184,19 +186,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct dentry *cursor = file->private_data;
-	struct list_head *anchor = &dentry->d_subdirs;
 	struct dentry *next = NULL;
-	struct list_head *p;
+	struct hlist_node **p;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
 	if (ctx->pos == 2)
-		p = anchor;
-	else if (!list_empty(&cursor->d_child))
-		p = &cursor->d_child;
+		p = &dentry->d_children.first;
 	else
-		return 0;
+		p = &cursor->d_sib.next;
 
 	while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
 		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
@@ -204,13 +203,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 			      fs_umode_to_dtype(d_inode(next)->i_mode)))
 			break;
 		ctx->pos++;
-		p = &next->d_child;
+		p = &next->d_sib.next;
 	}
 	spin_lock(&dentry->d_lock);
+	hlist_del_init(&cursor->d_sib);
 	if (next)
-		list_move_tail(&cursor->d_child, &next->d_child);
-	else
-		list_del_init(&cursor->d_child);
+		hlist_add_before(&cursor->d_sib, &next->d_sib);
 	spin_unlock(&dentry->d_lock);
 	dput(next);
 
@@ -489,12 +487,11 @@ const struct file_operations simple_offset_dir_operations = {
 
 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
 {
-	struct dentry *child = NULL;
-	struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
+	struct dentry *child = NULL, *d;
 
 	spin_lock(&parent->d_lock);
-	while ((p = p->next) != &parent->d_subdirs) {
-		struct dentry *d = container_of(p, struct dentry, d_child);
+	d = prev ? d_next_sibling(prev) : d_first_child(parent);
+	hlist_for_each_entry_from(d, d_sib) {
 		if (simple_positive(d)) {
 			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
 			if (simple_positive(d))
@@ -654,7 +651,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dentry->d_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+	hlist_for_each_entry(child, &dentry->d_children, d_sib) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
 		if (simple_positive(child)) {
 			spin_unlock(&child->d_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7974e91ffe13..8bfd690e9f10 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		 * d_flags to indicate parental interest (their parent is the
 		 * original inode) */
 		spin_lock(&alias->d_lock);
-		list_for_each_entry(child, &alias->d_subdirs, d_child) {
+		hlist_for_each_entry(child, &alias->d_children, d_sib) {
 			if (!child->d_inode)
 				continue;
 
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 891653ba9cf3..2aaa4c421938 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -199,26 +199,21 @@ static void change_gid(struct dentry *dentry, kgid_t gid)
  */
 static void set_gid(struct dentry *parent, kgid_t gid)
 {
-	struct dentry *this_parent;
-	struct list_head *next;
+	struct dentry *this_parent, *dentry;
 
 	this_parent = parent;
 	spin_lock(&this_parent->d_lock);
 
 	change_gid(this_parent, gid);
 repeat:
-	next = this_parent->d_subdirs.next;
+	dentry = d_first_child(this_parent);
 resume:
-	while (next != &this_parent->d_subdirs) {
-		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
-		next = tmp->next;
-
+	hlist_for_each_entry_from(dentry, d_sib) {
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
 		change_gid(dentry, gid);
 
-		if (!list_empty(&dentry->d_subdirs)) {
+		if (!hlist_empty(&dentry->d_children)) {
 			spin_unlock(&this_parent->d_lock);
 			spin_release(&dentry->d_lock.dep_map, _RET_IP_);
 			this_parent = dentry;
@@ -233,21 +228,20 @@ static void set_gid(struct dentry *parent, kgid_t gid)
 	rcu_read_lock();
 ascend:
 	if (this_parent != parent) {
-		struct dentry *child = this_parent;
-		this_parent = child->d_parent;
+		dentry = this_parent;
+		this_parent = dentry->d_parent;
 
-		spin_unlock(&child->d_lock);
+		spin_unlock(&dentry->d_lock);
 		spin_lock(&this_parent->d_lock);
 
 		/* go into the first sibling still alive */
-		do {
-			next = child->d_child.next;
-			if (next == &this_parent->d_subdirs)
-				goto ascend;
-			child = list_entry(next, struct dentry, d_child);
-		} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
-		rcu_read_unlock();
-		goto resume;
+		hlist_for_each_entry_continue(dentry, d_sib) {
+			if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
+				rcu_read_unlock();
+				goto resume;
+			}
+		}
+		goto ascend;
 	}
 	rcu_read_unlock();
 	spin_unlock(&this_parent->d_lock);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 8b4ad3c3bba0..140662c3156d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -68,12 +68,12 @@ extern const struct qstr dotdot_name;
  * large memory footprint increase).
  */
 #ifdef CONFIG_64BIT
-# define DNAME_INLINE_LEN 32 /* 192 bytes */
+# define DNAME_INLINE_LEN 40 /* 192 bytes */
 #else
 # ifdef CONFIG_SMP
-#  define DNAME_INLINE_LEN 36 /* 128 bytes */
-# else
 #  define DNAME_INLINE_LEN 40 /* 128 bytes */
+# else
+#  define DNAME_INLINE_LEN 44 /* 128 bytes */
 # endif
 #endif
 
@@ -101,8 +101,8 @@ struct dentry {
 		struct list_head d_lru;		/* LRU list */
 		wait_queue_head_t *d_wait;	/* in-lookup ones only */
 	};
-	struct list_head d_child;	/* child of parent list */
-	struct list_head d_subdirs;	/* our children */
+	struct hlist_node d_sib;	/* child of parent list */
+	struct hlist_head d_children;	/* our children */
 	/*
 	 * d_alias and d_rcu can share memory
 	 */
@@ -599,4 +599,14 @@ struct name_snapshot {
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
 
+static inline struct dentry *d_first_child(const struct dentry *dentry)
+{
+	return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
+}
+
+static inline struct dentry *d_next_sibling(const struct dentry *dentry)
+{
+	return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
+}
+
 #endif	/* __LINUX_DCACHE_H */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 05/22] centralize killing dentry from shrink list
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (2 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 04/22] dentry: switch the lists of children to hlist Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:49                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 06/22] get rid of __dget() Al Viro
                                   ` (17 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

new helper unifying identical bits of shrink_dentry_list() and
shring_dcache_for_umount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 1b8ec1a9bf1c..56af55f2b7d9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1174,10 +1174,18 @@ static bool shrink_lock_dentry(struct dentry *dentry)
 	return false;
 }
 
+static inline void shrink_kill(struct dentry *victim, struct list_head *list)
+{
+	struct dentry *parent = victim->d_parent;
+	if (parent != victim)
+		__dput_to_list(parent, list);
+	__dentry_kill(victim);
+}
+
 void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
-		struct dentry *dentry, *parent;
+		struct dentry *dentry;
 
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
@@ -1195,10 +1203,7 @@ void shrink_dentry_list(struct list_head *list)
 		}
 		rcu_read_unlock();
 		d_shrink_del(dentry);
-		parent = dentry->d_parent;
-		if (parent != dentry)
-			__dput_to_list(parent, list);
-		__dentry_kill(dentry);
+		shrink_kill(dentry, list);
 	}
 }
 
@@ -1629,17 +1634,13 @@ void shrink_dcache_parent(struct dentry *parent)
 		data.victim = NULL;
 		d_walk(parent, &data, select_collect2);
 		if (data.victim) {
-			struct dentry *parent;
 			spin_lock(&data.victim->d_lock);
 			if (!shrink_lock_dentry(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
 				rcu_read_unlock();
-				parent = data.victim->d_parent;
-				if (parent != data.victim)
-					__dput_to_list(parent, &data.dispose);
-				__dentry_kill(data.victim);
+				shrink_kill(data.victim, &data.dispose);
 			}
 		}
 		if (!list_empty(&data.dispose))
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 06/22] get rid of __dget()
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (3 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 05/22] centralize killing dentry from shrink list Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:50                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
                                   ` (16 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

fold into the sole remaining caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 56af55f2b7d9..1476f2d6e9ea 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -948,11 +948,6 @@ static inline void __dget_dlock(struct dentry *dentry)
 	dentry->d_lockref.count++;
 }
 
-static inline void __dget(struct dentry *dentry)
-{
-	lockref_get(&dentry->d_lockref);
-}
-
 struct dentry *dget_parent(struct dentry *dentry)
 {
 	int gotref;
@@ -1002,7 +997,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
 	if (hlist_empty(&inode->i_dentry))
 		return NULL;
 	alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
-	__dget(alias);
+	lockref_get(&alias->d_lockref);
 	return alias;
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (4 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 06/22] get rid of __dget() Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:53                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 08/22] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
                                   ` (15 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

... we won't see DCACHE_MAY_FREE on anything that is *not* dead
and checking d_flags is just as cheap as checking refcount.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 1476f2d6e9ea..5371f32eb4bb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1186,11 +1186,10 @@ void shrink_dentry_list(struct list_head *list)
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
 		if (!shrink_lock_dentry(dentry)) {
-			bool can_free = false;
+			bool can_free;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
-			if (dentry->d_lockref.count < 0)
-				can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
 			if (can_free)
 				dentry_free(dentry);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 08/22] fast_dput(): having ->d_delete() is not reason to delay refcount decrement
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (5 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 13:58                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 09/22] fast_dput(): handle underflows gracefully Al Viro
                                   ` (14 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

->d_delete() is a way for filesystem to tell that dentry is not worth
keeping cached.  It is not guaranteed to be called every time a dentry
has refcount drop down to zero; it is not guaranteed to be called before
dentry gets evicted.  In other words, it is not suitable for any kind
of keeping track of dentry state.

None of the in-tree filesystems attempt to use it that way, fortunately.

So the contortions done by fast_dput() (as well as dentry_kill()) are
not warranted.  fast_dput() certainly should treat having ->d_delete()
instance as "can't assume we'll be keeping it", but that's not different
from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar
to making ->d_delete() returns true when called).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5371f32eb4bb..0d15e8852ac1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -768,15 +768,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * try to decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -830,7 +822,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 09/22] fast_dput(): handle underflows gracefully
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (6 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 08/22] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 14:46                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 10/22] fast_dput(): new rules for refcount Al Viro
                                   ` (13 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

If refcount is less than 1, we should just warn, unlock dentry and
return true, so that the caller doesn't try to do anything else.

Taking care of that leaves the rest of "lockref_put_return() has
failed" case equivalent to "decrement refcount and rejoin the
normal slow path after the point where we grab ->d_lock".

NOTE: lockref_put_return() is strictly a fastpath thing - unlike
the rest of lockref primitives, it does not contain a fallback.
Caller (and it looks like fast_dput() is the only legitimate one
in the entire kernel) has to do that itself.  Reasons for
lockref_put_return() failures:
	* ->d_lock held by somebody
	* refcount <= 0
	* ... or an architecture not supporting lockref use of
cmpxchg - sparc, anything non-SMP, config with spinlock debugging...

We could add a fallback, but it would be a clumsy API - we'd have
to distinguish between:
	(1) refcount > 1 - decremented, lock not held on return
	(2) refcount < 1 - left alone, probably no sense to hold the lock
	(3) refcount is 1, no cmphxcg - decremented, lock held on return
	(4) refcount is 1, cmphxcg supported - decremented, lock *NOT* held
	    on return.
We want to return with no lock held in case (4); that's the whole point of that
thing.  We very much do not want to have the fallback in case (3) return without
a lock, since the caller might have to retake it in that case.
So it wouldn't be more convenient than doing the fallback in the caller and
it would be very easy to screw up, especially since the test coverage would
suck - no way to test (3) and (4) on the same kernel build.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 0d15e8852ac1..e02b3c81bc02 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -779,12 +779,12 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	if (unlikely(ret < 0)) {
 		spin_lock(&dentry->d_lock);
-		if (dentry->d_lockref.count > 1) {
-			dentry->d_lockref.count--;
+		if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
 			spin_unlock(&dentry->d_lock);
 			return true;
 		}
-		return false;
+		dentry->d_lockref.count--;
+		goto locked;
 	}
 
 	/*
@@ -842,6 +842,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * else could have killed it and marked it dead. Either way, we
 	 * don't need to do anything else.
 	 */
+locked:
 	if (dentry->d_lockref.count) {
 		spin_unlock(&dentry->d_lock);
 		return true;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 10/22] fast_dput(): new rules for refcount
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (7 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 09/22] fast_dput(): handle underflows gracefully Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 14:54                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 11/22] __dput_to_list(): do decrement of refcount in the callers Al Viro
                                   ` (12 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Currently the "need caller to do more work" path in fast_dput()
has refcount decremented, then, with ->d_lock held and
refcount verified to have reached 0 fast_dput() forcibly resets
the refcount to 1.

Move that resetting refcount to 1 into the callers; later in
the series it will be massaged out of existence.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index e02b3c81bc02..9a3eeee02500 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -847,13 +847,6 @@ static inline bool fast_dput(struct dentry *dentry)
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
-
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -896,6 +889,7 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
+		dentry->d_lockref.count = 1;
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
@@ -930,6 +924,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
+	dentry->d_lockref.count = 1;
 	if (!retain_dentry(dentry))
 		__dput_to_list(dentry, list);
 	spin_unlock(&dentry->d_lock);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 11/22] __dput_to_list(): do decrement of refcount in the callers
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (8 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 10/22] fast_dput(): new rules for refcount Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 15:21                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 12/22] Make retain_dentry() neutral with respect to refcounting Al Viro
                                   ` (11 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

... and rename it to to_shrink_list(), seeing that it no longer
does dropping any references

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 9a3eeee02500..1899376d0189 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -902,16 +902,13 @@ void dput(struct dentry *dentry)
 }
 EXPORT_SYMBOL(dput);
 
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
+		if (!dentry->d_lockref.count)
 			d_shrink_add(dentry, list);
 	}
 }
@@ -925,8 +922,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 	}
 	rcu_read_unlock();
 	dentry->d_lockref.count = 1;
-	if (!retain_dentry(dentry))
-		__dput_to_list(dentry, list);
+	if (!retain_dentry(dentry)) {
+		--dentry->d_lockref.count;
+		to_shrink_list(dentry, list);
+	}
 	spin_unlock(&dentry->d_lock);
 }
 
@@ -1160,8 +1159,10 @@ static bool shrink_lock_dentry(struct dentry *dentry)
 static inline void shrink_kill(struct dentry *victim, struct list_head *list)
 {
 	struct dentry *parent = victim->d_parent;
-	if (parent != victim)
-		__dput_to_list(parent, list);
+	if (parent != victim) {
+		--parent->d_lockref.count;
+		to_shrink_list(parent, list);
+	}
 	__dentry_kill(victim);
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 12/22] Make retain_dentry() neutral with respect to refcounting
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (9 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 11/22] __dput_to_list(): do decrement of refcount in the callers Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 15:22                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 13/22] __dentry_kill(): get consistent rules for victim's refcount Al Viro
                                   ` (10 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

retain_dentry() used to decrement refcount if and only if it returned
true.  Lift those decrements into the callers.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 1899376d0189..1f61a5d03d5b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -744,6 +743,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	} else if (likely(!retain_dentry(dentry))) {
 		__dentry_kill(dentry);
 		return parent;
+	} else {
+		dentry->d_lockref.count--;
 	}
 	/* we are keeping it, after all */
 	if (inode)
@@ -893,6 +894,7 @@ void dput(struct dentry *dentry)
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
+			dentry->d_lockref.count--;
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
@@ -925,6 +927,8 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 	if (!retain_dentry(dentry)) {
 		--dentry->d_lockref.count;
 		to_shrink_list(dentry, list);
+	} else {
+		--dentry->d_lockref.count;
 	}
 	spin_unlock(&dentry->d_lock);
 }
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 13/22] __dentry_kill(): get consistent rules for victim's refcount
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (10 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 12/22] Make retain_dentry() neutral with respect to refcounting Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 15:27                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
                                   ` (9 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Currently we call it with refcount equal to 1 when called from
dentry_kill(); all other callers have it equal to 0.

Make it always be called with zero refcount; on this step we
just decrement it before the calls in dentry_kill().  That is
safe, since all places that care about the value of refcount
either do that under ->d_lock or hold a reference to dentry
in question.  Either is sufficient to prevent observing a
dentry immediately prior to __dentry_kill() getting called
from dentry_kill().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 1f61a5d03d5b..d9466cab4884 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -729,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 			goto slow_positive;
 		}
 	}
+	dentry->d_lockref.count--;
 	__dentry_kill(dentry);
 	return parent;
 
@@ -741,6 +742,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	if (unlikely(dentry->d_lockref.count != 1)) {
 		dentry->d_lockref.count--;
 	} else if (likely(!retain_dentry(dentry))) {
+		dentry->d_lockref.count--;
 		__dentry_kill(dentry);
 		return parent;
 	} else {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (11 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 13/22] __dentry_kill(): get consistent rules for victim's refcount Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 15:53                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 15/22] Call retain_dentry() with refcount 0 Al Viro
                                   ` (8 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

We have already checked it and dentry used to look not worthy
of keeping.  The only hard obstacle to evicting dentry is
non-zero refcount; everything else is advisory - e.g. memory
pressure could evict any dentry found with refcount zero.
On the slow path in dentry_kill() we had dropped and regained
->d_lock; we must recheck the refcount, but everything else
is not worth bothering with.

Note that filesystem can not count upon ->d_delete() being
called for dentry - not even once.  Again, memory pressure
(as well as d_prune_aliases(), or attempted rmdir() of ancestor,
or...) will not call ->d_delete() at all.

So from the correctness point of view we are fine doing the
check only once.  And it makes things simpler down the road.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index d9466cab4884..916b978bfd98 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -739,14 +739,10 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	spin_lock(&dentry->d_lock);
 	parent = lock_parent(dentry);
 got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
-		dentry->d_lockref.count--;
+	dentry->d_lockref.count--;
+	if (likely(dentry->d_lockref.count == 0)) {
 		__dentry_kill(dentry);
 		return parent;
-	} else {
-		dentry->d_lockref.count--;
 	}
 	/* we are keeping it, after all */
 	if (inode)
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 15/22] Call retain_dentry() with refcount 0
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (12 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 16:09                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 16/22] fold the call of retain_dentry() into fast_dput() Al Viro
                                   ` (7 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Instead of bumping it from 0 to 1, calling retain_dentry(), then
decrementing it back to 0 (with ->d_lock held all the way through),
just leave refcount at 0 through all of that.

It will have a visible effect for ->d_delete() - now it can be
called with refcount 0 instead of 1 and it can no longer play
silly buggers with dropping/regaining ->d_lock.  Not that any
in-tree instances tried to (it's pretty hard to get right).

Any out-of-tree ones will have to adjust (assuming they need any
changes).

Note that we do not need to extend rcu-critical area here - we have
verified that refcount is non-negative after having grabbed ->d_lock,
so nobody will be able to free dentry until they get into __dentry_kill(),
which won't happen until they manage to grab ->d_lock.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  8 ++++++++
 fs/dcache.c                           | 10 ++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 58627f0baf3e..6b058362938c 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1054,3 +1054,11 @@ The list of children anchored in parent dentry got turned into hlist now.
 Field names got changed (->d_children/->d_sib instead of ->d_subdirs/->d_child
 for anchor/entries resp.), so any affected places will be immediately caught
 by compiler.
+
+---
+
+**mandatory**
+
+	->d_delete() instances are now called for dentries with ->d_lock held
+and refcount equal to 0.  They are not permitted to drop/regain ->d_lock.
+None of in-tree instances did anything of that sort.  Make sure yours do not...
diff --git a/fs/dcache.c b/fs/dcache.c
index 916b978bfd98..3179156e0ad9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -888,15 +888,14 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		dentry->d_lockref.count = 1;
 		rcu_read_unlock();
 
 		if (likely(retain_dentry(dentry))) {
-			dentry->d_lockref.count--;
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
 
+		dentry->d_lockref.count = 1;
 		dentry = dentry_kill(dentry);
 	}
 }
@@ -921,13 +920,8 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
-	dentry->d_lockref.count = 1;
-	if (!retain_dentry(dentry)) {
-		--dentry->d_lockref.count;
+	if (!retain_dentry(dentry))
 		to_shrink_list(dentry, list);
-	} else {
-		--dentry->d_lockref.count;
-	}
 	spin_unlock(&dentry->d_lock);
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 16/22] fold the call of retain_dentry() into fast_dput()
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (13 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 15/22] Call retain_dentry() with refcount 0 Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 16:17                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 17/22] don't try to cut corners in shrink_lock_dentry() Al Viro
                                   ` (6 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Calls of retain_dentry() happen immediately after getting false
from fast_dput() and getting true from retain_dentry() is
treated the same way as non-zero refcount would be treated by
fast_dput() - unlock dentry and bugger off.

Doing that in fast_dput() itself is simpler.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 3179156e0ad9..23afcd48c1a9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -757,6 +757,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
  * Try to do a lockless dput(), and return whether that was successful.
  *
  * If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
  *
  * The caller needs to hold the RCU read lock, so that the dentry is
  * guaranteed to stay around even if the refcount goes down to zero!
@@ -842,7 +844,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * don't need to do anything else.
 	 */
 locked:
-	if (dentry->d_lockref.count) {
+	if (dentry->d_lockref.count || retain_dentry(dentry)) {
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
@@ -889,12 +891,6 @@ void dput(struct dentry *dentry)
 
 		/* Slow case: now with the dentry lock held */
 		rcu_read_unlock();
-
-		if (likely(retain_dentry(dentry))) {
-			spin_unlock(&dentry->d_lock);
-			return;
-		}
-
 		dentry->d_lockref.count = 1;
 		dentry = dentry_kill(dentry);
 	}
@@ -920,8 +916,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
-	if (!retain_dentry(dentry))
-		to_shrink_list(dentry, list);
+	to_shrink_list(dentry, list);
 	spin_unlock(&dentry->d_lock);
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (14 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 16/22] fold the call of retain_dentry() into fast_dput() Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 17:20                   ` Christian Brauner
  2023-11-09 17:39                   ` Linus Torvalds
  2023-11-09  6:20                 ` [PATCH 18/22] fold dentry_kill() into dput() Al Viro
                                   ` (5 subsequent siblings)
  21 siblings, 2 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

That is to say, do *not* treat the ->d_inode or ->d_parent changes
as "it's hard, return false; somebody must have grabbed it, so
even if has zero refcount, we don't need to bother killing it -
final dput() from whoever grabbed it would've done everything".

First of all, that is not guaranteed.  It might have been dropped
by shrink_kill() handling of victim's parent, which would've found
it already on a shrink list (ours) and decided that they don't need
to put it on their shrink list.

What's more, dentry_kill() is doing pretty much the same thing,
cutting its own set of corners (it assumes that dentry can't
go from positive to negative, so its inode can change but only once
and only in one direction).

Doing that right allows to get rid of that not-quite-duplication
and removes the only reason for re-incrementing refcount before
the call of dentry_kill().

Replacement is called lock_for_kill(); called under rcu_read_lock
and with ->d_lock held.  If it returns false, dentry has non-zero
refcount and the same locks are held.  If it returns true,
dentry has zero refcount and all locks required by __dentry_kill()
are taken.

Part of __lock_parent() had been lifted into lock_parent() to
allow its reuse.  Now it's called with rcu_read_lock already
held and dentry already unlocked.

Note that this is not the final change - locking requirements for
__dentry_kill() are going to change later in the series and the
set of locks taken by lock_for_kill() will be adjusted.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 159 ++++++++++++++++++++++------------------------------
 1 file changed, 66 insertions(+), 93 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 23afcd48c1a9..801502871671 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry)
 static struct dentry *__lock_parent(struct dentry *dentry)
 {
 	struct dentry *parent;
-	rcu_read_lock();
-	spin_unlock(&dentry->d_lock);
 again:
 	parent = READ_ONCE(dentry->d_parent);
 	spin_lock(&parent->d_lock);
@@ -642,7 +640,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
 		spin_unlock(&parent->d_lock);
 		goto again;
 	}
-	rcu_read_unlock();
 	if (parent != dentry)
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	else
@@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 		return NULL;
 	if (likely(spin_trylock(&parent->d_lock)))
 		return parent;
-	return __lock_parent(dentry);
+	rcu_read_lock();
+	spin_unlock(&dentry->d_lock);
+	parent = __lock_parent(dentry);
+	rcu_read_unlock();
+	return parent;
+}
+
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy.  Otherwise, return true and have
+ * that dentry's inode and parent both locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = dentry->d_parent;
+
+	if (unlikely(dentry->d_lockref.count))
+		return false;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto slow;
+	if (dentry == parent)
+		return true;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return true;
+
+	if (inode)
+		spin_unlock(&inode->i_lock);
+slow:
+	spin_unlock(&dentry->d_lock);
+
+	for (;;) {
+		if (inode)
+			spin_lock(&inode->i_lock);
+		parent = __lock_parent(dentry);
+		if (likely(inode == dentry->d_inode))
+			break;
+		if (inode)
+			spin_unlock(&inode->i_lock);
+		inode = dentry->d_inode;
+		spin_unlock(&dentry->d_lock);
+		if (parent)
+			spin_unlock(&parent->d_lock);
+	}
+	if (likely(!dentry->d_lockref.count))
+		return true;
+	if (inode)
+		spin_unlock(&inode->i_lock);
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	return false;
 }
 
 static inline bool retain_dentry(struct dentry *dentry)
@@ -710,45 +764,16 @@ EXPORT_SYMBOL(d_mark_dontcache);
 static struct dentry *dentry_kill(struct dentry *dentry)
 	__releases(dentry->d_lock)
 {
-	struct inode *inode = dentry->d_inode;
-	struct dentry *parent = NULL;
 
-	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
-		goto slow_positive;
-
-	if (!IS_ROOT(dentry)) {
-		parent = dentry->d_parent;
-		if (unlikely(!spin_trylock(&parent->d_lock))) {
-			parent = __lock_parent(dentry);
-			if (likely(inode || !dentry->d_inode))
-				goto got_locks;
-			/* negative that became positive */
-			if (parent)
-				spin_unlock(&parent->d_lock);
-			inode = dentry->d_inode;
-			goto slow_positive;
-		}
-	}
 	dentry->d_lockref.count--;
-	__dentry_kill(dentry);
-	return parent;
-
-slow_positive:
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&inode->i_lock);
-	spin_lock(&dentry->d_lock);
-	parent = lock_parent(dentry);
-got_locks:
-	dentry->d_lockref.count--;
-	if (likely(dentry->d_lockref.count == 0)) {
+	rcu_read_lock();
+	if (likely(lock_for_kill(dentry))) {
+		struct dentry *parent = dentry->d_parent;
+		rcu_read_unlock();
 		__dentry_kill(dentry);
-		return parent;
+		return parent != dentry ? parent : NULL;
 	}
-	/* we are keeping it, after all */
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	if (parent)
-		spin_unlock(&parent->d_lock);
+	rcu_read_unlock();
 	spin_unlock(&dentry->d_lock);
 	return NULL;
 }
@@ -1095,58 +1120,6 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list.  Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
-{
-	struct inode *inode;
-	struct dentry *parent;
-
-	if (dentry->d_lockref.count)
-		return false;
-
-	inode = dentry->d_inode;
-	if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
-		spin_unlock(&dentry->d_lock);
-		spin_lock(&inode->i_lock);
-		spin_lock(&dentry->d_lock);
-		if (unlikely(dentry->d_lockref.count))
-			goto out;
-		/* changed inode means that somebody had grabbed it */
-		if (unlikely(inode != dentry->d_inode))
-			goto out;
-	}
-
-	parent = dentry->d_parent;
-	if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
-		return true;
-
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&parent->d_lock);
-	if (unlikely(parent != dentry->d_parent)) {
-		spin_unlock(&parent->d_lock);
-		spin_lock(&dentry->d_lock);
-		goto out;
-	}
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	if (likely(!dentry->d_lockref.count))
-		return true;
-	spin_unlock(&parent->d_lock);
-out:
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	return false;
-}
-
 static inline void shrink_kill(struct dentry *victim, struct list_head *list)
 {
 	struct dentry *parent = victim->d_parent;
@@ -1165,7 +1138,7 @@ void shrink_dentry_list(struct list_head *list)
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
-		if (!shrink_lock_dentry(dentry)) {
+		if (!lock_for_kill(dentry)) {
 			bool can_free;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
@@ -1609,7 +1582,7 @@ void shrink_dcache_parent(struct dentry *parent)
 		d_walk(parent, &data, select_collect2);
 		if (data.victim) {
 			spin_lock(&data.victim->d_lock);
-			if (!shrink_lock_dentry(data.victim)) {
+			if (!lock_for_kill(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 18/22] fold dentry_kill() into dput()
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (15 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 17/22] don't try to cut corners in shrink_lock_dentry() Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 17:22                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 19/22] to_shrink_list(): call only if refcount is 0 Al Viro
                                   ` (4 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 801502871671..aa9f7ee7a603 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -756,28 +756,6 @@ void d_mark_dontcache(struct inode *inode)
 }
 EXPORT_SYMBOL(d_mark_dontcache);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
-{
-
-	dentry->d_lockref.count--;
-	rcu_read_lock();
-	if (likely(lock_for_kill(dentry))) {
-		struct dentry *parent = dentry->d_parent;
-		rcu_read_unlock();
-		__dentry_kill(dentry);
-		return parent != dentry ? parent : NULL;
-	}
-	rcu_read_unlock();
-	spin_unlock(&dentry->d_lock);
-	return NULL;
-}
-
 /*
  * Try to do a lockless dput(), and return whether that was successful.
  *
@@ -915,9 +893,18 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-		dentry->d_lockref.count = 1;
-		dentry = dentry_kill(dentry);
+		if (likely(lock_for_kill(dentry))) {
+			struct dentry *parent = dentry->d_parent;
+			rcu_read_unlock();
+			__dentry_kill(dentry);
+			if (dentry == parent)
+				return;
+			dentry = parent;
+		} else {
+			rcu_read_unlock();
+			spin_unlock(&dentry->d_lock);
+			return;
+		}
 	}
 }
 EXPORT_SYMBOL(dput);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 19/22] to_shrink_list(): call only if refcount is 0
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (16 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 18/22] fold dentry_kill() into dput() Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 17:29                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 20/22] switch select_collect{,2}() to use of to_shrink_list() Al Viro
                                   ` (3 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

The only thing it does if refcount is not zero is d_lru_del(); no
point, IMO, seeing that plain dput() does nothing of that sort...

Note that 2 of 3 current callers are guaranteed that refcount is 0.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index aa9f7ee7a603..49585f2ad896 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -915,8 +915,7 @@ __must_hold(&dentry->d_lock)
 	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -1110,10 +1109,8 @@ EXPORT_SYMBOL(d_prune_aliases);
 static inline void shrink_kill(struct dentry *victim, struct list_head *list)
 {
 	struct dentry *parent = victim->d_parent;
-	if (parent != victim) {
-		--parent->d_lockref.count;
+	if (parent != victim && !--parent->d_lockref.count)
 		to_shrink_list(parent, list);
-	}
 	__dentry_kill(victim);
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 20/22] switch select_collect{,2}() to use of to_shrink_list()
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (17 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 19/22] to_shrink_list(): call only if refcount is 0 Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 17:31                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 21/22] d_prune_aliases(): use a shrink list Al Viro
                                   ` (2 subsequent siblings)
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 49585f2ad896..5fdb6342f659 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1490,13 +1490,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 
 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 		data->found++;
-	} else {
-		if (dentry->d_flags & DCACHE_LRU_LIST)
-			d_lru_del(dentry);
-		if (!dentry->d_lockref.count) {
-			d_shrink_add(dentry, &data->dispose);
-			data->found++;
-		}
+	} else if (!dentry->d_lockref.count) {
+		to_shrink_list(dentry, &data->dispose);
+		data->found++;
 	}
 	/*
 	 * We can return to the caller if we have found some (this
@@ -1517,17 +1513,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
 	if (data->start == dentry)
 		goto out;
 
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		if (!dentry->d_lockref.count) {
+	if (!dentry->d_lockref.count) {
+		if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 			rcu_read_lock();
 			data->victim = dentry;
 			return D_WALK_QUIT;
 		}
-	} else {
-		if (dentry->d_flags & DCACHE_LRU_LIST)
-			d_lru_del(dentry);
-		if (!dentry->d_lockref.count)
-			d_shrink_add(dentry, &data->dispose);
+		to_shrink_list(dentry, &data->dispose);
 	}
 	/*
 	 * We can return to the caller if we have found some (this
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 21/22] d_prune_aliases(): use a shrink list
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (18 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 20/22] switch select_collect{,2}() to use of to_shrink_list() Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-09 17:33                   ` Christian Brauner
  2023-11-09  6:20                 ` [PATCH 22/22] __dentry_kill(): new locking scheme Al Viro
  2023-11-09 13:33                 ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Christian Brauner
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Instead of dropping aliases one by one, restarting, etc., just
collect them into a shrink list and kill them off in one pass.

We don't really need the restarts - one alias can't pin another
(directory has only one alias, and couldn't be its own ancestor
anyway), so collecting everything that is not busy and taking it
out would take care of everything evictable that had been there
as we entered the function.  And new aliases added while we'd
been dropping old ones could just as easily have appeared right
as we return to caller...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5fdb6342f659..cea707a77e28 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -647,20 +647,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
 	return parent;
 }
 
-static inline struct dentry *lock_parent(struct dentry *dentry)
-{
-	struct dentry *parent = dentry->d_parent;
-	if (IS_ROOT(dentry))
-		return NULL;
-	if (likely(spin_trylock(&parent->d_lock)))
-		return parent;
-	rcu_read_lock();
-	spin_unlock(&dentry->d_lock);
-	parent = __lock_parent(dentry);
-	rcu_read_unlock();
-	return parent;
-}
-
 /*
  * Lock a dentry for feeding it to __dentry_kill().
  * Called under rcu_read_lock() and dentry->d_lock; the former
@@ -1085,24 +1071,18 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
  */
 void d_prune_aliases(struct inode *inode)
 {
+	LIST_HEAD(dispose);
 	struct dentry *dentry;
-restart:
+
 	spin_lock(&inode->i_lock);
 	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
 		spin_lock(&dentry->d_lock);
-		if (!dentry->d_lockref.count) {
-			struct dentry *parent = lock_parent(dentry);
-			if (likely(!dentry->d_lockref.count)) {
-				__dentry_kill(dentry);
-				dput(parent);
-				goto restart;
-			}
-			if (parent)
-				spin_unlock(&parent->d_lock);
-		}
+		if (!dentry->d_lockref.count)
+			to_shrink_list(dentry, &dispose);
 		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&inode->i_lock);
+	shrink_dentry_list(&dispose);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* [PATCH 22/22] __dentry_kill(): new locking scheme
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (19 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 21/22] d_prune_aliases(): use a shrink list Al Viro
@ 2023-11-09  6:20                 ` Al Viro
  2023-11-10 13:34                   ` Christian Brauner
  2023-11-09 13:33                 ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Christian Brauner
  21 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09  6:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

Currently we enter __dentry_kill() with parent (along with the victim
dentry and victim's inode) held locked.  Then we
	mark dentry refcount as dead
	call ->d_prune()
	remove dentry from hash
	remove it from the parent's list of children
	unlock the parent, don't need it from that point on
	detach dentry from inode, unlock dentry and drop the inode
(via ->d_iput())
	call ->d_release()
	regain the lock on dentry
	check if it's on a shrink list (in which case freeing its empty husk
has to be left to shrink_dentry_list()) or not (in which case we can free it
ourselves).  In the former case, mark it as an empty husk, so that
shrink_dentry_list() would know it can free the sucker.
	drop the lock on dentry
... and usually the caller proceeds to drop a reference on the parent,
possibly retaking the lock on it.

That is painful for a bunch of reasons, starting with the need to take locks
out of order, but not limited to that - the parent of positive dentry can
change if we drop its ->d_lock, so getting these locks has to be done with
care.  Moreover, as soon as dentry is out of the parent's list of children,
shrink_dcache_for_umount() won't see it anymore, making it appear as if
the parent is inexplicably busy.  We do work around that by having
shrink_dentry_list() decrement the parent's refcount first and put it on
shrink list to be evicted once we are done with __dentry_kill() of child,
but that may in some cases lead to ->d_iput() on child called after the
parent got killed.  That doesn't happen in cases where in-tree ->d_iput()
instances might want to look at the parent, but that's brittle as hell.

Solution: do removal from the parent's list of children in the very
end of __dentry_kill().  As the result, the callers do not need to
lock the parent and by the time we really need the parent locked,
dentry is negative and is guaranteed not to be moved around.

It does mean that ->d_prune() will be called with parent not locked.
It also means that we might see dentries in process of being torn
down while going through the parent's list of children; those dentries
will be unhashed, negative and with refcount marked dead.  In practice,
that's enough for in-tree code that looks through the list of children
to do the right thing as-is.  Out-of-tree code might need to be adjusted.

Calling conventions: __dentry_kill(dentry) is called with dentry->d_lock
held, along with ->i_lock of its inode (if any).  It either returns
the parent (locked, with refcount decremented to 0) or NULL (if there'd
been no parent or if refcount decrement for parent hadn't reached 0).

lock_for_kill() is adjusted for new requirements - it doesn't touch
the parent's ->d_lock at all.

Callers adjusted.  Note that for dput() we don't need to bother with
fast_dput() for the parent - we just need to check retain_dentry()
for it, since its ->d_lock is still held since the moment when
__dentry_kill() had taken it to remove the victim from the list of
children.

The kludge with early decrement of parent's refcount in
shrink_dentry_list() is no longer needed - shrink_dcache_for_umount()
sees the half-killed dentries in the list of children for as long
as they are pinning the parent.  They are easily recognized and
accounted for by select_collect(), so we know we are not done yet.

As the result, we always have the expected ordering for ->d_iput()/->d_release()
vs. __dentry_kill() of the parent, no exceptions.  Moreover, the current
rules for shrink lists (one must make sure that shrink_dcache_for_umount()
won't happen while any dentries from the superblock in question are on
any shrink lists) are gone - shrink_dcache_for_umount() will do the
right thing in all cases, taking such dentries out.  Their empty
husks (memory occupied by struct dentry itself + its external name,
if any) will remain on the shrink lists, but they are no obstacles
to filesystem shutdown.  And such husks will get freed as soon as
shrink_dentry_list() of the list they are on gets to them.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  17 ++++
 fs/dcache.c                           | 127 ++++++++++----------------
 2 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 6b058362938c..8e3e31b18374 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1062,3 +1062,20 @@ by compiler.
 	->d_delete() instances are now called for dentries with ->d_lock held
 and refcount equal to 0.  They are not permitted to drop/regain ->d_lock.
 None of in-tree instances did anything of that sort.  Make sure yours do not...
+
+--
+
+**mandatory**
+
+	->d_prune() instances are now called without ->d_lock held on the parent.
+->d_lock on dentry itself is still held; if you need per-parent exclusions (none
+of the in-tree instances did), use your own spinlock.
+
+	->d_iput() and ->d_release() are called with victim dentry still in the
+list of parent's children.  It is still unhashed, marked killed, etc., just not
+removed from parent's ->d_children yet.
+
+	Anyone iterating through the list of children needs to be aware of the
+half-killed dentries that might be seen there; taking ->d_lock on those will
+see them negative, unhashed and with negative refcount, which means that most
+of the in-kernel users would've done the right thing anyway without any adjustment.
diff --git a/fs/dcache.c b/fs/dcache.c
index cea707a77e28..bd57b9a08894 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -575,12 +575,10 @@ static inline void dentry_unlist(struct dentry *dentry)
 	}
 }
 
-static void __dentry_kill(struct dentry *dentry)
+static struct dentry *__dentry_kill(struct dentry *dentry)
 {
 	struct dentry *parent = NULL;
 	bool can_free = true;
-	if (!IS_ROOT(dentry))
-		parent = dentry->d_parent;
 
 	/*
 	 * The dentry is now unrecoverably dead to the world.
@@ -600,9 +598,6 @@ static void __dentry_kill(struct dentry *dentry)
 	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	dentry_unlist(dentry);
-	if (parent)
-		spin_unlock(&parent->d_lock);
 	if (dentry->d_inode)
 		dentry_unlink_inode(dentry);
 	else
@@ -611,7 +606,14 @@ static void __dentry_kill(struct dentry *dentry)
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
+	cond_resched();
+	/* now that it's negative, ->d_parent is stable */
+	if (!IS_ROOT(dentry)) {
+		parent = dentry->d_parent;
+		spin_lock(&parent->d_lock);
+	}
 	spin_lock(&dentry->d_lock);
+	dentry_unlist(dentry);
 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 		dentry->d_flags |= DCACHE_MAY_FREE;
 		can_free = false;
@@ -619,31 +621,10 @@ static void __dentry_kill(struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 	if (likely(can_free))
 		dentry_free(dentry);
-	cond_resched();
-}
-
-static struct dentry *__lock_parent(struct dentry *dentry)
-{
-	struct dentry *parent;
-again:
-	parent = READ_ONCE(dentry->d_parent);
-	spin_lock(&parent->d_lock);
-	/*
-	 * We can't blindly lock dentry until we are sure
-	 * that we won't violate the locking order.
-	 * Any changes of dentry->d_parent must have
-	 * been done with parent->d_lock held, so
-	 * spin_lock() above is enough of a barrier
-	 * for checking if it's still our child.
-	 */
-	if (unlikely(parent != dentry->d_parent)) {
+	if (parent && --parent->d_lockref.count) {
 		spin_unlock(&parent->d_lock);
-		goto again;
+		return NULL;
 	}
-	if (parent != dentry)
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	else
-		parent = NULL;
 	return parent;
 }
 
@@ -655,48 +636,32 @@ static struct dentry *__lock_parent(struct dentry *dentry)
  * d_delete(), etc.
  *
  * Return false if dentry is busy.  Otherwise, return true and have
- * that dentry's inode and parent both locked.
+ * that dentry's inode locked.
  */
 
 static bool lock_for_kill(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	struct dentry *parent = dentry->d_parent;
 
 	if (unlikely(dentry->d_lockref.count))
 		return false;
 
-	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
-		goto slow;
-	if (dentry == parent)
-		return true;
-	if (likely(spin_trylock(&parent->d_lock)))
+	if (!inode || likely(spin_trylock(&inode->i_lock)))
 		return true;
 
-	if (inode)
-		spin_unlock(&inode->i_lock);
-slow:
-	spin_unlock(&dentry->d_lock);
-
-	for (;;) {
-		if (inode)
-			spin_lock(&inode->i_lock);
-		parent = __lock_parent(dentry);
+	do {
+		spin_unlock(&dentry->d_lock);
+		spin_lock(&inode->i_lock);
+		spin_lock(&dentry->d_lock);
 		if (likely(inode == dentry->d_inode))
 			break;
-		if (inode)
-			spin_unlock(&inode->i_lock);
+		spin_unlock(&inode->i_lock);
 		inode = dentry->d_inode;
-		spin_unlock(&dentry->d_lock);
-		if (parent)
-			spin_unlock(&parent->d_lock);
-	}
+	} while (inode);
 	if (likely(!dentry->d_lockref.count))
 		return true;
 	if (inode)
 		spin_unlock(&inode->i_lock);
-	if (parent)
-		spin_unlock(&parent->d_lock);
 	return false;
 }
 
@@ -869,29 +834,27 @@ static inline bool fast_dput(struct dentry *dentry)
  */
 void dput(struct dentry *dentry)
 {
-	while (dentry) {
-		might_sleep();
-
-		rcu_read_lock();
-		if (likely(fast_dput(dentry))) {
-			rcu_read_unlock();
+	if (!dentry)
+		return;
+	might_sleep();
+	rcu_read_lock();
+	if (likely(fast_dput(dentry))) {
+		rcu_read_unlock();
+		return;
+	}
+	while (lock_for_kill(dentry)) {
+		rcu_read_unlock();
+		dentry = __dentry_kill(dentry);
+		if (!dentry)
 			return;
-		}
-
-		/* Slow case: now with the dentry lock held */
-		if (likely(lock_for_kill(dentry))) {
-			struct dentry *parent = dentry->d_parent;
-			rcu_read_unlock();
-			__dentry_kill(dentry);
-			if (dentry == parent)
-				return;
-			dentry = parent;
-		} else {
-			rcu_read_unlock();
+		if (retain_dentry(dentry)) {
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
+	spin_unlock(&dentry->d_lock);
 }
 EXPORT_SYMBOL(dput);
 
@@ -1086,12 +1049,16 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-static inline void shrink_kill(struct dentry *victim, struct list_head *list)
+static inline void shrink_kill(struct dentry *victim)
 {
-	struct dentry *parent = victim->d_parent;
-	if (parent != victim && !--parent->d_lockref.count)
-		to_shrink_list(parent, list);
-	__dentry_kill(victim);
+	do {
+		rcu_read_unlock();
+		victim = __dentry_kill(victim);
+		rcu_read_lock();
+	} while (victim && lock_for_kill(victim));
+	rcu_read_unlock();
+	if (victim)
+		spin_unlock(&victim->d_lock);
 }
 
 void shrink_dentry_list(struct list_head *list)
@@ -1112,9 +1079,8 @@ void shrink_dentry_list(struct list_head *list)
 				dentry_free(dentry);
 			continue;
 		}
-		rcu_read_unlock();
 		d_shrink_del(dentry);
-		shrink_kill(dentry, list);
+		shrink_kill(dentry);
 	}
 }
 
@@ -1473,6 +1439,8 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 	} else if (!dentry->d_lockref.count) {
 		to_shrink_list(dentry, &data->dispose);
 		data->found++;
+	} else if (dentry->d_lockref.count < 0) {
+		data->found++;
 	}
 	/*
 	 * We can return to the caller if we have found some (this
@@ -1542,8 +1510,7 @@ void shrink_dcache_parent(struct dentry *parent)
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
-				rcu_read_unlock();
-				shrink_kill(data.victim, &data.dispose);
+				shrink_kill(data.victim);
 			}
 		}
 		if (!list_empty(&data.dispose))
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy
  2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
                                   ` (20 preceding siblings ...)
  2023-11-09  6:20                 ` [PATCH 22/22] __dentry_kill(): new locking scheme Al Viro
@ 2023-11-09 13:33                 ` Christian Brauner
  21 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:33 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:35AM +0000, Al Viro wrote:
> This is beyond ridiculous.  There is a reason why that thing is
> cacheline-aligned...
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Yeah, lenient annotation with this stuff makes a bunch of structures
with carefully chosen layouts pretty meaningless.

The thing is that it doesn't matter for most cases as every regular
distro afaict sets CONFIG_RANDSTRUCT_NONE=y which means layout
randomization isn't applied.

In any case,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal()
  2023-11-09  6:20                 ` [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal() Al Viro
@ 2023-11-09 13:42                   ` Christian Brauner
  2023-11-09 14:01                   ` Chuck Lever
  1 sibling, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:42 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:36AM +0000, Al Viro wrote:
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Acked-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 03/22] coda_flag_children(): cope with dentries turning negative
  2023-11-09  6:20                 ` [PATCH 03/22] coda_flag_children(): cope with dentries turning negative Al Viro
@ 2023-11-09 13:43                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:43 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:37AM +0000, Al Viro wrote:
> ->d_lock on parent does not stabilize ->d_inode of child.
> We don't do much with that inode in there, but we need
> at least to avoid struct inode getting freed under us...
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 04/22] dentry: switch the lists of children to hlist
  2023-11-09  6:20                 ` [PATCH 04/22] dentry: switch the lists of children to hlist Al Viro
@ 2023-11-09 13:48                   ` Christian Brauner
  2023-11-09 19:32                     ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:48 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:38AM +0000, Al Viro wrote:
> Saves a pointer per struct dentry and actually makes the things less

Which you're giving back to DNAME_INLINE_LEN.

> clumsy.  Cleaned the d_walk() and dcache_readdir() a bit by use
> of hlist_for_... iterators.
> 
> A couple of new helpers - d_first_child() and d_next_sibling(),
> to make the expressions less awful.
> 
> X-fuck-kABI: gladly
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Nice, gets rid of that do-while(), while () stuff,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 05/22] centralize killing dentry from shrink list
  2023-11-09  6:20                 ` [PATCH 05/22] centralize killing dentry from shrink list Al Viro
@ 2023-11-09 13:49                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:49 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:39AM +0000, Al Viro wrote:
> new helper unifying identical bits of shrink_dentry_list() and
> shring_dcache_for_umount()
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 06/22] get rid of __dget()
  2023-11-09  6:20                 ` [PATCH 06/22] get rid of __dget() Al Viro
@ 2023-11-09 13:50                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:50 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:40AM +0000, Al Viro wrote:
> fold into the sole remaining caller
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead
  2023-11-09  6:20                 ` [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
@ 2023-11-09 13:53                   ` Christian Brauner
  2023-11-09 20:28                     ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:53 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:41AM +0000, Al Viro wrote:
> ... we won't see DCACHE_MAY_FREE on anything that is *not* dead
> and checking d_flags is just as cheap as checking refcount.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Could also be a WARN_ON_ONCE() on d_lockref.count > 0 if DCACHE_MAY_FREE
is set but probably doesn't matter,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 08/22] fast_dput(): having ->d_delete() is not reason to delay refcount decrement
  2023-11-09  6:20                 ` [PATCH 08/22] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
@ 2023-11-09 13:58                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 13:58 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:42AM +0000, Al Viro wrote:
> ->d_delete() is a way for filesystem to tell that dentry is not worth
> keeping cached.  It is not guaranteed to be called every time a dentry
> has refcount drop down to zero; it is not guaranteed to be called before
> dentry gets evicted.  In other words, it is not suitable for any kind
> of keeping track of dentry state.
> 
> None of the in-tree filesystems attempt to use it that way, fortunately.
> 
> So the contortions done by fast_dput() (as well as dentry_kill()) are
> not warranted.  fast_dput() certainly should treat having ->d_delete()
> instance as "can't assume we'll be keeping it", but that's not different
> from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar
> to making ->d_delete() returns true when called).
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Reasoning seems sane to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal()
  2023-11-09  6:20                 ` [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal() Al Viro
  2023-11-09 13:42                   ` Christian Brauner
@ 2023-11-09 14:01                   ` Chuck Lever
  2023-11-09 18:47                     ` Al Viro
  1 sibling, 1 reply; 119+ messages in thread
From: Chuck Lever @ 2023-11-09 14:01 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel, Christian Brauner

On Thu, Nov 09, 2023 at 06:20:36AM +0000, Al Viro wrote:
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> Tested-by: Jeff Layton <jlayton@kernel.org>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Future me is going to be mightily confused by the lack of a patch
description. I went back to the series cover letter and found some
text that would be nice to include here:

> 02/22) nfsd_client_rmdir() and its gut open-code simple_recursive_removal();
> converting to calling that cleans the things up in there *and* reduces
> the amount of places where we touch the list of children, which simplifies
> the work later in the series.


> ---
>  fs/nfsd/nfsctl.c | 70 ++++++++++--------------------------------------
>  1 file changed, 14 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 7ed02fb88a36..035b42c1a181 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1235,63 +1235,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name,
>  
>  #endif
>  
> -static void clear_ncl(struct inode *inode)
> +static void clear_ncl(struct dentry *dentry)
>  {
> +	struct inode *inode = d_inode(dentry);
>  	struct nfsdfs_client *ncl = inode->i_private;
>  
> +	spin_lock(&inode->i_lock);
>  	inode->i_private = NULL;
> +	spin_unlock(&inode->i_lock);
>  	kref_put(&ncl->cl_ref, ncl->cl_release);
>  }
>  
> -static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode)
> -{
> -	struct nfsdfs_client *nc = inode->i_private;
> -
> -	if (nc)
> -		kref_get(&nc->cl_ref);
> -	return nc;
> -}
> -
>  struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
>  {
>  	struct nfsdfs_client *nc;
>  
> -	inode_lock_shared(inode);
> -	nc = __get_nfsdfs_client(inode);
> -	inode_unlock_shared(inode);
> +	spin_lock(&inode->i_lock);
> +	nc = inode->i_private;
> +	if (nc)
> +		kref_get(&nc->cl_ref);
> +	spin_unlock(&inode->i_lock);
>  	return nc;
>  }
> -/* from __rpc_unlink */
> -static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
> -{
> -	int ret;
> -
> -	clear_ncl(d_inode(dentry));
> -	dget(dentry);
> -	ret = simple_unlink(dir, dentry);
> -	d_drop(dentry);
> -	fsnotify_unlink(dir, dentry);
> -	dput(dentry);
> -	WARN_ON_ONCE(ret);
> -}
> -
> -static void nfsdfs_remove_files(struct dentry *root)
> -{
> -	struct dentry *dentry, *tmp;
> -
> -	list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) {
> -		if (!simple_positive(dentry)) {
> -			WARN_ON_ONCE(1); /* I think this can't happen? */
> -			continue;
> -		}
> -		nfsdfs_remove_file(d_inode(root), dentry);
> -	}
> -}
>  
>  /* XXX: cut'n'paste from simple_fill_super; figure out if we could share
>   * code instead. */
>  static  int nfsdfs_create_files(struct dentry *root,
>  				const struct tree_descr *files,
> +				struct nfsdfs_client *ncl,
>  				struct dentry **fdentries)
>  {
>  	struct inode *dir = d_inode(root);
> @@ -1310,8 +1281,9 @@ static  int nfsdfs_create_files(struct dentry *root,
>  			dput(dentry);
>  			goto out;
>  		}
> +		kref_get(&ncl->cl_ref);
>  		inode->i_fop = files->ops;
> -		inode->i_private = __get_nfsdfs_client(dir);
> +		inode->i_private = ncl;
>  		d_add(dentry, inode);
>  		fsnotify_create(dir, dentry);
>  		if (fdentries)
> @@ -1320,7 +1292,6 @@ static  int nfsdfs_create_files(struct dentry *root,
>  	inode_unlock(dir);
>  	return 0;
>  out:
> -	nfsdfs_remove_files(root);
>  	inode_unlock(dir);
>  	return -ENOMEM;
>  }
> @@ -1340,7 +1311,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
>  	dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
>  	if (IS_ERR(dentry)) /* XXX: tossing errors? */
>  		return NULL;
> -	ret = nfsdfs_create_files(dentry, files, fdentries);
> +	ret = nfsdfs_create_files(dentry, files, ncl, fdentries);
>  	if (ret) {
>  		nfsd_client_rmdir(dentry);
>  		return NULL;
> @@ -1351,20 +1322,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
>  /* Taken from __rpc_rmdir: */
>  void nfsd_client_rmdir(struct dentry *dentry)
>  {
> -	struct inode *dir = d_inode(dentry->d_parent);
> -	struct inode *inode = d_inode(dentry);
> -	int ret;
> -
> -	inode_lock(dir);
> -	nfsdfs_remove_files(dentry);
> -	clear_ncl(inode);
> -	dget(dentry);
> -	ret = simple_rmdir(dir, dentry);
> -	WARN_ON_ONCE(ret);
> -	d_drop(dentry);
> -	fsnotify_rmdir(dir, dentry);
> -	dput(dentry);
> -	inode_unlock(dir);
> +	simple_recursive_removal(dentry, clear_ncl);
>  }
>  
>  static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
> -- 
> 2.39.2
> 
> 

-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 09/22] fast_dput(): handle underflows gracefully
  2023-11-09  6:20                 ` [PATCH 09/22] fast_dput(): handle underflows gracefully Al Viro
@ 2023-11-09 14:46                   ` Christian Brauner
  2023-11-09 20:39                     ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 14:46 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:43AM +0000, Al Viro wrote:
> If refcount is less than 1, we should just warn, unlock dentry and
> return true, so that the caller doesn't try to do anything else.

That's effectively to guard against bugs in filesystems, not in dcache
itself, right? Have we observed this frequently?

> 
> Taking care of that leaves the rest of "lockref_put_return() has
> failed" case equivalent to "decrement refcount and rejoin the
> normal slow path after the point where we grab ->d_lock".
> 
> NOTE: lockref_put_return() is strictly a fastpath thing - unlike
> the rest of lockref primitives, it does not contain a fallback.
> Caller (and it looks like fast_dput() is the only legitimate one
> in the entire kernel) has to do that itself.  Reasons for
> lockref_put_return() failures:
> 	* ->d_lock held by somebody
> 	* refcount <= 0
> 	* ... or an architecture not supporting lockref use of
> cmpxchg - sparc, anything non-SMP, config with spinlock debugging...
> 
> We could add a fallback, but it would be a clumsy API - we'd have
> to distinguish between:
> 	(1) refcount > 1 - decremented, lock not held on return
> 	(2) refcount < 1 - left alone, probably no sense to hold the lock
> 	(3) refcount is 1, no cmphxcg - decremented, lock held on return
> 	(4) refcount is 1, cmphxcg supported - decremented, lock *NOT* held
> 	    on return.
> We want to return with no lock held in case (4); that's the whole point of that
> thing.  We very much do not want to have the fallback in case (3) return without
> a lock, since the caller might have to retake it in that case.
> So it wouldn't be more convenient than doing the fallback in the caller and
> it would be very easy to screw up, especially since the test coverage would
> suck - no way to test (3) and (4) on the same kernel build.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks like a good idea,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 10/22] fast_dput(): new rules for refcount
  2023-11-09  6:20                 ` [PATCH 10/22] fast_dput(): new rules for refcount Al Viro
@ 2023-11-09 14:54                   ` Christian Brauner
  2023-11-09 20:52                     ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 14:54 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:44AM +0000, Al Viro wrote:
> Currently the "need caller to do more work" path in fast_dput()
> has refcount decremented, then, with ->d_lock held and
> refcount verified to have reached 0 fast_dput() forcibly resets
> the refcount to 1.
> 
> Move that resetting refcount to 1 into the callers; later in
> the series it will be massaged out of existence.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Ok, this is safe to do because of

[PATCH 09/22] fast_dput(): handle underflows gracefully
https://lore.kernel.org/linux-fsdevel/20231109062056.3181775-9-viro@zeniv.linux.org.uk

as return false from fast_dput() now always means refcount is zero.

Reviewed-by: Christian Brauner <brauner@kernel.org>

>  fs/dcache.c | 9 ++-------
>  1 file changed, 2 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index e02b3c81bc02..9a3eeee02500 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -847,13 +847,6 @@ static inline bool fast_dput(struct dentry *dentry)
>  		spin_unlock(&dentry->d_lock);
>  		return true;
>  	}
> -
> -	/*
> -	 * Re-get the reference we optimistically dropped. We hold the
> -	 * lock, and we just tested that it was zero, so we can just
> -	 * set it to 1.
> -	 */
> -	dentry->d_lockref.count = 1;
>  	return false;
>  }
>  
> @@ -896,6 +889,7 @@ void dput(struct dentry *dentry)
>  		}
>  
>  		/* Slow case: now with the dentry lock held */
> +		dentry->d_lockref.count = 1;
>  		rcu_read_unlock();
>  
>  		if (likely(retain_dentry(dentry))) {
> @@ -930,6 +924,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
>  		return;
>  	}
>  	rcu_read_unlock();
> +	dentry->d_lockref.count = 1;
>  	if (!retain_dentry(dentry))
>  		__dput_to_list(dentry, list);
>  	spin_unlock(&dentry->d_lock);
> -- 
> 2.39.2
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 11/22] __dput_to_list(): do decrement of refcount in the callers
  2023-11-09  6:20                 ` [PATCH 11/22] __dput_to_list(): do decrement of refcount in the callers Al Viro
@ 2023-11-09 15:21                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 15:21 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:45AM +0000, Al Viro wrote:
> ... and rename it to to_shrink_list(), seeing that it no longer
> does dropping any references
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 12/22] Make retain_dentry() neutral with respect to refcounting
  2023-11-09  6:20                 ` [PATCH 12/22] Make retain_dentry() neutral with respect to refcounting Al Viro
@ 2023-11-09 15:22                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 15:22 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:46AM +0000, Al Viro wrote:
> retain_dentry() used to decrement refcount if and only if it returned
> true.  Lift those decrements into the callers.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 13/22] __dentry_kill(): get consistent rules for victim's refcount
  2023-11-09  6:20                 ` [PATCH 13/22] __dentry_kill(): get consistent rules for victim's refcount Al Viro
@ 2023-11-09 15:27                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 15:27 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:47AM +0000, Al Viro wrote:
> Currently we call it with refcount equal to 1 when called from
> dentry_kill(); all other callers have it equal to 0.
> 
> Make it always be called with zero refcount; on this step we
> just decrement it before the calls in dentry_kill().  That is
> safe, since all places that care about the value of refcount
> either do that under ->d_lock or hold a reference to dentry

Also worth noting that dentry_kill() is marked with
__releases(dentry->d_lock).

I'm usually pretty liberal with lockdep_assert asserts as well because
it gives nice splats on testing kernels and makes for much faster review
because the assumptions are visible directly in the helper.

> in question.  Either is sufficient to prevent observing a
> dentry immediately prior to __dentry_kill() getting called
> from dentry_kill().
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path
  2023-11-09  6:20                 ` [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
@ 2023-11-09 15:53                   ` Christian Brauner
  2023-11-09 21:29                     ` Al Viro
  0 siblings, 1 reply; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 15:53 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:48AM +0000, Al Viro wrote:
> We have already checked it and dentry used to look not worthy
> of keeping.  The only hard obstacle to evicting dentry is
> non-zero refcount; everything else is advisory - e.g. memory
> pressure could evict any dentry found with refcount zero.
> On the slow path in dentry_kill() we had dropped and regained
> ->d_lock; we must recheck the refcount, but everything else
> is not worth bothering with.
> 
> Note that filesystem can not count upon ->d_delete() being
> called for dentry - not even once.  Again, memory pressure
> (as well as d_prune_aliases(), or attempted rmdir() of ancestor,
> or...) will not call ->d_delete() at all.
> 
> So from the correctness point of view we are fine doing the
> check only once.  And it makes things simpler down the road.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Ok, that again relies on earlier patches that ensure that dentry_kill()
isn't called with refcount == 0 afaiu,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 15/22] Call retain_dentry() with refcount 0
  2023-11-09  6:20                 ` [PATCH 15/22] Call retain_dentry() with refcount 0 Al Viro
@ 2023-11-09 16:09                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 16:09 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:49AM +0000, Al Viro wrote:
> Instead of bumping it from 0 to 1, calling retain_dentry(), then
> decrementing it back to 0 (with ->d_lock held all the way through),
> just leave refcount at 0 through all of that.
> 
> It will have a visible effect for ->d_delete() - now it can be
> called with refcount 0 instead of 1 and it can no longer play
> silly buggers with dropping/regaining ->d_lock.  Not that any
> in-tree instances tried to (it's pretty hard to get right).
> 
> Any out-of-tree ones will have to adjust (assuming they need any
> changes).
> 
> Note that we do not need to extend rcu-critical area here - we have
> verified that refcount is non-negative after having grabbed ->d_lock,
> so nobody will be able to free dentry until they get into __dentry_kill(),
> which won't happen until they manage to grab ->d_lock.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 16/22] fold the call of retain_dentry() into fast_dput()
  2023-11-09  6:20                 ` [PATCH 16/22] fold the call of retain_dentry() into fast_dput() Al Viro
@ 2023-11-09 16:17                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 16:17 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:50AM +0000, Al Viro wrote:
> Calls of retain_dentry() happen immediately after getting false
> from fast_dput() and getting true from retain_dentry() is
> treated the same way as non-zero refcount would be treated by
> fast_dput() - unlock dentry and bugger off.
> 
> Doing that in fast_dput() itself is simpler.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Slight context change in that retain_dentry()'s now called with rcu read
lock held. Not that it should matter,

Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09  6:20                 ` [PATCH 17/22] don't try to cut corners in shrink_lock_dentry() Al Viro
@ 2023-11-09 17:20                   ` Christian Brauner
  2023-11-09 21:45                     ` Al Viro
  2023-11-09 17:39                   ` Linus Torvalds
  1 sibling, 1 reply; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 17:20 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:51AM +0000, Al Viro wrote:
> That is to say, do *not* treat the ->d_inode or ->d_parent changes
> as "it's hard, return false; somebody must have grabbed it, so
> even if has zero refcount, we don't need to bother killing it -
> final dput() from whoever grabbed it would've done everything".
> 
> First of all, that is not guaranteed.  It might have been dropped
> by shrink_kill() handling of victim's parent, which would've found
> it already on a shrink list (ours) and decided that they don't need
> to put it on their shrink list.
> 
> What's more, dentry_kill() is doing pretty much the same thing,
> cutting its own set of corners (it assumes that dentry can't
> go from positive to negative, so its inode can change but only once
> and only in one direction).
> 
> Doing that right allows to get rid of that not-quite-duplication
> and removes the only reason for re-incrementing refcount before
> the call of dentry_kill().
> 
> Replacement is called lock_for_kill(); called under rcu_read_lock
> and with ->d_lock held.  If it returns false, dentry has non-zero
> refcount and the same locks are held.  If it returns true,
> dentry has zero refcount and all locks required by __dentry_kill()
> are taken.
> 
> Part of __lock_parent() had been lifted into lock_parent() to
> allow its reuse.  Now it's called with rcu_read_lock already
> held and dentry already unlocked.
> 
> Note that this is not the final change - locking requirements for
> __dentry_kill() are going to change later in the series and the
> set of locks taken by lock_for_kill() will be adjusted.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

It's a bit unfortunate that __lock_parent() locks the parent *and* may
lock the child which isn't really obvious from the name. It just becomes
clear that this is assumed by how callers release the child's lock.

>  fs/dcache.c | 159 ++++++++++++++++++++++------------------------------
>  1 file changed, 66 insertions(+), 93 deletions(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 23afcd48c1a9..801502871671 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry)
>  static struct dentry *__lock_parent(struct dentry *dentry)
>  {
>  	struct dentry *parent;
> -	rcu_read_lock();
> -	spin_unlock(&dentry->d_lock);
>  again:
>  	parent = READ_ONCE(dentry->d_parent);
>  	spin_lock(&parent->d_lock);
> @@ -642,7 +640,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
>  		spin_unlock(&parent->d_lock);
>  		goto again;
>  	}
> -	rcu_read_unlock();
>  	if (parent != dentry)
>  		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
>  	else
> @@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
>  		return NULL;
>  	if (likely(spin_trylock(&parent->d_lock)))
>  		return parent;
> -	return __lock_parent(dentry);
> +	rcu_read_lock();
> +	spin_unlock(&dentry->d_lock);
> +	parent = __lock_parent(dentry);
> +	rcu_read_unlock();
> +	return parent;
> +}
> +
> +/*
> + * Lock a dentry for feeding it to __dentry_kill().
> + * Called under rcu_read_lock() and dentry->d_lock; the former
> + * guarantees that nothing we access will be freed under us.
> + * Note that dentry is *not* protected from concurrent dentry_kill(),
> + * d_delete(), etc.
> + *
> + * Return false if dentry is busy.  Otherwise, return true and have
> + * that dentry's inode and parent both locked.
> + */
> +
> +static bool lock_for_kill(struct dentry *dentry)
> +{
> +	struct inode *inode = dentry->d_inode;
> +	struct dentry *parent = dentry->d_parent;
> +
> +	if (unlikely(dentry->d_lockref.count))
> +		return false;
> +
> +	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
> +		goto slow;
> +	if (dentry == parent)
> +		return true;
> +	if (likely(spin_trylock(&parent->d_lock)))
> +		return true;
> +
> +	if (inode)
> +		spin_unlock(&inode->i_lock);
> +slow:
> +	spin_unlock(&dentry->d_lock);
> +
> +	for (;;) {
> +		if (inode)
> +			spin_lock(&inode->i_lock);
> +		parent = __lock_parent(dentry);

We're under rcu here. Are we sure that this can't trigger rcu timeouts
because we're spinning? Maybe there's a reason that's not an issue here.

That spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED) in
__lock_parent() is there for the sake of lockdep to verify that the
parent lock is always aqcuired before the child lock?

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 18/22] fold dentry_kill() into dput()
  2023-11-09  6:20                 ` [PATCH 18/22] fold dentry_kill() into dput() Al Viro
@ 2023-11-09 17:22                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 17:22 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:52AM +0000, Al Viro wrote:
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 19/22] to_shrink_list(): call only if refcount is 0
  2023-11-09  6:20                 ` [PATCH 19/22] to_shrink_list(): call only if refcount is 0 Al Viro
@ 2023-11-09 17:29                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 17:29 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:53AM +0000, Al Viro wrote:
> The only thing it does if refcount is not zero is d_lru_del(); no
> point, IMO, seeing that plain dput() does nothing of that sort...
> 
> Note that 2 of 3 current callers are guaranteed that refcount is 0.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Ok, I don't fully understand this one but I see nothing obviously wrong
with it so,

Acked-by: Christian Brauner <brauner@kernel.org>

>  fs/dcache.c | 7 ++-----
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index aa9f7ee7a603..49585f2ad896 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -915,8 +915,7 @@ __must_hold(&dentry->d_lock)
>  	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
>  		if (dentry->d_flags & DCACHE_LRU_LIST)
>  			d_lru_del(dentry);
> -		if (!dentry->d_lockref.count)
> -			d_shrink_add(dentry, list);
> +		d_shrink_add(dentry, list);
>  	}
>  }
>  
> @@ -1110,10 +1109,8 @@ EXPORT_SYMBOL(d_prune_aliases);
>  static inline void shrink_kill(struct dentry *victim, struct list_head *list)
>  {
>  	struct dentry *parent = victim->d_parent;
> -	if (parent != victim) {
> -		--parent->d_lockref.count;
> +	if (parent != victim && !--parent->d_lockref.count)
>  		to_shrink_list(parent, list);
> -	}
>  	__dentry_kill(victim);
>  }
>  
> -- 
> 2.39.2
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 20/22] switch select_collect{,2}() to use of to_shrink_list()
  2023-11-09  6:20                 ` [PATCH 20/22] switch select_collect{,2}() to use of to_shrink_list() Al Viro
@ 2023-11-09 17:31                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 17:31 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:54AM +0000, Al Viro wrote:
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 21/22] d_prune_aliases(): use a shrink list
  2023-11-09  6:20                 ` [PATCH 21/22] d_prune_aliases(): use a shrink list Al Viro
@ 2023-11-09 17:33                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-09 17:33 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:55AM +0000, Al Viro wrote:
> Instead of dropping aliases one by one, restarting, etc., just
> collect them into a shrink list and kill them off in one pass.
> 
> We don't really need the restarts - one alias can't pin another
> (directory has only one alias, and couldn't be its own ancestor
> anyway), so collecting everything that is not busy and taking it
> out would take care of everything evictable that had been there
> as we entered the function.  And new aliases added while we'd
> been dropping old ones could just as easily have appeared right
> as we return to caller...
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Nice cleanup,
Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09  6:20                 ` [PATCH 17/22] don't try to cut corners in shrink_lock_dentry() Al Viro
  2023-11-09 17:20                   ` Christian Brauner
@ 2023-11-09 17:39                   ` Linus Torvalds
  2023-11-09 18:11                     ` Linus Torvalds
  2023-11-09 18:20                     ` Al Viro
  1 sibling, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-09 17:39 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Christian Brauner

On Wed, 8 Nov 2023 at 22:23, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
>  static struct dentry *__lock_parent(struct dentry *dentry)
>  {
>         struct dentry *parent;
> -       rcu_read_lock();
> -       spin_unlock(&dentry->d_lock);
>  again:
>         parent = READ_ONCE(dentry->d_parent);
>         spin_lock(&parent->d_lock);

Can we rename this while at it?

That name *used* to make sense, in that the function was entered with
the dentry lock held, and then it returned with the dentry lock *and*
the parent lock held.

But now you've changed the rules so that the dentry lock is *not* held
at entry, so now the semantics of that function is essentially "lock
dentry and parent". Which I think means that the name should change to
reflect that.

Finally: it does look like most callers actually did hold the dentry
lock, and that you just moved the

        spin_unlock(&dentry->d_lock);

from inside that function to the caller. I don't hate that, but now
that I look at it, I get the feeling that what we *should* have done
is

  static struct dentry *__lock_parent(struct dentry *dentry)
  {
        struct dentry *parent = dentry->d_parent;
        if (try_spin_lock(&parent->d_lock))
                return parent;
        /* Uhhuh - need to get the parent lock first */
        .. old code goes here ..

but that won't work with the new world order.

So I get the feeling that maybe instead of renaming it for the new
semantics, maybe the old semantics of "called with the dentry lock
held" were simply better"

                  Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09 17:39                   ` Linus Torvalds
@ 2023-11-09 18:11                     ` Linus Torvalds
  2023-11-09 18:20                     ` Al Viro
  1 sibling, 0 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-09 18:11 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, Christian Brauner

On Thu, 9 Nov 2023 at 09:39, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Can we rename this while at it?

Never mind. I didn't notice that the thing disappears entirely in 22/22.

Just ignore my blind ass.

                Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09 17:39                   ` Linus Torvalds
  2023-11-09 18:11                     ` Linus Torvalds
@ 2023-11-09 18:20                     ` Al Viro
  1 sibling, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09 18:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Christian Brauner

On Thu, Nov 09, 2023 at 09:39:09AM -0800, Linus Torvalds wrote:
> On Wed, 8 Nov 2023 at 22:23, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> >  static struct dentry *__lock_parent(struct dentry *dentry)
> >  {
> >         struct dentry *parent;
> > -       rcu_read_lock();
> > -       spin_unlock(&dentry->d_lock);
> >  again:
> >         parent = READ_ONCE(dentry->d_parent);
> >         spin_lock(&parent->d_lock);
> 
> Can we rename this while at it?
> 
> That name *used* to make sense, in that the function was entered with
> the dentry lock held, and then it returned with the dentry lock *and*
> the parent lock held.
> 
> But now you've changed the rules so that the dentry lock is *not* held
> at entry, so now the semantics of that function is essentially "lock
> dentry and parent". Which I think means that the name should change to
> reflect that.
> 
> Finally: it does look like most callers actually did hold the dentry
> lock, and that you just moved the
> 
>         spin_unlock(&dentry->d_lock);
> 
> from inside that function to the caller. I don't hate that, but now
> that I look at it, I get the feeling that what we *should* have done
> is
> 
>   static struct dentry *__lock_parent(struct dentry *dentry)
>   {
>         struct dentry *parent = dentry->d_parent;
>         if (try_spin_lock(&parent->d_lock))
>                 return parent;
>         /* Uhhuh - need to get the parent lock first */
>         .. old code goes here ..
> 
> but that won't work with the new world order.

Can't - currently lock_for_kill() uses it in a loop.  Can't have trylocks
in there, or realtime setups will get unhappy.  More to the point, the whole
function is gone by the end of the series.  Along with lock_parent().

The only reason why we needed that thing is that we lock the parent too
early; that's where the last commit in the series is a big win.  There
we remove from the parent's list of children in the very end, when we'd
already made the victim negative (and unlocked it); there ->d_parent
is stable and we can simply lock that, then lock dentry.

We still need a loop in lock_for_kill() to get the inode locked along
with dentry, but that's less convoluted (the ordering between two
->d_lock can change; ->i_lock is always safe to take before ->d_lock).

> So I get the feeling that maybe instead of renaming it for the new
> semantics, maybe the old semantics of "called with the dentry lock
> held" were simply better"

lock_parent() goes aways when d_prune_alias() is switched to shrink list;
after that __lock_parent() is used only in that loop in lock_for_kill()
and only until (22/22) when lock_for_kill() stops touching the parent.
After that it's simply gone.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal()
  2023-11-09 14:01                   ` Chuck Lever
@ 2023-11-09 18:47                     ` Al Viro
  2023-11-09 18:50                       ` Chuck Lever III
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09 18:47 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Linus Torvalds, linux-fsdevel, Christian Brauner

On Thu, Nov 09, 2023 at 09:01:19AM -0500, Chuck Lever wrote:
> On Thu, Nov 09, 2023 at 06:20:36AM +0000, Al Viro wrote:
> > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > Tested-by: Jeff Layton <jlayton@kernel.org>
> > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> 
> Future me is going to be mightily confused by the lack of a patch
> description. I went back to the series cover letter and found some
> text that would be nice to include here:

Does the following work for you?

switch nfsd_client_rmdir() to use of simple_recursive_removal()

nfsd_client_rmdir() open-codes a subset of simple_recursive_removal().
Conversion to calling simple_recursive_removal() allows to clean things
up quite a bit.

While we are at it, nfsdfs_create_files() doesn't need to mess with "pick    
the reference to struct nfsdfs_client from the already created parent" -
the caller already knows it (that's where the parent got it from,
after all), so we might as well just pass it as an explicit argument.
So __get_nfsdfs_client() is only needed in get_nfsdfs_client() and
can be folded in there.

Incidentally, the locking in get_nfsdfs_client() is too heavy - we don't 
need ->i_rwsem for that, ->i_lock serves just fine.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal()
  2023-11-09 18:47                     ` Al Viro
@ 2023-11-09 18:50                       ` Chuck Lever III
  0 siblings, 0 replies; 119+ messages in thread
From: Chuck Lever III @ 2023-11-09 18:50 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel, Christian Brauner



> On Nov 9, 2023, at 1:47 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> 
> On Thu, Nov 09, 2023 at 09:01:19AM -0500, Chuck Lever wrote:
>> On Thu, Nov 09, 2023 at 06:20:36AM +0000, Al Viro wrote:
>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
>>> Tested-by: Jeff Layton <jlayton@kernel.org>
>>> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
>> 
>> Future me is going to be mightily confused by the lack of a patch
>> description. I went back to the series cover letter and found some
>> text that would be nice to include here:
> 
> Does the following work for you?
> 
> switch nfsd_client_rmdir() to use of simple_recursive_removal()
> 
> nfsd_client_rmdir() open-codes a subset of simple_recursive_removal().
> Conversion to calling simple_recursive_removal() allows to clean things
> up quite a bit.
> 
> While we are at it, nfsdfs_create_files() doesn't need to mess with "pick    
> the reference to struct nfsdfs_client from the already created parent" -
> the caller already knows it (that's where the parent got it from,
> after all), so we might as well just pass it as an explicit argument.
> So __get_nfsdfs_client() is only needed in get_nfsdfs_client() and
> can be folded in there.
> 
> Incidentally, the locking in get_nfsdfs_client() is too heavy - we don't 
> need ->i_rwsem for that, ->i_lock serves just fine.

Very nice, thanks.

Acked-by: Chuck Lever <chuck.lever@oracle.com <mailto:chuck.lever@oracle.com>>

--
Chuck Lever



^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 04/22] dentry: switch the lists of children to hlist
  2023-11-09 13:48                   ` Christian Brauner
@ 2023-11-09 19:32                     ` Al Viro
  0 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09 19:32 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 02:48:19PM +0100, Christian Brauner wrote:
> On Thu, Nov 09, 2023 at 06:20:38AM +0000, Al Viro wrote:
> > Saves a pointer per struct dentry and actually makes the things less
> 
> Which you're giving back to DNAME_INLINE_LEN.

Have to - we want the size to stay a multiple of 64.  So DNAME_INLINE_LEN serves
as a sump - any space savings we get go in there, just as any additional fields
have to pull the space out of there.

FWIW, from distribution of name lengths on 3 local boxen, with reasonably diverse
contents of filesystems:

< 24: 90.6877% 89.8033% 89.3202%
< 25: 92.2120% 90.4324% 90.4652%
< 26: 93.5858% 95.0555% 92.3849%
< 27: 94.6277% 95.4424% 93.1948%
< 28: 95.4827% 95.7796% 93.9134%
< 29: 96.1926% 96.0851% 94.5449%
< 30: 96.7963% 96.3503% 95.1006%
< 32: 97.6930% 96.5792% 95.5681%
< 33: 98.0392% 96.7943% 95.9879%
< 34: 98.3134% 96.9829% 96.3353%
< 35: 98.5493% 97.1352% 96.6313%
< 36: 98.7468% 97.2757% 96.8890%
< 37: 98.9134% 97.4192% 97.1199%
< 38: 99.0515% 97.5506% 97.3372%
< 39: 99.3650% 97.6394% 97.4857%
< 40: 99.4606% 98.8016% 97.7237%

So 32 is tolerable, but going down would rapidly become unpleasant.  This series
does not introduce any new fields, but it's nice to be able to do so without
causing PITA for long names.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead
  2023-11-09 13:53                   ` Christian Brauner
@ 2023-11-09 20:28                     ` Al Viro
  0 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09 20:28 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 02:53:04PM +0100, Christian Brauner wrote:
> On Thu, Nov 09, 2023 at 06:20:41AM +0000, Al Viro wrote:
> > ... we won't see DCACHE_MAY_FREE on anything that is *not* dead
> > and checking d_flags is just as cheap as checking refcount.
> > 
> > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > ---
> 
> Could also be a WARN_ON_ONCE() on d_lockref.count > 0 if DCACHE_MAY_FREE
> is set but probably doesn't matter,

>= 0, actually, but... TBH, in longer run I would rather represent the
empty husk state (instance just waiting for shrink_dentry_list() to remove
it from its list and free the sucker) not by a bit in ->d_flags, but
by a specific negative value in ->d_lockref.count.

After this series we have the following picture: all real instances come
from __alloc_dentry().  Possible states after that
  Busy <-> Retained -> Dying -> Freeing
                         |        ^
			 V        |
			 Husk ----/

Busy and Retained are live dentries, with positive and zero refcount
resp.; that's the pure refcounting land.  Eventually we get to
succesful lock_for_kill(), which leads to call of __dentry_kill().
That's where the state becomes Dying.  On the way out of __dentry_kill()
(after ->d_prune()/->d_iput()/->d_release()) we either switch to Freeing
(only RCU references remain, actual memory object freed by the end of it)
or Husk (the only non-RCU reference is that of a shrink list it's on).
Husk, in turn, switches to Freeing as soon as shrink_dentry_list() gets
around to it and takes it out of its shrink list.  If shrink_dentry_list()
picks an instance in Dying state, it quietly removes it from the shrink
list and leaves it for __dentry_kill() to deal with.

All transitions are under ->d_lock.  ->d_lockref.count for those is
positive in Busy, zero in Retained and -128 in Dying, Husk and Freeing.
Husk is distinguished by having DCACHE_MAY_FREE set.  Freeing has no
visible difference from Dying.

All refcount changes are under ->d_lock.  None of them should _ever_
change the negative values.  If the last part is easy to verify (right
now it's up to "no refcount overflows, all callers of dget_dlock() are
guaranteed to be dealing with Busy or Retained instances"), it might
make sense to use 3 specific negative values for Dying/Husk/Freeing.
What's more, it might make sense to deal with overflows by adding a
separate unsigned long __d_large_count_dont_you_ever_touch_that_directly;
and have the overflow switch to the 4th special negative number indicating
that real counter sits in there.

I'm not 100% convinced that this is the right way to handle that mess,
but it's an approach I'm at least keeping in mind.  Anyway, we need to
get the damn thing documented and understandable before dealing with
overflows becomes even remotely possible.  As it is, it's way too
subtle and reasoning about correctness is too convoluted and brittle.

PS: "documented" includes explicit description of states, their
representations and transitions between them, as well as the objects
associated with the instance in each of those, what references
are allowed in each state, etc.  And the things like in-lookup,
cursor, etc. - live dentries have sub-states as well...

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 09/22] fast_dput(): handle underflows gracefully
  2023-11-09 14:46                   ` Christian Brauner
@ 2023-11-09 20:39                     ` Al Viro
  0 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09 20:39 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 03:46:21PM +0100, Christian Brauner wrote:
> On Thu, Nov 09, 2023 at 06:20:43AM +0000, Al Viro wrote:
> > If refcount is less than 1, we should just warn, unlock dentry and
> > return true, so that the caller doesn't try to do anything else.
> 
> That's effectively to guard against bugs in filesystems, not in dcache
> itself, right? Have we observed this frequently?

Hard to tell - it doesn't happen often, but... extra dput() somewhere
is not an impossible class of bugs.  I remember running into that
while doing work in namei.c, I certainly have seen failure exits in
random places that fucked refcounting up by dropping the wrong things.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 10/22] fast_dput(): new rules for refcount
  2023-11-09 14:54                   ` Christian Brauner
@ 2023-11-09 20:52                     ` Al Viro
  0 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09 20:52 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 03:54:34PM +0100, Christian Brauner wrote:
> On Thu, Nov 09, 2023 at 06:20:44AM +0000, Al Viro wrote:
> > Currently the "need caller to do more work" path in fast_dput()
> > has refcount decremented, then, with ->d_lock held and
> > refcount verified to have reached 0 fast_dput() forcibly resets
> > the refcount to 1.
> > 
> > Move that resetting refcount to 1 into the callers; later in
> > the series it will be massaged out of existence.
> > 
> > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > ---
> 
> Ok, this is safe to do because of
> 
> [PATCH 09/22] fast_dput(): handle underflows gracefully
> https://lore.kernel.org/linux-fsdevel/20231109062056.3181775-9-viro@zeniv.linux.org.uk
> 
> as return false from fast_dput() now always means refcount is zero.

Not sure how to put it in commit message cleanly.  Perhaps something
like the following variant?

By now there is only one place in entire fast_dput() where we return
false; that happens after refcount had been decremented and found
(while holding ->d_lock) to be zero.  In that case, just prior to
returning false to caller, fast_dput() forcibly changes the refcount
from 0 to 1.

Lift that resetting refcount to 1 into the callers; later in
the series it will be massaged out of existence.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path
  2023-11-09 15:53                   ` Christian Brauner
@ 2023-11-09 21:29                     ` Al Viro
  0 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-09 21:29 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 04:53:07PM +0100, Christian Brauner wrote:
> On Thu, Nov 09, 2023 at 06:20:48AM +0000, Al Viro wrote:
> > We have already checked it and dentry used to look not worthy
> > of keeping.  The only hard obstacle to evicting dentry is
> > non-zero refcount; everything else is advisory - e.g. memory
> > pressure could evict any dentry found with refcount zero.
> > On the slow path in dentry_kill() we had dropped and regained
> > ->d_lock; we must recheck the refcount, but everything else
> > is not worth bothering with.
> > 
> > Note that filesystem can not count upon ->d_delete() being
> > called for dentry - not even once.  Again, memory pressure
> > (as well as d_prune_aliases(), or attempted rmdir() of ancestor,
> > or...) will not call ->d_delete() at all.
> > 
> > So from the correctness point of view we are fine doing the
> > check only once.  And it makes things simpler down the road.
> > 
> > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > ---
> 
> Ok, that again relies on earlier patches that ensure that dentry_kill()
> isn't called with refcount == 0 afaiu,

Huh?

There are two reasons to keep dentry alive - positive refcount and
a bunch of heuristics for "it might be nice to keep it around in
hash, even though its refcount is down to zero now".

Breakage on underflows aside, dentry_kill() had always been called with
refcount 1, victim locked and those heuristics saying "no point keeping
it around".  Then it grabs the rest of locks needed for actual killing;
if we are lucky and that gets done just on trylocks, that's it - we
decrement refcount (to 0 - we held ->d_lock all along) and pass the
sucker to __dentry_kill().  RIP.  If we had to drop and regain ->d_lock,
it is possible that somebody took an extra reference and it's no longer
possible to kill the damn thing.  In that case we just decrement the
refcount, drop the locks and that's it - we are done.

So far, so good, but there's an extra twist - in case we had to drop
and regain ->d_lock, dentry_kill() rechecks the "might be nice to
keep it around" heuristics and treats "it might be" same way as it
would deal with finding extra references taken by somebody while
->d_lock had not been held.  That is to say, it does refcount decrement
(to 0 - we'd just checked that it hadn't been increased from 1),
drops the locks and that's it.

The thing is, those heuristics are really "it might be nice to keep" -
there are trivial ways to force eviction of any unlocked dentry with
zero refcount.  So why bother rechecking those?  We have already
checked them just before calling dentry_kill() and got "nah, don't
bother keeping it", after all.  And we would be leaving it in the
state where it could be instantly evicted, heuristics nonwithstanding,
so from correctness standpoint might as well decide not to keep
it and act as if that second call of retain_dentry() returned false.

Previous patches have very little to do with that - the only thing
that affects dentry_kill() is the (now gone) possibility of hitting
an underflow here.  If underflow happened, we were already screwed;
yes, this would've been one of the places where the breakage would
show up, but that's basically "what amusing kinds of behaviour would
that function exhibit on FUBAR data structures".


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09 17:20                   ` Christian Brauner
@ 2023-11-09 21:45                     ` Al Viro
  2023-11-10  9:07                       ` Christian Brauner
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-09 21:45 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:08PM +0100, Christian Brauner wrote:

> It's a bit unfortunate that __lock_parent() locks the parent *and* may
> lock the child which isn't really obvious from the name. It just becomes
> clear that this is assumed by how callers release the child's lock.

__lock_parent() is gone by the end of the series.

> We're under rcu here. Are we sure that this can't trigger rcu timeouts
> because we're spinning? Maybe there's a reason that's not an issue here.

Spinning happens only if somebody is busy moving that dentry from
directory to directory or back-and-forth turning it negative/positive
with different inode.  It's not a livelock situation - for each
iteration you need a successful rename() and/or unlink()/creat() pair
on the dentry in question.  Coming rapidly enough to cause you
spinning there...

Note that lock_parent() had that loop under rcu for a long time;
so did dget_parent().  I don't remember seeing rcu timeout warnings
about either...

> That spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED) in
> __lock_parent() is there for the sake of lockdep to verify that the
> parent lock is always aqcuired before the child lock?

Yes.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-01 18:19                   ` Al Viro
@ 2023-11-10  4:20                     ` Al Viro
  2023-11-10  5:57                       ` Linus Torvalds
  0 siblings, 1 reply; 119+ messages in thread
From: Al Viro @ 2023-11-10  4:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel

On Wed, Nov 01, 2023 at 06:19:10PM +0000, Al Viro wrote:
 
> gcc-12 on x86 turns the series of ifs into
>         movl    %edi, %eax
> 	andl    $32832, %eax
> 	cmpl    $32832, %eax
> 	jne     .L17
> 	andl    $168, %edi
> 	jne     .L17
> instead of combining that into
>         andl    $33000, %edi
> 	cmpl    $32832, %edi
> 	jne     .L17
> 
> OTOH, that's not much of pessimization...  Up to you.

	FWIW, on top of current #work.dcache2 the following delta might be worth
looking into.  Not sure if it's less confusing that way, though - I'd been staring
at that place for too long.  Code generation is slightly suboptimal with recent
gcc, but only marginally so.

diff --git a/fs/dcache.c b/fs/dcache.c
index bd57b9a08894..9e1486db64a7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -665,30 +665,57 @@ static bool lock_for_kill(struct dentry *dentry)
 	return false;
 }
 
-static inline bool retain_dentry(struct dentry *dentry)
+/*
+ * Decide if dentry is worth retaining.  Usually this is called with dentry
+ * locked; if not locked, we are more limited and might not be able to tell
+ * without a lock.  False in this case means "punt to locked path and recheck".
+ *
+ * In case we aren't locked, these predicates are not "stable". However, it is
+ * sufficient that at some point after we dropped the reference the dentry was
+ * hashed and the flags had the proper value. Other dentry users may have
+ * re-gotten a reference to the dentry and change that, but our work is done -
+ * we can leave the dentry around with a zero refcount.
+ */
+static inline bool retain_dentry(struct dentry *dentry, bool locked)
 {
-	WARN_ON(d_in_lookup(dentry));
+	unsigned int d_flags;
 
-	/* Unreachable? Get rid of it */
+	smp_rmb();
+	d_flags = READ_ONCE(dentry->d_flags);
+
+	// Unreachable? Nobody would be able to look it up, no point retaining
 	if (unlikely(d_unhashed(dentry)))
 		return false;
 
-	if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+	// Same if it's disconnected
+	if (unlikely(d_flags & DCACHE_DISCONNECTED))
 		return false;
 
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
-		if (dentry->d_op->d_delete(dentry))
+	// ->d_delete() might tell us not to bother, but that requires
+	// ->d_lock; can't decide without it
+	if (unlikely(d_flags & DCACHE_OP_DELETE)) {
+		if (!locked || dentry->d_op->d_delete(dentry))
 			return false;
 	}
 
-	if (unlikely(dentry->d_flags & DCACHE_DONTCACHE))
+	// Explicitly told not to bother
+	if (unlikely(d_flags & DCACHE_DONTCACHE))
 		return false;
 
-	/* retain; LRU fodder */
-	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+	// At this point it looks like we ought to keep it.  We also might
+	// need to do something - put it on LRU if it wasn't there already
+	// and mark it referenced if it was on LRU, but not marked yet.
+	// Unfortunately, both actions require ->d_lock, so in lockless
+	// case we'd have to punt rather than doing those.
+	if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
+		if (!locked)
+			return false;
 		d_lru_add(dentry);
-	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+	} else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
+		if (!locked)
+			return false;
 		dentry->d_flags |= DCACHE_REFERENCED;
+	}
 	return true;
 }
 
@@ -720,7 +747,6 @@ EXPORT_SYMBOL(d_mark_dontcache);
 static inline bool fast_dput(struct dentry *dentry)
 {
 	int ret;
-	unsigned int d_flags;
 
 	/*
 	 * try to decrement the lockref optimistically.
@@ -749,45 +775,18 @@ static inline bool fast_dput(struct dentry *dentry)
 		return true;
 
 	/*
-	 * Careful, careful. The reference count went down
-	 * to zero, but we don't hold the dentry lock, so
-	 * somebody else could get it again, and do another
-	 * dput(), and we need to not race with that.
-	 *
-	 * However, there is a very special and common case
-	 * where we don't care, because there is nothing to
-	 * do: the dentry is still hashed, it does not have
-	 * a 'delete' op, and it's referenced and already on
-	 * the LRU list.
-	 *
-	 * NOTE! Since we aren't locked, these values are
-	 * not "stable". However, it is sufficient that at
-	 * some point after we dropped the reference the
-	 * dentry was hashed and the flags had the proper
-	 * value. Other dentry users may have re-gotten
-	 * a reference to the dentry and change that, but
-	 * our work is done - we can leave the dentry
-	 * around with a zero refcount.
-	 *
-	 * Nevertheless, there are two cases that we should kill
-	 * the dentry anyway.
-	 * 1. free disconnected dentries as soon as their refcount
-	 *    reached zero.
-	 * 2. free dentries if they should not be cached.
+	 * Can we decide that decrement of refcount is all we needed without
+	 * taking the lock?  There's a very common case when it's all we need -
+	 * dentry looks like it ought to be retained and there's nothing else
+	 * to do.
 	 */
-	smp_rmb();
-	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
-			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
-
-	/* Nothing to do? Dropping the reference was all we needed? */
-	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+	if (retain_dentry(dentry, false))
 		return true;
 
 	/*
-	 * Not the fast normal case? Get the lock. We've already decremented
-	 * the refcount, but we'll need to re-check the situation after
-	 * getting the lock.
+	 * Either not worth retaining or we can't tell without the lock.
+	 * Get the lock, then.  We've already decremented the refcount to 0,
+	 * but we'll need to re-check the situation after getting the lock.
 	 */
 	spin_lock(&dentry->d_lock);
 
@@ -798,7 +797,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * don't need to do anything else.
 	 */
 locked:
-	if (dentry->d_lockref.count || retain_dentry(dentry)) {
+	if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
@@ -847,7 +846,7 @@ void dput(struct dentry *dentry)
 		dentry = __dentry_kill(dentry);
 		if (!dentry)
 			return;
-		if (retain_dentry(dentry)) {
+		if (retain_dentry(dentry, true)) {
 			spin_unlock(&dentry->d_lock);
 			return;
 		}

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-10  4:20                     ` lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput()) Al Viro
@ 2023-11-10  5:57                       ` Linus Torvalds
  2023-11-10  6:22                         ` Linus Torvalds
                                           ` (2 more replies)
  0 siblings, 3 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-10  5:57 UTC (permalink / raw)
  To: Al Viro, Peter Zijlstra; +Cc: linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 4444 bytes --]

On Thu, 9 Nov 2023 at 20:20, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
>         FWIW, on top of current #work.dcache2 the following delta might be worth
> looking into.  Not sure if it's less confusing that way, though - I'd been staring
> at that place for too long.  Code generation is slightly suboptimal with recent
> gcc, but only marginally so.

I doubt the pure ALU ops and a couple of extra conditional branches
(that _probably_ predict well) matter at all.

Especially since this is all after lockref_put_return() has done that
locked cmpxchg, which *is* expensive.

My main reaction is that we use hlist_bl_unhashed() for d_unhashed(),
and we *intentionally* make it separate from the actual unhasing:

 - ___d_drop() does the __hlist_bl_del()

 - but d_unhashed() does hlist_bl_unhashed(), which checks
d_hash.pprev == NULL, and that's done by __d_drop

We even have a comment about this:

 * ___d_drop doesn't mark dentry as "unhashed"
 * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).

and we depend on this in __d_move(), which will unhash things
temporarily, but not mark things unhashed, because they get re-hashed
again. Same goes for __d_add().

Anyway, what I'm actually getting at in a roundabout way is that maybe
we should make D_UNHASHED be another flag in d_flags, and *not* use
that d_hash.pprev field, and that would allow us to combine even more
of these tests in dput(), because now pretty much *all* of those
"retain_dentry()" checks would be about d_flags bits.

Hmm? As it is, it has that odd combination of d_flags and that
d_unhashed() test, so it's testing two different fields.

Anyway, I really don't think it matters much, but since you brought up
the whole suboptimal code generation..

I tried to look at dput() code generation, and it doesn't look
horrendous as-is in your dcache2 branch.

If anything, the thing that hirs is the lockref_put_return() being
out-of-line even though this is basically the only caller, plus people
have pessimized the arch_spin_value_unlocked() implementation *again*,
so that it uses a volatile read, when the *WHOLE*POINT* of that
"VALUE" part of "value_unlocked()" is that we've already read the
value, and we should *not* re-read it.

Damn.

The bug seems to affect both the generic qspinlock code, and the
ticket-based one.

For the ticket based ones, it's PeterZ and commit 1bce11126d57
("asm-generic: ticket-lock: New generic ticket-based spinlock"), which
does

  static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
  {
        return !arch_spin_is_locked(&lock);
  }

where we've got that "lock" value, but then it takes the address of
it, and uses arch_spin_is_locked() on it, so now it will force a flush
to memory, and then an READ_ONCE() on it.

And for the qspinlock code, we had a similar

  static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
  {
        return !atomic_read(&lock.val);
  }

thing, where it does 'atomic_read()' on the value it was passed as an argument.

Stupid, stupid. It's literally forcing a re-read of a value that is
guaranteed to be on stack.

I know this worked at some point, but that may have been many years
ago, since I haven't looked at this part of lockref code generation in
ages.

Anway, as a result now all the lockref functions will do silly "store
the old lockref value to memory, in order to read it again" dances in
that CMPXCHG_LOOP() loop.

It literally makes that whole "is this an unlocked value" function
completely pointless. The *whole* and only point was "look, I already
loaded the value from memory, is this *VALUE* unlocked.

Compared to that complete braindamage in the fast-path loop, the small
extra ALU ops in fast_dput() are nothing.

Peter - those functions are done exactly the wrong way around.
arch_spin_is_locked() should be implemented using
arch_spin_value_unlocked(), not this way around.

And the queued spinlocks should not do an atomic_read()of the argument
they get, they should just do "!lock.val.counter"

So something like this should fix lockref. ENTIRELY UNTESTED, except
now the code generation of lockref_put_return() looks much better,
without a pointless flush to the stack, and now it has no pointless
stack frame as a result.

Of course, it should probably be inlined, since it has only one user
(ok, two, since fast_dput() gets used twice), and that should make the
return value testing much better.

               Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1730 bytes --]

 include/asm-generic/qspinlock.h |  2 +-
 include/asm-generic/spinlock.h  | 17 +++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 995513fa2690..0655aa5b57b2 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -70,7 +70,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
  */
 static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
 {
-	return !atomic_read(&lock.val);
+	return !lock.val.counter;
 }
 
 /**
diff --git a/include/asm-generic/spinlock.h b/include/asm-generic/spinlock.h
index fdfebcb050f4..a35eda0ec2a2 100644
--- a/include/asm-generic/spinlock.h
+++ b/include/asm-generic/spinlock.h
@@ -68,11 +68,17 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 	smp_store_release(ptr, (u16)val + 1);
 }
 
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+	u32 val = lock.counter;
+	return ((val >> 16) == (val & 0xffff));
+}
+
 static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	u32 val = atomic_read(lock);
-
-	return ((val >> 16) != (val & 0xffff));
+	arch_spinlock_t val;
+	val.counter = atomic_read(lock);
+	return !arch_spin_value_unlocked(val);
 }
 
 static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
@@ -82,11 +88,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
 	return (s16)((val >> 16) - (val & 0xffff)) > 1;
 }
 
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return !arch_spin_is_locked(&lock);
-}
-
 #include <asm/qrwlock.h>
 
 #endif /* __ASM_GENERIC_SPINLOCK_H */

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-10  5:57                       ` Linus Torvalds
@ 2023-11-10  6:22                         ` Linus Torvalds
  2023-11-22  6:29                           ` Guo Ren
  2023-11-10  8:19                         ` Al Viro
  2023-11-22  7:19                         ` Guo Ren
  2 siblings, 1 reply; 119+ messages in thread
From: Linus Torvalds @ 2023-11-10  6:22 UTC (permalink / raw)
  To: Al Viro, Peter Zijlstra, Guo Ren, Ingo Molnar; +Cc: linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 889 bytes --]

On Thu, 9 Nov 2023 at 21:57, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So something like this should fix lockref. ENTIRELY UNTESTED, except
> now the code generation of lockref_put_return() looks much better,
> without a pointless flush to the stack, and now it has no pointless
> stack frame as a result.

Heh. And because I was looking at Al's tree, I didn't notice that
commit c6f4a9002252 ("asm-generic: ticket-lock: Optimize
arch_spin_value_unlocked()") had solved the ticket spinlock part of
this in this merge window in the meantime.

The qspinlock implementation - which is what x86 uses - is still
broken in mainline, though.

So that part of my patch still stands. Now attached just the small
one-liner part. Adding Ingo and Guo Ren, who did the ticket lock part
(and looks to have done it very similarly to my suggested patch.

Ingo?

                     Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 532 bytes --]

 include/asm-generic/qspinlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 995513fa2690..0655aa5b57b2 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -70,7 +70,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
  */
 static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
 {
-	return !atomic_read(&lock.val);
+	return !lock.val.counter;
 }
 
 /**

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-10  5:57                       ` Linus Torvalds
  2023-11-10  6:22                         ` Linus Torvalds
@ 2023-11-10  8:19                         ` Al Viro
  2023-11-22  7:19                         ` Guo Ren
  2 siblings, 0 replies; 119+ messages in thread
From: Al Viro @ 2023-11-10  8:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Peter Zijlstra, linux-fsdevel

On Thu, Nov 09, 2023 at 09:57:39PM -0800, Linus Torvalds wrote:

> Anyway, what I'm actually getting at in a roundabout way is that maybe
> we should make D_UNHASHED be another flag in d_flags, and *not* use
> that d_hash.pprev field, and that would allow us to combine even more
> of these tests in dput(), because now pretty much *all* of those
> "retain_dentry()" checks would be about d_flags bits.
> 
> Hmm? As it is, it has that odd combination of d_flags and that
> d_unhashed() test, so it's testing two different fields.

Hmm, indeed.  The trouble is, we are getting tight on the ->d_flags bits.
Only two unassigned bits left (0x08000000 and 0x80000000).

DCACHE_COOKIE is defined (0x00002000), but unused.  Should've been
taken out when dcookie stuff went.

DCACHE_DENTRY_KILLED might be mergable with DCACHE_MAY_FREE now;
worth looking into.  In effect, DCACHE_MAY_FREE is set iff
we have both DCACHE_DENTRY_KILLED and DCACHE_SHRINK_LIST - and
the only place that checks it is guaranteed to have had
DCACHE_SHRINK_LIST.  Actually, that's nice - in terms of dentry
states we have
refcount > 0 <=> Busy
refcount == 0 <=> Retained
refcount < 0 && !KILLED <=> Dying
refcount < 0 && KILLED && !SHRINK_LIST <=> Freeing
refcount < 0 && KILLED && SHRINK_LIST <=> Husk.
<makes a note in the docs being written>

DCACHE_FALLTRHU is odd - it's never checked (or set, for that matter);
might be killable, might be intended for some overlayfs plans.

DCACHE_GENOCIDE might become killable, what with selinuxfs patch I've
got (apparently OK with selinux folks, will sort it out after -rc1).

OK, it's not as awful as I thought - one more bit won't hurt.
I'll go through the unlocked callers and see if any of those is
sensitive to separating setting that flag from hash list removal.
There might be dragons...

> Anyway, I really don't think it matters much, but since you brought up
> the whole suboptimal code generation..

FWIW, it's not all that suboptimal, at least with current gcc.  The thing
I'm really not sure about is whether that patch makes the whole thing
easier to follow - probably need to let it sit around for a week or so,
then look at it again; right now I don't trust my taste regarding that
particular change, having spent too much time today mucking with it ;-/

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 17/22] don't try to cut corners in shrink_lock_dentry()
  2023-11-09 21:45                     ` Al Viro
@ 2023-11-10  9:07                       ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-10  9:07 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 09:45:37PM +0000, Al Viro wrote:
> On Thu, Nov 09, 2023 at 06:20:08PM +0100, Christian Brauner wrote:
> 
> > It's a bit unfortunate that __lock_parent() locks the parent *and* may
> > lock the child which isn't really obvious from the name. It just becomes
> > clear that this is assumed by how callers release the child's lock.
> 
> __lock_parent() is gone by the end of the series.

Yes, I saw that once I got to the end of the series. Thanks.

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: [PATCH 22/22] __dentry_kill(): new locking scheme
  2023-11-09  6:20                 ` [PATCH 22/22] __dentry_kill(): new locking scheme Al Viro
@ 2023-11-10 13:34                   ` Christian Brauner
  0 siblings, 0 replies; 119+ messages in thread
From: Christian Brauner @ 2023-11-10 13:34 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Nov 09, 2023 at 06:20:56AM +0000, Al Viro wrote:
> Currently we enter __dentry_kill() with parent (along with the victim
> dentry and victim's inode) held locked.  Then we
> 	mark dentry refcount as dead
> 	call ->d_prune()
> 	remove dentry from hash
> 	remove it from the parent's list of children
> 	unlock the parent, don't need it from that point on
> 	detach dentry from inode, unlock dentry and drop the inode
> (via ->d_iput())
> 	call ->d_release()
> 	regain the lock on dentry
> 	check if it's on a shrink list (in which case freeing its empty husk
> has to be left to shrink_dentry_list()) or not (in which case we can free it
> ourselves).  In the former case, mark it as an empty husk, so that
> shrink_dentry_list() would know it can free the sucker.
> 	drop the lock on dentry
> ... and usually the caller proceeds to drop a reference on the parent,
> possibly retaking the lock on it.
> 
> That is painful for a bunch of reasons, starting with the need to take locks
> out of order, but not limited to that - the parent of positive dentry can
> change if we drop its ->d_lock, so getting these locks has to be done with
> care.  Moreover, as soon as dentry is out of the parent's list of children,
> shrink_dcache_for_umount() won't see it anymore, making it appear as if
> the parent is inexplicably busy.  We do work around that by having
> shrink_dentry_list() decrement the parent's refcount first and put it on
> shrink list to be evicted once we are done with __dentry_kill() of child,
> but that may in some cases lead to ->d_iput() on child called after the
> parent got killed.  That doesn't happen in cases where in-tree ->d_iput()
> instances might want to look at the parent, but that's brittle as hell.
> 
> Solution: do removal from the parent's list of children in the very
> end of __dentry_kill().  As the result, the callers do not need to
> lock the parent and by the time we really need the parent locked,
> dentry is negative and is guaranteed not to be moved around.
> 
> It does mean that ->d_prune() will be called with parent not locked.
> It also means that we might see dentries in process of being torn
> down while going through the parent's list of children; those dentries
> will be unhashed, negative and with refcount marked dead.  In practice,
> that's enough for in-tree code that looks through the list of children
> to do the right thing as-is.  Out-of-tree code might need to be adjusted.
> 
> Calling conventions: __dentry_kill(dentry) is called with dentry->d_lock
> held, along with ->i_lock of its inode (if any).  It either returns
> the parent (locked, with refcount decremented to 0) or NULL (if there'd
> been no parent or if refcount decrement for parent hadn't reached 0).
> 
> lock_for_kill() is adjusted for new requirements - it doesn't touch
> the parent's ->d_lock at all.
> 
> Callers adjusted.  Note that for dput() we don't need to bother with
> fast_dput() for the parent - we just need to check retain_dentry()
> for it, since its ->d_lock is still held since the moment when
> __dentry_kill() had taken it to remove the victim from the list of
> children.
> 
> The kludge with early decrement of parent's refcount in
> shrink_dentry_list() is no longer needed - shrink_dcache_for_umount()
> sees the half-killed dentries in the list of children for as long
> as they are pinning the parent.  They are easily recognized and
> accounted for by select_collect(), so we know we are not done yet.
> 
> As the result, we always have the expected ordering for ->d_iput()/->d_release()
> vs. __dentry_kill() of the parent, no exceptions.  Moreover, the current
> rules for shrink lists (one must make sure that shrink_dcache_for_umount()
> won't happen while any dentries from the superblock in question are on
> any shrink lists) are gone - shrink_dcache_for_umount() will do the
> right thing in all cases, taking such dentries out.  Their empty
> husks (memory occupied by struct dentry itself + its external name,
> if any) will remain on the shrink lists, but they are no obstacles
> to filesystem shutdown.  And such husks will get freed as soon as
> shrink_dentry_list() of the list they are on gets to them.
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---

Reviewed-by: Christian Brauner <brauner@kernel.org>

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-10  6:22                         ` Linus Torvalds
@ 2023-11-22  6:29                           ` Guo Ren
  0 siblings, 0 replies; 119+ messages in thread
From: Guo Ren @ 2023-11-22  6:29 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, Ingo Molnar, linux-fsdevel, will

On Thu, Nov 09, 2023 at 10:22:13PM -0800, Linus Torvalds wrote:
> On Thu, 9 Nov 2023 at 21:57, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > So something like this should fix lockref. ENTIRELY UNTESTED, except
> > now the code generation of lockref_put_return() looks much better,
> > without a pointless flush to the stack, and now it has no pointless
> > stack frame as a result.
> 
> Heh. And because I was looking at Al's tree, I didn't notice that
> commit c6f4a9002252 ("asm-generic: ticket-lock: Optimize
> arch_spin_value_unlocked()") had solved the ticket spinlock part of
> this in this merge window in the meantime.
> 
> The qspinlock implementation - which is what x86 uses - is still
> broken in mainline, though.
> 
> So that part of my patch still stands. Now attached just the small
> one-liner part. Adding Ingo and Guo Ren, who did the ticket lock part
> (and looks to have done it very similarly to my suggested patch.
Not only generic ticket lock, I think Will Deacon recognized the lockref
problem of the arm32 ticket-lock in 2013.

After my patch merged, I think riscv could also select
ARCH_USE_CMPXCHG_LOCKREF in its Kconfig.

Ref:
commit 0cbad9c9dfe0c38e8ec7385b39087c005a6dee3e
Author: Will Deacon <will@kernel.org>
Date:   Wed Oct 9 17:19:22 2013 +0100

    ARM: 7854/1: lockref: add support for lockless lockrefs using
cmpxchg64

    Our spinlocks are only 32-bit (2x16-bit tickets) and, on processors
    with 64-bit atomic instructions, cmpxchg64 makes use of the
double-word
    exclusive accessors.

    This patch wires up the cmpxchg-based lockless lockref
implementation
    for ARM.

    Signed-off-by: Will Deacon <will.deacon@arm.com>
    Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1ad6fb6c094d..fc184bcd7848 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAVE_CUSTOM_GPIO_H
+       select ARCH_USE_CMPXCHG_LOCKREF
        select ARCH_WANT_IPC_PARSE_VERSION
        select BUILDTIME_EXTABLE_SORT if MMU
        select CLONE_BACKWARDS
diff --git a/arch/arm/include/asm/spinlock.h
b/arch/arm/include/asm/spinlock.h
index 4f2c28060c9a..ed6c22919e47 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -127,10 +127,14 @@ static inline void
arch_spin_unlock(arch_spinlock_t *lock)
        dsb_sev();
 }

+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+       return lock.tickets.owner == lock.tickets.next;
+}
+
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-       struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
-       return tickets.owner != tickets.next;
+       return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
 }

 static inline int arch_spin_is_contended(arch_spinlock_t *lock)

> 
> Ingo?
> 
>                      Linus

>  include/asm-generic/qspinlock.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index 995513fa2690..0655aa5b57b2 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -70,7 +70,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
>   */
>  static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
>  {
> -	return !atomic_read(&lock.val);
> +	return !lock.val.counter;
>  }
>  
>  /**


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-10  5:57                       ` Linus Torvalds
  2023-11-10  6:22                         ` Linus Torvalds
  2023-11-10  8:19                         ` Al Viro
@ 2023-11-22  7:19                         ` Guo Ren
  2023-11-22 17:20                           ` Linus Torvalds
  2 siblings, 1 reply; 119+ messages in thread
From: Guo Ren @ 2023-11-22  7:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Thu, Nov 09, 2023 at 09:57:39PM -0800, Linus Torvalds wrote:
> On Thu, 9 Nov 2023 at 20:20, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> >         FWIW, on top of current #work.dcache2 the following delta might be worth
> > looking into.  Not sure if it's less confusing that way, though - I'd been staring
> > at that place for too long.  Code generation is slightly suboptimal with recent
> > gcc, but only marginally so.
> 
> I doubt the pure ALU ops and a couple of extra conditional branches
> (that _probably_ predict well) matter at all.
> 
> Especially since this is all after lockref_put_return() has done that
> locked cmpxchg, which *is* expensive.
> 
> My main reaction is that we use hlist_bl_unhashed() for d_unhashed(),
> and we *intentionally* make it separate from the actual unhasing:
> 
>  - ___d_drop() does the __hlist_bl_del()
> 
>  - but d_unhashed() does hlist_bl_unhashed(), which checks
> d_hash.pprev == NULL, and that's done by __d_drop
> 
> We even have a comment about this:
> 
>  * ___d_drop doesn't mark dentry as "unhashed"
>  * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
> 
> and we depend on this in __d_move(), which will unhash things
> temporarily, but not mark things unhashed, because they get re-hashed
> again. Same goes for __d_add().
> 
> Anyway, what I'm actually getting at in a roundabout way is that maybe
> we should make D_UNHASHED be another flag in d_flags, and *not* use
> that d_hash.pprev field, and that would allow us to combine even more
> of these tests in dput(), because now pretty much *all* of those
> "retain_dentry()" checks would be about d_flags bits.
> 
> Hmm? As it is, it has that odd combination of d_flags and that
> d_unhashed() test, so it's testing two different fields.
> 
> Anyway, I really don't think it matters much, but since you brought up
> the whole suboptimal code generation..
> 
> I tried to look at dput() code generation, and it doesn't look
> horrendous as-is in your dcache2 branch.
> 
> If anything, the thing that hirs is the lockref_put_return() being
> out-of-line even though this is basically the only caller, plus people
> have pessimized the arch_spin_value_unlocked() implementation *again*,
> so that it uses a volatile read, when the *WHOLE*POINT* of that
> "VALUE" part of "value_unlocked()" is that we've already read the
> value, and we should *not* re-read it.
> 
> Damn.
> 
> The bug seems to affect both the generic qspinlock code, and the
> ticket-based one.
> 
> For the ticket based ones, it's PeterZ and commit 1bce11126d57
> ("asm-generic: ticket-lock: New generic ticket-based spinlock"), which
> does
> 
>   static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
>   {
>         return !arch_spin_is_locked(&lock);
>   }
> 
> where we've got that "lock" value, but then it takes the address of
> it, and uses arch_spin_is_locked() on it, so now it will force a flush
> to memory, and then an READ_ONCE() on it.
> 
> And for the qspinlock code, we had a similar

We discussed x86 qspinlock code generation. It looked not too bad as I
thought because qspinlock_spin_value_unlocked is much cheaper than the
ticket-lock. But the riscv ticket-lock code generation is terrible
because of the shift left & right 16-bit.
https://lore.kernel.org/all/ZNG2tHFOABSXGCVi@gmail.com

> 
>   static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
>   {
>         return !atomic_read(&lock.val);
>   }
> 
> thing, where it does 'atomic_read()' on the value it was passed as an argument.
> 
> Stupid, stupid. It's literally forcing a re-read of a value that is
> guaranteed to be on stack.
> 
> I know this worked at some point, but that may have been many years
> ago, since I haven't looked at this part of lockref code generation in
> ages.
> 
> Anway, as a result now all the lockref functions will do silly "store
> the old lockref value to memory, in order to read it again" dances in
> that CMPXCHG_LOOP() loop.
> 
> It literally makes that whole "is this an unlocked value" function
> completely pointless. The *whole* and only point was "look, I already
> loaded the value from memory, is this *VALUE* unlocked.
> 
> Compared to that complete braindamage in the fast-path loop, the small
> extra ALU ops in fast_dput() are nothing.
> 
> Peter - those functions are done exactly the wrong way around.
> arch_spin_is_locked() should be implemented using
> arch_spin_value_unlocked(), not this way around.
> 
> And the queued spinlocks should not do an atomic_read()of the argument
> they get, they should just do "!lock.val.counter"
> 
> So something like this should fix lockref. ENTIRELY UNTESTED, except
> now the code generation of lockref_put_return() looks much better,
> without a pointless flush to the stack, and now it has no pointless
> stack frame as a result.
> 
> Of course, it should probably be inlined, since it has only one user
> (ok, two, since fast_dput() gets used twice), and that should make the
> return value testing much better.
> 
>                Linus

>  include/asm-generic/qspinlock.h |  2 +-
>  include/asm-generic/spinlock.h  | 17 +++++++++--------
>  2 files changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index 995513fa2690..0655aa5b57b2 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -70,7 +70,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
>   */
>  static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
>  {
> -	return !atomic_read(&lock.val);
> +	return !lock.val.counter;
>  }
>  
>  /**
> diff --git a/include/asm-generic/spinlock.h b/include/asm-generic/spinlock.h
> index fdfebcb050f4..a35eda0ec2a2 100644
> --- a/include/asm-generic/spinlock.h
> +++ b/include/asm-generic/spinlock.h
> @@ -68,11 +68,17 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
>  	smp_store_release(ptr, (u16)val + 1);
>  }
>  
> +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
> +{
> +	u32 val = lock.counter;
> +	return ((val >> 16) == (val & 0xffff));
> +}
> +
>  static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock)
>  {
> -	u32 val = atomic_read(lock);
> -
> -	return ((val >> 16) != (val & 0xffff));
> +	arch_spinlock_t val;
> +	val.counter = atomic_read(lock);
> +	return !arch_spin_value_unlocked(val);
>  }
>  
>  static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
> @@ -82,11 +88,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
>  	return (s16)((val >> 16) - (val & 0xffff)) > 1;
>  }
>  
> -static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
> -{
> -	return !arch_spin_is_locked(&lock);
> -}
> -
>  #include <asm/qrwlock.h>
>  
>  #endif /* __ASM_GENERIC_SPINLOCK_H */


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22  7:19                         ` Guo Ren
@ 2023-11-22 17:20                           ` Linus Torvalds
  2023-11-22 17:52                             ` Linus Torvalds
  2023-11-26 16:39                             ` Guo Ren
  0 siblings, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-22 17:20 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2315 bytes --]

On Tue, 21 Nov 2023 at 23:19, Guo Ren <guoren@kernel.org> wrote:
>
> We discussed x86 qspinlock code generation. It looked not too bad as I
> thought because qspinlock_spin_value_unlocked is much cheaper than the
> ticket-lock. But the riscv ticket-lock code generation is terrible
> because of the shift left & right 16-bit.
> https://lore.kernel.org/all/ZNG2tHFOABSXGCVi@gmail.com

No, it's not the 16-bit shifts in the spin_value_unlocked() check,
that just generates simple and straightforward code:

  a0:   0107569b                srlw    a3,a4,0x10
  a4:   00c77733                and     a4,a4,a2
  a8:   04e69063                bne     a3,a4,e8 <.L12>

(plus two stupid instructions for generating the immediate in a2 for
0xffff, but hey, that's the usual insane RISC-V encoding thing - you
can load a 20-bit U-immediate only shifted up by 12, if it's in the
lower bits you're kind of screwed and limited to 12-bit immediates).

The *bad* code generation is from the much simpler

        new.count++;

which sadly neither gcc not clang is quite smart enough to understand
that "hey, I can do that in 64 bits".

It's incrementing the higher 32-bit word in a 64-bit union, and with a
smarter compiler it *should* basically become

        lock_count += 1 << 32;

but the compiler isn't that clever, so it splits the 64-bit word into
two 32-bit words, increments one of them, and then merges the two
words back into 64 bits:

  98:   4207d693                sra     a3,a5,0x20
  9c:   02079713                sll     a4,a5,0x20
  a0:   0016869b                addw    a3,a3,1
  a4:   02069693                sll     a3,a3,0x20
  a8:   02075713                srl     a4,a4,0x20
  ac:   00d76733                or      a4,a4,a3

which is pretty sad.

If you want to do the optimization that the compiler misses by hand,
it would be something like the attached patch.

NOTE! Very untested. But that *should* cause the compiler to just
generate a single "add" instruction (in addition to generating the
constant 0x100000000, of course).

Of course, on a LL/SC architecture like RISC-V, in an *optimal* world,
the whole sequence would actually be done with one single LL/SC,
rather than the "load,add,cmpxchg" thing.

But then you'd have to do absolutely everything by hand in assembly.

                  Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1133 bytes --]

 lib/lockref.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/lockref.c b/lib/lockref.c
index 2afe4c5d8919..481b102a6476 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -26,6 +26,17 @@
 	}									\
 } while (0)
 
+/*
+ * The compiler isn't smart enough to the the count
+ * increment in the high 32 bits of the 64-bit value,
+ * so do this optimization by hand.
+ */
+#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
+ #define LOCKREF_INC(n) ((n).lock_count += 1ul<<32)
+#else
+ #define LOCKREF_INC(n) ((n).count++)
+#endif
+
 #else
 
 #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
@@ -42,7 +53,7 @@
 void lockref_get(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_INC(new);
 	,
 		return;
 	);
@@ -63,7 +74,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_INC(new);
 		if (old.count <= 0)
 			return 0;
 	,
@@ -174,7 +185,7 @@ int lockref_get_not_dead(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_INC(new);
 		if (old.count < 0)
 			return 0;
 	,

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22 17:20                           ` Linus Torvalds
@ 2023-11-22 17:52                             ` Linus Torvalds
  2023-11-22 18:05                               ` Linus Torvalds
  2023-11-22 19:11                               ` Linus Torvalds
  2023-11-26 16:39                             ` Guo Ren
  1 sibling, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-22 17:52 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1009 bytes --]

On Wed, 22 Nov 2023 at 09:20, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> If you want to do the optimization that the compiler misses by hand,
> it would be something like the attached patch.

Bah. Might as well do the reference decrements with the same logic,
not just the increments.

Of course, this is much more noticeable with the ticket locks, because
with the qspinlocks the "is this unlocked" test will check whether the
lock is all zeroes.

So with qspinlocks, the compiler sees that "oh, the low 32 bits are
zero", and the whole "merge the two words back to 64 bits" is much
cheaper, and doesn't generate quite the mess that it does for RISC-V
with ticket locks.

But this "treat the lockref as a 64-bit entity" thing is probably a
good thing on most 64-bit architectures, including x86 that has that
qspinlock thing.

Still not actually tested, but the code generation on x86 looks
reasonable, so it migth be worth looking at whether it helps the
RISC-V case.

                 Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1684 bytes --]

diff --git a/lib/lockref.c b/lib/lockref.c
index 2afe4c5d8919..56f4419f593d 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -26,6 +26,17 @@
 	}									\
 } while (0)
 
+/*
+ * The compiler isn't smart enough to the the count
+ * increment in the high 32 bits of the 64-bit value,
+ * so do this optimization by hand.
+ */
+#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
+ #define LOCKREF_ADD(n,x) ((n).lock_count += (unsigned long)(x)<<32)
+#else
+ #define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x)<<32)
+#endif
+
 #else
 
 #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
@@ -42,7 +53,7 @@
 void lockref_get(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_ADD(new,1);
 	,
 		return;
 	);
@@ -63,7 +74,7 @@ int lockref_get_not_zero(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_ADD(new,1);
 		if (old.count <= 0)
 			return 0;
 	,
@@ -91,7 +102,7 @@ int lockref_put_not_zero(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count--;
+		LOCKREF_ADD(new,-1);
 		if (old.count <= 1)
 			return 0;
 	,
@@ -119,7 +130,7 @@ EXPORT_SYMBOL(lockref_put_not_zero);
 int lockref_put_return(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count--;
+		LOCKREF_ADD(new,-1);
 		if (old.count <= 0)
 			return -1;
 	,
@@ -137,7 +148,7 @@ EXPORT_SYMBOL(lockref_put_return);
 int lockref_put_or_lock(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count--;
+		LOCKREF_ADD(new,-1);
 		if (old.count <= 1)
 			break;
 	,
@@ -174,7 +185,7 @@ int lockref_get_not_dead(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_ADD(new,1);
 		if (old.count < 0)
 			return 0;
 	,

^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22 17:52                             ` Linus Torvalds
@ 2023-11-22 18:05                               ` Linus Torvalds
  2023-11-22 19:11                               ` Linus Torvalds
  1 sibling, 0 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-22 18:05 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Wed, 22 Nov 2023 at 09:52, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Bah. Might as well do the reference decrements with the same logic,
> not just the increments.

And thanks for reminding me about this issue. I just committed the
trivial one-liner fix for qspinlock code generation that apparently
never went anywhere (mostly my own damn fault for not having pushed it
enough and made a proper commit message).

               Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22 17:52                             ` Linus Torvalds
  2023-11-22 18:05                               ` Linus Torvalds
@ 2023-11-22 19:11                               ` Linus Torvalds
  2023-11-29  7:14                                 ` Guo Ren
  2023-11-29 12:25                                 ` Guo Ren
  1 sibling, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-22 19:11 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1952 bytes --]

On Wed, 22 Nov 2023 at 09:52, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Still not actually tested, but the code generation on x86 looks
> reasonable, so it migth be worth looking at whether it helps the
> RISC-V case.

Doing some more munging, and actually looking at RISC-V code
generation too (I obviously had to enable ARCH_USE_CMPXCHG_LOCKREF for
RISC-V).

I get this:

  lockref_get:
        addi    sp,sp,-32
        sd      s0,16(sp)
        sd      s1,8(sp)
        sd      ra,24(sp)
        addi    s0,sp,32
        li      a1,65536
        ld      a5,0(a0)
        mv      s1,a0
        addi    a1,a1,-1
        li      a0,100
  .L43:
        sext.w  a3,a5
        li      a4,1
        srliw   a2,a5,16
        and     a3,a3,a1
        slli    a4,a4,32
        bne     a2,a3,.L49
        add     a4,a5,a4
  0:
        lr.d a3, 0(s1)
        bne a3, a5, 1f
        sc.d.rl a2, a4, 0(s1)
        bnez a2, 0b
        fence rw, rw
  1:
        bne     a5,a3,.L52
        ld      ra,24(sp)
        ld      s0,16(sp)
        ld      s1,8(sp)
        addi    sp,sp,32
        jr      ra
  ...

so now that single update is indeed just one single instruction:

        add     a4,a5,a4

is that "increment count in the high 32 bits".

The ticket lock being unlocked checks are those

        li      a1,65536
        sext.w  a3,a5
        srliw   a2,a5,16
        and     a3,a3,a1
        bne     a2,a3,.L49

instructions if I read it right.

That actually looks fairly close to optimal, although the frame setup
is kind of sad.

(The above does not include the "loop if the cmpxchg failed" part of
the code generation)

Anyway, apart from enabling LOCKREF, the patch to get this for RISC-V
is attached.

I'm not going to play with this any more, but you might want to check
whether this actually does work on RISC-V.

Becaue I only looked at the code generation, I didn't actually look at
whether it *worked*.

                Linus

[-- Attachment #2: 0001-lockref-improve-code-generation-for-ref-updates.patch --]
[-- Type: text/x-patch, Size: 3125 bytes --]

From 168f35850c15468941e597907e33daacd179d54a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 22 Nov 2023 09:33:29 -0800
Subject: [PATCH] lockref: improve code generation for ref updates

Our lockref data structure is two 32-bit words laid out next to each
other, combining the spinlock and the count into one entity that can be
accessed atomically together.

In particular, the structure is laid out so that the count is the upper
32 bit word (on little-endian), so that you can do basic arithmetic on
the count in 64 bits: instead of adding one to the 32-bit word, you can
just add a value shifted by 32 to the full 64-bit word.

Sadly, neither gcc nor clang are quite clever enough to work that out on
their own, so this does that "manually".

Also, try to do any compares against zero values, which generally
improves the code generation.  So rather than check that the value was
at least 1 before a decrement, check that it's positive or zero after
the decrement.  We don't worry about the overflow point in lockrefs.

Cc: Guo Ren <guoren@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/lockref.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/lockref.c b/lib/lockref.c
index 2afe4c5d8919..f3c30c538af1 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -26,6 +26,17 @@
 	}									\
 } while (0)
 
+/*
+ * The compiler isn't smart enough to the the count
+ * increment in the high 32 bits of the 64-bit value,
+ * so do this optimization by hand.
+ */
+#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
+ #define LOCKREF_ADD(n,x) ((n).lock_count += (unsigned long)(x)<<32)
+#else
+ #define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x)<<32)
+#endif
+
 #else
 
 #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
@@ -42,7 +53,7 @@
 void lockref_get(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count++;
+		LOCKREF_ADD(new,1);
 	,
 		return;
 	);
@@ -63,9 +74,9 @@ int lockref_get_not_zero(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count++;
 		if (old.count <= 0)
 			return 0;
+		LOCKREF_ADD(new,1);
 	,
 		return 1;
 	);
@@ -91,8 +102,8 @@ int lockref_put_not_zero(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count--;
-		if (old.count <= 1)
+		LOCKREF_ADD(new,-1);
+		if (new.count <= 0)
 			return 0;
 	,
 		return 1;
@@ -119,8 +130,8 @@ EXPORT_SYMBOL(lockref_put_not_zero);
 int lockref_put_return(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count--;
-		if (old.count <= 0)
+		LOCKREF_ADD(new,-1);
+		if (new.count < 0)
 			return -1;
 	,
 		return new.count;
@@ -137,8 +148,8 @@ EXPORT_SYMBOL(lockref_put_return);
 int lockref_put_or_lock(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
-		new.count--;
-		if (old.count <= 1)
+		LOCKREF_ADD(new,-1);
+		if (new.count <= 0)
 			break;
 	,
 		return 1;
@@ -174,9 +185,9 @@ int lockref_get_not_dead(struct lockref *lockref)
 	int retval;
 
 	CMPXCHG_LOOP(
-		new.count++;
 		if (old.count < 0)
 			return 0;
+		LOCKREF_ADD(new,1);
 	,
 		return 1;
 	);
-- 
2.43.0.5.g38fb137bdb


^ permalink raw reply related	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22 17:20                           ` Linus Torvalds
  2023-11-22 17:52                             ` Linus Torvalds
@ 2023-11-26 16:39                             ` Guo Ren
  2023-11-26 16:51                               ` Linus Torvalds
                                                 ` (2 more replies)
  1 sibling, 3 replies; 119+ messages in thread
From: Guo Ren @ 2023-11-26 16:39 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Wed, Nov 22, 2023 at 09:20:53AM -0800, Linus Torvalds wrote:
> On Tue, 21 Nov 2023 at 23:19, Guo Ren <guoren@kernel.org> wrote:
> >
> > We discussed x86 qspinlock code generation. It looked not too bad as I
> > thought because qspinlock_spin_value_unlocked is much cheaper than the
> > ticket-lock. But the riscv ticket-lock code generation is terrible
> > because of the shift left & right 16-bit.
> > https://lore.kernel.org/all/ZNG2tHFOABSXGCVi@gmail.com
> 
> No, it's not the 16-bit shifts in the spin_value_unlocked() check,
> that just generates simple and straightforward code:
> 
>   a0:   0107569b                srlw    a3,a4,0x10
>   a4:   00c77733                and     a4,a4,a2
>   a8:   04e69063                bne     a3,a4,e8 <.L12>
> 
> (plus two stupid instructions for generating the immediate in a2 for
> 0xffff, but hey, that's the usual insane RISC-V encoding thing - you
> can load a 20-bit U-immediate only shifted up by 12, if it's in the
> lower bits you're kind of screwed and limited to 12-bit immediates).
> 
> The *bad* code generation is from the much simpler
> 
>         new.count++;
> 
> which sadly neither gcc not clang is quite smart enough to understand
> that "hey, I can do that in 64 bits".
> 
> It's incrementing the higher 32-bit word in a 64-bit union, and with a
> smarter compiler it *should* basically become
> 
>         lock_count += 1 << 32;
> 
> but the compiler isn't that clever, so it splits the 64-bit word into
> two 32-bit words, increments one of them, and then merges the two
> words back into 64 bits:
> 
>   98:   4207d693                sra     a3,a5,0x20
>   9c:   02079713                sll     a4,a5,0x20
>   a0:   0016869b                addw    a3,a3,1
>   a4:   02069693                sll     a3,a3,0x20
>   a8:   02075713                srl     a4,a4,0x20
>   ac:   00d76733                or      a4,a4,a3
> 
> which is pretty sad.
9c & a8 is for word-zero-extend; riscv would have zext.w in the future.
Your patch may improve above with:
	li      a4,1
	slli    a4,a4,32
	add     a4,a5,a4

v.s.
	sra     a3,a5,0x20
	zext.w	a4,a5
	addw    a3,a3,1
	or      a4,a4,a3
You win one instruction "or a4,a4,a3", which is less than one cycle.

The zext.w is important, and it could replace sll+srl a lot, so I think
it's a current ISA design short.

Here, what I want to improve is to prevent stack frame setup in the fast
path, and that's the most benefit my patch could give out. Unnecessary
memory access is the most important performance killer in SMP.

My patch removes the stack frame setup from the fast path.
void lockref_get(struct lockref *lockref)
{
  78:   00053783                ld      a5,0(a0)
000000000000007c <.LBB212>:
  7c:   00010637                lui     a2,0x10

0000000000000080 <.LBE212>:
  80:   06400593                li      a1,100

0000000000000084 <.LBB216>:
  84:   fff60613                add     a2,a2,-1 # ffff <.LLST8+0xf4aa>

0000000000000088 <.L8>:
  88:   0007871b                sext.w  a4,a5

000000000000008c <.LBB217>:
  8c:   0107d69b                srlw    a3,a5,0x10
  90:   00c77733                and     a4,a4,a2
  94:   04e69063                bne     a3,a4,d4 <.L12> --------+
						      		|
0000000000000098 <.LBB218>:					|
  98:   4207d693                sra     a3,a5,0x20		|
  9c:   02079713                sll     a4,a5,0x20		|
  a0:   0016869b                addw    a3,a3,1			|
  a4:   02069693                sll     a3,a3,0x20		|
  a8:   02075713                srl     a4,a4,0x20		|
  ac:   00d76733                or      a4,a4,a3		|
								|
00000000000000b0 <.L0^B1>:					|
  b0:   100536af                lr.d    a3,(a0)			|
  b4:   00f69863                bne     a3,a5,c4 <.L1^B1>	|
  b8:   1ae5382f                sc.d.rl a6,a4,(a0)		|
  bc:   fe081ae3                bnez    a6,b0 <.L0^B1>		|
  c0:   0330000f                fence   rw,rw			|
								|
00000000000000c4 <.L1^B1>:					|
  c4:   04d78a63                beq     a5,a3,118 <.L18>	|
								|
00000000000000c8 <.LBE228>:					|
  c8:   fff5859b                addw    a1,a1,-1		|	
								|
00000000000000cc <.LBB229>:					|
  cc:   00068793                mv      a5,a3			|
								|
00000000000000d0 <.LBE229>:					|
  d0:   fa059ce3                bnez    a1,88 <.L8>		|
						     		|
00000000000000d4 <.L12>: <--------------------------------------+
{						      slow_path
  d4:   fe010113                add     sp,sp,-32
  d8:   00113c23                sd      ra,24(sp)
  dc:   00813823                sd      s0,16(sp)
  e0:   02010413                add     s0,sp,32


> 
> If you want to do the optimization that the compiler misses by hand,
> it would be something like the attached patch.
> 
> NOTE! Very untested. But that *should* cause the compiler to just
> generate a single "add" instruction (in addition to generating the
> constant 0x100000000, of course).
> 
> Of course, on a LL/SC architecture like RISC-V, in an *optimal* world,
> the whole sequence would actually be done with one single LL/SC,
> rather than the "load,add,cmpxchg" thing.
> 
> But then you'd have to do absolutely everything by hand in assembly.
No, it's not worth to do that.
 - There are only atomic primitives in Linux, but no ll/sc primitive in
   the real world. The world belongs to AMO and the only usage of ll/sc
   is to implement AMO and CAS.
 - Using single ll/sc primitive instead of cmpxchg is similar to your
   patch, and you may win 1 cycle or not.
 - The critical work here are reducing bus transactions, preventing
   cache dance, and forward progress guarantee.

Here is my optimization advice:

#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                     \
        BUILD_BUG_ON(sizeof(old) != 8);                                         \
+       prefetchw(lockref);                                                     \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {     \
                struct lockref new = old;                                       \
                CODE                                                            \
                if (likely(try_cmpxchg64_relaxed(&lockref->lock_count,          \
                                                 &old.lock_count,               \
                                                 new.lock_count))) {            \
                        SUCCESS;                                                \
                }                                                               \

Micro-arch could give prefetchw more guarantee:
 - Prefetch.w must guarantee cache line exclusiveness even when a
   shareable state cache line hits.
 - Hold the exclusive cache line for several cycles until the next
   store or timeout
 - Mask interrupt during the holding cycles (Optional)

The lockref slow path is killed in this micro-architecture, which
means there is no chance to execute the spinlock.

I've written down more details in my ppt:
https://docs.google.com/presentation/d/1UudBcj4cL_cjJexMpZNF9ppRzYxeYqsdBotIvU7sO2Q/edit?usp=sharing

This type of prefetchw could help large-size atomic operations within
one cache line. Compared to the transaction memory model, prefetchw
could give a forward progress guarantee and easier landing in Linux
without any new primitive.

> 
>                   Linus

>  lib/lockref.c | 17 ++++++++++++++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/lockref.c b/lib/lockref.c
> index 2afe4c5d8919..481b102a6476 100644
> --- a/lib/lockref.c
> +++ b/lib/lockref.c
> @@ -26,6 +26,17 @@
>  	}									\
>  } while (0)
>  
> +/*
> + * The compiler isn't smart enough to the the count
> + * increment in the high 32 bits of the 64-bit value,
> + * so do this optimization by hand.
> + */
> +#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
> + #define LOCKREF_INC(n) ((n).lock_count += 1ul<<32)
> +#else
> + #define LOCKREF_INC(n) ((n).count++)
> +#endif
> +
>  #else
>  
>  #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
> @@ -42,7 +53,7 @@
>  void lockref_get(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count++;
> +		LOCKREF_INC(new);
>  	,
>  		return;
>  	);
> @@ -63,7 +74,7 @@ int lockref_get_not_zero(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count++;
> +		LOCKREF_INC(new);
>  		if (old.count <= 0)
>  			return 0;
>  	,
> @@ -174,7 +185,7 @@ int lockref_get_not_dead(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count++;
> +		LOCKREF_INC(new);
>  		if (old.count < 0)
>  			return 0;
>  	,


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-26 16:39                             ` Guo Ren
@ 2023-11-26 16:51                               ` Linus Torvalds
  2023-11-30 10:00                                 ` Guo Ren
  2023-11-26 16:51                               ` Guo Ren
  2023-11-26 17:06                               ` Linus Torvalds
  2 siblings, 1 reply; 119+ messages in thread
From: Linus Torvalds @ 2023-11-26 16:51 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Sun, 26 Nov 2023 at 08:39, Guo Ren <guoren@kernel.org> wrote:
>
> Here is my optimization advice:
>
> #define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
>         int retry = 100;                                                        \
>         struct lockref old;                                                     \
>         BUILD_BUG_ON(sizeof(old) != 8);                                         \
> +       prefetchw(lockref);                                                     \\

No.

We're not adding software prefetches to generic code. Been there, done
that. They *never* improve performance on good hardware. They end up
helping on some random (usually particularly bad) microarchitecture,
and then they hurt everybody else.

And the real optimization advice is: "don't run on crap hardware".

It really is that simple. Good hardware does OoO and sees the future write.

> Micro-arch could give prefetchw more guarantee:

Well, in practice, they never do, and in fact they are often buggy and
cause problems because they weren't actually tested very much.

                 Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-26 16:39                             ` Guo Ren
  2023-11-26 16:51                               ` Linus Torvalds
@ 2023-11-26 16:51                               ` Guo Ren
  2023-11-26 17:06                               ` Linus Torvalds
  2 siblings, 0 replies; 119+ messages in thread
From: Guo Ren @ 2023-11-26 16:51 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Sun, Nov 26, 2023 at 11:39:37AM -0500, Guo Ren wrote:
> On Wed, Nov 22, 2023 at 09:20:53AM -0800, Linus Torvalds wrote:
> > On Tue, 21 Nov 2023 at 23:19, Guo Ren <guoren@kernel.org> wrote:
> > >
> > > We discussed x86 qspinlock code generation. It looked not too bad as I
> > > thought because qspinlock_spin_value_unlocked is much cheaper than the
> > > ticket-lock. But the riscv ticket-lock code generation is terrible
> > > because of the shift left & right 16-bit.
> > > https://lore.kernel.org/all/ZNG2tHFOABSXGCVi@gmail.com
> > 
> > No, it's not the 16-bit shifts in the spin_value_unlocked() check,
> > that just generates simple and straightforward code:
> > 
> >   a0:   0107569b                srlw    a3,a4,0x10
> >   a4:   00c77733                and     a4,a4,a2
> >   a8:   04e69063                bne     a3,a4,e8 <.L12>
> > 
> > (plus two stupid instructions for generating the immediate in a2 for
> > 0xffff, but hey, that's the usual insane RISC-V encoding thing - you
> > can load a 20-bit U-immediate only shifted up by 12, if it's in the
> > lower bits you're kind of screwed and limited to 12-bit immediates).
> > 
> > The *bad* code generation is from the much simpler
> > 
> >         new.count++;
> > 
> > which sadly neither gcc not clang is quite smart enough to understand
> > that "hey, I can do that in 64 bits".
> > 
> > It's incrementing the higher 32-bit word in a 64-bit union, and with a
> > smarter compiler it *should* basically become
> > 
> >         lock_count += 1 << 32;
> > 
> > but the compiler isn't that clever, so it splits the 64-bit word into
> > two 32-bit words, increments one of them, and then merges the two
> > words back into 64 bits:
> > 
> >   98:   4207d693                sra     a3,a5,0x20
> >   9c:   02079713                sll     a4,a5,0x20
> >   a0:   0016869b                addw    a3,a3,1
> >   a4:   02069693                sll     a3,a3,0x20
> >   a8:   02075713                srl     a4,a4,0x20
> >   ac:   00d76733                or      a4,a4,a3
> > 
> > which is pretty sad.
> 9c & a8 is for word-zero-extend; riscv would have zext.w in the future.
> Your patch may improve above with:
> 	li      a4,1
> 	slli    a4,a4,32
> 	add     a4,a5,a4
> 
> v.s.
> 	sra     a3,a5,0x20
> 	zext.w	a4,a5
> 	addw    a3,a3,1
> 	or      a4,a4,a3
> You win one instruction "or a4,a4,a3", which is less than one cycle.
Sorry, I forgot "sll     a3,a3,0x20", so it's 1.5 cycles, but it didn't
affect my opinion here; local core operations are the lower optimization
priority than memory transations.

> 
> The zext.w is important, and it could replace sll+srl a lot, so I think
> it's a current ISA design short.
> 
> Here, what I want to improve is to prevent stack frame setup in the fast
> path, and that's the most benefit my patch could give out. Unnecessary
> memory access is the most important performance killer in SMP.
> 
> My patch removes the stack frame setup from the fast path.
> void lockref_get(struct lockref *lockref)
> {
>   78:   00053783                ld      a5,0(a0)
> 000000000000007c <.LBB212>:
>   7c:   00010637                lui     a2,0x10
> 
> 0000000000000080 <.LBE212>:
>   80:   06400593                li      a1,100
> 
> 0000000000000084 <.LBB216>:
>   84:   fff60613                add     a2,a2,-1 # ffff <.LLST8+0xf4aa>
> 
> 0000000000000088 <.L8>:
>   88:   0007871b                sext.w  a4,a5
> 
> 000000000000008c <.LBB217>:
>   8c:   0107d69b                srlw    a3,a5,0x10
>   90:   00c77733                and     a4,a4,a2
>   94:   04e69063                bne     a3,a4,d4 <.L12> --------+
> 						      		|
> 0000000000000098 <.LBB218>:					|
>   98:   4207d693                sra     a3,a5,0x20		|
>   9c:   02079713                sll     a4,a5,0x20		|
>   a0:   0016869b                addw    a3,a3,1			|
>   a4:   02069693                sll     a3,a3,0x20		|
>   a8:   02075713                srl     a4,a4,0x20		|
>   ac:   00d76733                or      a4,a4,a3		|
> 								|
> 00000000000000b0 <.L0^B1>:					|
>   b0:   100536af                lr.d    a3,(a0)			|
>   b4:   00f69863                bne     a3,a5,c4 <.L1^B1>	|
>   b8:   1ae5382f                sc.d.rl a6,a4,(a0)		|
>   bc:   fe081ae3                bnez    a6,b0 <.L0^B1>		|
>   c0:   0330000f                fence   rw,rw			|
> 								|
> 00000000000000c4 <.L1^B1>:					|
>   c4:   04d78a63                beq     a5,a3,118 <.L18>	|
> 								|
> 00000000000000c8 <.LBE228>:					|
>   c8:   fff5859b                addw    a1,a1,-1		|	
> 								|
> 00000000000000cc <.LBB229>:					|
>   cc:   00068793                mv      a5,a3			|
> 								|
> 00000000000000d0 <.LBE229>:					|
>   d0:   fa059ce3                bnez    a1,88 <.L8>		|
> 						     		|
> 00000000000000d4 <.L12>: <--------------------------------------+
> {						      slow_path
>   d4:   fe010113                add     sp,sp,-32
>   d8:   00113c23                sd      ra,24(sp)
>   dc:   00813823                sd      s0,16(sp)
>   e0:   02010413                add     s0,sp,32
> 
> 
> > 
> > If you want to do the optimization that the compiler misses by hand,
> > it would be something like the attached patch.
> > 
> > NOTE! Very untested. But that *should* cause the compiler to just
> > generate a single "add" instruction (in addition to generating the
> > constant 0x100000000, of course).
> > 
> > Of course, on a LL/SC architecture like RISC-V, in an *optimal* world,
> > the whole sequence would actually be done with one single LL/SC,
> > rather than the "load,add,cmpxchg" thing.
> > 
> > But then you'd have to do absolutely everything by hand in assembly.
> No, it's not worth to do that.
>  - There are only atomic primitives in Linux, but no ll/sc primitive in
>    the real world. The world belongs to AMO and the only usage of ll/sc
>    is to implement AMO and CAS.
>  - Using single ll/sc primitive instead of cmpxchg is similar to your
>    patch, and you may win 1 cycle or not.
>  - The critical work here are reducing bus transactions, preventing
>    cache dance, and forward progress guarantee.
> 
> Here is my optimization advice:
> 
> #define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
>         int retry = 100;                                                        \
>         struct lockref old;                                                     \
>         BUILD_BUG_ON(sizeof(old) != 8);                                         \
> +       prefetchw(lockref);                                                     \
>         old.lock_count = READ_ONCE(lockref->lock_count);                        \
>         while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {     \
>                 struct lockref new = old;                                       \
>                 CODE                                                            \
>                 if (likely(try_cmpxchg64_relaxed(&lockref->lock_count,          \
>                                                  &old.lock_count,               \
>                                                  new.lock_count))) {            \
>                         SUCCESS;                                                \
>                 }                                                               \
> 
> Micro-arch could give prefetchw more guarantee:
>  - Prefetch.w must guarantee cache line exclusiveness even when a
>    shareable state cache line hits.
>  - Hold the exclusive cache line for several cycles until the next
>    store or timeout
>  - Mask interrupt during the holding cycles (Optional)
> 
> The lockref slow path is killed in this micro-architecture, which
> means there is no chance to execute the spinlock.
> 
> I've written down more details in my ppt:
> https://docs.google.com/presentation/d/1UudBcj4cL_cjJexMpZNF9ppRzYxeYqsdBotIvU7sO2Q/edit?usp=sharing
> 
> This type of prefetchw could help large-size atomic operations within
> one cache line. Compared to the transaction memory model, prefetchw
> could give a forward progress guarantee and easier landing in Linux
> without any new primitive.
> 
> > 
> >                   Linus
> 
> >  lib/lockref.c | 17 ++++++++++++++---
> >  1 file changed, 14 insertions(+), 3 deletions(-)
> > 
> > diff --git a/lib/lockref.c b/lib/lockref.c
> > index 2afe4c5d8919..481b102a6476 100644
> > --- a/lib/lockref.c
> > +++ b/lib/lockref.c
> > @@ -26,6 +26,17 @@
> >  	}									\
> >  } while (0)
> >  
> > +/*
> > + * The compiler isn't smart enough to the the count
> > + * increment in the high 32 bits of the 64-bit value,
> > + * so do this optimization by hand.
> > + */
> > +#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
> > + #define LOCKREF_INC(n) ((n).lock_count += 1ul<<32)
> > +#else
> > + #define LOCKREF_INC(n) ((n).count++)
> > +#endif
> > +
> >  #else
> >  
> >  #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
> > @@ -42,7 +53,7 @@
> >  void lockref_get(struct lockref *lockref)
> >  {
> >  	CMPXCHG_LOOP(
> > -		new.count++;
> > +		LOCKREF_INC(new);
> >  	,
> >  		return;
> >  	);
> > @@ -63,7 +74,7 @@ int lockref_get_not_zero(struct lockref *lockref)
> >  	int retval;
> >  
> >  	CMPXCHG_LOOP(
> > -		new.count++;
> > +		LOCKREF_INC(new);
> >  		if (old.count <= 0)
> >  			return 0;
> >  	,
> > @@ -174,7 +185,7 @@ int lockref_get_not_dead(struct lockref *lockref)
> >  	int retval;
> >  
> >  	CMPXCHG_LOOP(
> > -		new.count++;
> > +		LOCKREF_INC(new);
> >  		if (old.count < 0)
> >  			return 0;
> >  	,
> 
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-26 16:39                             ` Guo Ren
  2023-11-26 16:51                               ` Linus Torvalds
  2023-11-26 16:51                               ` Guo Ren
@ 2023-11-26 17:06                               ` Linus Torvalds
  2023-11-26 17:59                                 ` Linus Torvalds
  2023-11-29  9:52                                 ` Guo Ren
  2 siblings, 2 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-26 17:06 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Sun, 26 Nov 2023 at 08:39, Guo Ren <guoren@kernel.org> wrote:
>
> Here, what I want to improve is to prevent stack frame setup in the fast
> path, and that's the most benefit my patch could give out.

Side note: what patch do you have that avoids the stack frame setup?
Because I still saw the stack frame even with the
arch_spin_value_unlocked() fix and the improved code generation. The
compiler still does

        addi    sp,sp,-32
        sd      s0,16(sp)
        sd      s1,8(sp)
        sd      ra,24(sp)
        addi    s0,sp,32

at the top of the function for me - not because of the (now fixed)
lock value spilling, but just because it wants to save registers.

The reason seems to be that gcc isn't smart enough to delay the frame
setup to the slow path where it then has to do the actual spinlock, so
it has to generate a stack frame just for the return address and then
it does the whole frame setup thing.

I was using just the risc-v defconfig (with the cmpxchg lockrefs
enabled, and spinlock debugging disabled so that lockrefs actually do
something), so there might be some other config thing like "force
frame pointers" that then causes problems.

But while the current tree avoids the silly lock value spill and
reload, and my patch improved the integer instruction selection, I
really couldn't get rid of the stack frame entirely. The x86 code also
ends up looking quite nice, although part of that is that the
qspinlock test is a simple compare against zero:

  lockref_get:
        pushq   %rbx
        movq    %rdi, %rbx
        movq    (%rdi), %rax
        movl    $-100, %ecx
        movabsq $4294967296, %rdx
  .LBB0_1:
        testl   %eax, %eax
        jne     .LBB0_4
        leaq    (%rax,%rdx), %rsi
        lock
        cmpxchgq        %rsi, (%rbx)
        je      .LBB0_5
        incl    %ecx
        jne     .LBB0_1
  .LBB0_4:
        movq    %rbx, %rdi
        callq   _raw_spin_lock
        incl    4(%rbx)
        movb    $0, (%rbx)
  .LBB0_5:
        popq    %rbx
        retq

(That 'movabsq' thing is what generates the big constant that adds '1'
in the upper word - that add is then done as a 'leaq').

In this case, the 'retry' count is actually a noticeable part of the
code generation, and is probably also why it has to save/restore
'%rbx'. Oh well. We limited the cmpxchg loop because of horrible
issues with starvation on bad arm64 cores.  It turns out that SMP
cacheline bouncing is hard, and if you haven't been doing it for a
couple of decades, you'll do it wrong.

You'll find out the hard way that the same is probably true on any
early RISC-V SMP setups. You wanting to use prefetchw is a pretty
clear indication of the same kind of thing.

             Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-26 17:06                               ` Linus Torvalds
@ 2023-11-26 17:59                                 ` Linus Torvalds
  2023-11-29  9:52                                 ` Guo Ren
  1 sibling, 0 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-26 17:59 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Sun, 26 Nov 2023 at 09:06, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> In this case, the 'retry' count is actually a noticeable part of the
> code generation, and is probably also why it has to save/restore
> '%rbx'.

Nope. The reason for having to save/restore a register is the

        spin_lock(&lockref->lock);
        lockref->count++;

sequence: since spin_lock() is a function call, it will clobber all
the registers that a function can clobber, and the callee has to keep
the 'lockref' argument somewhere. So it needs a callee-saved register,
which it then itself needs to save.

Inlining the spinlock sequence entirely would fix it, but is the wrong
thing to do for the slow case.

Marking the spinlock functions with

  __attribute__((no_caller_saved_registers))

might actually be a reasonable option. It makes the spinlock itself
more expensive (since now it saves/restores all the registers it
uses), but in this case that's the right thing to do.

Of course, in this case, lockref has already done the optimistic
"check the lock" version, so our spinlock code that does that

        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);

which first tries to do the trylock, is all kinds of wrong.

In a perfect world, the lockref code actually wants only the
slow-path, since it has already done the fast-path case. And it would
have that "slow path saves all registers" thing. That might be a good
idea for spinlocks in general, who knows..

Oh well. Probably not worth worrying about. In my profiles, lockref
looks pretty good even under heavy dentry load. Even if it's not
perfect.

                 Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22 19:11                               ` Linus Torvalds
@ 2023-11-29  7:14                                 ` Guo Ren
  2023-11-29 12:25                                 ` Guo Ren
  1 sibling, 0 replies; 119+ messages in thread
From: Guo Ren @ 2023-11-29  7:14 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Wed, Nov 22, 2023 at 11:11:38AM -0800, Linus Torvalds wrote:
> On Wed, 22 Nov 2023 at 09:52, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > Still not actually tested, but the code generation on x86 looks
> > reasonable, so it migth be worth looking at whether it helps the
> > RISC-V case.
> 
> Doing some more munging, and actually looking at RISC-V code
> generation too (I obviously had to enable ARCH_USE_CMPXCHG_LOCKREF for
> RISC-V).
> 
> I get this:
> 
>   lockref_get:
>         addi    sp,sp,-32
>         sd      s0,16(sp)
>         sd      s1,8(sp)
>         sd      ra,24(sp)
>         addi    s0,sp,32
>         li      a1,65536
>         ld      a5,0(a0)
>         mv      s1,a0
>         addi    a1,a1,-1
>         li      a0,100
>   .L43:
>         sext.w  a3,a5
>         li      a4,1
>         srliw   a2,a5,16
>         and     a3,a3,a1
>         slli    a4,a4,32
>         bne     a2,a3,.L49
>         add     a4,a5,a4
>   0:
>         lr.d a3, 0(s1)
>         bne a3, a5, 1f
>         sc.d.rl a2, a4, 0(s1)
>         bnez a2, 0b
>         fence rw, rw
>   1:
>         bne     a5,a3,.L52
>         ld      ra,24(sp)
>         ld      s0,16(sp)
>         ld      s1,8(sp)
>         addi    sp,sp,32
>         jr      ra
>   ...
> 
> so now that single update is indeed just one single instruction:
> 
>         add     a4,a5,a4
> 
> is that "increment count in the high 32 bits".
> 
> The ticket lock being unlocked checks are those
> 
>         li      a1,65536
>         sext.w  a3,a5
>         srliw   a2,a5,16
>         and     a3,a3,a1
>         bne     a2,a3,.L49
> 
> instructions if I read it right.
> 
> That actually looks fairly close to optimal, although the frame setup
> is kind of sad.
> 
> (The above does not include the "loop if the cmpxchg failed" part of
> the code generation)
> 
> Anyway, apart from enabling LOCKREF, the patch to get this for RISC-V
> is attached.
> 
> I'm not going to play with this any more, but you might want to check
> whether this actually does work on RISC-V.
> 
> Becaue I only looked at the code generation, I didn't actually look at
> whether it *worked*.
> 
>                 Linus

> From 168f35850c15468941e597907e33daacd179d54a Mon Sep 17 00:00:00 2001
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Wed, 22 Nov 2023 09:33:29 -0800
> Subject: [PATCH] lockref: improve code generation for ref updates
> 
> Our lockref data structure is two 32-bit words laid out next to each
> other, combining the spinlock and the count into one entity that can be
> accessed atomically together.
> 
> In particular, the structure is laid out so that the count is the upper
> 32 bit word (on little-endian), so that you can do basic arithmetic on
> the count in 64 bits: instead of adding one to the 32-bit word, you can
> just add a value shifted by 32 to the full 64-bit word.
> 
> Sadly, neither gcc nor clang are quite clever enough to work that out on
> their own, so this does that "manually".
> 
> Also, try to do any compares against zero values, which generally
> improves the code generation.  So rather than check that the value was
> at least 1 before a decrement, check that it's positive or zero after
> the decrement.  We don't worry about the overflow point in lockrefs.
Tested-by: Guo Ren <guoren@kernel.org>

This patch could help riscv optimize 3 ALU instructions.

Before the patch:
000000000000020c <lockref_get>:
        CMPXCHG_LOOP(
 20c:   611c                    ld      a5,0(a0)

000000000000020e <.LBB492>:
 20e:   03079713                sll     a4,a5,0x30
 212:   0107d69b                srlw    a3,a5,0x10
 216:   9341                    srl     a4,a4,0x30
 218:   02e69663                bne     a3,a4,244 <.L40>

000000000000021c <.LBB494>:
 21c:   4207d693                sra     a3,a5,0x20    -------+
 220:   02079713                sll     a4,a5,0x20	     |
 224:   2685                    addw    a3,a3,1		     |
 226:   1682                    sll     a3,a3,0x20	     |
 228:   9301                    srl     a4,a4,0x20	     |
 22a:   8f55                    or      a4,a4,a3      -------+

000000000000022c <.L0^B4>:
 22c:   100536af                lr.d    a3,(a0)
 230:   00f69763                bne     a3,a5,23e <.L1^B5>
 234:   1ae5362f                sc.d.rl a2,a4,(a0)
 238:   fa75                    bnez    a2,22c <.L0^B4>
 23a:   0330000f                fence   rw,rw

After the patch:
000000000000020c <lockref_get>:
        CMPXCHG_LOOP(
 20c:   611c                    ld      a5,0(a0)

000000000000020e <.LBB526>:
 20e:   03079713                sll     a4,a5,0x30
 212:   0107d69b                srlw    a3,a5,0x10
 216:   9341                    srl     a4,a4,0x30
 218:   02e69163                bne     a3,a4,23a <.L40>

000000000000021c <.LBB528>:
 21c:   4705                    li      a4,1		------+
 21e:   1702                    sll     a4,a4,0x20	      |
 220:   973e                    add     a4,a4,a5	------+

0000000000000222 <.L0^B4>:
 222:   100536af                lr.d    a3,(a0)
 226:   00f69763                bne     a3,a5,234 <.L1^B5>
 22a:   1ae5362f                sc.d.rl a2,a4,(a0)
 22e:   fa75                    bnez    a2,222 <.L0^B4>
 230:   0330000f                fence   rw,rw

> 
> Cc: Guo Ren <guoren@kernel.org>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> ---
>  lib/lockref.c | 29 ++++++++++++++++++++---------
>  1 file changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/lib/lockref.c b/lib/lockref.c
> index 2afe4c5d8919..f3c30c538af1 100644
> --- a/lib/lockref.c
> +++ b/lib/lockref.c
> @@ -26,6 +26,17 @@
>  	}									\
>  } while (0)
>  
> +/*
> + * The compiler isn't smart enough to the the count
> + * increment in the high 32 bits of the 64-bit value,
> + * so do this optimization by hand.
> + */
> +#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
> + #define LOCKREF_ADD(n,x) ((n).lock_count += (unsigned long)(x)<<32)
> +#else
> + #define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x)<<32)
> +#endif
> +
>  #else
>  
>  #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
> @@ -42,7 +53,7 @@
>  void lockref_get(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count++;
> +		LOCKREF_ADD(new,1);
>  	,
>  		return;
>  	);
> @@ -63,9 +74,9 @@ int lockref_get_not_zero(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count++;
>  		if (old.count <= 0)
>  			return 0;
> +		LOCKREF_ADD(new,1);
>  	,
>  		return 1;
>  	);
> @@ -91,8 +102,8 @@ int lockref_put_not_zero(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count--;
> -		if (old.count <= 1)
> +		LOCKREF_ADD(new,-1);
> +		if (new.count <= 0)
>  			return 0;
>  	,
>  		return 1;
> @@ -119,8 +130,8 @@ EXPORT_SYMBOL(lockref_put_not_zero);
>  int lockref_put_return(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count--;
> -		if (old.count <= 0)
> +		LOCKREF_ADD(new,-1);
> +		if (new.count < 0)
>  			return -1;
>  	,
>  		return new.count;
> @@ -137,8 +148,8 @@ EXPORT_SYMBOL(lockref_put_return);
>  int lockref_put_or_lock(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count--;
> -		if (old.count <= 1)
> +		LOCKREF_ADD(new,-1);
> +		if (new.count <= 0)
>  			break;
>  	,
>  		return 1;
> @@ -174,9 +185,9 @@ int lockref_get_not_dead(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count++;
>  		if (old.count < 0)
>  			return 0;
> +		LOCKREF_ADD(new,1);
>  	,
>  		return 1;
>  	);
> -- 
> 2.43.0.5.g38fb137bdb
> 


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-26 17:06                               ` Linus Torvalds
  2023-11-26 17:59                                 ` Linus Torvalds
@ 2023-11-29  9:52                                 ` Guo Ren
  1 sibling, 0 replies; 119+ messages in thread
From: Guo Ren @ 2023-11-29  9:52 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Sun, Nov 26, 2023 at 09:06:03AM -0800, Linus Torvalds wrote:
> On Sun, 26 Nov 2023 at 08:39, Guo Ren <guoren@kernel.org> wrote:
> >
> > Here, what I want to improve is to prevent stack frame setup in the fast
> > path, and that's the most benefit my patch could give out.
> 
> Side note: what patch do you have that avoids the stack frame setup?
> Because I still saw the stack frame even with the
> arch_spin_value_unlocked() fix and the improved code generation. The
> compiler still does
> 
>         addi    sp,sp,-32
>         sd      s0,16(sp)
>         sd      s1,8(sp)
>         sd      ra,24(sp)
>         addi    s0,sp,32
I found below affect you:

 #define CMPXCHG_LOOP(CODE, SUCCESS) do {                                       \
-       int retry = 100;                                                        \
        struct lockref old;                                                     \
        BUILD_BUG_ON(sizeof(old) != 8);                                         \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
@@ -21,11 +20,21 @@
                                                 new.lock_count))) {            \
                        SUCCESS;                                                \
                }                                                               \
-               if (!--retry)                                                   \
-                       break;                                                  \

Yes, The 'retry' count patch [1] hurts us.

[1]: https://lore.kernel.org/lkml/20190607072652.GA5522@hc/T/#m091df9dca68c27c28f8f69a72cae0e1361dba4fa

> 
> at the top of the function for me - not because of the (now fixed)
> lock value spilling, but just because it wants to save registers.
> 
> The reason seems to be that gcc isn't smart enough to delay the frame
> setup to the slow path where it then has to do the actual spinlock, so
> it has to generate a stack frame just for the return address and then
> it does the whole frame setup thing.
> 
> I was using just the risc-v defconfig (with the cmpxchg lockrefs
> enabled, and spinlock debugging disabled so that lockrefs actually do
> something), so there might be some other config thing like "force
> frame pointers" that then causes problems.
> 
> But while the current tree avoids the silly lock value spill and
> reload, and my patch improved the integer instruction selection, I
> really couldn't get rid of the stack frame entirely. The x86 code also
> ends up looking quite nice, although part of that is that the
> qspinlock test is a simple compare against zero:
> 
>   lockref_get:
>         pushq   %rbx
>         movq    %rdi, %rbx
>         movq    (%rdi), %rax
>         movl    $-100, %ecx
>         movabsq $4294967296, %rdx
>   .LBB0_1:
>         testl   %eax, %eax
>         jne     .LBB0_4
>         leaq    (%rax,%rdx), %rsi
>         lock
>         cmpxchgq        %rsi, (%rbx)
>         je      .LBB0_5
>         incl    %ecx
>         jne     .LBB0_1
>   .LBB0_4:
>         movq    %rbx, %rdi
>         callq   _raw_spin_lock
>         incl    4(%rbx)
>         movb    $0, (%rbx)
>   .LBB0_5:
>         popq    %rbx
>         retq
> 
> (That 'movabsq' thing is what generates the big constant that adds '1'
> in the upper word - that add is then done as a 'leaq').
> 
> In this case, the 'retry' count is actually a noticeable part of the
> code generation, and is probably also why it has to save/restore
> '%rbx'. Oh well. We limited the cmpxchg loop because of horrible
> issues with starvation on bad arm64 cores.  It turns out that SMP
> cacheline bouncing is hard, and if you haven't been doing it for a
> couple of decades, you'll do it wrong.
> 
> You'll find out the hard way that the same is probably true on any
> early RISC-V SMP setups. You wanting to use prefetchw is a pretty
> clear indication of the same kind of thing.

The 'retry' count is bad solution, which hides the problem. ThunderX2's
problem is mainly about unnecessary cpu_relax & cacheline sticky less.
In the AMBA 5 CHI spec "Home behavior" section says: [2]
"When a Home(CIU/LLcache) determines that an Exclusive Store transaction
has failed, the following rules must be followed: If the Requester has
lost the cache line, then the Home is expected to send SnpPreferUniqueFwd
or SnpPreferUnique to get a copy of the cache line."
The SnpPreferUnique is not SnpUnique, which means it would return a shared
cacheline in case of serious contention. No guarantee for the next cmpxchg.

But, we want a unique cache line right? You said: [1]
"... And once one CPU gets ownership of the line, it doesn't lose it
immediately, so the next cmpxchg will *succeed*.
So at most, the *first* cmpxchg will fail (because that's the one that
was fed not by a previous cmpxchg, but by a regular load (which we'd
*like* to do as a "load-for-ownership" load, but we don't have the
interfaces to do that). But the second cmpxchg should basically always
succeed, ..."
(Sorry, I quoted you like this.)

My argue is:
Why do we need to wait for cmpxchg failure? You have the
"load-for-ownership" interface: "prefetchw"!

   lockref_get:
         pushq   %rbx
  +------prefetchw (%rdi)    --------> doesn't lose it immediately,
st|				so the next cmpxchg will *succeed*
ic|						- Linus
ky|      movq    %rdi, %rbx
 t|      movq    (%rdi), %rax  ------> local acquire, comfortable!
im|      movl    $-100, %ecx
e |      movabsq $4294967296, %rdx
  |.LBB0_1:
  |      testl   %eax, %eax
  |      jne     .LBB0_4
  |      leaq    (%rax,%rdx), %rsi
  |      lock
  |      cmpxchgq        %rsi, (%rbx) --> local cas, success!
  +----- je      .LBB0_5          ------> Farewell to the slowpath!

If x86 is not a crap machine, "movq+movq+movl+movabsq+testl+jne+leak+
cmpxchg" should be fast enough to satisfy the sticky time.

The prefetchw primitive has been defined in include/linux/prefetch.h
for many years.

The prefetchw has been used for generic code: 
➜  linux git:(master) ✗ grep prefetchw mm/ fs/ kernel/ -r
mm/slub.c:      prefetchw(object + s->offset);
mm/slab.c:      prefetchw(objp);
mm/page_alloc.c:        prefetchw(p);
mm/page_alloc.c:                prefetchw(p + 1);
mm/vmscan.c:#define prefetchw_prev_lru_folio(_folio, _base, _field)                     \
mm/vmscan.c:                    prefetchw(&prev->_field);                       \
mm/vmscan.c:#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
mm/vmscan.c:            prefetchw_prev_lru_folio(folio, src, flags);
fs/mpage.c:             prefetchw(&folio->flags);
fs/f2fs/data.c:                 prefetchw(&page->flags);
fs/ext4/readpage.c:             prefetchw(&folio->flags);
kernel/bpf/cpumap.c:                    prefetchw(page);
kernel/locking/qspinlock.c:                     prefetchw(next);
➜  linux git:(master) ✗ grep prefetchw drivers/ -r | wc -l
80

The prefetchw is okay for all good hardware. Not like the 'retry' one.

[1]: https://lore.kernel.org/lkml/CAHk-=wiEahkwDXpoy=-SzJHNMRXKVSjPa870+eKKenufhO_Hgw@mail.gmail.com/raw
[2]: https://kolegite.com/EE_library/datasheets_and_manuals/FPGA/AMBA/IHI0050E_a_amba_5_chi_architecture_spec.pdf

> 
>              Linus
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-22 19:11                               ` Linus Torvalds
  2023-11-29  7:14                                 ` Guo Ren
@ 2023-11-29 12:25                                 ` Guo Ren
  2023-11-29 14:42                                   ` Linus Torvalds
  1 sibling, 1 reply; 119+ messages in thread
From: Guo Ren @ 2023-11-29 12:25 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Wed, Nov 22, 2023 at 11:11:38AM -0800, Linus Torvalds wrote:
> On Wed, 22 Nov 2023 at 09:52, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > Still not actually tested, but the code generation on x86 looks
> > reasonable, so it migth be worth looking at whether it helps the
> > RISC-V case.
> 
> Doing some more munging, and actually looking at RISC-V code
> generation too (I obviously had to enable ARCH_USE_CMPXCHG_LOCKREF for
> RISC-V).
> 
> I get this:
> 
>   lockref_get:
>         addi    sp,sp,-32
>         sd      s0,16(sp)
>         sd      s1,8(sp)
>         sd      ra,24(sp)
>         addi    s0,sp,32
>         li      a1,65536
>         ld      a5,0(a0)
>         mv      s1,a0
>         addi    a1,a1,-1
>         li      a0,100
>   .L43:
>         sext.w  a3,a5
>         li      a4,1
>         srliw   a2,a5,16
>         and     a3,a3,a1
>         slli    a4,a4,32
>         bne     a2,a3,.L49
>         add     a4,a5,a4
>   0:
>         lr.d a3, 0(s1)
>         bne a3, a5, 1f
>         sc.d.rl a2, a4, 0(s1)
>         bnez a2, 0b
>         fence rw, rw
>   1:
>         bne     a5,a3,.L52
>         ld      ra,24(sp)
>         ld      s0,16(sp)
>         ld      s1,8(sp)
>         addi    sp,sp,32
>         jr      ra
>   ...
> 
> so now that single update is indeed just one single instruction:
> 
>         add     a4,a5,a4
> 
> is that "increment count in the high 32 bits".
> 
> The ticket lock being unlocked checks are those
> 
>         li      a1,65536
>         sext.w  a3,a5
>         srliw   a2,a5,16
>         and     a3,a3,a1
>         bne     a2,a3,.L49
> 
> instructions if I read it right.
> 
> That actually looks fairly close to optimal, although the frame setup
> is kind of sad.
> 
> (The above does not include the "loop if the cmpxchg failed" part of
> the code generation)
> 
> Anyway, apart from enabling LOCKREF, the patch to get this for RISC-V
> is attached.
> 
> I'm not going to play with this any more, but you might want to check
> whether this actually does work on RISC-V.
> 
> Becaue I only looked at the code generation, I didn't actually look at
> whether it *worked*.
> 
>                 Linus

> From 168f35850c15468941e597907e33daacd179d54a Mon Sep 17 00:00:00 2001
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Wed, 22 Nov 2023 09:33:29 -0800
> Subject: [PATCH] lockref: improve code generation for ref updates
> 
> Our lockref data structure is two 32-bit words laid out next to each
> other, combining the spinlock and the count into one entity that can be
> accessed atomically together.
> 
> In particular, the structure is laid out so that the count is the upper
> 32 bit word (on little-endian), so that you can do basic arithmetic on
> the count in 64 bits: instead of adding one to the 32-bit word, you can
> just add a value shifted by 32 to the full 64-bit word.
> 
> Sadly, neither gcc nor clang are quite clever enough to work that out on
> their own, so this does that "manually".
> 
> Also, try to do any compares against zero values, which generally
> improves the code generation.  So rather than check that the value was
> at least 1 before a decrement, check that it's positive or zero after
> the decrement.  We don't worry about the overflow point in lockrefs.
> 
> Cc: Guo Ren <guoren@kernel.org>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> ---
>  lib/lockref.c | 29 ++++++++++++++++++++---------
>  1 file changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/lib/lockref.c b/lib/lockref.c
> index 2afe4c5d8919..f3c30c538af1 100644
> --- a/lib/lockref.c
> +++ b/lib/lockref.c
> @@ -26,6 +26,17 @@
>  	}									\
>  } while (0)
>  
> +/*
> + * The compiler isn't smart enough to the the count
> + * increment in the high 32 bits of the 64-bit value,
> + * so do this optimization by hand.
> + */
> +#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
> + #define LOCKREF_ADD(n,x) ((n).lock_count += (unsigned long)(x)<<32)
> +#else
> + #define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x)<<32)
#define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x))
?

> +#endif
> +
>  #else
>  
>  #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
> @@ -42,7 +53,7 @@
>  void lockref_get(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count++;
> +		LOCKREF_ADD(new,1);
>  	,
>  		return;
>  	);
> @@ -63,9 +74,9 @@ int lockref_get_not_zero(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count++;
>  		if (old.count <= 0)
>  			return 0;
> +		LOCKREF_ADD(new,1);
>  	,
>  		return 1;
>  	);
> @@ -91,8 +102,8 @@ int lockref_put_not_zero(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count--;
> -		if (old.count <= 1)
> +		LOCKREF_ADD(new,-1);
> +		if (new.count <= 0)
>  			return 0;
>  	,
>  		return 1;
> @@ -119,8 +130,8 @@ EXPORT_SYMBOL(lockref_put_not_zero);
>  int lockref_put_return(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count--;
> -		if (old.count <= 0)
> +		LOCKREF_ADD(new,-1);
> +		if (new.count < 0)
>  			return -1;
>  	,
>  		return new.count;
> @@ -137,8 +148,8 @@ EXPORT_SYMBOL(lockref_put_return);
>  int lockref_put_or_lock(struct lockref *lockref)
>  {
>  	CMPXCHG_LOOP(
> -		new.count--;
> -		if (old.count <= 1)
> +		LOCKREF_ADD(new,-1);
> +		if (new.count <= 0)
>  			break;
>  	,
>  		return 1;
> @@ -174,9 +185,9 @@ int lockref_get_not_dead(struct lockref *lockref)
>  	int retval;
>  
>  	CMPXCHG_LOOP(
> -		new.count++;
>  		if (old.count < 0)
>  			return 0;
> +		LOCKREF_ADD(new,1);
>  	,
>  		return 1;
>  	);
> -- 
> 2.43.0.5.g38fb137bdb
> 


^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-29 12:25                                 ` Guo Ren
@ 2023-11-29 14:42                                   ` Linus Torvalds
  0 siblings, 0 replies; 119+ messages in thread
From: Linus Torvalds @ 2023-11-29 14:42 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Wed, 29 Nov 2023 at 04:25, Guo Ren <guoren@kernel.org> wrote:
>
> > +#if defined(__LITTLE_ENDIAN) && BITS_PER_LONG == 64
> > + #define LOCKREF_ADD(n,x) ((n).lock_count += (unsigned long)(x)<<32)
> > +#else
> > + #define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x)<<32)
> #define LOCKREF_ADD(n,x) ((n).count += (unsigned long)(x))
> ?

Yes. I obviously only tested the little-endian case, and the BE case
was a bit too much cut-and-paste..

             Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-26 16:51                               ` Linus Torvalds
@ 2023-11-30 10:00                                 ` Guo Ren
  2023-12-01  1:09                                   ` Linus Torvalds
  0 siblings, 1 reply; 119+ messages in thread
From: Guo Ren @ 2023-11-30 10:00 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Sun, Nov 26, 2023 at 08:51:35AM -0800, Linus Torvalds wrote:
> On Sun, 26 Nov 2023 at 08:39, Guo Ren <guoren@kernel.org> wrote:
> >
> > Here is my optimization advice:
> >
> > #define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
> >         int retry = 100;                                                        \
> >         struct lockref old;                                                     \
> >         BUILD_BUG_ON(sizeof(old) != 8);                                         \
> > +       prefetchw(lockref);                                                     \\
> 
> No.
> 
> We're not adding software prefetches to generic code. Been there, done
> that. They *never* improve performance on good hardware. They end up
> helping on some random (usually particularly bad) microarchitecture,
> and then they hurt everybody else.
> 
> And the real optimization advice is: "don't run on crap hardware".
> 
> It really is that simple. Good hardware does OoO and sees the future write.
That needs the expensive mechanism DynAMO [1], but some power-efficient
core lacks the capability. Yes, powerful OoO hardware could virtually
satisfy you by a minimum number of retries, but why couldn't we
explicitly tell hardware for "prefetchw"?

Advanced hardware would treat cmpxchg as interconnect transactions when
cache miss(far atomic), which means L3 cache wouldn't return a unique
cacheline even when cmpxchg fails. The cmpxchg loop would continue to
read data bypassing the L1/L2 cache, which means every failure cmpxchg
is a cache-miss read. Because of the "new.count++"/CODE data dependency,
the continuous cmpxchg requests must wait first finish. This will cause
a gap between cmpxchg requests, which will cause most CPU's cmpxchgs
continue failling during serious contention.

   cas: Compare-And-Swap

   L1&L2          L3 cache
 +------+       +-----------
 | CPU1 | wait  |
 | cas2 |------>| CPU1_cas1 --+
 +------+       |             |
 +------+       |             |
 | CPU2 | wait  |             |
 | cas2 |------>| CPU2_cas1 --+--> If queued with CPU1_cas1 CPU2_cas1
 +------+       |             |    CPU3_cas1, and most of CPUs would
 +------+       |             |    fail and retry.
 | CPU3 | wait  |             |
 | cas2 |------>| CPU3_cas1---+
 +------+       +----------

The entire system moves forward with inefficiency:
 - A large number of invalid read requests CPU->L3
 - High power consumption
 - Poor performance

But, the “far atomic” is suitable for scenarios where contention is
not particularly serious. So it is reasonable to let the software give
prompts. That is "prefetchw":
 - The prefetchw is the preparation of "load + cmpxchg loop."
 - The prefetchw is not for single AMO or CAS or Store.

[1] https://dl.acm.org/doi/10.1145/3579371.3589065

> 
> > Micro-arch could give prefetchw more guarantee:
> 
> Well, in practice, they never do, and in fact they are often buggy and
> cause problems because they weren't actually tested very much.
> 
>                  Linus
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-11-30 10:00                                 ` Guo Ren
@ 2023-12-01  1:09                                   ` Linus Torvalds
  2023-12-01  3:36                                     ` Guo Ren
  0 siblings, 1 reply; 119+ messages in thread
From: Linus Torvalds @ 2023-12-01  1:09 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Thu, 30 Nov 2023 at 19:01, Guo Ren <guoren@kernel.org> wrote:
>
> That needs the expensive mechanism DynAMO [1], but some power-efficient
> core lacks the capability. Yes, powerful OoO hardware could virtually
> satisfy you by a minimum number of retries, but why couldn't we
> explicitly tell hardware for "prefetchw"?

Because every single time we've had a prefetch in the kernel, it has
caused problems. A bit like cpu_relax() - these things get added for
random hardware where it helps, and then a few years later it turns
out that it hurts almost everywhere else.

We've had particular problems with 'prefetch' because it turns out
that (a) nobody sane uses them so (b) hardware is often buggy. And
here "buggy" may be just performance (ie "prefetch actually stalls on
TLB lookup" etc broken behavior that means that prefetch is not even
remotely like a no-op that just hints to the cache subsystem), but
sometimes even in actual semantics (ie "prefetch causes spurious
faulting behavior")

> Advanced hardware would treat cmpxchg as interconnect transactions when
> cache miss(far atomic), which means L3 cache wouldn't return a unique
> cacheline even when cmpxchg fails. The cmpxchg loop would continue to
> read data bypassing the L1/L2 cache, which means every failure cmpxchg
> is a cache-miss read.

Honestly, I wouldn't call that "advanced hardware". I would call that
ridiculous.

If the cmpxchg isn't guaranteed to make progress, then the cmpxchg is
broken. It's really that simple.

It does sound like on your hardware, maybe you just want to make the
RISC-V cmpxchg function always do a "prefetchw" if the 'sc.d' fails,
something like

                        "0:     lr.w %0, %2\n"                          \
                        "       bne  %0, %z3, 1f\n"                     \
                        "       sc.w %1, %z4, %2\n"                     \
-                       "       bnez %1, 0b\n"                          \
+                       "       beqz %1, 1f\n"                          \
+                       "       prefetchw %2\n"                         \
+                       "       j 0b\n"                                 \
                        "1:\n"                                          \

(quick entirely untested hack, you get the idea). A better
implementation might use "asm goto" and expose the different error
cases to the compiler so that it can move things around, but I'm not
convinced it's worth the effort.

But no, we're *not* adding a prefetchw to generic code just because
apparently some RISC-V code is doing bad things. You need to keep
workarounds for RISC-V behavior to RISC-V.

And yes, the current "retry count" in our lockref implementation comes
from another "some hardware does bad things for cmpxchg". But that
workaround at most causes a few extra (regular) ALU instructions, and
while not optimal, it's at least not going to cause any bigger
problems.

           Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-12-01  1:09                                   ` Linus Torvalds
@ 2023-12-01  3:36                                     ` Guo Ren
  2023-12-01  5:15                                       ` Linus Torvalds
  0 siblings, 1 reply; 119+ messages in thread
From: Guo Ren @ 2023-12-01  3:36 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Fri, Dec 01, 2023 at 10:09:01AM +0900, Linus Torvalds wrote:
> On Thu, 30 Nov 2023 at 19:01, Guo Ren <guoren@kernel.org> wrote:
> >
> > That needs the expensive mechanism DynAMO [1], but some power-efficient
> > core lacks the capability. Yes, powerful OoO hardware could virtually
> > satisfy you by a minimum number of retries, but why couldn't we
> > explicitly tell hardware for "prefetchw"?
> 
> Because every single time we've had a prefetch in the kernel, it has
> caused problems. A bit like cpu_relax() - these things get added for
> random hardware where it helps, and then a few years later it turns
> out that it hurts almost everywhere else.
> 
> We've had particular problems with 'prefetch' because it turns out
> that (a) nobody sane uses them so (b) hardware is often buggy. And
> here "buggy" may be just performance (ie "prefetch actually stalls on
> TLB lookup" etc broken behavior that means that prefetch is not even
> remotely like a no-op that just hints to the cache subsystem), but
> sometimes even in actual semantics (ie "prefetch causes spurious
> faulting behavior")
Thanks for sharing your experience, I now know the problem with generic
prefetchw.

But what to do with these codes?
➜  linux git:(master) ✗ grep prefetchw mm/ fs/ kernel/ -r
mm/slub.c:      prefetchw(object + s->offset);
mm/slab.c:      prefetchw(objp);
mm/page_alloc.c:        prefetchw(p);
mm/page_alloc.c:                prefetchw(p + 1);
mm/vmscan.c:#define prefetchw_prev_lru_folio(_folio, _base, _field)
mm/vmscan.c:                    prefetchw(&prev->_field);
mm/vmscan.c:#define prefetchw_prev_lru_folio(_folio, _base, _field) do
mm/vmscan.c:            prefetchw_prev_lru_folio(folio, src, flags);
fs/mpage.c:             prefetchw(&folio->flags);
fs/f2fs/data.c:                 prefetchw(&page->flags);
fs/ext4/readpage.c:             prefetchw(&folio->flags);
kernel/bpf/cpumap.c:                    prefetchw(page);
kernel/locking/qspinlock.c:                     prefetchw(next);
➜  linux git:(master) ✗ grep prefetchw drivers/ -r | wc -l
80
...

> 
> > Advanced hardware would treat cmpxchg as interconnect transactions when
> > cache miss(far atomic), which means L3 cache wouldn't return a unique
> > cacheline even when cmpxchg fails. The cmpxchg loop would continue to
> > read data bypassing the L1/L2 cache, which means every failure cmpxchg
> > is a cache-miss read.
> 
> Honestly, I wouldn't call that "advanced hardware". I would call that
> ridiculous.
Ridiculous Hardware:
When CAS fails, the hardware still keeps "far atomic" mode.

Correct Hardware:
When CAS fails, the hardware should change to "near-atomic," which means
acquiring an exclusive cache line and making progress.

> 
> If the cmpxchg isn't guaranteed to make progress, then the cmpxchg is
> broken. It's really that simple.
I totally agree, and it's a correct guide, Thx.

> 
> It does sound like on your hardware, maybe you just want to make the
> RISC-V cmpxchg function always do a "prefetchw" if the 'sc.d' fails,
> something like
> 
>                         "0:     lr.w %0, %2\n"                          \
>                         "       bne  %0, %z3, 1f\n"                     \
>                         "       sc.w %1, %z4, %2\n"                     \
> -                       "       bnez %1, 0b\n"                          \
> +                       "       beqz %1, 1f\n"                          \
> +                       "       prefetchw %2\n"                         \
> +                       "       j 0b\n"                                 \
>                         "1:\n"                                          \

I modify your code to guarantee the progress of the comparison failure
situation:
Final version (for easy read):
                         "0:     lr.w %0, %2\n"                          \
                         "       bne  %0, %z3, 2f\n"                     \
                         "       sc.w %1, %z4, %2\n"                     \
                         "       beqz %1, 1f\n"                          \
                         "       prefetchw %2\n"                         \
                         "       j 0b\n"                         	 \
                         "2:\n"                                          \
                         "       prefetchw %2\n"                         \
                         "1:\n"                                          \

Diff version:
                         "0:     lr.w %0, %2\n"                          \
 -                       "       bne  %0, %z3, 1f\n"                     \
 +                       "       bne  %0, %z3, 2f\n"                     \
                         "       sc.w %1, %z4, %2\n"                     \
 -                       "       bnez %1, 0b\n"                          \
 +                       "       beqz %1, 1f\n"                          \
 +                       "       prefetchw %2\n"                         \
 +                       "       j 0b\n"                         	 \
 +                       "2:\n"                                          \
 +                       "       prefetchw %2\n"                         \
                         "1:\n"                                          \

> 
> (quick entirely untested hack, you get the idea). A better
> implementation might use "asm goto" and expose the different error
> cases to the compiler so that it can move things around, but I'm not
> convinced it's worth the effort.
> 
> But no, we're *not* adding a prefetchw to generic code just because
> apparently some RISC-V code is doing bad things. You need to keep
> workarounds for RISC-V behavior to RISC-V.
> 
> And yes, the current "retry count" in our lockref implementation comes
> from another "some hardware does bad things for cmpxchg". But that
> workaround at most causes a few extra (regular) ALU instructions, and
> while not optimal, it's at least not going to cause any bigger
> problems.
> 
>            Linus
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-12-01  3:36                                     ` Guo Ren
@ 2023-12-01  5:15                                       ` Linus Torvalds
  2023-12-01  7:31                                         ` Guo Ren
  0 siblings, 1 reply; 119+ messages in thread
From: Linus Torvalds @ 2023-12-01  5:15 UTC (permalink / raw)
  To: Guo Ren; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Fri, 1 Dec 2023 at 12:36, Guo Ren <guoren@kernel.org> wrote:
>
> I modify your code to guarantee the progress of the comparison failure
> situation:

Are you sure you want to prefetch when the value doesn't even match
the existing value? Aren't you better off just looping doing just
reads until you at least have a valid value to exchange?

Otherwise you might easily find that your cmpxchg loops cause
horrendous cacheline ping-pong patterns.

Of course, if your hardware is bad at releasing the written state,
that may actually be what you want, to see changes in a timely manner.

At least some of our cmpxchg uses are the "try_cmpxchg()" pattern,
which wouldn't even loop - and won't write at all - on a value
mismatch.

And some of those try_cmpxchg cases are a lot more important than the
lockref code. Things like spin_trylock() etc. Of course, for best
results you might want to have an actual architecture-specific helper
for the try_cmpxchg case, and use the compiler for "outputs in
condition codes" (but then you need to have fallback cases for older
compilers that don't support it).

See the code code for example of the kinds of nasty support code you need with

  /*
   * Macros to generate condition code outputs from inline assembly,
   * The output operand must be type "bool".
   */
  #ifdef __GCC_ASM_FLAG_OUTPUTS__
  # define CC_SET(c) "\n\t/* output condition code " #c "*/\n"
  # define CC_OUT(c) "=@cc" #c
  #else
  # define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n"
  # define CC_OUT(c) [_cc_ ## c] "=qm"
  #endif

and then a lot of "CC_SET()/CC_OUT()" use in the inline asms in
<asm/cmpxchg.h>...

IOW, you really should time this and then add the timing information
to whatever commit message.

             Linus

^ permalink raw reply	[flat|nested] 119+ messages in thread

* Re: lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput())
  2023-12-01  5:15                                       ` Linus Torvalds
@ 2023-12-01  7:31                                         ` Guo Ren
  0 siblings, 0 replies; 119+ messages in thread
From: Guo Ren @ 2023-12-01  7:31 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Al Viro, Peter Zijlstra, linux-fsdevel

On Fri, Dec 01, 2023 at 02:15:15PM +0900, Linus Torvalds wrote:
> On Fri, 1 Dec 2023 at 12:36, Guo Ren <guoren@kernel.org> wrote:
> >
> > I modify your code to guarantee the progress of the comparison failure
> > situation:
> 
> Are you sure you want to prefetch when the value doesn't even match
> the existing value? Aren't you better off just looping doing just
> reads until you at least have a valid value to exchange?
Oops, you are right, I'm wrong here. Here is what I want to say:
+          "       prefetch %2\n"                          \
           "0:     lr.w %0, %2\n"                          \
           "       bne  %0, %z3, 1f\n"                     \
           "       sc.w %1, %z4, %2\n"                     \
           "       beqz %1, 1f\n"                          \
           "       prefetchw %2\n"                         \
           "       j 0b\n"                                 \
           "1:\n"                                          \

Just add a prefetch shared cache line before, which could stick the
shared cache line several cycles to ensure the outer cmpxchg loop could
make progress.

All we wrote here is not for actual code, and it's just a reference for
hardware guys.
 - lr could imply a sticky shared cache line.
 - sc could imply a sticky unique cache line when sc fails.

> 
> Otherwise you might easily find that your cmpxchg loops cause
> horrendous cacheline ping-pong patterns.
> 
> Of course, if your hardware is bad at releasing the written state,
> that may actually be what you want, to see changes in a timely manner.
> 
> At least some of our cmpxchg uses are the "try_cmpxchg()" pattern,
> which wouldn't even loop - and won't write at all - on a value
> mismatch.
> 
> And some of those try_cmpxchg cases are a lot more important than the
> lockref code. Things like spin_trylock() etc. Of course, for best
> results you might want to have an actual architecture-specific helper
> for the try_cmpxchg case, and use the compiler for "outputs in
> condition codes" (but then you need to have fallback cases for older
> compilers that don't support it).
> 
> See the code code for example of the kinds of nasty support code you need with
> 
>   /*
>    * Macros to generate condition code outputs from inline assembly,
>    * The output operand must be type "bool".
>    */
>   #ifdef __GCC_ASM_FLAG_OUTPUTS__
>   # define CC_SET(c) "\n\t/* output condition code " #c "*/\n"
>   # define CC_OUT(c) "=@cc" #c
>   #else
>   # define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n"
>   # define CC_OUT(c) [_cc_ ## c] "=qm"
>   #endif
> 
> and then a lot of "CC_SET()/CC_OUT()" use in the inline asms in
> <asm/cmpxchg.h>...
Thanks for the tip. It's helpful to try_cmpxchg optimization.

> 
> IOW, you really should time this and then add the timing information
> to whatever commit message.
> 
>              Linus
> 

^ permalink raw reply	[flat|nested] 119+ messages in thread

end of thread, other threads:[~2023-12-01  7:31 UTC | newest]

Thread overview: 119+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-30  0:37 [RFC] simplifying fast_dput(), dentry_kill() et.al Al Viro
2023-10-30 21:53 ` Al Viro
2023-10-30 22:18   ` Linus Torvalds
2023-10-31  0:18     ` Al Viro
2023-10-31  1:53       ` Al Viro
2023-10-31  6:12         ` Al Viro
2023-11-01  6:18           ` Al Viro
2023-11-01  6:20           ` [PATCH 01/15] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
2023-11-01  6:20             ` [PATCH 02/15] fast_dput(): handle underflows gracefully Al Viro
2023-11-01  6:20             ` [PATCH 03/15] fast_dput(): new rules for refcount Al Viro
2023-11-01  6:20             ` [PATCH 04/15] __dput_to_list(): do decrement of refcount in the caller Al Viro
2023-11-01  6:20             ` [PATCH 05/15] retain_dentry(): lift decrement of ->d_count into callers Al Viro
2023-11-01  6:20             ` [PATCH 06/15] __dentry_kill(): get consistent rules for ->d_count Al Viro
2023-11-01  6:20             ` [PATCH 07/15] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
2023-11-01  6:20             ` [PATCH 08/15] Call retain_dentry() with refcount 0 Al Viro
2023-11-01  6:20             ` [PATCH 09/15] fold the call of retain_dentry() into fast_dput() Al Viro
2023-11-01  8:45               ` Al Viro
2023-11-01 17:30                 ` Linus Torvalds
2023-11-01 18:19                   ` Al Viro
2023-11-10  4:20                     ` lockless case of retain_dentry() (was Re: [PATCH 09/15] fold the call of retain_dentry() into fast_dput()) Al Viro
2023-11-10  5:57                       ` Linus Torvalds
2023-11-10  6:22                         ` Linus Torvalds
2023-11-22  6:29                           ` Guo Ren
2023-11-10  8:19                         ` Al Viro
2023-11-22  7:19                         ` Guo Ren
2023-11-22 17:20                           ` Linus Torvalds
2023-11-22 17:52                             ` Linus Torvalds
2023-11-22 18:05                               ` Linus Torvalds
2023-11-22 19:11                               ` Linus Torvalds
2023-11-29  7:14                                 ` Guo Ren
2023-11-29 12:25                                 ` Guo Ren
2023-11-29 14:42                                   ` Linus Torvalds
2023-11-26 16:39                             ` Guo Ren
2023-11-26 16:51                               ` Linus Torvalds
2023-11-30 10:00                                 ` Guo Ren
2023-12-01  1:09                                   ` Linus Torvalds
2023-12-01  3:36                                     ` Guo Ren
2023-12-01  5:15                                       ` Linus Torvalds
2023-12-01  7:31                                         ` Guo Ren
2023-11-26 16:51                               ` Guo Ren
2023-11-26 17:06                               ` Linus Torvalds
2023-11-26 17:59                                 ` Linus Torvalds
2023-11-29  9:52                                 ` Guo Ren
2023-11-01  6:20             ` [PATCH 10/15] don't try to cut corners in shrink_lock_dentry() Al Viro
2023-11-01  6:21             ` [PATCH 11/15] fold dentry_kill() into dput() Al Viro
2023-11-01  6:21             ` [PATCH 12/15] get rid of __dget() Al Viro
2023-11-01  6:21             ` [PATCH 13/15] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
2023-11-01  6:21             ` [PATCH 14/15] to_shrink_list(): call only if refcount is 0 Al Viro
2023-11-01  6:21             ` [PATCH 15/15] switch select_collect{,2}() to use of to_shrink_list() Al Viro
2023-11-01  2:22       ` [RFC] simplifying fast_dput(), dentry_kill() et.al Al Viro
2023-11-01 14:29         ` Benjamin Coddington
2023-11-05 19:54       ` Al Viro
2023-11-05 21:59         ` Al Viro
2023-11-06  5:53         ` Al Viro
2023-11-07  2:08           ` Al Viro
2023-11-09  6:19             ` [RFC][PATCHSET v2] " Al Viro
2023-11-09  6:20               ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Al Viro
2023-11-09  6:20                 ` [PATCH 02/22] switch nfsd_client_rmdir() to use of simple_recursive_removal() Al Viro
2023-11-09 13:42                   ` Christian Brauner
2023-11-09 14:01                   ` Chuck Lever
2023-11-09 18:47                     ` Al Viro
2023-11-09 18:50                       ` Chuck Lever III
2023-11-09  6:20                 ` [PATCH 03/22] coda_flag_children(): cope with dentries turning negative Al Viro
2023-11-09 13:43                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 04/22] dentry: switch the lists of children to hlist Al Viro
2023-11-09 13:48                   ` Christian Brauner
2023-11-09 19:32                     ` Al Viro
2023-11-09  6:20                 ` [PATCH 05/22] centralize killing dentry from shrink list Al Viro
2023-11-09 13:49                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 06/22] get rid of __dget() Al Viro
2023-11-09 13:50                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 07/22] shrink_dentry_list(): no need to check that dentry refcount is marked dead Al Viro
2023-11-09 13:53                   ` Christian Brauner
2023-11-09 20:28                     ` Al Viro
2023-11-09  6:20                 ` [PATCH 08/22] fast_dput(): having ->d_delete() is not reason to delay refcount decrement Al Viro
2023-11-09 13:58                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 09/22] fast_dput(): handle underflows gracefully Al Viro
2023-11-09 14:46                   ` Christian Brauner
2023-11-09 20:39                     ` Al Viro
2023-11-09  6:20                 ` [PATCH 10/22] fast_dput(): new rules for refcount Al Viro
2023-11-09 14:54                   ` Christian Brauner
2023-11-09 20:52                     ` Al Viro
2023-11-09  6:20                 ` [PATCH 11/22] __dput_to_list(): do decrement of refcount in the callers Al Viro
2023-11-09 15:21                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 12/22] Make retain_dentry() neutral with respect to refcounting Al Viro
2023-11-09 15:22                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 13/22] __dentry_kill(): get consistent rules for victim's refcount Al Viro
2023-11-09 15:27                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 14/22] dentry_kill(): don't bother with retain_dentry() on slow path Al Viro
2023-11-09 15:53                   ` Christian Brauner
2023-11-09 21:29                     ` Al Viro
2023-11-09  6:20                 ` [PATCH 15/22] Call retain_dentry() with refcount 0 Al Viro
2023-11-09 16:09                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 16/22] fold the call of retain_dentry() into fast_dput() Al Viro
2023-11-09 16:17                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 17/22] don't try to cut corners in shrink_lock_dentry() Al Viro
2023-11-09 17:20                   ` Christian Brauner
2023-11-09 21:45                     ` Al Viro
2023-11-10  9:07                       ` Christian Brauner
2023-11-09 17:39                   ` Linus Torvalds
2023-11-09 18:11                     ` Linus Torvalds
2023-11-09 18:20                     ` Al Viro
2023-11-09  6:20                 ` [PATCH 18/22] fold dentry_kill() into dput() Al Viro
2023-11-09 17:22                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 19/22] to_shrink_list(): call only if refcount is 0 Al Viro
2023-11-09 17:29                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 20/22] switch select_collect{,2}() to use of to_shrink_list() Al Viro
2023-11-09 17:31                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 21/22] d_prune_aliases(): use a shrink list Al Viro
2023-11-09 17:33                   ` Christian Brauner
2023-11-09  6:20                 ` [PATCH 22/22] __dentry_kill(): new locking scheme Al Viro
2023-11-10 13:34                   ` Christian Brauner
2023-11-09 13:33                 ` [PATCH 01/22] struct dentry: get rid of randomize_layout idiocy Christian Brauner
2023-10-31  2:25     ` [RFC] simplifying fast_dput(), dentry_kill() et.al Gao Xiang
2023-10-31  2:29       ` Gao Xiang
2023-10-31  3:02       ` Linus Torvalds
2023-10-31  3:13         ` Gao Xiang
2023-10-31  3:26         ` Al Viro
2023-10-31  3:41           ` Linus Torvalds

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.