All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] epoll: Fix spurious lockdep warnings
@ 2011-08-09 18:11 Nelson Elhage
  2011-09-08  0:04 ` Josh Boyer
  0 siblings, 1 reply; 13+ messages in thread
From: Nelson Elhage @ 2011-08-09 18:11 UTC (permalink / raw)
  To: Alexander Viro
  Cc: Davide Libenzi, linux-kernel, Josh Boyer, linux-fsdevel,
	Paul Bolle, Nelson Elhage, stable

epoll can acquire recursively acquire ep->mtx on multiple "struct
eventpoll"s at once in the case where one epoll fd is monitoring
another epoll fd. This is perfectly OK, since we're careful about the
lock ordering, but it causes spurious lockdep warnings. Annotate the
recursion using mutex_lock_nested, and add a comment explaining the
nesting rules for good measure.

Recent versions of systemd are triggering this, and it can also be
demonstrated with the following trivial test program:

--------------------8<--------------------

int main(void) {
   int e1, e2;
   struct epoll_event evt = {
       .events = EPOLLIN
   };

   e1 = epoll_create1(0);
   e2 = epoll_create1(0);
   epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
   return 0;
}
--------------------8<--------------------

Cc: stable@kernel.org
Reported-by: Paul Bolle <pebolle@tiscali.nl>
Tested-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
---
 fs/eventpoll.c |   25 ++++++++++++++++++-------
 1 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f9cfd16..2acaf60 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
  * simultaneous inserts (A into B and B into A) from racing and
  * constructing a cycle without either insert observing that it is
  * going to.
+ * It is necessary to acquire multiple "ep->mtx"es at once in the
+ * case when one epoll fd is added to another. In this case, we
+ * always acquire the locks in the order of nesting (i.e. after
+ * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
+ * before e2->mtx). Since we disallow cycles of epoll file
+ * descriptors, this ensures that the mutexes are well-ordered. In
+ * order to communicate this nesting to lockdep, when walking a tree
+ * of epoll file descriptors, we use the current recursion depth as
+ * the lockdep subkey.
  * It is possible to drop the "ep->mtx" and to use the global
  * mutex "epmutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
  * @ep: Pointer to the epoll private data structure.
  * @sproc: Pointer to the scan callback.
  * @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
  *
  * Returns: The same integer error code returned by the @sproc callback.
  */
 static int ep_scan_ready_list(struct eventpoll *ep,
 			      int (*sproc)(struct eventpoll *,
 					   struct list_head *, void *),
-			      void *priv)
+			      void *priv,
+			      int depth)
 {
 	int error, pwake = 0;
 	unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release_file() and epoll_ctl().
 	 */
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, depth);
 
 	/*
 	 * Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 {
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
 }
 
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
 
 		ep = epi->ep;
 		list_del_init(&epi->fllink);
-		mutex_lock(&ep->mtx);
+		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
 		mutex_unlock(&ep->mtx);
 	}
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
 	esed.maxevents = maxevents;
 	esed.events = events;
 
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
 }
 
 static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 	struct rb_node *rbp;
 	struct epitem *epi;
 
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	}
 
 
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, 0);
 
 	/*
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
-- 
1.7.4.44.gf9e72


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings
  2011-08-09 18:11 [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
@ 2011-09-08  0:04 ` Josh Boyer
  2011-09-13 20:22   ` Jason Baron
  0 siblings, 1 reply; 13+ messages in thread
From: Josh Boyer @ 2011-09-08  0:04 UTC (permalink / raw)
  To: Nelson Elhage
  Cc: Alexander Viro, Davide Libenzi, linux-kernel, linux-fsdevel,
	Paul Bolle, stable

On Tue, Aug 09, 2011 at 02:11:55PM -0400, Nelson Elhage wrote:
> epoll can acquire recursively acquire ep->mtx on multiple "struct
> eventpoll"s at once in the case where one epoll fd is monitoring
> another epoll fd. This is perfectly OK, since we're careful about the
> lock ordering, but it causes spurious lockdep warnings. Annotate the
> recursion using mutex_lock_nested, and add a comment explaining the
> nesting rules for good measure.
> 
> Recent versions of systemd are triggering this, and it can also be
> demonstrated with the following trivial test program:
> 
> --------------------8<--------------------
> 
> int main(void) {
>    int e1, e2;
>    struct epoll_event evt = {
>        .events = EPOLLIN
>    };
> 
>    e1 = epoll_create1(0);
>    e2 = epoll_create1(0);
>    epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
>    return 0;
> }
> --------------------8<--------------------
> 
> Cc: stable@kernel.org
> Reported-by: Paul Bolle <pebolle@tiscali.nl>
> Tested-by: Paul Bolle <pebolle@tiscali.nl>
> Signed-off-by: Nelson Elhage <nelhage@nelhage.com>

Any progress on this heading upstream?

josh

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings
  2011-09-08  0:04 ` Josh Boyer
@ 2011-09-13 20:22   ` Jason Baron
  2011-09-13 21:16     ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Jason Baron @ 2011-09-13 20:22 UTC (permalink / raw)
  To: Josh Boyer
  Cc: Nelson Elhage, Alexander Viro, Davide Libenzi, linux-kernel,
	linux-fsdevel, Paul Bolle, stable, akpm

On Wed, Sep 07, 2011 at 08:04:29PM -0400, Josh Boyer wrote:
> On Tue, Aug 09, 2011 at 02:11:55PM -0400, Nelson Elhage wrote:
> > epoll can acquire recursively acquire ep->mtx on multiple "struct
> > eventpoll"s at once in the case where one epoll fd is monitoring
> > another epoll fd. This is perfectly OK, since we're careful about the
> > lock ordering, but it causes spurious lockdep warnings. Annotate the
> > recursion using mutex_lock_nested, and add a comment explaining the
> > nesting rules for good measure.
> > 
> > Recent versions of systemd are triggering this, and it can also be
> > demonstrated with the following trivial test program:
> > 
> > --------------------8<--------------------
> > 
> > int main(void) {
> >    int e1, e2;
> >    struct epoll_event evt = {
> >        .events = EPOLLIN
> >    };
> > 
> >    e1 = epoll_create1(0);
> >    e2 = epoll_create1(0);
> >    epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
> >    return 0;
> > }
> > --------------------8<--------------------
> > 
> > Cc: stable@kernel.org
> > Reported-by: Paul Bolle <pebolle@tiscali.nl>
> > Tested-by: Paul Bolle <pebolle@tiscali.nl>
> > Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
> 
> Any progress on this heading upstream?
> 

Patch looks good to me, feel free to add:

Acked-by: Jason Baron <jbaron@redhat.com>

However, I am going to have to re-base the epoll path I recently posted:
https://lkml.org/lkml/2011/9/2/295, if this goes in first. Perhaps,
Andrew (added to the 'cc), can help us sort out the ordering...

Thanks,

-Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings
  2011-09-13 20:22   ` Jason Baron
@ 2011-09-13 21:16     ` Andrew Morton
  2011-09-15 15:15       ` Jason Baron
  0 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2011-09-13 21:16 UTC (permalink / raw)
  To: Jason Baron
  Cc: Josh Boyer, Nelson Elhage, Alexander Viro, Davide Libenzi,
	linux-kernel, linux-fsdevel, Paul Bolle, stable

On Tue, 13 Sep 2011 16:22:48 -0400
Jason Baron <jbaron@redhat.com> wrote:

> On Wed, Sep 07, 2011 at 08:04:29PM -0400, Josh Boyer wrote:
> > On Tue, Aug 09, 2011 at 02:11:55PM -0400, Nelson Elhage wrote:
> > > epoll can acquire recursively acquire ep->mtx on multiple "struct
> > > eventpoll"s at once in the case where one epoll fd is monitoring
> > > another epoll fd. This is perfectly OK, since we're careful about the
> > > lock ordering, but it causes spurious lockdep warnings. Annotate the
> > > recursion using mutex_lock_nested, and add a comment explaining the
> > > nesting rules for good measure.
> > > 
> > > Recent versions of systemd are triggering this, and it can also be
> > > demonstrated with the following trivial test program:
> > > 
> > > --------------------8<--------------------
> > > 
> > > int main(void) {
> > >    int e1, e2;
> > >    struct epoll_event evt = {
> > >        .events = EPOLLIN
> > >    };
> > > 
> > >    e1 = epoll_create1(0);
> > >    e2 = epoll_create1(0);
> > >    epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
> > >    return 0;
> > > }
> > > --------------------8<--------------------
> > > 
> > > Cc: stable@kernel.org
> > > Reported-by: Paul Bolle <pebolle@tiscali.nl>
> > > Tested-by: Paul Bolle <pebolle@tiscali.nl>
> > > Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
> > 
> > Any progress on this heading upstream?
> > 
> 
> Patch looks good to me, feel free to add:
> 
> Acked-by: Jason Baron <jbaron@redhat.com>
> 
> However, I am going to have to re-base the epoll path I recently posted:
> https://lkml.org/lkml/2011/9/2/295, if this goes in first. Perhaps,
> Andrew (added to the 'cc), can help us sort out the ordering...

I have already fixed up epoll-limit-paths.patch.  You're planning on
sending a new version of that patch.  Please do base that on
epoll-fix-spurious-lockdep-warnings.patch.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings
  2011-09-13 21:16     ` Andrew Morton
@ 2011-09-15 15:15       ` Jason Baron
  0 siblings, 0 replies; 13+ messages in thread
From: Jason Baron @ 2011-09-15 15:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Josh Boyer, Nelson Elhage, Alexander Viro, Davide Libenzi,
	linux-kernel, linux-fsdevel, Paul Bolle, stable

On Tue, Sep 13, 2011 at 02:16:03PM -0700, Andrew Morton wrote:
> On Tue, 13 Sep 2011 16:22:48 -0400
> Jason Baron <jbaron@redhat.com> wrote:
> 
> > On Wed, Sep 07, 2011 at 08:04:29PM -0400, Josh Boyer wrote:
> > > On Tue, Aug 09, 2011 at 02:11:55PM -0400, Nelson Elhage wrote:
> > > > epoll can acquire recursively acquire ep->mtx on multiple "struct
> > > > eventpoll"s at once in the case where one epoll fd is monitoring
> > > > another epoll fd. This is perfectly OK, since we're careful about the
> > > > lock ordering, but it causes spurious lockdep warnings. Annotate the
> > > > recursion using mutex_lock_nested, and add a comment explaining the
> > > > nesting rules for good measure.
> > > > 
> > > > Recent versions of systemd are triggering this, and it can also be
> > > > demonstrated with the following trivial test program:
> > > > 
> > > > --------------------8<--------------------
> > > > 
> > > > int main(void) {
> > > >    int e1, e2;
> > > >    struct epoll_event evt = {
> > > >        .events = EPOLLIN
> > > >    };
> > > > 
> > > >    e1 = epoll_create1(0);
> > > >    e2 = epoll_create1(0);
> > > >    epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
> > > >    return 0;
> > > > }
> > > > --------------------8<--------------------
> > > > 
> > > > Cc: stable@kernel.org
> > > > Reported-by: Paul Bolle <pebolle@tiscali.nl>
> > > > Tested-by: Paul Bolle <pebolle@tiscali.nl>
> > > > Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
> > > 
> > > Any progress on this heading upstream?
> > > 
> > 
> > Patch looks good to me, feel free to add:
> > 
> > Acked-by: Jason Baron <jbaron@redhat.com>
> > 
> > However, I am going to have to re-base the epoll path I recently posted:
> > https://lkml.org/lkml/2011/9/2/295, if this goes in first. Perhaps,
> > Andrew (added to the 'cc), can help us sort out the ordering...
> 
> I have already fixed up epoll-limit-paths.patch.  You're planning on
> sending a new version of that patch.  Please do base that on
> epoll-fix-spurious-lockdep-warnings.patch.

Ok, where can I get at these patches? With:
http://userweb.kernel.org/~akpm/mmotm/, still down.

Thanks,

-Jason

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-08-09 15:11   ` Josh Boyer
@ 2011-08-09 17:36     ` Nelson Elhage
  0 siblings, 0 replies; 13+ messages in thread
From: Nelson Elhage @ 2011-08-09 17:36 UTC (permalink / raw)
  To: Josh Boyer
  Cc: Nelson Elhage, Paul Bolle, Alexander Viro, linux-fsdevel, Davide Libenzi

I think so. I'll resend to upstream for inclusion in 3.1 (CC stable)
with Paul's comments later today.

- Nelson

On Tue, Aug 9, 2011 at 11:11 AM, Josh Boyer <jwboyer@redhat.com> wrote:
> On Sat, Jul 30, 2011 at 06:30:22PM -0400, Nelson Elhage wrote:
>> epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s
>> at once in the case where one epoll fd is monitoring another epoll
>> fd. This is perfectly OK, since we're careful about the lock ordering,
>> but causes spurious lockdep warnings. Annotate the recursion using
>> mutex_lock_nested, and add a comment explaining the nesting rules for
>> good measure.
>>
>> Reported-by: Paul Bolle <pebolle@tiscali.nl>
>> Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
>
> We were seeing lockdep warnings for this in rawhide until I applied the
> patch.  Would it be possible to get it into 3.1 still?
>
> josh
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-30 22:30 ` [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
                     ` (2 preceding siblings ...)
  2011-07-31 21:48   ` Paul Bolle
@ 2011-08-09 15:11   ` Josh Boyer
  2011-08-09 17:36     ` Nelson Elhage
  3 siblings, 1 reply; 13+ messages in thread
From: Josh Boyer @ 2011-08-09 15:11 UTC (permalink / raw)
  To: Nelson Elhage
  Cc: Paul Bolle, Alexander Viro, linux-fsdevel, Davide Libenzi, Nelson Elhage

On Sat, Jul 30, 2011 at 06:30:22PM -0400, Nelson Elhage wrote:
> epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s
> at once in the case where one epoll fd is monitoring another epoll
> fd. This is perfectly OK, since we're careful about the lock ordering,
> but causes spurious lockdep warnings. Annotate the recursion using
> mutex_lock_nested, and add a comment explaining the nesting rules for
> good measure.
> 
> Reported-by: Paul Bolle <pebolle@tiscali.nl>
> Signed-off-by: Nelson Elhage <nelhage@nelhage.com>

We were seeing lockdep warnings for this in rawhide until I applied the
patch.  Would it be possible to get it into 3.1 still?

josh

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-30 22:30 ` [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
  2011-07-31 15:06   ` Paul Bolle
  2011-07-31 21:36   ` Paul Bolle
@ 2011-07-31 21:48   ` Paul Bolle
  2011-08-09 15:11   ` Josh Boyer
  3 siblings, 0 replies; 13+ messages in thread
From: Paul Bolle @ 2011-07-31 21:48 UTC (permalink / raw)
  To: Nelson Elhage
  Cc: Alexander Viro, linux-fsdevel, Davide Libenzi, Nelson Elhage

On Sun, 2011-07-31 at 23:36 +0200, Paul Bolle wrote:
> Also works as expected with systemd-30 (which is apparently the first
> version that uses recursive epoll instances in systemd-logind).

I forgot to add that maybe (the final version of) this patch should be
sent to "stable". Not sure for which previous releases it could be
relevant, though.


Paul Bolle


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-31 15:16     ` Nelson Elhage
@ 2011-07-31 21:39       ` Paul Bolle
  0 siblings, 0 replies; 13+ messages in thread
From: Paul Bolle @ 2011-07-31 21:39 UTC (permalink / raw)
  To: Nelson Elhage; +Cc: Alexander Viro, linux-fsdevel, Davide Libenzi

On Sun, 2011-07-31 at 11:16 -0400, Nelson Elhage wrote:
> Sure -- it's quite simple if you've worked with epoll before. This
> cut-down version is even simpler than the previous one I had, and I'd
> be happy to add it to the commit message.
> 
> --------------------8<--------------------
> #include <sys/epoll.h>
> 
> int main(void) {
>     int e1, e2;
>     struct epoll_event evt = {
>         .events = EPOLLIN
>     };
> 
>     e1 = epoll_create1(0);
>     e2 = epoll_create1(0);
>     epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
>     return 0;
> }
> --------------------8<--------------------

That looks rather obvious. It's, well, interesting to see how I managed
to miss that.

Thanks,


Paul Bolle


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-30 22:30 ` [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
  2011-07-31 15:06   ` Paul Bolle
@ 2011-07-31 21:36   ` Paul Bolle
  2011-07-31 21:48   ` Paul Bolle
  2011-08-09 15:11   ` Josh Boyer
  3 siblings, 0 replies; 13+ messages in thread
From: Paul Bolle @ 2011-07-31 21:36 UTC (permalink / raw)
  To: Nelson Elhage
  Cc: Alexander Viro, linux-fsdevel, Davide Libenzi, Nelson Elhage

On Sat, 2011-07-30 at 18:30 -0400, Nelson Elhage wrote:
0) Nit: drop the period at the end of the summary

> epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s

1) Ditto: it's "ep->mtx", but perhaps better something like "[...]
recursive mutexes on [...]" 

> Paul, are you able to test systemd against this?

Also works as expected with systemd-30 (which is apparently the first
version that uses recursive epoll instances in systemd-logind).

Thanks,


Paul Bolle


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-31 15:06   ` Paul Bolle
@ 2011-07-31 15:16     ` Nelson Elhage
  2011-07-31 21:39       ` Paul Bolle
  0 siblings, 1 reply; 13+ messages in thread
From: Nelson Elhage @ 2011-07-31 15:16 UTC (permalink / raw)
  To: Paul Bolle; +Cc: Alexander Viro, linux-fsdevel, Davide Libenzi

Sure -- it's quite simple if you've worked with epoll before. This
cut-down version is even simpler than the previous one I had, and I'd
be happy to add it to the commit message.

--------------------8<--------------------
#include <sys/epoll.h>

int main(void) {
    int e1, e2;
    struct epoll_event evt = {
        .events = EPOLLIN
    };

    e1 = epoll_create1(0);
    e2 = epoll_create1(0);
    epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);
    return 0;
}
--------------------8<--------------------

- Nelson

On Sun, Jul 31, 2011 at 05:06:10PM +0200, Paul Bolle wrote:
> On Sat, 2011-07-30 at 18:30 -0400, Nelson Elhage wrote:
> >  I've tested this on a synthetic epoll test case, that just adds e1 to
> >  e2 and then does an epoll_wait(). I verified that it caused lockdep
> >  problems on 3.0 and that this patch fixed it, but I haven't done more
> >  extensive testing.
> 
> I was unable to come up with such a test case myself. Could you perhaps
> share it?
> 
> (Maybe that test case could even be added tot the commit message. I seem
> to remember an earlier commit that you were involved with which had a
> test case added. That helped me understand eventpoll's interface - at
> least enough to pinpoint the problem. Looking at a test case is much
> easier than grepping through a program like systemd. Issues in non-test
> case programs tend to increase the bug hunting challenge: one is faced
> with an issue in an interface one hasn't used before triggered by a
> program one hasn't studied before.)
> 
> > Paul, are you able to test systemd against this?
> 
> I hope to do so shortly (ie, in the next 24 hours).
> 
> 
> Paul Bolle
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-30 22:30 ` [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
@ 2011-07-31 15:06   ` Paul Bolle
  2011-07-31 15:16     ` Nelson Elhage
  2011-07-31 21:36   ` Paul Bolle
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Paul Bolle @ 2011-07-31 15:06 UTC (permalink / raw)
  To: Nelson Elhage
  Cc: Alexander Viro, linux-fsdevel, Davide Libenzi, Nelson Elhage

On Sat, 2011-07-30 at 18:30 -0400, Nelson Elhage wrote:
>  I've tested this on a synthetic epoll test case, that just adds e1 to
>  e2 and then does an epoll_wait(). I verified that it caused lockdep
>  problems on 3.0 and that this patch fixed it, but I haven't done more
>  extensive testing.

I was unable to come up with such a test case myself. Could you perhaps
share it?

(Maybe that test case could even be added tot the commit message. I seem
to remember an earlier commit that you were involved with which had a
test case added. That helped me understand eventpoll's interface - at
least enough to pinpoint the problem. Looking at a test case is much
easier than grepping through a program like systemd. Issues in non-test
case programs tend to increase the bug hunting challenge: one is faced
with an issue in an interface one hasn't used before triggered by a
program one hasn't studied before.)

> Paul, are you able to test systemd against this?

I hope to do so shortly (ie, in the next 24 hours).


Paul Bolle


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] epoll: Fix spurious lockdep warnings.
  2011-07-30 21:25 recursive locking: epoll Nelson Elhage
@ 2011-07-30 22:30 ` Nelson Elhage
  2011-07-31 15:06   ` Paul Bolle
                     ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Nelson Elhage @ 2011-07-30 22:30 UTC (permalink / raw)
  To: Paul Bolle; +Cc: Alexander Viro, linux-fsdevel, Davide Libenzi, Nelson Elhage

epoll can acquire multiple ep->mutex on multiple "struct eventpoll"s
at once in the case where one epoll fd is monitoring another epoll
fd. This is perfectly OK, since we're careful about the lock ordering,
but causes spurious lockdep warnings. Annotate the recursion using
mutex_lock_nested, and add a comment explaining the nesting rules for
good measure.

Reported-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Nelson Elhage <nelhage@nelhage.com>
---
 I've tested this on a synthetic epoll test case, that just adds e1 to
 e2 and then does an epoll_wait(). I verified that it caused lockdep
 problems on 3.0 and that this patch fixed it, but I haven't done more
 extensive testing. Paul, are you able to test systemd against this?

 fs/eventpoll.c |   25 ++++++++++++++++++-------
 1 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f9cfd16..0cb7bc6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -76,6 +76,15 @@
  * Events that require holding "epmutex" are very rare, while for
  * normal operations the epoll private "ep->mtx" will guarantee
  * a better scalability.
+ * It is possible to acquire multiple "ep->mtx"es at once in the case
+ * when one epoll fd is added to another. In this case, we always
+ * acquire the locks in the order of nesting (i.e. after epoll_ctl(e1,
+ * EPOLL_CTL_ADD, e2), e1->mtx will always be acquired before
+ * e2->mtx). Since we disallow cycles of epoll file descriptors, this
+ * ensures that the mutexes are well-ordered. In order to communicate
+ * this nesting to lockdep, when walking a tree of epoll file
+ * descriptors, we use the current recursion depth as the lockdep
+ * subkey.
  */
 
 /* Epoll private bits inside the event mask */
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
  * @ep: Pointer to the epoll private data structure.
  * @sproc: Pointer to the scan callback.
  * @priv: Private opaque data passed to the @sproc callback.
+ * @depth: The current depth of recursive f_op->poll calls.
  *
  * Returns: The same integer error code returned by the @sproc callback.
  */
 static int ep_scan_ready_list(struct eventpoll *ep,
 			      int (*sproc)(struct eventpoll *,
 					   struct list_head *, void *),
-			      void *priv)
+			      void *priv,
+			      int depth)
 {
 	int error, pwake = 0;
 	unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release_file() and epoll_ctl().
 	 */
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, depth);
 
 	/*
 	 * Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 {
-	return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+	return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
 }
 
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
 
 		ep = epi->ep;
 		list_del_init(&epi->fllink);
-		mutex_lock(&ep->mtx);
+		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
 		mutex_unlock(&ep->mtx);
 	}
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
 	esed.maxevents = maxevents;
 	esed.events = events;
 
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
+	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
 }
 
 static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 	struct rb_node *rbp;
 	struct epitem *epi;
 
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, call_nests + 1);
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	}
 
 
-	mutex_lock(&ep->mtx);
+	mutex_lock_nested(&ep->mtx, 0);
 
 	/*
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-09-15 15:16 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-09 18:11 [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
2011-09-08  0:04 ` Josh Boyer
2011-09-13 20:22   ` Jason Baron
2011-09-13 21:16     ` Andrew Morton
2011-09-15 15:15       ` Jason Baron
  -- strict thread matches above, loose matches on Subject: below --
2011-07-30 21:25 recursive locking: epoll Nelson Elhage
2011-07-30 22:30 ` [PATCH] epoll: Fix spurious lockdep warnings Nelson Elhage
2011-07-31 15:06   ` Paul Bolle
2011-07-31 15:16     ` Nelson Elhage
2011-07-31 21:39       ` Paul Bolle
2011-07-31 21:36   ` Paul Bolle
2011-07-31 21:48   ` Paul Bolle
2011-08-09 15:11   ` Josh Boyer
2011-08-09 17:36     ` Nelson Elhage

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.