[3/3] kernel/workqueue: Suppress a false positive lockdep complaint
diff mbox series

Message ID 20181025150540.259281-4-bvanassche@acm.org
State New, archived
Headers show
Series
  • Suppress false positives triggered by workqueue lockdep annotations
Related show

Commit Message

Bart Van Assche Oct. 25, 2018, 3:05 p.m. UTC
It can happen that the direct I/O queue creates and destroys an empty
workqueue from inside a work function. Avoid that this triggers the
false positive lockdep complaint shown below.

Comments

Johannes Berg Oct. 25, 2018, 3:34 p.m. UTC | #1
On Thu, 2018-10-25 at 15:05 +0000, Bart Van Assche wrote:

> @@ -2889,7 +2893,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
>  	 * workqueues the deadlock happens when the rescuer stalls, blocking
>  	 * forward progress.
>  	 */
> -	if (!from_cancel &&
> +	if (!from_cancel && (pwq->wq->flags & __WQ_HAS_BEEN_USED) &&
>  	    (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
>  		lock_acquire_exclusive(&pwq->wq->lockdep_map, 0, 0, NULL,
>  				       _THIS_IP_);

This also doesn't seem right to me. You shouldn't really care whether or
not the workqueue has been used at this point, lockdep also doesn't do
this for locks.

Any dependency you cause at some point can - at a future time - be taken
into account when checking dependency cycles. Removing one arbitrarily
just because you haven't actually executed anything *yet* just removes
knowledge from lockdep. In the general case, this isn't right. Just
because you haven't executd anything here doesn't mean that it's
*impossible* to have executed something, right?

(Also, still trying to understand patch 2)

johannes
Tejun Heo Oct. 25, 2018, 3:36 p.m. UTC | #2
Hello, Bart.

On Thu, Oct 25, 2018 at 08:05:40AM -0700, Bart Van Assche wrote:
> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> index 60d673e15632..375ec764f148 100644
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -344,6 +344,7 @@ enum {
>  	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
>  	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
>  	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
> +	__WQ_HAS_BEEN_USED	= 1 << 20, /* internal: work has been queued */
>  
>  	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
>  	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index fc9129d5909e..0ef275fe526c 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1383,6 +1383,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
>  	if (unlikely(wq->flags & __WQ_DRAINING) &&
>  	    WARN_ON_ONCE(!is_chained_work(wq)))
>  		return;
> +
> +	if (!(wq->flags & __WQ_HAS_BEEN_USED))
> +		wq->flags |= __WQ_HAS_BEEN_USED;
> +
>  retry:
>  	if (req_cpu == WORK_CPU_UNBOUND)
>  		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
> @@ -2889,7 +2893,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
>  	 * workqueues the deadlock happens when the rescuer stalls, blocking
>  	 * forward progress.
>  	 */
> -	if (!from_cancel &&
> +	if (!from_cancel && (pwq->wq->flags & __WQ_HAS_BEEN_USED) &&
>  	    (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
>  		lock_acquire_exclusive(&pwq->wq->lockdep_map, 0, 0, NULL,
>  				       _THIS_IP_);

We likely wanna skip the whole drain instead of eliding lockdep
annotation here.  Other than that, this patch looks fine to me but for
the others, I think it'd be a better idea to listen to Johannes.  We
wanna annotate the users for the exceptions rather than weakening the
workqueue lockdep checks, especially because workqueue related
deadlocks can be pretty difficult to trigger and root cause
afterwards.

Thanks.
Tejun Heo Oct. 25, 2018, 3:37 p.m. UTC | #3
On Thu, Oct 25, 2018 at 08:36:57AM -0700, Tejun Heo wrote:
> Hello, Bart.
> 
> On Thu, Oct 25, 2018 at 08:05:40AM -0700, Bart Van Assche wrote:
> > diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> > index 60d673e15632..375ec764f148 100644
> > --- a/include/linux/workqueue.h
> > +++ b/include/linux/workqueue.h
> > @@ -344,6 +344,7 @@ enum {
> >  	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
> >  	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
> >  	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
> > +	__WQ_HAS_BEEN_USED	= 1 << 20, /* internal: work has been queued */
> >  
> >  	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
> >  	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index fc9129d5909e..0ef275fe526c 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -1383,6 +1383,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
> >  	if (unlikely(wq->flags & __WQ_DRAINING) &&
> >  	    WARN_ON_ONCE(!is_chained_work(wq)))
> >  		return;
> > +
> > +	if (!(wq->flags & __WQ_HAS_BEEN_USED))
> > +		wq->flags |= __WQ_HAS_BEEN_USED;
> > +
> >  retry:
> >  	if (req_cpu == WORK_CPU_UNBOUND)
> >  		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
> > @@ -2889,7 +2893,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
> >  	 * workqueues the deadlock happens when the rescuer stalls, blocking
> >  	 * forward progress.
> >  	 */
> > -	if (!from_cancel &&
> > +	if (!from_cancel && (pwq->wq->flags & __WQ_HAS_BEEN_USED) &&
> >  	    (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
> >  		lock_acquire_exclusive(&pwq->wq->lockdep_map, 0, 0, NULL,
> >  				       _THIS_IP_);
> 
> We likely wanna skip the whole drain instead of eliding lockdep
> annotation here.  Other than that, this patch looks fine to me but for
> the others, I think it'd be a better idea to listen to Johannes.  We
> wanna annotate the users for the exceptions rather than weakening the
> workqueue lockdep checks, especially because workqueue related
> deadlocks can be pretty difficult to trigger and root cause
> afterwards.

Ooh, also, please only do the HAS_BEEN_USED marking if LOCKDEP is
enabled.

Thanks.
Theodore Y. Ts'o Oct. 25, 2018, 3:40 p.m. UTC | #4
On Thu, Oct 25, 2018 at 08:05:40AM -0700, Bart Van Assche wrote:
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index fc9129d5909e..0ef275fe526c 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1383,6 +1383,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
>  	if (unlikely(wq->flags & __WQ_DRAINING) &&
>  	    WARN_ON_ONCE(!is_chained_work(wq)))
>  		return;
> +
> +	if (!(wq->flags & __WQ_HAS_BEEN_USED))
> +		wq->flags |= __WQ_HAS_BEEN_USED;
> +
>  retry:
>  	if (req_cpu == WORK_CPU_UNBOUND)
>  		cpu = wq_select_unbound_cpu(raw_smp_processor_id());

I was looking to fix this problem as well, and I thought about doing
this, but I thought wq->mutex had to be taken in order to modify
wq->flags --- and that would destroy the scalability of the
add-to-work-to-queue functions.

We could switch all of wq->flags to use the {test,set,clear}_bits()
family of operations, but that seemed like a very large change.

Tejun seemed to be ok with creating a destroy_workqueue_no_drain()
function and using that instead of destroy_workqueue() for the losers
of the cmpxchg lockless initialization code in sb_init_dio_done_wq()
in fs/direct_io.c.

						- Ted
Bart Van Assche Oct. 25, 2018, 3:55 p.m. UTC | #5
On Thu, 2018-10-25 at 17:34 +0200, Johannes Berg wrote:
> On Thu, 2018-10-25 at 15:05 +0000, Bart Van Assche wrote:
> 
> > @@ -2889,7 +2893,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
> >  	 * workqueues the deadlock happens when the rescuer stalls, blocking
> >  	 * forward progress.
> >  	 */
> > -	if (!from_cancel &&
> > +	if (!from_cancel && (pwq->wq->flags & __WQ_HAS_BEEN_USED) &&
> >  	    (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
> >  		lock_acquire_exclusive(&pwq->wq->lockdep_map, 0, 0, NULL,
> >  				       _THIS_IP_);
> 
> This also doesn't seem right to me. You shouldn't really care whether or
> not the workqueue has been used at this point, lockdep also doesn't do
> this for locks.
> 
> Any dependency you cause at some point can - at a future time - be taken
> into account when checking dependency cycles. Removing one arbitrarily
> just because you haven't actually executed anything *yet* just removes
> knowledge from lockdep. In the general case, this isn't right. Just
> because you haven't executd anything here doesn't mean that it's
> *impossible* to have executed something, right?

Please have a look at the call trace in the description of this patch and also
at the direct I/O code. The lockdep complaint in the description of this patch
really is a false positive. What I think needs further discussion is on how to
address this false positive - track whether or not a work queue has been used
or follow Tejun's proposal that I became aware of after I posted this patch,
namely introduce a new function for destroying a work queue that skips draining,
e.g. destroy_workqueue_skip_drain() (https://lkml.org/lkml/2018/10/24/2).

Thanks,

Bart.
Johannes Berg Oct. 25, 2018, 5:02 p.m. UTC | #6
On Thu, 2018-10-25 at 15:05 +0000, Bart Van Assche wrote:
> It can happen that the direct I/O queue creates and destroys an empty
> workqueue from inside a work function.

So, thinking about this more, can you guarantee (somehow) that the
workqueue is empty at this point? Perhaps then we should encode that
guarantee into the API, and actually make it WARN_ON() if something is
scheduled on it at that point? And then skip lockdep in this case.

You've actually done a superset of this here - you've basically said
this workqueue was never used, not just that it's empty right now.
However, if you could guarantee that it was never used at this point, I
guess I fail to see what you need it for anyway (or alternatively, why
you wouldn't allocate it later after this point.)

It does seem strange to me to make lockdep contingent on "has this
workqueue been used" because there may be things queued on it that only
happen in certain cases (hardware, phases of the moon ;-) ) and lockdep
sort of says "once you execute all the paths that _could_ lead to a
deadlock, I'll warn you about it, even if it never happened". It seems
that we'd want to extend that to workqueues.

Also, I think in a way what this does is try to differentiate the
classes of workqueues: used vs. not used per instance, but I'm not
really sure that's what you want - after all, could it really never have
been used? Surely it has a purpose?

But anyway, I'll need to take a better look at the lockdep splat later
(i.e. when the kids are sleeping.) Do you know how to reproduce this?

johannes
Bart Van Assche Oct. 25, 2018, 5:11 p.m. UTC | #7
On Thu, 2018-10-25 at 19:02 +0200, Johannes Berg wrote:
> On Thu, 2018-10-25 at 15:05 +0000, Bart Van Assche wrote:
> > It can happen that the direct I/O queue creates and destroys an empty
> > workqueue from inside a work function.
> 
> So, thinking about this more, can you guarantee (somehow) that the
> workqueue is empty at this point?

In general, no. But for the direct I/O case this can be guaranteed. Please
have a look at the code in sb_init_dio_done_wq() if you would not yet have
done this.

> Do you know how to reproduce this?

The lockdep complaint in the patch description is easy to reproduce. The
way I reproduce it is as follows:

git clone https://github.com/osandov/blktests
(cd blktests && ./check -q nvmeof-mp)

Bart.
Johannes Berg Oct. 25, 2018, 7:51 p.m. UTC | #8
On Thu, 2018-10-25 at 10:11 -0700, Bart Van Assche wrote:
> On Thu, 2018-10-25 at 19:02 +0200, Johannes Berg wrote:
> > On Thu, 2018-10-25 at 15:05 +0000, Bart Van Assche wrote:
> > > It can happen that the direct I/O queue creates and destroys an empty
> > > workqueue from inside a work function.
> > 
> > So, thinking about this more, can you guarantee (somehow) that the
> > workqueue is empty at this point?
> 
> In general, no. But for the direct I/O case this can be guaranteed. Please
> have a look at the code in sb_init_dio_done_wq() if you would not yet have
> done this.

Indeed, obviously.

> > Do you know how to reproduce this?
> 
> The lockdep complaint in the patch description is easy to reproduce. The
> way I reproduce it is as follows:
> 
> git clone https://github.com/osandov/blktests
> (cd blktests && ./check -q nvmeof-mp)

I'm a bit scared by this, looks like that needs a lot of prerequisites?
I'll try it in a VM when it's done compiling with nvme.


Looking at the splat in more detail, I think we have the following:

__generic_file_fsync() takes i_mutex_key#14

__generic_file_fsync() this is called from dio_aio_complete_work()

dio_aio_complete_work() generally runs on the dio/%s workqueue



Lockdep also sees:

ext4_file_write_iter() takes i_mutex_key#14

depending on circumstances, it can then call do_blockdev_direct_IO(),
which needs to ensure the WQ exists for this SB, so it calls
sb_init_dio_done_wq() to allocate the dio/%s workqueue.



Since lockdep neither knows that the instance of the workqueue that was
executing dio_aio_complete_work() must be different from the instance
that's freed, it complains because this creates a circle of
dependencies.

Still, basically what I tried to say before - rather than track whether
a workqueue was ever used, which is error-prone since in other cases in
the kernel the usage might depend on whatever conditions, I think we
should either teach lockdep that this is guaranteed to be a different
workqueue, or perhaps we should just have a "free empty workqueue"
function. I tend to prefer the former as it's more general, so I'd
propose this (combined) patch:

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 093fb54cd316..9ef33d6cba56 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -629,9 +629,16 @@ int sb_init_dio_done_wq(struct super_block *sb)
 	 * This has to be atomic as more DIOs can race to create the workqueue
 	 */
 	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
-	/* Someone created workqueue before us? Free ours... */
+	/*
+	 * Someone created workqueue before us? Free ours...
+	 * Note the _nested(), that pushes down to the (in this case actually
+	 * pointless) flush_workqueue() happening inside, since this function
+	 * might be called in contexts that hold the same locks that an fs may
+	 * take while being called from dio_aio_complete_work() from another
+	 * instance of the workqueue we allocate here.
+	 */
 	if (old)
-		destroy_workqueue(wq);
+		destroy_workqueue_nested(wq, SINGLE_DEPTH_NESTING);
 	return 0;
 }
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 60d673e15632..0b36a7df61d4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -453,7 +453,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 #define create_singlethread_workqueue(name)				\
 	alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
 
-extern void destroy_workqueue(struct workqueue_struct *wq);
+extern void destroy_workqueue_nested(struct workqueue_struct *wq, int subclass);
+
+static inline void destroy_workqueue(struct workqueue_struct *wq)
+{
+	destroy_workqueue_nested(wq, 0);
+}
 
 struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
@@ -469,8 +474,18 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay);
 extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);
 
-extern void flush_workqueue(struct workqueue_struct *wq);
-extern void drain_workqueue(struct workqueue_struct *wq);
+extern void flush_workqueue_nested(struct workqueue_struct *wq, int subclass);
+extern void drain_workqueue_nested(struct workqueue_struct *wq, int subclass);
+
+static inline void flush_workqueue(struct workqueue_struct *wq)
+{
+	flush_workqueue_nested(wq, 0);
+}
+
+static inline void drain_workqueue(struct workqueue_struct *wq)
+{
+	drain_workqueue_nested(wq, 0);
+}
 
 extern int schedule_on_each_cpu(work_func_t func);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0280deac392e..6b00e062af96 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2634,13 +2634,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
 }
 
 /**
- * flush_workqueue - ensure that any scheduled work has run to completion.
+ * flush_workqueue_nested - ensure that any scheduled work has run to completion.
  * @wq: workqueue to flush
+ * @subclass: subclass for lockdep
  *
  * This function sleeps until all work items which were queued on entry
  * have finished execution, but it is not livelocked by new incoming ones.
  */
-void flush_workqueue(struct workqueue_struct *wq)
+void flush_workqueue_nested(struct workqueue_struct *wq, int subclass)
 {
 	struct wq_flusher this_flusher = {
 		.list = LIST_HEAD_INIT(this_flusher.list),
@@ -2652,7 +2653,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	if (WARN_ON(!wq_online))
 		return;
 
-	lock_map_acquire(&wq->lockdep_map);
+	lock_acquire_exclusive(&wq->lockdep_map, subclass, 0, NULL, _THIS_IP_);
 	lock_map_release(&wq->lockdep_map);
 
 	mutex_lock(&wq->mutex);
@@ -2789,11 +2790,12 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
 	mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue_nested);
 
 /**
- * drain_workqueue - drain a workqueue
+ * drain_workqueue_nested - drain a workqueue
  * @wq: workqueue to drain
+ * @subclass: lockdep subclass
  *
  * Wait until the workqueue becomes empty.  While draining is in progress,
  * only chain queueing is allowed.  IOW, only currently pending or running
@@ -2802,7 +2804,7 @@ EXPORT_SYMBOL(flush_workqueue);
  * by the depth of chaining and should be relatively short.  Whine if it
  * takes too long.
  */
-void drain_workqueue(struct workqueue_struct *wq)
+void drain_workqueue_nested(struct workqueue_struct *wq, int subclass)
 {
 	unsigned int flush_cnt = 0;
 	struct pool_workqueue *pwq;
@@ -2817,7 +2819,7 @@ void drain_workqueue(struct workqueue_struct *wq)
 		wq->flags |= __WQ_DRAINING;
 	mutex_unlock(&wq->mutex);
 reflush:
-	flush_workqueue(wq);
+	flush_workqueue_nested(wq, subclass);
 
 	mutex_lock(&wq->mutex);
 
@@ -2844,7 +2846,7 @@ void drain_workqueue(struct workqueue_struct *wq)
 		wq->flags &= ~__WQ_DRAINING;
 	mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL_GPL(drain_workqueue);
+EXPORT_SYMBOL_GPL(drain_workqueue_nested);
 
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 			     bool from_cancel)
@@ -4141,18 +4143,19 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
 /**
- * destroy_workqueue - safely terminate a workqueue
+ * destroy_workqueue_nested - safely terminate a workqueue
  * @wq: target workqueue
+ * @subclass: lockdep subclass
  *
  * Safely destroy a workqueue. All work currently pending will be done first.
  */
-void destroy_workqueue(struct workqueue_struct *wq)
+void destroy_workqueue_nested(struct workqueue_struct *wq, int subclass)
 {
 	struct pool_workqueue *pwq;
 	int node;
 
 	/* drain it before proceeding with destruction */
-	drain_workqueue(wq);
+	drain_workqueue_nested(wq, subclass);
 
 	/* sanity checks */
 	mutex_lock(&wq->mutex);
@@ -4217,7 +4220,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		put_pwq_unlocked(pwq);
 	}
 }
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+EXPORT_SYMBOL_GPL(destroy_workqueue_nested);
 
 /**
  * workqueue_set_max_active - adjust max_active of a workqueue


We could avoid the useless subclass argument in the non-lockdep case
with some macro trickery, but for now I haven't done that.

johannes
Johannes Berg Oct. 25, 2018, 7:59 p.m. UTC | #9
On Thu, 2018-10-25 at 08:55 -0700, Bart Van Assche wrote:

> Please have a look at the call trace in the description of this patch and also
> at the direct I/O code. The lockdep complaint in the description of this patch
> really is a false positive. 

I agree. I just don't agree that it's a false positive because of the
lockdep tracking of workqueues (which you're essentially saying since
you want to change it), I think it's a false positive because the user
isn't telling lockdep properly what it's doing.

> What I think needs further discussion is on how to
> address this false positive - track whether or not a work queue has been used
> or follow Tejun's proposal that I became aware of after I posted this patch,
> namely introduce a new function for destroying a work queue that skips draining,
> e.g. destroy_workqueue_skip_drain() (https://lkml.org/lkml/2018/10/24/2).

Agree. Ted's suggestion is basically what I meant in my other email when
I said:

> So, thinking about this more, can you guarantee (somehow) that the
> workqueue is empty at this point?

(I hadn't looked at the code then - obviously that's guaranteed)

> Perhaps then we should encode that
> guarantee into the API, and actually make it WARN_ON() if something is
> scheduled on it at that point? And then skip lockdep in this case.

So that API I'm talking about is what Ted suggests with
destroy_never_used_workqueue() or Tejun's suggestion of
destroy_workqueue_skip_drain(). Either is fine with me, though I'd
probably think it would make sense to actually check that it really was
never used, or at least is actually empty right now and thus doesn't
need the train.

However, I think more generally useful would be the
destroy_workqueue_nested() API I've proposed in my other reply just
moments ago, since that could be used in other code with non-empty
workqueues, as long as you can ensure that there's a guaranteed layering
of them. Then you could even have two objects A and B of the same class,
but separate instances, and flush B's workqueue from a work executing on
A's workqueue, which may come in handy in other places to address false
positives similar to this one.

johannes
Johannes Berg Oct. 25, 2018, 8:13 p.m. UTC | #10
On Thu, 2018-10-25 at 15:36 +0000, Tejun Heo wrote:
> We
> wanna annotate the users for the exceptions rather than weakening the
> workqueue lockdep checks, especially because workqueue related
> deadlocks can be pretty difficult to trigger and root cause
> afterwards.

I think you just said more in that one sentence than I wrote in all the
emails on this ;-)

johannes
Theodore Y. Ts'o Oct. 25, 2018, 8:21 p.m. UTC | #11
On Thu, Oct 25, 2018 at 09:59:38PM +0200, Johannes Berg wrote:
> > So, thinking about this more, can you guarantee (somehow) that the
> > workqueue is empty at this point?
> 
> (I hadn't looked at the code then - obviously that's guaranteed)

We can guarantee it from someone who is looking at the code path.  In
dio_set_defer_completion:

	if (!sb->s_dio_done_wq)
		return sb_init_dio_done_wq(sb);

And then sb_init_dio_done_wq:

int sb_init_dio_done_wq(struct super_block *sb)
{
	struct workqueue_struct *old;
	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
						      WQ_MEM_RECLAIM, 0,
						      sb->s_id);
	if (!wq)
		return -ENOMEM;
	/*
	 * This has to be atomic as more DIOs can race to create the workqueue
	 */
	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
	/* Someone created workqueue before us? Free ours... */
	if (old)
		destroy_workqueue(wq);
	return 0;
}

The race found in the syzbot reproducer has multiple threads all
running DIO writes at the same time.  So we have multiple threads
calling sb_init_dio_done_wq, but all but one will lose the race, and
then call destry_workqueue on the freshly created (but never used)
workqueue.

We could replace the destroy_workqueue(wq) with a
"I_solemnly_swear_this_workqueue_has_never_been_used_please_destroy(wq)".

Or, as Tejun suggested, "destroy_workqueue_skip_drain(wq)", but there is
no way for the workqueue code to know whether the caller was using the
interface correctly.  So this basically becomes a philosophical
question about whether or not we trust the caller to be correct or
not.

I don't see an obvious way that we can test to make sure the workqueue
is never used without actually taking a performance.  Am I correct
that we would need to take the wq->mutex before we can mess with the
wq->flags field?

					- Ted
Johannes Berg Oct. 25, 2018, 8:26 p.m. UTC | #12
On Thu, 2018-10-25 at 16:21 -0400, Theodore Y. Ts'o wrote:

> We can guarantee it from someone who is looking at the code path.  In
> dio_set_defer_completion:

[snip]

Right, it's indeed pretty obvious. I shouldn't have tried to reply
before the kids went to bed, that made me cut some corners ;-)

> The race found in the syzbot reproducer has multiple threads all
> running DIO writes at the same time.  So we have multiple threads
> calling sb_init_dio_done_wq, but all but one will lose the race, and
> then call destry_workqueue on the freshly created (but never used)
> workqueue.

Right.

> We could replace the destroy_workqueue(wq) with a
> "I_solemnly_swear_this_workqueue_has_never_been_used_please_destroy(wq)".

:-)

> Or, as Tejun suggested, "destroy_workqueue_skip_drain(wq)", but there is
> no way for the workqueue code to know whether the caller was using the
> interface correctly.  So this basically becomes a philosophical
> question about whether or not we trust the caller to be correct or
> not.

Right. Same with the lockdep annotation I suggested over in my other
email, of course. I think that the set of APIs I wrote there
({drain,flush,destroy}_workqueue_nested()) might be more generally
useful in other cases, not just this one, and I suspect that this code
would basically be the only user of destroy_workqueue_skip_drain().

> I don't see an obvious way that we can test to make sure the workqueue
> is never used without actually taking a performance.  Am I correct
> that we would need to take the wq->mutex before we can mess with the
> wq->flags field?

I don't really know, sorry.

johannes
Bart Van Assche Oct. 25, 2018, 8:39 p.m. UTC | #13
On Thu, 2018-10-25 at 21:51 +0200, Johannes Berg wrote:
> [ ... ]
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 093fb54cd316..9ef33d6cba56 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -629,9 +629,16 @@ int sb_init_dio_done_wq(struct super_block *sb)
>  	 * This has to be atomic as more DIOs can race to create the workqueue
>  	 */
>  	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
> -	/* Someone created workqueue before us? Free ours... */
> +	/*
> +	 * Someone created workqueue before us? Free ours...
> +	 * Note the _nested(), that pushes down to the (in this case actually
> +	 * pointless) flush_workqueue() happening inside, since this function
> +	 * might be called in contexts that hold the same locks that an fs may
> +	 * take while being called from dio_aio_complete_work() from another
> +	 * instance of the workqueue we allocate here.
> +	 */
>  	if (old)
> -		destroy_workqueue(wq);
> +		destroy_workqueue_nested(wq, SINGLE_DEPTH_NESTING);
>  	return 0;
>  }
> [ ... ]
>  /**
> - * flush_workqueue - ensure that any scheduled work has run to completion.
> + * flush_workqueue_nested - ensure that any scheduled work has run to completion.
>   * @wq: workqueue to flush
> + * @subclass: subclass for lockdep
>   *
>   * This function sleeps until all work items which were queued on entry
>   * have finished execution, but it is not livelocked by new incoming ones.
>   */
> -void flush_workqueue(struct workqueue_struct *wq)
> +void flush_workqueue_nested(struct workqueue_struct *wq, int subclass)
>  {
>  	struct wq_flusher this_flusher = {
>  		.list = LIST_HEAD_INIT(this_flusher.list),
> @@ -2652,7 +2653,7 @@ void flush_workqueue(struct workqueue_struct *wq)
>  	if (WARN_ON(!wq_online))
>  		return;
>  
> -	lock_map_acquire(&wq->lockdep_map);
> +	lock_acquire_exclusive(&wq->lockdep_map, subclass, 0, NULL, _THIS_IP_);
>  	lock_map_release(&wq->lockdep_map);
>  
>  	mutex_lock(&wq->mutex);
> [ ... ]

I don't like this approach because it doesn't match how other kernel code uses
lockdep annotations. All other kernel code I know of only annotates lock depmaps
as nested if the same kernel thread calls lock_acquire() twice for the same depmap
without intervening lock_release(). My understanding is that with your patch
applied flush_workqueue_nested(wq, 1) calls lock_acquire() only once and with the
subclass argument set to one. I think this will confuse other people who will read
the workqueue implementation and who have not followed this conversation.

I like Tejuns proposal much better than the above proposal.

Bart.
Johannes Berg Oct. 25, 2018, 8:47 p.m. UTC | #14
On Thu, 2018-10-25 at 13:39 -0700, Bart Van Assche wrote:
> 
> > +void flush_workqueue_nested(struct workqueue_struct *wq, int subclass)
> >  {
> >  	struct wq_flusher this_flusher = {
> >  		.list = LIST_HEAD_INIT(this_flusher.list),
> > @@ -2652,7 +2653,7 @@ void flush_workqueue(struct workqueue_struct *wq)
> >  	if (WARN_ON(!wq_online))
> >  		return;
> >  
> > -	lock_map_acquire(&wq->lockdep_map);
> > +	lock_acquire_exclusive(&wq->lockdep_map, subclass, 0, NULL, _THIS_IP_);
> >  	lock_map_release(&wq->lockdep_map);
> >  
> >  	mutex_lock(&wq->mutex);
> > [ ... ]
> 
> I don't like this approach because it doesn't match how other kernel code uses
> lockdep annotations. All other kernel code I know of only annotates lock depmaps
> as nested if the same kernel thread calls lock_acquire() twice for the same depmap
> without intervening lock_release(). My understanding is that with your patch
> applied flush_workqueue_nested(wq, 1) calls lock_acquire() only once and with the
> subclass argument set to one. I think this will confuse other people who will read
> the workqueue implementation and who have not followed this conversation.

Hmm, yeah, that's a reasonable complaint. My mental model is more along
the lines of
	"this is a different nested (layered) object instance"
rather than
	"I'm nesting the locks",
so this code change fits well into my model. However, code like
infiniband/core/cma.c does in fact use it with nested object instances
*and* actually (and directly in the code) nested locks.

I think the "actual nesting" could possibly come up with workqueues,
like I described earlier: you have two objects with a workqueue each,
and the objects are somehow layered (like cma.c's listen_id and
conn_id), and each contains a workqueue, and some work running on the
listen_id's workqueue wants to destroy all the conn_id's workqueue(s).
In that case, you do end up really having the nesting.

So in that sense, I still think this API may still be required, and
(perhaps with an appropriate comment) could be used in the dio case even
if there's no actual nesting, without introducing the - IMHO single-use
and more dangerous - destroy_workqueue_no_drain().

johannes

Patch
diff mbox series

======================================================
WARNING: possible circular locking dependency detected
4.19.0-dbg+ #1 Not tainted
------------------------------------------------------
fio/4129 is trying to acquire lock:
00000000a01cfe1a ((wq_completion)"dio/%s"sb->s_id){+.+.}, at: flush_workqueue+0xd0/0x970

but task is already holding lock:
00000000a0acecf9 (&sb->s_type->i_mutex_key#14){+.+.}, at: ext4_file_write_iter+0x154/0x710

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&sb->s_type->i_mutex_key#14){+.+.}:
       down_write+0x3d/0x80
       __generic_file_fsync+0x77/0xf0
       ext4_sync_file+0x3c9/0x780
       vfs_fsync_range+0x66/0x100
       dio_complete+0x2f5/0x360
       dio_aio_complete_work+0x1c/0x20
       process_one_work+0x481/0x9f0
       worker_thread+0x63/0x5a0
       kthread+0x1cf/0x1f0
       ret_from_fork+0x24/0x30

-> #1 ((work_completion)(&dio->complete_work)){+.+.}:
       process_one_work+0x447/0x9f0
       worker_thread+0x63/0x5a0
       kthread+0x1cf/0x1f0
       ret_from_fork+0x24/0x30

-> #0 ((wq_completion)"dio/%s"sb->s_id){+.+.}:
       lock_acquire+0xc5/0x200
       flush_workqueue+0xf3/0x970
       drain_workqueue+0xec/0x220
       destroy_workqueue+0x23/0x350
       sb_init_dio_done_wq+0x6a/0x80
       do_blockdev_direct_IO+0x1f33/0x4be0
       __blockdev_direct_IO+0x79/0x86
       ext4_direct_IO+0x5df/0xbb0
       generic_file_direct_write+0x119/0x220
       __generic_file_write_iter+0x131/0x2d0
       ext4_file_write_iter+0x3fa/0x710
       aio_write+0x235/0x330
       io_submit_one+0x510/0xeb0
       __x64_sys_io_submit+0x122/0x340
       do_syscall_64+0x71/0x220
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

other info that might help us debug this:

Chain exists of:
  (wq_completion)"dio/%s"sb->s_id --> (work_completion)(&dio->complete_work) --> &sb->s_type->i_mutex_key#14

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&sb->s_type->i_mutex_key#14);
                               lock((work_completion)(&dio->complete_work));
                               lock(&sb->s_type->i_mutex_key#14);
  lock((wq_completion)"dio/%s"sb->s_id);

 *** DEADLOCK ***

1 lock held by fio/4129:
 #0: 00000000a0acecf9 (&sb->s_type->i_mutex_key#14){+.+.}, at: ext4_file_write_iter+0x154/0x710

stack backtrace:
CPU: 3 PID: 4129 Comm: fio Not tainted 4.19.0-dbg+ #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
 dump_stack+0x86/0xc5
 print_circular_bug.isra.32+0x20a/0x218
 __lock_acquire+0x1c68/0x1cf0
 lock_acquire+0xc5/0x200
 flush_workqueue+0xf3/0x970
 drain_workqueue+0xec/0x220
 destroy_workqueue+0x23/0x350
 sb_init_dio_done_wq+0x6a/0x80
 do_blockdev_direct_IO+0x1f33/0x4be0
 __blockdev_direct_IO+0x79/0x86
 ext4_direct_IO+0x5df/0xbb0
 generic_file_direct_write+0x119/0x220
 __generic_file_write_iter+0x131/0x2d0
 ext4_file_write_iter+0x3fa/0x710
 aio_write+0x235/0x330
 io_submit_one+0x510/0xeb0
 __x64_sys_io_submit+0x122/0x340
 do_syscall_64+0x71/0x220
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 include/linux/workqueue.h | 1 +
 kernel/workqueue.c        | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 60d673e15632..375ec764f148 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -344,6 +344,7 @@  enum {
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
 	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
 	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
+	__WQ_HAS_BEEN_USED	= 1 << 20, /* internal: work has been queued */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fc9129d5909e..0ef275fe526c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1383,6 +1383,10 @@  static void __queue_work(int cpu, struct workqueue_struct *wq,
 	if (unlikely(wq->flags & __WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
+
+	if (!(wq->flags & __WQ_HAS_BEEN_USED))
+		wq->flags |= __WQ_HAS_BEEN_USED;
+
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
 		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
@@ -2889,7 +2893,7 @@  static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 	 * workqueues the deadlock happens when the rescuer stalls, blocking
 	 * forward progress.
 	 */
-	if (!from_cancel &&
+	if (!from_cancel && (pwq->wq->flags & __WQ_HAS_BEEN_USED) &&
 	    (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
 		lock_acquire_exclusive(&pwq->wq->lockdep_map, 0, 0, NULL,
 				       _THIS_IP_);