All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat
@ 2014-06-08  8:35 Mike Galbraith
  2014-06-09  2:08 ` Lai Jiangshan
  0 siblings, 1 reply; 13+ messages in thread
From: Mike Galbraith @ 2014-06-08  8:35 UTC (permalink / raw)
  To: RT
  Cc: LKML, Sebastian Andrzej Siewior, Steven Rostedt, Thomas Gleixner,
	Paul E. McKenney


[  172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
[  172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
[  172.743117] 2 locks held by rcuos/2/26:
[  172.743128]  #0:  (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
[  172.743135]  #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
[  172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[  172.743138]
[  172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
[  172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
[  172.743148]  ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
[  172.743151]  ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
[  172.743154]  ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
[  172.743155] Call Trace:
[  172.743160]  [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
[  172.743163]  [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
[  172.743167]  [<ffffffff81589304>] rt_spin_lock+0x24/0x70
[  172.743171]  [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
[  172.743174]  [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
[  172.743177]  [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
[  172.743180]  [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
[  172.743183]  [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[  172.743185]  [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
[  172.743189]  [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
[  172.743192]  [<ffffffff8106e046>] kthread+0xd6/0xf0
[  172.743194]  [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
[  172.743197]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
[  172.743200]  [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
[  172.743203]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70

crash> gdb list *percpu_ref_kill_rcu+0x1b4
0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
164             pcpu_count = ACCESS_ONCE(ref->pcpu_count);
165
166             if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
167                     __this_cpu_dec(*pcpu_count);
168             else if (unlikely(atomic_dec_and_test(&ref->count)))
169                     ref->release(ref);
170
171             rcu_read_unlock_sched();
172     }
173

Ok, so ->release() can't do anything where it may meet a sleeping lock,
but in an -rt kernel, it does that.

Convert struct kioctx ctx_lock/completion_lock to raw_spinlock_t, and
defer final free to a time when we're not under rcu_read_lock_sched().

runltp -f ltp-aio-stress.part1 runs kernel/ltp gripe free.

INFO: ltp-pan reported all tests PASS
LTP Version: 20140422

       ###############################################################

            Done executing testcases.
            LTP Version:  20140422
       ###############################################################


Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 fs/aio.c |   61 ++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 17 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -125,7 +125,7 @@ struct kioctx {
 	} ____cacheline_aligned_in_smp;
 
 	struct {
-		spinlock_t	ctx_lock;
+		raw_spinlock_t	ctx_lock;
 		struct list_head active_reqs;	/* used for cancellation */
 	} ____cacheline_aligned_in_smp;
 
@@ -136,13 +136,16 @@ struct kioctx {
 
 	struct {
 		unsigned	tail;
-		spinlock_t	completion_lock;
+		raw_spinlock_t	completion_lock;
 	} ____cacheline_aligned_in_smp;
 
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
 
 	unsigned		id;
+#ifdef CONFIG_PREEMPT_RT_BASE
+	struct rcu_head		rcu;
+#endif
 };
 
 /*------ sysctl variables----*/
@@ -334,11 +337,11 @@ static int aio_migratepage(struct addres
 	 * while the old page is copied to the new.  This prevents new
 	 * events from being lost.
 	 */
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	raw_spin_lock_irqsave(&ctx->completion_lock, flags);
 	migrate_page_copy(new, old);
 	BUG_ON(ctx->ring_pages[idx] != old);
 	ctx->ring_pages[idx] = new;
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	/* The old page is no longer accessible. */
 	put_page(old);
@@ -461,14 +464,14 @@ void kiocb_set_cancel_fn(struct kiocb *r
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
 
 	if (!req->ki_list.next)
 		list_add(&req->ki_list, &ctx->active_reqs);
 
 	req->ki_cancel = cancel;
 
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
@@ -493,6 +496,7 @@ static int kiocb_cancel(struct kioctx *c
 	return cancel(kiocb);
 }
 
+#ifndef CONFIG_PREEMPT_RT_BASE
 static void free_ioctx(struct work_struct *work)
 {
 	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -503,13 +507,36 @@ static void free_ioctx(struct work_struc
 	free_percpu(ctx->cpu);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
+#else
+static void free_ioctx_rcu(struct rcu_head *rcu)
+{
+	struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
+
+	pr_debug("freeing %p\n", ctx);
+
+	aio_free_ring(ctx);
+	free_percpu(ctx->cpu);
+	kmem_cache_free(kioctx_cachep, ctx);
+}
+#endif
 
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
 
+#ifndef CONFIG_PREEMPT_RT_BASE
 	INIT_WORK(&ctx->free_work, free_ioctx);
 	schedule_work(&ctx->free_work);
+#else
+	/*
+	 * We're in ->release() under rcu_read_lock_sched(), and can't do
+	 * anything that requires taking a sleeping lock, so ->release()
+	 * becomes a two stage rcu process for -rt.  We've now done the
+	 * rcu work that we can under locks made raw to get us this far.
+	 * Defer the freeing bit until we're again allowed to schedule().
+	 */
+	call_rcu(&ctx->rcu, free_ioctx_rcu);
+#endif
 }
 
 /*
@@ -522,7 +549,7 @@ static void free_ioctx_users(struct perc
 	struct kioctx *ctx = container_of(ref, struct kioctx, users);
 	struct kiocb *req;
 
-	spin_lock_irq(&ctx->ctx_lock);
+	raw_spin_lock_irq(&ctx->ctx_lock);
 
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
@@ -532,7 +559,7 @@ static void free_ioctx_users(struct perc
 		kiocb_cancel(ctx, req);
 	}
 
-	spin_unlock_irq(&ctx->ctx_lock);
+	raw_spin_unlock_irq(&ctx->ctx_lock);
 
 	percpu_ref_kill(&ctx->reqs);
 	percpu_ref_put(&ctx->reqs);
@@ -645,8 +672,8 @@ static struct kioctx *ioctx_alloc(unsign
 
 	ctx->max_reqs = nr_events;
 
-	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->completion_lock);
+	raw_spin_lock_init(&ctx->ctx_lock);
+	raw_spin_lock_init(&ctx->completion_lock);
 	mutex_init(&ctx->ring_lock);
 	/* Protect against page migration throughout kiotx setup by keeping
 	 * the ring_lock mutex held until setup is complete. */
@@ -948,9 +975,9 @@ void aio_complete(struct kiocb *iocb, lo
 	if (iocb->ki_list.next) {
 		unsigned long flags;
 
-		spin_lock_irqsave(&ctx->ctx_lock, flags);
+		raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
 		list_del(&iocb->ki_list);
-		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+		raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	}
 
 	/*
@@ -958,7 +985,7 @@ void aio_complete(struct kiocb *iocb, lo
 	 * ctx->completion_lock to prevent other code from messing with the tail
 	 * pointer since we might be called from irq context.
 	 */
-	spin_lock_irqsave(&ctx->completion_lock, flags);
+	raw_spin_lock_irqsave(&ctx->completion_lock, flags);
 
 	tail = ctx->tail;
 	pos = tail + AIO_EVENTS_OFFSET;
@@ -993,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, lo
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 
@@ -1515,7 +1542,7 @@ static struct kiocb *lookup_kiocb(struct
 {
 	struct list_head *pos;
 
-	assert_spin_locked(&ctx->ctx_lock);
+	assert_raw_spin_locked(&ctx->ctx_lock);
 
 	if (key != KIOCB_KEY)
 		return NULL;
@@ -1555,7 +1582,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
 	if (unlikely(!ctx))
 		return -EINVAL;
 
-	spin_lock_irq(&ctx->ctx_lock);
+	raw_spin_lock_irq(&ctx->ctx_lock);
 
 	kiocb = lookup_kiocb(ctx, iocb, key);
 	if (kiocb)
@@ -1563,7 +1590,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
 	else
 		ret = -EINVAL;
 
-	spin_unlock_irq(&ctx->ctx_lock);
+	raw_spin_unlock_irq(&ctx->ctx_lock);
 
 	if (!ret) {
 		/*



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-08  8:35 [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat Mike Galbraith
@ 2014-06-09  2:08 ` Lai Jiangshan
  2014-06-09  3:17   ` Mike Galbraith
  2014-06-10  3:47   ` [RFC PATCH V2] " Mike Galbraith
  0 siblings, 2 replies; 13+ messages in thread
From: Lai Jiangshan @ 2014-06-09  2:08 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: RT, LKML, Sebastian Andrzej Siewior, Steven Rostedt,
	Thomas Gleixner, Paul E. McKenney

Hi, rt-people

I don't think it is the correct direction.
Softirq (including local_bh_disable()) in RT kernel should be preemptible.

Fixing these problems via converting spinlock_t to raw_spinlock_t will
result that all spinlock_t used in RCU-callbacks are converted which
means almost the spinlock_t in kernel are converted.

Sudden and superficial thought.
Thanks
Lai

On 06/08/2014 04:35 PM, Mike Galbraith wrote:
> 
> [  172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
> [  172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
> [  172.743117] 2 locks held by rcuos/2/26:
> [  172.743128]  #0:  (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
> [  172.743135]  #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
> [  172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [  172.743138]
> [  172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
> [  172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
> [  172.743148]  ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
> [  172.743151]  ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
> [  172.743154]  ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
> [  172.743155] Call Trace:
> [  172.743160]  [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
> [  172.743163]  [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
> [  172.743167]  [<ffffffff81589304>] rt_spin_lock+0x24/0x70
> [  172.743171]  [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
> [  172.743174]  [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
> [  172.743177]  [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
> [  172.743180]  [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
> [  172.743183]  [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [  172.743185]  [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
> [  172.743189]  [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
> [  172.743192]  [<ffffffff8106e046>] kthread+0xd6/0xf0
> [  172.743194]  [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
> [  172.743197]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
> [  172.743200]  [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
> [  172.743203]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
> 
> crash> gdb list *percpu_ref_kill_rcu+0x1b4
> 0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
> 164             pcpu_count = ACCESS_ONCE(ref->pcpu_count);
> 165
> 166             if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
> 167                     __this_cpu_dec(*pcpu_count);
> 168             else if (unlikely(atomic_dec_and_test(&ref->count)))
> 169                     ref->release(ref);
> 170
> 171             rcu_read_unlock_sched();
> 172     }
> 173
> 
> Ok, so ->release() can't do anything where it may meet a sleeping lock,
> but in an -rt kernel, it does that.
> 
> Convert struct kioctx ctx_lock/completion_lock to raw_spinlock_t, and
> defer final free to a time when we're not under rcu_read_lock_sched().
> 
> runltp -f ltp-aio-stress.part1 runs kernel/ltp gripe free.
> 
> INFO: ltp-pan reported all tests PASS
> LTP Version: 20140422
> 
>        ###############################################################
> 
>             Done executing testcases.
>             LTP Version:  20140422
>        ###############################################################
> 
> 
> Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> ---
>  fs/aio.c |   61 ++++++++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 44 insertions(+), 17 deletions(-)
> 
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -125,7 +125,7 @@ struct kioctx {
>  	} ____cacheline_aligned_in_smp;
>  
>  	struct {
> -		spinlock_t	ctx_lock;
> +		raw_spinlock_t	ctx_lock;
>  		struct list_head active_reqs;	/* used for cancellation */
>  	} ____cacheline_aligned_in_smp;
>  
> @@ -136,13 +136,16 @@ struct kioctx {
>  
>  	struct {
>  		unsigned	tail;
> -		spinlock_t	completion_lock;
> +		raw_spinlock_t	completion_lock;
>  	} ____cacheline_aligned_in_smp;
>  
>  	struct page		*internal_pages[AIO_RING_PAGES];
>  	struct file		*aio_ring_file;
>  
>  	unsigned		id;
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	struct rcu_head		rcu;
> +#endif
>  };
>  
>  /*------ sysctl variables----*/
> @@ -334,11 +337,11 @@ static int aio_migratepage(struct addres
>  	 * while the old page is copied to the new.  This prevents new
>  	 * events from being lost.
>  	 */
> -	spin_lock_irqsave(&ctx->completion_lock, flags);
> +	raw_spin_lock_irqsave(&ctx->completion_lock, flags);
>  	migrate_page_copy(new, old);
>  	BUG_ON(ctx->ring_pages[idx] != old);
>  	ctx->ring_pages[idx] = new;
> -	spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +	raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  
>  	/* The old page is no longer accessible. */
>  	put_page(old);
> @@ -461,14 +464,14 @@ void kiocb_set_cancel_fn(struct kiocb *r
>  	struct kioctx *ctx = req->ki_ctx;
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&ctx->ctx_lock, flags);
> +	raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
>  
>  	if (!req->ki_list.next)
>  		list_add(&req->ki_list, &ctx->active_reqs);
>  
>  	req->ki_cancel = cancel;
>  
> -	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> +	raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
>  }
>  EXPORT_SYMBOL(kiocb_set_cancel_fn);
>  
> @@ -493,6 +496,7 @@ static int kiocb_cancel(struct kioctx *c
>  	return cancel(kiocb);
>  }
>  
> +#ifndef CONFIG_PREEMPT_RT_BASE
>  static void free_ioctx(struct work_struct *work)
>  {
>  	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
> @@ -503,13 +507,36 @@ static void free_ioctx(struct work_struc
>  	free_percpu(ctx->cpu);
>  	kmem_cache_free(kioctx_cachep, ctx);
>  }
> +#else
> +static void free_ioctx_rcu(struct rcu_head *rcu)
> +{
> +	struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
> +
> +	pr_debug("freeing %p\n", ctx);
> +
> +	aio_free_ring(ctx);
> +	free_percpu(ctx->cpu);
> +	kmem_cache_free(kioctx_cachep, ctx);
> +}
> +#endif
>  
>  static void free_ioctx_reqs(struct percpu_ref *ref)
>  {
>  	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
>  
> +#ifndef CONFIG_PREEMPT_RT_BASE
>  	INIT_WORK(&ctx->free_work, free_ioctx);
>  	schedule_work(&ctx->free_work);
> +#else
> +	/*
> +	 * We're in ->release() under rcu_read_lock_sched(), and can't do
> +	 * anything that requires taking a sleeping lock, so ->release()
> +	 * becomes a two stage rcu process for -rt.  We've now done the
> +	 * rcu work that we can under locks made raw to get us this far.
> +	 * Defer the freeing bit until we're again allowed to schedule().
> +	 */
> +	call_rcu(&ctx->rcu, free_ioctx_rcu);
> +#endif
>  }
>  
>  /*
> @@ -522,7 +549,7 @@ static void free_ioctx_users(struct perc
>  	struct kioctx *ctx = container_of(ref, struct kioctx, users);
>  	struct kiocb *req;
>  
> -	spin_lock_irq(&ctx->ctx_lock);
> +	raw_spin_lock_irq(&ctx->ctx_lock);
>  
>  	while (!list_empty(&ctx->active_reqs)) {
>  		req = list_first_entry(&ctx->active_reqs,
> @@ -532,7 +559,7 @@ static void free_ioctx_users(struct perc
>  		kiocb_cancel(ctx, req);
>  	}
>  
> -	spin_unlock_irq(&ctx->ctx_lock);
> +	raw_spin_unlock_irq(&ctx->ctx_lock);
>  
>  	percpu_ref_kill(&ctx->reqs);
>  	percpu_ref_put(&ctx->reqs);
> @@ -645,8 +672,8 @@ static struct kioctx *ioctx_alloc(unsign
>  
>  	ctx->max_reqs = nr_events;
>  
> -	spin_lock_init(&ctx->ctx_lock);
> -	spin_lock_init(&ctx->completion_lock);
> +	raw_spin_lock_init(&ctx->ctx_lock);
> +	raw_spin_lock_init(&ctx->completion_lock);
>  	mutex_init(&ctx->ring_lock);
>  	/* Protect against page migration throughout kiotx setup by keeping
>  	 * the ring_lock mutex held until setup is complete. */
> @@ -948,9 +975,9 @@ void aio_complete(struct kiocb *iocb, lo
>  	if (iocb->ki_list.next) {
>  		unsigned long flags;
>  
> -		spin_lock_irqsave(&ctx->ctx_lock, flags);
> +		raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
>  		list_del(&iocb->ki_list);
> -		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> +		raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
>  	}
>  
>  	/*
> @@ -958,7 +985,7 @@ void aio_complete(struct kiocb *iocb, lo
>  	 * ctx->completion_lock to prevent other code from messing with the tail
>  	 * pointer since we might be called from irq context.
>  	 */
> -	spin_lock_irqsave(&ctx->completion_lock, flags);
> +	raw_spin_lock_irqsave(&ctx->completion_lock, flags);
>  
>  	tail = ctx->tail;
>  	pos = tail + AIO_EVENTS_OFFSET;
> @@ -993,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, lo
>  	kunmap_atomic(ring);
>  	flush_dcache_page(ctx->ring_pages[0]);
>  
> -	spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +	raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);
>  
>  	pr_debug("added to ring %p at [%u]\n", iocb, tail);
>  
> @@ -1515,7 +1542,7 @@ static struct kiocb *lookup_kiocb(struct
>  {
>  	struct list_head *pos;
>  
> -	assert_spin_locked(&ctx->ctx_lock);
> +	assert_raw_spin_locked(&ctx->ctx_lock);
>  
>  	if (key != KIOCB_KEY)
>  		return NULL;
> @@ -1555,7 +1582,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
>  	if (unlikely(!ctx))
>  		return -EINVAL;
>  
> -	spin_lock_irq(&ctx->ctx_lock);
> +	raw_spin_lock_irq(&ctx->ctx_lock);
>  
>  	kiocb = lookup_kiocb(ctx, iocb, key);
>  	if (kiocb)
> @@ -1563,7 +1590,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
>  	else
>  		ret = -EINVAL;
>  
> -	spin_unlock_irq(&ctx->ctx_lock);
> +	raw_spin_unlock_irq(&ctx->ctx_lock);
>  
>  	if (!ret) {
>  		/*
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> .
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-09  2:08 ` Lai Jiangshan
@ 2014-06-09  3:17   ` Mike Galbraith
  2014-06-09  6:22     ` Mike Galbraith
  2014-06-10  3:47   ` [RFC PATCH V2] " Mike Galbraith
  1 sibling, 1 reply; 13+ messages in thread
From: Mike Galbraith @ 2014-06-09  3:17 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: RT, LKML, Sebastian Andrzej Siewior, Steven Rostedt,
	Thomas Gleixner, Paul E. McKenney

On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> Hi, rt-people
> 
> I don't think it is the correct direction.

Yup, it's a band-aid, ergo RFC.

-Mike


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-09  3:17   ` Mike Galbraith
@ 2014-06-09  6:22     ` Mike Galbraith
  2014-06-09  9:04       ` Pavel Vasilyev
  0 siblings, 1 reply; 13+ messages in thread
From: Mike Galbraith @ 2014-06-09  6:22 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: RT, LKML, Sebastian Andrzej Siewior, Steven Rostedt,
	Thomas Gleixner, Paul E. McKenney

On Mon, 2014-06-09 at 05:17 +0200, Mike Galbraith wrote: 
> On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> > Hi, rt-people
> > 
> > I don't think it is the correct direction.
> 
> Yup, it's a band-aid, ergo RFC.

Making aio play by the rules is safe.  Another option is to bend the
rules up a little.  With the below, ltp aio-stress testcases haven't yet
griped or exploded either.. which somehow isn't quite as comforting as
"is safe" :)

---
 fs/aio.c |    6 ++++++
 1 file changed, 6 insertions(+)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -509,7 +509,9 @@ static void free_ioctx_reqs(struct percp
 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
 
 	INIT_WORK(&ctx->free_work, free_ioctx);
+	preempt_enable_rt();
 	schedule_work(&ctx->free_work);
+	preempt_disable_rt();
 }
 
 /*
@@ -522,7 +524,9 @@ static void free_ioctx_users(struct perc
 	struct kioctx *ctx = container_of(ref, struct kioctx, users);
 	struct kiocb *req;
 
+	preempt_enable_rt();
 	spin_lock_irq(&ctx->ctx_lock);
+	local_irq_disable_rt();
 
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
@@ -536,6 +540,8 @@ static void free_ioctx_users(struct perc
 
 	percpu_ref_kill(&ctx->reqs);
 	percpu_ref_put(&ctx->reqs);
+	preempt_disable_rt();
+	local_irq_enable_rt();
 }
 
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-09  6:22     ` Mike Galbraith
@ 2014-06-09  9:04       ` Pavel Vasilyev
  0 siblings, 0 replies; 13+ messages in thread
From: Pavel Vasilyev @ 2014-06-09  9:04 UTC (permalink / raw)
  Cc: RT, LKML

09.06.2014 10:22, Mike Galbraith пишет:
> On Mon, 2014-06-09 at 05:17 +0200, Mike Galbraith wrote:
>> On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
>>> Hi, rt-people

> @@ -522,7 +524,9 @@ static void free_ioctx_users(struct perc
>   	struct kioctx *ctx = container_of(ref, struct kioctx, users);
>   	struct kiocb *req;
>
> +	preempt_enable_rt();
>   	spin_lock_irq(&ctx->ctx_lock);
> +	local_irq_disable_rt();
>
>   	while (!list_empty(&ctx->active_reqs)) {
>   		req = list_first_entry(&ctx->active_reqs,
> @@ -536,6 +540,8 @@ static void free_ioctx_users(struct perc
>
>   	percpu_ref_kill(&ctx->reqs);
>   	percpu_ref_put(&ctx->reqs);
> +	preempt_disable_rt();
> +	local_irq_enable_rt();


I think, enable_/disable_ must be as mirror reflections


     preempt_enable_rt();
         local_irq_disable_rt();

                 do_something();

         local_irq_enable_rt();
     preempt_disable_rt();



-- 

                                                          Pavel.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-09  2:08 ` Lai Jiangshan
  2014-06-09  3:17   ` Mike Galbraith
@ 2014-06-10  3:47   ` Mike Galbraith
  2014-06-10 17:50     ` Benjamin LaHaise
  1 sibling, 1 reply; 13+ messages in thread
From: Mike Galbraith @ 2014-06-10  3:47 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: RT, LKML, Sebastian Andrzej Siewior, Steven Rostedt,
	Thomas Gleixner, Paul E. McKenney, Benjamin LaHaise

On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> Hi, rt-people
> 
> I don't think it is the correct direction.
> Softirq (including local_bh_disable()) in RT kernel should be preemptible.

How about the below then?

I was sorely tempted to post a tiny variant that dropped taking ctx_lock
in free_ioctx_users() entirely, as someone diddling with no reference
didn't make sense.  Cc Ben, he would know.

[  172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
[  172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
[  172.743117] 2 locks held by rcuos/2/26:
[  172.743128]  #0:  (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
[  172.743135]  #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
[  172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[  172.743138]
[  172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
[  172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
[  172.743148]  ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
[  172.743151]  ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
[  172.743154]  ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
[  172.743155] Call Trace:
[  172.743160]  [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
[  172.743163]  [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
[  172.743167]  [<ffffffff81589304>] rt_spin_lock+0x24/0x70
[  172.743171]  [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
[  172.743174]  [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
[  172.743177]  [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
[  172.743180]  [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
[  172.743183]  [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[  172.743185]  [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
[  172.743189]  [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
[  172.743192]  [<ffffffff8106e046>] kthread+0xd6/0xf0
[  172.743194]  [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
[  172.743197]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
[  172.743200]  [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
[  172.743203]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70

crash> gdb list *percpu_ref_kill_rcu+0x1b4
0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
164             pcpu_count = ACCESS_ONCE(ref->pcpu_count);
165
166             if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
167                     __this_cpu_dec(*pcpu_count);
168             else if (unlikely(atomic_dec_and_test(&ref->count)))
169                     ref->release(ref);
170
171             rcu_read_unlock_sched();
172     }
173

ref->release() path can't take sleeping locks, but in an rt kernel it does.
Defer ->release() work via call_rcu() to move sleeping locks out from under
rcu_read_lock_sched().

Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 fs/aio.c |   38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -110,8 +110,6 @@ struct kioctx {
 	struct page		**ring_pages;
 	long			nr_pages;
 
-	struct work_struct	free_work;
-
 	struct {
 		/*
 		 * This counts the number of available slots in the ringbuffer,
@@ -143,6 +141,7 @@ struct kioctx {
 	struct file		*aio_ring_file;
 
 	unsigned		id;
+	struct rcu_head		rcu;
 };
 
 /*------ sysctl variables----*/
@@ -493,9 +492,9 @@ static int kiocb_cancel(struct kioctx *c
 	return cancel(kiocb);
 }
 
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx_rcu(struct rcu_head *rcu)
 {
-	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+	struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
 
 	pr_debug("freeing %p\n", ctx);
 
@@ -508,18 +507,12 @@ static void free_ioctx_reqs(struct percp
 {
 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
 
-	INIT_WORK(&ctx->free_work, free_ioctx);
-	schedule_work(&ctx->free_work);
+	call_rcu(&ctx->rcu, free_ioctx_rcu);
 }
 
-/*
- * When this function runs, the kioctx has been removed from the "hash table"
- * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
- * now it's safe to cancel any that need to be.
- */
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_rcu(struct rcu_head *rcu)
 {
-	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+	struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
 	struct kiocb *req;
 
 	spin_lock_irq(&ctx->ctx_lock);
@@ -538,6 +531,25 @@ static void free_ioctx_users(struct perc
 	percpu_ref_put(&ctx->reqs);
 }
 
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
+ *
+ * NOTE: -rt must defer this to avoid taking sleeping ctx_lock while under
+ * rcu_real_lock_sched() during ->release().
+ */
+static void free_ioctx_users(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+	call_rcu(&ctx->rcu, free_ioctx_users_rcu);
+#else
+	free_ioctx_users_rcu(&ctx->rcu);
+#endif
+}
+
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
 	unsigned i, new_nr;




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-10  3:47   ` [RFC PATCH V2] " Mike Galbraith
@ 2014-06-10 17:50     ` Benjamin LaHaise
  2014-06-11  4:10       ` Mike Galbraith
  2014-06-12 20:26       ` Kent Overstreet
  0 siblings, 2 replies; 13+ messages in thread
From: Benjamin LaHaise @ 2014-06-10 17:50 UTC (permalink / raw)
  To: Mike Galbraith, Kent Overstreet
  Cc: Lai Jiangshan, RT, LKML, Sebastian Andrzej Siewior,
	Steven Rostedt, Thomas Gleixner, Paul E. McKenney

On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> > Hi, rt-people
> > 
> > I don't think it is the correct direction.
> > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> 
> How about the below then?
> 
> I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> in free_ioctx_users() entirely, as someone diddling with no reference
> didn't make sense.  Cc Ben, he would know.

That should be okay...  Let's ask Kent to chime in on whether this looks 
safe to him on the percpu ref front as well, since he's the one who wrote 
this code.

		-ben


> [  172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
> [  172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
> [  172.743117] 2 locks held by rcuos/2/26:
> [  172.743128]  #0:  (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
> [  172.743135]  #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
> [  172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [  172.743138]
> [  172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
> [  172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
> [  172.743148]  ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
> [  172.743151]  ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
> [  172.743154]  ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
> [  172.743155] Call Trace:
> [  172.743160]  [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
> [  172.743163]  [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
> [  172.743167]  [<ffffffff81589304>] rt_spin_lock+0x24/0x70
> [  172.743171]  [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
> [  172.743174]  [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
> [  172.743177]  [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
> [  172.743180]  [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
> [  172.743183]  [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [  172.743185]  [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
> [  172.743189]  [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
> [  172.743192]  [<ffffffff8106e046>] kthread+0xd6/0xf0
> [  172.743194]  [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
> [  172.743197]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
> [  172.743200]  [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
> [  172.743203]  [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
> 
> crash> gdb list *percpu_ref_kill_rcu+0x1b4
> 0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
> 164             pcpu_count = ACCESS_ONCE(ref->pcpu_count);
> 165
> 166             if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
> 167                     __this_cpu_dec(*pcpu_count);
> 168             else if (unlikely(atomic_dec_and_test(&ref->count)))
> 169                     ref->release(ref);
> 170
> 171             rcu_read_unlock_sched();
> 172     }
> 173
> 
> ref->release() path can't take sleeping locks, but in an rt kernel it does.
> Defer ->release() work via call_rcu() to move sleeping locks out from under
> rcu_read_lock_sched().
> 
> Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> ---
>  fs/aio.c |   38 +++++++++++++++++++++++++-------------
>  1 file changed, 25 insertions(+), 13 deletions(-)
> 
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -110,8 +110,6 @@ struct kioctx {
>  	struct page		**ring_pages;
>  	long			nr_pages;
>  
> -	struct work_struct	free_work;
> -
>  	struct {
>  		/*
>  		 * This counts the number of available slots in the ringbuffer,
> @@ -143,6 +141,7 @@ struct kioctx {
>  	struct file		*aio_ring_file;
>  
>  	unsigned		id;
> +	struct rcu_head		rcu;
>  };
>  
>  /*------ sysctl variables----*/
> @@ -493,9 +492,9 @@ static int kiocb_cancel(struct kioctx *c
>  	return cancel(kiocb);
>  }
>  
> -static void free_ioctx(struct work_struct *work)
> +static void free_ioctx_rcu(struct rcu_head *rcu)
>  {
> -	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
> +	struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
>  
>  	pr_debug("freeing %p\n", ctx);
>  
> @@ -508,18 +507,12 @@ static void free_ioctx_reqs(struct percp
>  {
>  	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
>  
> -	INIT_WORK(&ctx->free_work, free_ioctx);
> -	schedule_work(&ctx->free_work);
> +	call_rcu(&ctx->rcu, free_ioctx_rcu);
>  }
>  
> -/*
> - * When this function runs, the kioctx has been removed from the "hash table"
> - * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
> - * now it's safe to cancel any that need to be.
> - */
> -static void free_ioctx_users(struct percpu_ref *ref)
> +static void free_ioctx_users_rcu(struct rcu_head *rcu)
>  {
> -	struct kioctx *ctx = container_of(ref, struct kioctx, users);
> +	struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
>  	struct kiocb *req;
>  
>  	spin_lock_irq(&ctx->ctx_lock);
> @@ -538,6 +531,25 @@ static void free_ioctx_users(struct perc
>  	percpu_ref_put(&ctx->reqs);
>  }
>  
> +/*
> + * When this function runs, the kioctx has been removed from the "hash table"
> + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
> + * now it's safe to cancel any that need to be.
> + *
> + * NOTE: -rt must defer this to avoid taking sleeping ctx_lock while under
> + * rcu_real_lock_sched() during ->release().
> + */
> +static void free_ioctx_users(struct percpu_ref *ref)
> +{
> +	struct kioctx *ctx = container_of(ref, struct kioctx, users);
> +
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	call_rcu(&ctx->rcu, free_ioctx_users_rcu);
> +#else
> +	free_ioctx_users_rcu(&ctx->rcu);
> +#endif
> +}
> +
>  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
>  {
>  	unsigned i, new_nr;
> 
> 

-- 
"Thought is the essence of where you are now."

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-10 17:50     ` Benjamin LaHaise
@ 2014-06-11  4:10       ` Mike Galbraith
  2014-06-12 20:26       ` Kent Overstreet
  1 sibling, 0 replies; 13+ messages in thread
From: Mike Galbraith @ 2014-06-11  4:10 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Kent Overstreet, Lai Jiangshan, RT, LKML,
	Sebastian Andrzej Siewior, Steven Rostedt, Thomas Gleixner,
	Paul E. McKenney

On Tue, 2014-06-10 at 13:50 -0400, Benjamin LaHaise wrote: 
> On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> > On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> > > Hi, rt-people
> > > 
> > > I don't think it is the correct direction.
> > > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> > 
> > How about the below then?
> > 
> > I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> > in free_ioctx_users() entirely, as someone diddling with no reference
> > didn't make sense.  Cc Ben, he would know.
> 
> That should be okay...  Let's ask Kent to chime in on whether this looks 
> safe to him on the percpu ref front as well, since he's the one who wrote 
> this code.

Looking at the gizzard of our in-tree user of kiocb_set_cancel_fn()
(gadget), cancel() leads to dequeue() methods, which take other sleeping
locks, so tiniest variant is not an option, patchlet stands.

-Mike


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-10 17:50     ` Benjamin LaHaise
  2014-06-11  4:10       ` Mike Galbraith
@ 2014-06-12 20:26       ` Kent Overstreet
  2014-06-25 15:24         ` Benjamin LaHaise
  1 sibling, 1 reply; 13+ messages in thread
From: Kent Overstreet @ 2014-06-12 20:26 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Mike Galbraith, Lai Jiangshan, RT, LKML,
	Sebastian Andrzej Siewior, Steven Rostedt, Thomas Gleixner,
	Paul E. McKenney

On Tue, Jun 10, 2014 at 01:50:01PM -0400, Benjamin LaHaise wrote:
> On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> > On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> > > Hi, rt-people
> > > 
> > > I don't think it is the correct direction.
> > > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> > 
> > How about the below then?
> > 
> > I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> > in free_ioctx_users() entirely, as someone diddling with no reference
> > didn't make sense.  Cc Ben, he would know.
> 
> That should be okay...  Let's ask Kent to chime in on whether this looks 
> safe to him on the percpu ref front as well, since he's the one who wrote 
> this code.

Ok, finally got around to reading the whole thread - honestly, I would just punt
to workqueue to do the free_ioctx_users(). AFAICT that should be perfectly safe
and even aside from rt it would be good change so we're not cancelling an
arbitrary number of kiocbs from rcu callback context.

Kind of a related change, it should be possible to just grab the entire list of
kiocbs with ctx_lock held (remove them all at once from ctx->active_reqs), then
cancel them without ctx_lock held.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-12 20:26       ` Kent Overstreet
@ 2014-06-25 15:24         ` Benjamin LaHaise
  2014-06-26  7:37           ` Mike Galbraith
  2015-02-16 17:59           ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 13+ messages in thread
From: Benjamin LaHaise @ 2014-06-25 15:24 UTC (permalink / raw)
  To: Kent Overstreet, Mike Galbraith
  Cc: Lai Jiangshan, RT, LKML, Sebastian Andrzej Siewior,
	Steven Rostedt, Thomas Gleixner, Paul E. McKenney

On Thu, Jun 12, 2014 at 01:26:02PM -0700, Kent Overstreet wrote:
> On Tue, Jun 10, 2014 at 01:50:01PM -0400, Benjamin LaHaise wrote:
> > On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> > > On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote: 
> > > > Hi, rt-people
> > > > 
> > > > I don't think it is the correct direction.
> > > > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> > > 
> > > How about the below then?
> > > 
> > > I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> > > in free_ioctx_users() entirely, as someone diddling with no reference
> > > didn't make sense.  Cc Ben, he would know.
> > 
> > That should be okay...  Let's ask Kent to chime in on whether this looks 
> > safe to him on the percpu ref front as well, since he's the one who wrote 
> > this code.
> 
> Ok, finally got around to reading the whole thread - honestly, I would just punt
> to workqueue to do the free_ioctx_users(). AFAICT that should be perfectly safe
> and even aside from rt it would be good change so we're not cancelling an
> arbitrary number of kiocbs from rcu callback context.

I finally have some time to look at this patch in detail.  I'd rather do the 
below variant that does what Kent suggested.  Mike, can you confirm that 
this fixes the issue you reported?  It's on top of my current aio-next tree 
at git://git.kvack.org/~bcrl/aio-next.git .  If that's okay, I'll queue it 
up.  Does this bug fix need to end up in -stable kernels as well or would it 
end up in the -rt tree?

> Kind of a related change, it should be possible to just grab the entire list of
> kiocbs with ctx_lock held (remove them all at once from ctx->active_reqs), then
> cancel them without ctx_lock held.

No, that is not safe.  Cancellation has to be done under ctx_lock.

		-ben
-- 
"Thought is the essence of where you are now."


diff --git a/fs/aio.c b/fs/aio.c
index c1d8c48..0b038f2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -526,9 +526,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_work(struct work_struct *work)
 {
-	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
 	struct kiocb *req;
 
 	spin_lock_irq(&ctx->ctx_lock);
@@ -547,6 +547,18 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	percpu_ref_put(&ctx->reqs);
 }
 
+static void free_ioctx_users(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+#ifdef CONFIG_PREEMPT_RT_BAS
+	INIT_WORK(&ctx->free_work, free_ioctx_users_work);
+	schedule_work(&ctx->free_work);
+#else
+	free_ioctx_users_work(&ctx->free_work);
+#endif
+}
+
+
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
 	unsigned i, new_nr;

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-25 15:24         ` Benjamin LaHaise
@ 2014-06-26  7:37           ` Mike Galbraith
  2014-06-26 16:42             ` Benjamin LaHaise
  2015-02-16 17:59           ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 13+ messages in thread
From: Mike Galbraith @ 2014-06-26  7:37 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Kent Overstreet, Lai Jiangshan, RT, LKML,
	Sebastian Andrzej Siewior, Steven Rostedt, Thomas Gleixner,
	Paul E. McKenney

Hi Ben,

On Wed, 2014-06-25 at 11:24 -0400, Benjamin LaHaise wrote:

> I finally have some time to look at this patch in detail.  I'd rather do the 
> below variant that does what Kent suggested.  Mike, can you confirm that 
> this fixes the issue you reported?  It's on top of my current aio-next tree 
> at git://git.kvack.org/~bcrl/aio-next.git .  If that's okay, I'll queue it 
> up.  Does this bug fix need to end up in -stable kernels as well or would it 
> end up in the -rt tree?

It's an -rt specific problem, so presumably any fix would only go into
-rt trees until it manages to get merged.

I knew intervening change wasn't likely to fix the might_sleep() splat
up, but did the test anyway with fixed up CONFIG_PREEMPT_RT_BASE typo.
schedule_work() leads to an rtmutex, so -rt still has to ship that out
from under rcu_read_lock_sched().

marge:/usr/local/src/kernel/linux-3.14-rt # quilt applied|tail
patches/mm-memcg-make-refill_stock-use-get_cpu_light.patch
patches/printk-fix-lockdep-instrumentation-of-console_sem.patch
patches/aio-block-io_destroy-until-all-context-requests-are-completed.patch
patches/fs-aio-Remove-ctx-parameter-in-kiocb_cancel.patch
patches/aio-report-error-from-io_destroy-when-threads-race-in-io_destroy.patch
patches/aio-cleanup-flatten-kill_ioctx.patch
patches/aio-fix-aio-request-leak-when-events-are-reaped-by-userspace.patch
patches/aio-fix-kernel-memory-disclosure-in-io_getevents-introduced-in-v3.10.patch
patches/aio-change-exit_aio-to-load-mm-ioctx_table-once-and-avoid-rcu_read_lock.patch
patches/rt-aio-fix-rcu-garbage-collection-might_sleep-splat-ben.patch

[  191.057656] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:792
[  191.057672] in_atomic(): 1, irqs_disabled(): 0, pid: 22, name: rcuc/0
[  191.057674] 2 locks held by rcuc/0/22:
[  191.057684]  #0:  (rcu_callback){.+.+..}, at: [<ffffffff810ceb87>] rcu_cpu_kthread+0x2d7/0x840
[  191.057691]  #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812e52f6>] percpu_ref_kill_rcu+0xa6/0x1c0
[  191.057694] Preemption disabled at:[<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
[  191.057695] 
[  191.057698] CPU: 0 PID: 22 Comm: rcuc/0 Tainted: GF       W    3.14.8-rt5 #47
[  191.057699] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
[  191.057704]  ffff88007c5d8000 ffff88007c5d7c98 ffffffff815696ed 0000000000000000
[  191.057708]  ffff88007c5d7cb8 ffffffff8108c3e5 ffff88007dc0e120 000000000000e120
[  191.057711]  ffff88007c5d7cd8 ffffffff8156f404 ffff88007dc0e120 ffff88007dc0e120
[  191.057712] Call Trace:
[  191.057716]  [<ffffffff815696ed>] dump_stack+0x4e/0x9c
[  191.057720]  [<ffffffff8108c3e5>] __might_sleep+0x105/0x180
[  191.057723]  [<ffffffff8156f404>] rt_spin_lock+0x24/0x70
[  191.057727]  [<ffffffff81078897>] queue_work_on+0x67/0x1a0
[  191.057731]  [<ffffffff81216fc2>] free_ioctx_users+0x72/0x80
[  191.057734]  [<ffffffff812e5404>] percpu_ref_kill_rcu+0x1b4/0x1c0
[  191.057737]  [<ffffffff812e52f6>] ? percpu_ref_kill_rcu+0xa6/0x1c0
[  191.057740]  [<ffffffff812e5250>] ? percpu_ref_kill_and_confirm+0x70/0x70
[  191.057742]  [<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
[  191.057745]  [<ffffffff810ceb87>] ? rcu_cpu_kthread+0x2d7/0x840
[  191.057749]  [<ffffffff8108a76d>] smpboot_thread_fn+0x1dd/0x340
[  191.057752]  [<ffffffff8156c45a>] ? schedule+0x2a/0xa0
[  191.057755]  [<ffffffff8108a590>] ? smpboot_register_percpu_thread+0x100/0x100
[  191.057758]  [<ffffffff81081ca6>] kthread+0xd6/0xf0
[  191.057761]  [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70
[  191.057764]  [<ffffffff815780bc>] ret_from_fork+0x7c/0xb0
[  191.057767]  [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-26  7:37           ` Mike Galbraith
@ 2014-06-26 16:42             ` Benjamin LaHaise
  0 siblings, 0 replies; 13+ messages in thread
From: Benjamin LaHaise @ 2014-06-26 16:42 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Kent Overstreet, Lai Jiangshan, RT, LKML,
	Sebastian Andrzej Siewior, Steven Rostedt, Thomas Gleixner,
	Paul E. McKenney

On Thu, Jun 26, 2014 at 09:37:14AM +0200, Mike Galbraith wrote:
> Hi Ben,
> 
> On Wed, 2014-06-25 at 11:24 -0400, Benjamin LaHaise wrote:
> 
> > I finally have some time to look at this patch in detail.  I'd rather do the 
> > below variant that does what Kent suggested.  Mike, can you confirm that 
> > this fixes the issue you reported?  It's on top of my current aio-next tree 
> > at git://git.kvack.org/~bcrl/aio-next.git .  If that's okay, I'll queue it 
> > up.  Does this bug fix need to end up in -stable kernels as well or would it 
> > end up in the -rt tree?
> 
> It's an -rt specific problem, so presumably any fix would only go into
> -rt trees until it manages to get merged.
> 
> I knew intervening change wasn't likely to fix the might_sleep() splat
> up, but did the test anyway with fixed up CONFIG_PREEMPT_RT_BASE typo.
> schedule_work() leads to an rtmutex, so -rt still has to ship that out
> from under rcu_read_lock_sched().

So that doesn't fix it.  I think you should fix schedule_work(), because 
that should be callable from any context.  Abusing RCU instead of using 
schedule_work() is not the right way to fix this.

		-ben

> marge:/usr/local/src/kernel/linux-3.14-rt # quilt applied|tail
> patches/mm-memcg-make-refill_stock-use-get_cpu_light.patch
> patches/printk-fix-lockdep-instrumentation-of-console_sem.patch
> patches/aio-block-io_destroy-until-all-context-requests-are-completed.patch
> patches/fs-aio-Remove-ctx-parameter-in-kiocb_cancel.patch
> patches/aio-report-error-from-io_destroy-when-threads-race-in-io_destroy.patch
> patches/aio-cleanup-flatten-kill_ioctx.patch
> patches/aio-fix-aio-request-leak-when-events-are-reaped-by-userspace.patch
> patches/aio-fix-kernel-memory-disclosure-in-io_getevents-introduced-in-v3.10.patch
> patches/aio-change-exit_aio-to-load-mm-ioctx_table-once-and-avoid-rcu_read_lock.patch
> patches/rt-aio-fix-rcu-garbage-collection-might_sleep-splat-ben.patch
> 
> [  191.057656] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:792
> [  191.057672] in_atomic(): 1, irqs_disabled(): 0, pid: 22, name: rcuc/0
> [  191.057674] 2 locks held by rcuc/0/22:
> [  191.057684]  #0:  (rcu_callback){.+.+..}, at: [<ffffffff810ceb87>] rcu_cpu_kthread+0x2d7/0x840
> [  191.057691]  #1:  (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812e52f6>] percpu_ref_kill_rcu+0xa6/0x1c0
> [  191.057694] Preemption disabled at:[<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
> [  191.057695] 
> [  191.057698] CPU: 0 PID: 22 Comm: rcuc/0 Tainted: GF       W    3.14.8-rt5 #47
> [  191.057699] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
> [  191.057704]  ffff88007c5d8000 ffff88007c5d7c98 ffffffff815696ed 0000000000000000
> [  191.057708]  ffff88007c5d7cb8 ffffffff8108c3e5 ffff88007dc0e120 000000000000e120
> [  191.057711]  ffff88007c5d7cd8 ffffffff8156f404 ffff88007dc0e120 ffff88007dc0e120
> [  191.057712] Call Trace:
> [  191.057716]  [<ffffffff815696ed>] dump_stack+0x4e/0x9c
> [  191.057720]  [<ffffffff8108c3e5>] __might_sleep+0x105/0x180
> [  191.057723]  [<ffffffff8156f404>] rt_spin_lock+0x24/0x70
> [  191.057727]  [<ffffffff81078897>] queue_work_on+0x67/0x1a0
> [  191.057731]  [<ffffffff81216fc2>] free_ioctx_users+0x72/0x80
> [  191.057734]  [<ffffffff812e5404>] percpu_ref_kill_rcu+0x1b4/0x1c0
> [  191.057737]  [<ffffffff812e52f6>] ? percpu_ref_kill_rcu+0xa6/0x1c0
> [  191.057740]  [<ffffffff812e5250>] ? percpu_ref_kill_and_confirm+0x70/0x70
> [  191.057742]  [<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
> [  191.057745]  [<ffffffff810ceb87>] ? rcu_cpu_kthread+0x2d7/0x840
> [  191.057749]  [<ffffffff8108a76d>] smpboot_thread_fn+0x1dd/0x340
> [  191.057752]  [<ffffffff8156c45a>] ? schedule+0x2a/0xa0
> [  191.057755]  [<ffffffff8108a590>] ? smpboot_register_percpu_thread+0x100/0x100
> [  191.057758]  [<ffffffff81081ca6>] kthread+0xd6/0xf0
> [  191.057761]  [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70
> [  191.057764]  [<ffffffff815780bc>] ret_from_fork+0x7c/0xb0
> [  191.057767]  [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70
> 

-- 
"Thought is the essence of where you are now."

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat
  2014-06-25 15:24         ` Benjamin LaHaise
  2014-06-26  7:37           ` Mike Galbraith
@ 2015-02-16 17:59           ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 13+ messages in thread
From: Sebastian Andrzej Siewior @ 2015-02-16 17:59 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Kent Overstreet, Mike Galbraith, Lai Jiangshan, RT, LKML,
	Steven Rostedt, Thomas Gleixner, Paul E. McKenney

* Benjamin LaHaise | 2014-06-25 11:24:45 [-0400]:

>I finally have some time to look at this patch in detail.  I'd rather do the 
>below variant that does what Kent suggested.  Mike, can you confirm that 
>this fixes the issue you reported?  It's on top of my current aio-next tree 
>at git://git.kvack.org/~bcrl/aio-next.git .  If that's okay, I'll queue it 
>up.  Does this bug fix need to end up in -stable kernels as well or would it 
>end up in the -rt tree?

This looks smaller compared to the RCU version. Since recently I have
simple-workqueue in -RT so I could replace schedule_work() with it. And
looking at the code I would have to do this change to free_ioctx_users()
and free_ioctx_reqs(). So here is what I am about to add for next -RT.

Ach. It compiles and I've never seen that splat so any feedback is
welcome :)

diff --git a/fs/aio.c b/fs/aio.c
index 14b93159ef83..551fcfe3fd58 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -40,6 +40,7 @@
 #include <linux/ramfs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/mount.h>
+#include <linux/work-simple.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -110,7 +111,7 @@ struct kioctx {
 	struct page		**ring_pages;
 	long			nr_pages;
 
-	struct work_struct	free_work;
+	struct swork_event	free_work;
 
 	/*
 	 * signals when all in-flight requests are done
@@ -226,6 +227,7 @@ static int __init aio_setup(void)
 		.mount		= aio_mount,
 		.kill_sb	= kill_anon_super,
 	};
+	BUG_ON(swork_get());
 	aio_mnt = kern_mount(&aio_fs);
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
@@ -505,9 +507,9 @@ static int kiocb_cancel(struct kiocb *kiocb)
 	return cancel(kiocb);
 }
 
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx(struct swork_event *sev)
 {
-	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+	struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
 
 	pr_debug("freeing %p\n", ctx);
 
@@ -526,8 +528,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 	if (ctx->requests_done)
 		complete(ctx->requests_done);
 
-	INIT_WORK(&ctx->free_work, free_ioctx);
-	schedule_work(&ctx->free_work);
+	INIT_SWORK(&ctx->free_work, free_ioctx);
+	swork_queue(&ctx->free_work);
 }
 
 /*
@@ -535,9 +537,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_work(struct swork_event *sev)
 {
-	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+	struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
 	struct kiocb *req;
 
 	spin_lock_irq(&ctx->ctx_lock);
@@ -556,6 +558,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	percpu_ref_put(&ctx->reqs);
 }
 
+static void free_ioctx_users(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+	INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
+	swork_queue(&ctx->free_work);
+}
+
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
 	unsigned i, new_nr;
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-02-16 17:59 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-08  8:35 [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat Mike Galbraith
2014-06-09  2:08 ` Lai Jiangshan
2014-06-09  3:17   ` Mike Galbraith
2014-06-09  6:22     ` Mike Galbraith
2014-06-09  9:04       ` Pavel Vasilyev
2014-06-10  3:47   ` [RFC PATCH V2] " Mike Galbraith
2014-06-10 17:50     ` Benjamin LaHaise
2014-06-11  4:10       ` Mike Galbraith
2014-06-12 20:26       ` Kent Overstreet
2014-06-25 15:24         ` Benjamin LaHaise
2014-06-26  7:37           ` Mike Galbraith
2014-06-26 16:42             ` Benjamin LaHaise
2015-02-16 17:59           ` Sebastian Andrzej Siewior

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.