All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lai Jiangshan <laijs@cn.fujitsu.com>
To: paulmck@linux.vnet.ibm.com
Cc: Lai Jiangshan <eag0628@gmail.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Peter Zijlstra <peterz@infradead.org>,
	LKML <linux-kernel@vger.kernel.org>,
	Dipankar Sarma <dipankar@in.ibm.com>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site
Date: Mon, 26 Aug 2013 10:39:32 +0800	[thread overview]
Message-ID: <521ABFE4.3070703@cn.fujitsu.com> (raw)
In-Reply-To: <20130825174304.GL3871@linux.vnet.ibm.com>

On 08/26/2013 01:43 AM, Paul E. McKenney wrote:
> On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
>> Hi, Steven
>>
>> Any comments about this patch?
> 
> For whatever it is worth, it ran without incident for two hours worth
> of rcutorture on my P5 test (boosting but no CPU hotplug).
> 
> Lai, do you have a specific test for this patch?  

Also rcutorture.
(A special module is added to ensure all paths of my code are covered.)

> Your deadlock
> scenario looks plausible, but is apparently not occurring in the
> mainline kernel.

Yes, you can leave this possible bug until the real problem happens
or just disallow overlapping.
I can write some debug code for it which allow us find out
the problems earlier.

I guess this is an useful usage pattern of rcu:

again:
	rcu_read_lock();
	obj = read_dereference(ptr);
	spin_lock_XX(obj->lock);
	if (obj is invalid) {
		spin_unlock_XX(obj->lock);
		rcu_read_unlock();
		goto again;
	}
	rcu_read_unlock();
	# use obj
	spin_unlock_XX(obj->lock);

If we encourage this pattern, we should fix all the related problems.

Thanks,
Lai

> 
> 							Thanx, Paul
> 
>> Thanks,
>> Lai
>>
>>
>> On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
>>
>>> [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site
>>>
>>> Current rtmutex's lock->wait_lock doesn't disables softirq nor irq, it will
>>> cause rcu read site deadlock when rcu overlaps with any
>>> softirq-context/irq-context lock.
>>>
>>> @L is a spinlock of softirq or irq context.
>>>
>>> CPU1                                    cpu2(rcu boost)
>>> rcu_read_lock()                         rt_mutext_lock()
>>> <preemption and reschedule back>          raw_spin_lock(lock->wait_lock)
>>> spin_lock_XX(L)                           <interrupt and doing softirq or
>>> irq>
>>> rcu_read_unlock()                         do_softirq()
>>>   rcu_read_unlock_special()
>>>     rt_mutext_unlock()
>>>       raw_spin_lock(lock->wait_lock)        spin_lock_XX(L)  **DEADLOCK**
>>>
>>> This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
>>> rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
>>> Thus rtmutex's lock->wait_lock will not be called from rcu_read_unlock().
>>>
>>> This patch does not eliminate all kinds of rcu-read-site deadlock,
>>> if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
>>> in this case.(avoid overlapping or preempt_disable()).
>>>
>>> rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
>>> can't directly call rt_mutex_lock(&mtx) in the rcu_boost thread,
>>> we split rt_mutex_lock(&mtx) into two steps just like pi-futex.
>>> This result a internal state in rcu_boost thread and cause
>>> rcu_boost thread a bit more complicated.
>>>
>>> Thanks
>>> Lai
>>>
>>> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
>>> index 5cd0f09..8830874 100644
>>> --- a/include/linux/init_task.h
>>> +++ b/include/linux/init_task.h
>>> @@ -102,7 +102,7 @@ extern struct group_info init_groups;
>>>
>>>  #ifdef CONFIG_RCU_BOOST
>>>  #define INIT_TASK_RCU_BOOST()                                          \
>>> -       .rcu_boost_mutex = NULL,
>>> +       .rcu_boost_waiter = NULL,
>>>  #else
>>>  #define INIT_TASK_RCU_BOOST()
>>>  #endif
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index e9995eb..1eca99f 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -1078,7 +1078,7 @@ struct task_struct {
>>>         struct rcu_node *rcu_blocked_node;
>>>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
>>>  #ifdef CONFIG_RCU_BOOST
>>> -       struct rt_mutex *rcu_boost_mutex;
>>> +       struct rt_mutex_waiter *rcu_boost_waiter;
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>>
>>>  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
>>> @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct
>>> task_struct *p)
>>>         p->rcu_blocked_node = NULL;
>>>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
>>>  #ifdef CONFIG_RCU_BOOST
>>> -       p->rcu_boost_mutex = NULL;
>>> +       p->rcu_boost_waiter = NULL;
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>>         INIT_LIST_HEAD(&p->rcu_node_entry);
>>>  }
>>> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
>>> index 769e12e..d207ddd 100644
>>> --- a/kernel/rcutree_plugin.h
>>> +++ b/kernel/rcutree_plugin.h
>>> @@ -33,6 +33,7 @@
>>>  #define RCU_KTHREAD_PRIO 1
>>>
>>>  #ifdef CONFIG_RCU_BOOST
>>> +#include "rtmutex_common.h"
>>>  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
>>>  #else
>>>  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
>>> @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
>>>         unsigned long flags;
>>>         struct list_head *np;
>>>  #ifdef CONFIG_RCU_BOOST
>>> -       struct rt_mutex *rbmp = NULL;
>>> +       struct rt_mutex_waiter *waiter = NULL;
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>>         struct rcu_node *rnp;
>>>         int special;
>>> @@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
>>>  #ifdef CONFIG_RCU_BOOST
>>>                 if (&t->rcu_node_entry == rnp->boost_tasks)
>>>                         rnp->boost_tasks = np;
>>> -               /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock
>>> held. */
>>> -               if (t->rcu_boost_mutex) {
>>> -                       rbmp = t->rcu_boost_mutex;
>>> -                       t->rcu_boost_mutex = NULL;
>>> +               /* Snapshot/clear ->rcu_boost_waiter with rcu_node lock
>>> held. */
>>> +               if (t->rcu_boost_waiter) {
>>> +                       waiter = t->rcu_boost_waiter;
>>> +                       t->rcu_boost_waiter = NULL;
>>>                 }
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>>
>>> @@ -426,8 +427,8 @@ void rcu_read_unlock_special(struct task_struct *t)
>>>
>>>  #ifdef CONFIG_RCU_BOOST
>>>                 /* Unboost if we were boosted. */
>>> -               if (rbmp)
>>> -                       rt_mutex_unlock(rbmp);
>>> +               if (waiter)
>>> +                       rt_mutex_rcu_deboost_unlock(t, waiter);
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>>
>>>                 /*
>>> @@ -1129,9 +1130,6 @@ void exit_rcu(void)
>>>  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
>>>
>>>  #ifdef CONFIG_RCU_BOOST
>>> -
>>> -#include "rtmutex_common.h"
>>> -
>>>  #ifdef CONFIG_RCU_TRACE
>>>
>>>  static void rcu_initiate_boost_trace(struct rcu_node *rnp)
>>> @@ -1181,14 +1179,15 @@ static int rcu_boost(struct rcu_node *rnp)
>>>  {
>>>         unsigned long flags;
>>>         struct rt_mutex mtx;
>>> +       struct rt_mutex_waiter rcu_boost_waiter;
>>>         struct task_struct *t;
>>>         struct list_head *tb;
>>> +       int ret;
>>>
>>>         if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
>>>                 return 0;  /* Nothing left to boost. */
>>>
>>>         raw_spin_lock_irqsave(&rnp->lock, flags);
>>> -
>>>         /*
>>>          * Recheck under the lock: all tasks in need of boosting
>>>          * might exit their RCU read-side critical sections on their own.
>>> @@ -1215,7 +1214,7 @@ static int rcu_boost(struct rcu_node *rnp)
>>>
>>>         /*
>>>          * We boost task t by manufacturing an rt_mutex that appears to
>>> -        * be held by task t.  We leave a pointer to that rt_mutex where
>>> +        * be held by task t.  We leave a pointer to that rt_mutex_waiter
>>> where
>>>          * task t can find it, and task t will release the mutex when it
>>>          * exits its outermost RCU read-side critical section.  Then
>>>          * simply acquiring this artificial rt_mutex will boost task
>>> @@ -1230,11 +1229,30 @@ static int rcu_boost(struct rcu_node *rnp)
>>>          * section.
>>>          */
>>>         t = container_of(tb, struct task_struct, rcu_node_entry);
>>> +       get_task_struct(t);
>>>         rt_mutex_init_proxy_locked(&mtx, t);
>>> -       t->rcu_boost_mutex = &mtx;
>>>         raw_spin_unlock_irqrestore(&rnp->lock, flags);
>>> -       rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
>>> -       rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
>>> +
>>> +       debug_rt_mutex_init_waiter(&rcu_boost_waiter);
>>> +       /* Side effect: boosts task t's priority. */
>>> +       ret = rt_mutex_start_proxy_lock(&mtx, &rcu_boost_waiter, current,
>>> 0);
>>> +       if (WARN_ON_ONCE(ret)) {
>>> +               put_task_struct(t);
>>> +               return 0; /* temporary stop boosting */
>>> +       }
>>> +
>>> +       raw_spin_lock_irqsave(&rnp->lock, flags);
>>> +       if (&t->rcu_node_entry == rnp->exp_tasks ||
>>> +           &t->rcu_node_entry == rnp->boost_tasks) {
>>> +               t->rcu_boost_waiter = &rcu_boost_waiter;
>>> +               raw_spin_unlock_irqrestore(&rnp->lock, flags);
>>> +       } else {
>>> +               raw_spin_unlock_irqrestore(&rnp->lock, flags);
>>> +               rt_mutex_rcu_deboost_unlock(t, &rcu_boost_waiter);
>>> +       }
>>> +
>>> +       put_task_struct(t);
>>> +       rt_mutex_finish_proxy_lock(&mtx, NULL, &rcu_boost_waiter, 0);
>>>
>>>         return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
>>>                ACCESS_ONCE(rnp->boost_tasks) != NULL;
>>> diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
>>> index 0dd6aec..2f3caee 100644
>>> --- a/kernel/rtmutex.c
>>> +++ b/kernel/rtmutex.c
>>> @@ -734,6 +734,43 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
>>>         rt_mutex_adjust_prio(current);
>>>  }
>>>
>>> +#ifdef CONFIG_RCU_BOOST
>>> +/*
>>> + * rt_mutex_rcu_deboost_unlock() - unlock in irq/bh/process context
>>> + *
>>> + * please revert the patch which introduces this function when
>>> + * rt_mutex's ->wait_lock is irq-off.
>>> + */
>>> +void rt_mutex_rcu_deboost_unlock(struct task_struct *owner,
>>> +                                struct rt_mutex_waiter *waiter)
>>> +{
>>> +       unsigned long flags;
>>> +       struct rt_mutex *lock = waiter->lock;
>>> +
>>> +       /*
>>> +        * The correction of the following code is based on
>>> +        * 1) current lock is owned by @owner
>>> +        * 2) only one task(@waiter->task) is waiting on the @lock
>>> +        * 3) the @waiter has been queued and keeps been queued
>>> +        */
>>> +       if (WARN_ON_ONCE(rt_mutex_owner(lock) != owner))
>>> +               return; /* 1) */
>>> +       if (WARN_ON_ONCE(rt_mutex_top_waiter(lock) != waiter))
>>> +               return; /* 2) & 3) */
>>> +       if (WARN_ON_ONCE(plist_node_empty(&waiter->pi_list_entry)))
>>> +               return; /* 2) & 3) */
>>> +
>>> +       raw_spin_lock_irqsave(&owner->pi_lock, flags);
>>> +       plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
>>> +       lock->owner = NULL;
>>> +       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
>>> +
>>> +       wake_up_process(waiter->task);
>>> +       /* Undo pi boosting if necessary: */
>>> +       rt_mutex_adjust_prio(owner);
>>> +}
>>> +#endif /* #ifdef CONFIG_RCU_BOOST */
>>> +
>>>  /*
>>>   * debug aware fast / slowpath lock,trylock,unlock
>>>   *
>>> diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
>>> index 53a66c8..3cdbe82 100644
>>> --- a/kernel/rtmutex_common.h
>>> +++ b/kernel/rtmutex_common.h
>>> @@ -117,6 +117,11 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex
>>> *lock,
>>>                                       struct rt_mutex_waiter *waiter,
>>>                                       int detect_deadlock);
>>>
>>> +#ifdef CONFIG_RCU_BOOST
>>> +void rt_mutex_rcu_deboost_unlock(struct task_struct *owner,
>>> +                                struct rt_mutex_waiter *waiter);
>>> +#endif /* #ifdef CONFIG_RCU_BOOST */
>>> +
>>>  #ifdef CONFIG_DEBUG_RT_MUTEXES
>>>  # include "rtmutex-debug.h"
>>>  #else
>>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


  reply	other threads:[~2013-08-26  2:35 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-07 10:24 [PATCH 0/8] rcu: Ensure rcu read site is deadlock-immunity Lai Jiangshan
2013-08-07 10:24 ` [PATCH 1/8] rcu: add a warn to rcu_preempt_note_context_switch() Lai Jiangshan
2013-10-30 11:02   ` Paul E. McKenney
2013-08-07 10:24 ` [PATCH 2/8] rcu: remove irq/softirq context check in rcu_read_unlock_special() Lai Jiangshan
2013-10-30 11:18   ` Paul E. McKenney
2013-08-07 10:24 ` [PATCH 3/8] rcu: keep irqs disabled " Lai Jiangshan
2013-08-07 10:25 ` [PATCH 4/8] rcu: delay task rcu state cleanup in exit_rcu() Lai Jiangshan
2013-08-07 10:25 ` [PATCH 5/8] rcu: eliminate deadlock for rcu read site Lai Jiangshan
2013-08-08 20:40   ` Paul E. McKenney
2013-08-09  9:31     ` Lai Jiangshan
2013-08-09 17:58       ` Paul E. McKenney
2013-08-12 13:55       ` Peter Zijlstra
2013-08-12 15:16         ` Paul E. McKenney
2013-08-12 16:21           ` Peter Zijlstra
2013-08-12 16:44             ` Paul E. McKenney
2013-08-10  3:43     ` Lai Jiangshan
2013-08-10 15:07       ` Paul E. McKenney
2013-08-10 15:08         ` Paul E. McKenney
2013-08-21  3:17         ` Paul E. McKenney
2013-08-21  3:25           ` Lai Jiangshan
2013-08-21 13:42             ` Paul E. McKenney
2013-08-12 13:53       ` Peter Zijlstra
2013-08-12 14:10         ` Steven Rostedt
2013-08-12 14:14           ` Peter Zijlstra
     [not found]           ` <CACvQF51-oAGkdxwku+orKSQ=SZd1A4saXzkrgcRGi+KnZUZYxQ@mail.gmail.com>
2013-08-22 14:34             ` Steven Rostedt
2013-08-22 14:41               ` Steven Rostedt
2013-08-23  6:26               ` Lai Jiangshan
     [not found]                 ` <CACvQF51kcLrJsa=zBKhLkJfFBh109TW+Zrepm+tRxmEp0gALbQ@mail.gmail.com>
2013-08-25 17:43                   ` Paul E. McKenney
2013-08-26  2:39                     ` Lai Jiangshan [this message]
2013-08-30  2:05                       ` Paul E. McKenney
2013-09-05 15:22                 ` Steven Rostedt
2013-08-07 10:25 ` [PATCH 6/8] rcu: call rcu_read_unlock_special() in rcu_preempt_check_callbacks() Lai Jiangshan
2013-08-07 10:25 ` [PATCH 7/8] rcu: add # of deferred _special() statistics Lai Jiangshan
2013-08-07 16:42   ` Paul E. McKenney
2013-08-07 10:25 ` [PATCH 8/8] rcu: remove irq work for rsp_wakeup() Lai Jiangshan
2013-08-07 12:38 ` [PATCH 0/8] rcu: Ensure rcu read site is deadlock-immunity Paul E. McKenney
2013-08-07 19:29   ` Carsten Emde
2013-08-07 19:52     ` Paul E. McKenney
2013-08-08  1:17     ` Lai Jiangshan
2013-08-08  0:36   ` Paul E. McKenney
2013-08-08  1:47     ` Lai Jiangshan
2013-08-08  2:12       ` Steven Rostedt
2013-08-08  2:33         ` Lai Jiangshan
2013-08-08  2:33           ` Paul E. McKenney
2013-08-08  3:10             ` Lai Jiangshan
2013-08-08  4:18               ` Paul E. McKenney
2013-08-08  5:27                 ` Lai Jiangshan
2013-08-08  7:05                   ` Paul E. McKenney
2013-08-08  2:45         ` Lai Jiangshan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=521ABFE4.3070703@cn.fujitsu.com \
    --to=laijs@cn.fujitsu.com \
    --cc=dipankar@in.ibm.com \
    --cc=eag0628@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.