* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 3:05 ` Joel Fernandes
@ 2020-04-17 8:47 ` Uladzislau Rezki
2020-04-17 15:04 ` Sebastian Andrzej Siewior
` (2 subsequent siblings)
3 siblings, 0 replies; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-17 8:47 UTC (permalink / raw)
To: Joel Fernandes, Sebastian Andrzej Siewior
Cc: Sebastian Andrzej Siewior, Paul E. McKenney, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On Thu, Apr 16, 2020 at 11:05:15PM -0400, Joel Fernandes wrote:
> On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > >
> > > We might need different calling-context restrictions for the two variants
> > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > check for "safe to use normal spinlock in -rt".
> >
> > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > This one will scream if you do
> > raw_spin_lock();
> > spin_lock();
> >
> > Sadly, as of today, there is code triggering this which needs to be
> > addressed first (but it is one list of things to do).
> >
> > Given the thread so far, is it okay if I repost the series with
> > migrate_disable() instead of accepting a possible migration before
> > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > memory allocations in a possible atomic context) until we get there.
>
> I prefer something like the following to make it possible to invoke
> kfree_rcu() from atomic context considering call_rcu() is already callable
> from such contexts. Thoughts?
>
> (Only build tested)
> ---8<-----------------------
>
> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> Subject: [PATCH] rcu/tree: Avoid allocating in non-preemptible context for
> PREEMPT_RT kernels
>
> Per recent discussions, kfree_rcu() is a low-level facility which should be
> callable in atomic context (raw spinlock sections, IRQ disable sections etc).
>
> However, it depends on page allocation which acquires sleeping locks on
> PREEMPT_RT.
>
> In order to support all usecases, avoid the allocation of pages for
> PREEMPT_RT. The page allocation is just an optimization which does not
> break functionality. Further, in future patches the pages will be
> pre-allocated reducing the likelihood that page allocations will be
> needed.
>
> We also convert the spinlock_t to raw_spinlock_t so that does not sleep
> in PREEMPT_RT's raw atomic critical sections.
>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
> kernel/rcu/tree.c | 42 +++++++++++++++++++++++++-----------------
> 1 file changed, 25 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f288477ee1c26..ba831712fb307 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2905,7 +2905,7 @@ struct kfree_rcu_cpu {
> struct kfree_rcu_bulk_data *bhead;
> struct kfree_rcu_bulk_data *bcached;
> struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> - spinlock_t lock;
> + raw_spinlock_t lock;
> struct delayed_work monitor_work;
> bool monitor_todo;
> bool initialized;
> @@ -2939,12 +2939,12 @@ static void kfree_rcu_work(struct work_struct *work)
> krwp = container_of(to_rcu_work(work),
> struct kfree_rcu_cpu_work, rcu_work);
> krcp = krwp->krcp;
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> head = krwp->head_free;
> krwp->head_free = NULL;
> bhead = krwp->bhead_free;
> krwp->bhead_free = NULL;
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
>
> /* "bhead" is now private, so traverse locklessly. */
> for (; bhead; bhead = bnext) {
> @@ -3047,14 +3047,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> krcp->monitor_todo = false;
> if (queue_kfree_rcu_work(krcp)) {
> // Success! Our job is done here.
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> return;
> }
>
> // Previous RCU batch still in progress, try again later.
> krcp->monitor_todo = true;
> schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> /*
> @@ -3067,16 +3067,16 @@ static void kfree_rcu_monitor(struct work_struct *work)
> struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
> monitor_work.work);
>
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (krcp->monitor_todo)
> kfree_rcu_drain_unlock(krcp, flags);
> else
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> static inline bool
> kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> - struct rcu_head *head, rcu_callback_t func)
> + struct rcu_head *head, rcu_callback_t func, bool alloc)
> {
> struct kfree_rcu_bulk_data *bnode;
>
> @@ -3092,6 +3092,10 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> if (!bnode) {
> WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
>
> + /* If allocation is not allowed, don't do it. */
> + if (!alloc)
> + return false;
> +
> bnode = (struct kfree_rcu_bulk_data *)
> __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> }
> @@ -3138,11 +3142,15 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> {
> unsigned long flags;
> struct kfree_rcu_cpu *krcp;
> + bool alloc = true;
> +
> + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
> + alloc = false;
>
> local_irq_save(flags); // For safely calling this_cpu_ptr().
> krcp = this_cpu_ptr(&krc);
> if (krcp->initialized)
> - spin_lock(&krcp->lock);
> + raw_spin_lock(&krcp->lock);
>
> // Queue the object but don't yet schedule the batch.
> if (debug_rcu_head_queue(head)) {
> @@ -3156,7 +3164,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> * Under high memory pressure GFP_NOWAIT can fail,
> * in that case the emergency path is maintained.
> */
> - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
> + if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func, alloc))) {
> head->func = func;
> head->next = krcp->head;
> krcp->head = head;
> @@ -3173,7 +3181,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>
> unlock_return:
> if (krcp->initialized)
> - spin_unlock(&krcp->lock);
> + raw_spin_unlock(&krcp->lock);
> local_irq_restore(flags);
> }
> EXPORT_SYMBOL_GPL(kfree_call_rcu);
> @@ -3205,11 +3213,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> count = krcp->count;
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (krcp->monitor_todo)
> kfree_rcu_drain_unlock(krcp, flags);
> else
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
>
> sc->nr_to_scan -= count;
> freed += count;
> @@ -3236,15 +3244,15 @@ void __init kfree_rcu_scheduler_running(void)
> for_each_online_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (!krcp->head || krcp->monitor_todo) {
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> continue;
> }
> krcp->monitor_todo = true;
> schedule_delayed_work_on(cpu, &krcp->monitor_work,
> KFREE_DRAIN_JIFFIES);
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
> }
>
> @@ -4140,7 +4148,7 @@ static void __init kfree_rcu_batch_init(void)
> for_each_possible_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> - spin_lock_init(&krcp->lock);
> + raw_spin_lock_init(&krcp->lock);
> for (i = 0; i < KFREE_N_BATCHES; i++) {
> INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> krcp->krw_arr[i].krcp = krcp;
I have the same view on it how to handle it in PREEMPT_RT. Basically if
we can store the pointer in array we do it, if a current context is not
preemptible just build the list using rcu_head and queuing it for
farther process. If context is preemptible we will utilize our "array
pointer" approach, so the performance optimization will be in place
on CONFIG_PREEMPT_RT kernel.
I think this is an easiest way of making it PREEMPT_RT friendly. Also
we need to add static initialize of "raw spin lock". Split this patch
to:
- converting to raw spinlocks;
- make it statically initialized;
- bypass the page allocator if RT and not preemptable.
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 3:05 ` Joel Fernandes
2020-04-17 8:47 ` Uladzislau Rezki
@ 2020-04-17 15:04 ` Sebastian Andrzej Siewior
2020-04-17 18:26 ` Joel Fernandes
2020-04-17 16:11 ` Uladzislau Rezki
2020-04-19 12:15 ` Uladzislau Rezki
3 siblings, 1 reply; 85+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-04-17 15:04 UTC (permalink / raw)
To: Joel Fernandes
Cc: Paul E. McKenney, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > >
> > > We might need different calling-context restrictions for the two variants
> > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > check for "safe to use normal spinlock in -rt".
> >
> > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > This one will scream if you do
> > raw_spin_lock();
> > spin_lock();
> >
> > Sadly, as of today, there is code triggering this which needs to be
> > addressed first (but it is one list of things to do).
> >
> > Given the thread so far, is it okay if I repost the series with
> > migrate_disable() instead of accepting a possible migration before
> > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > memory allocations in a possible atomic context) until we get there.
>
> I prefer something like the following to make it possible to invoke
> kfree_rcu() from atomic context considering call_rcu() is already callable
> from such contexts. Thoughts?
So it looks like it would work. However, could we please delay this
until we have an actual case on RT? I just added
WARN_ON(!preemptible());
to kfree_call_rcu() on v5.6.4-rt4 and nothing triggered.
This is the list of users I had (just to figure out if this is used at
all):
- addrconf_ifdown
- cgroup_free
- cgroup_migrate_finish
- css_task_iter_end
- disk_expand_part_tbl
- drop_sysctl_table
- __hw_addr_flush
- inetdev_event
- ip6addrlbl_net_exit
- ip6addrlbl_net_exit
- ops_exit_list.isra.0
- rtnl_register_internal
- simple_set_acl
- swevent_hlist_put_cpu
- timerfd_release
- vfs_rename
- __xfs_set_acl
Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 15:04 ` Sebastian Andrzej Siewior
@ 2020-04-17 18:26 ` Joel Fernandes
2020-04-17 18:54 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Joel Fernandes @ 2020-04-17 18:26 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Paul E. McKenney, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > >
> > > > We might need different calling-context restrictions for the two variants
> > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > check for "safe to use normal spinlock in -rt".
> > >
> > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > This one will scream if you do
> > > raw_spin_lock();
> > > spin_lock();
> > >
> > > Sadly, as of today, there is code triggering this which needs to be
> > > addressed first (but it is one list of things to do).
> > >
> > > Given the thread so far, is it okay if I repost the series with
> > > migrate_disable() instead of accepting a possible migration before
> > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > memory allocations in a possible atomic context) until we get there.
> >
> > I prefer something like the following to make it possible to invoke
> > kfree_rcu() from atomic context considering call_rcu() is already callable
> > from such contexts. Thoughts?
>
> So it looks like it would work. However, could we please delay this
> until we have an actual case on RT? I just added
> WARN_ON(!preemptible());
I am not sure if waiting for it to break in the future is a good idea. I'd
rather design it in a forward thinking way. There could be folks replacing
"call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
in !preemptible(), we'd break on page allocation.
Also as a sidenote, the additional pre-allocation of pages that Vlad is
planning on adding would further reduce the need for pages from the page
allocator.
Paul, what is your opinion on this?
thanks,
- Joel
>
> to kfree_call_rcu() on v5.6.4-rt4 and nothing triggered.
>
> This is the list of users I had (just to figure out if this is used at
> all):
> - addrconf_ifdown
> - cgroup_free
> - cgroup_migrate_finish
> - css_task_iter_end
> - disk_expand_part_tbl
> - drop_sysctl_table
> - __hw_addr_flush
> - inetdev_event
> - ip6addrlbl_net_exit
> - ip6addrlbl_net_exit
> - ops_exit_list.isra.0
> - rtnl_register_internal
> - simple_set_acl
> - swevent_hlist_put_cpu
> - timerfd_release
> - vfs_rename
> - __xfs_set_acl
>
> Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 18:26 ` Joel Fernandes
@ 2020-04-17 18:54 ` Paul E. McKenney
2020-04-18 12:37 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-17 18:54 UTC (permalink / raw)
To: Joel Fernandes
Cc: Sebastian Andrzej Siewior, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > >
> > > > > We might need different calling-context restrictions for the two variants
> > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > check for "safe to use normal spinlock in -rt".
> > > >
> > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > This one will scream if you do
> > > > raw_spin_lock();
> > > > spin_lock();
> > > >
> > > > Sadly, as of today, there is code triggering this which needs to be
> > > > addressed first (but it is one list of things to do).
> > > >
> > > > Given the thread so far, is it okay if I repost the series with
> > > > migrate_disable() instead of accepting a possible migration before
> > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > memory allocations in a possible atomic context) until we get there.
> > >
> > > I prefer something like the following to make it possible to invoke
> > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > from such contexts. Thoughts?
> >
> > So it looks like it would work. However, could we please delay this
> > until we have an actual case on RT? I just added
> > WARN_ON(!preemptible());
>
> I am not sure if waiting for it to break in the future is a good idea. I'd
> rather design it in a forward thinking way. There could be folks replacing
> "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> in !preemptible(), we'd break on page allocation.
>
> Also as a sidenote, the additional pre-allocation of pages that Vlad is
> planning on adding would further reduce the need for pages from the page
> allocator.
>
> Paul, what is your opinion on this?
My experience with call_rcu(), of which kfree_rcu() is a specialization,
is that it gets invoked with preemption disabled, with interrupts
disabled, and during early boot, as in even before rcu_init() has been
invoked. This experience does make me lean towards raw spinlocks.
But to Sebastian's point, if we are going to use raw spinlocks, we need
to keep the code paths holding those spinlocks as short as possible.
I suppose that the inability to allocate memory with raw spinlocks held
helps, but it is worth checking.
Thanx, Paul
> thanks,
>
> - Joel
>
>
> >
> > to kfree_call_rcu() on v5.6.4-rt4 and nothing triggered.
> >
> > This is the list of users I had (just to figure out if this is used at
> > all):
> > - addrconf_ifdown
> > - cgroup_free
> > - cgroup_migrate_finish
> > - css_task_iter_end
> > - disk_expand_part_tbl
> > - drop_sysctl_table
> > - __hw_addr_flush
> > - inetdev_event
> > - ip6addrlbl_net_exit
> > - ip6addrlbl_net_exit
> > - ops_exit_list.isra.0
> > - rtnl_register_internal
> > - simple_set_acl
> > - swevent_hlist_put_cpu
> > - timerfd_release
> > - vfs_rename
> > - __xfs_set_acl
> >
> > Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 18:54 ` Paul E. McKenney
@ 2020-04-18 12:37 ` Uladzislau Rezki
2020-04-19 14:58 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-18 12:37 UTC (permalink / raw)
To: Paul E. McKenney, Joel Fernandes
Cc: Joel Fernandes, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney wrote:
> On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > > >
> > > > > > We might need different calling-context restrictions for the two variants
> > > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > > check for "safe to use normal spinlock in -rt".
> > > > >
> > > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > This one will scream if you do
> > > > > raw_spin_lock();
> > > > > spin_lock();
> > > > >
> > > > > Sadly, as of today, there is code triggering this which needs to be
> > > > > addressed first (but it is one list of things to do).
> > > > >
> > > > > Given the thread so far, is it okay if I repost the series with
> > > > > migrate_disable() instead of accepting a possible migration before
> > > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > > memory allocations in a possible atomic context) until we get there.
> > > >
> > > > I prefer something like the following to make it possible to invoke
> > > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > > from such contexts. Thoughts?
> > >
> > > So it looks like it would work. However, could we please delay this
> > > until we have an actual case on RT? I just added
> > > WARN_ON(!preemptible());
> >
> > I am not sure if waiting for it to break in the future is a good idea. I'd
> > rather design it in a forward thinking way. There could be folks replacing
> > "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> > in !preemptible(), we'd break on page allocation.
> >
> > Also as a sidenote, the additional pre-allocation of pages that Vlad is
> > planning on adding would further reduce the need for pages from the page
> > allocator.
> >
> > Paul, what is your opinion on this?
>
> My experience with call_rcu(), of which kfree_rcu() is a specialization,
> is that it gets invoked with preemption disabled, with interrupts
> disabled, and during early boot, as in even before rcu_init() has been
> invoked. This experience does make me lean towards raw spinlocks.
>
> But to Sebastian's point, if we are going to use raw spinlocks, we need
> to keep the code paths holding those spinlocks as short as possible.
> I suppose that the inability to allocate memory with raw spinlocks held
> helps, but it is worth checking.
>
How about reducing the lock contention even further?
<snip>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f288477ee1c2..fb916e065784 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3053,7 +3053,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_work_on(raw_smp_processor_id(),
+ &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
spin_unlock_irqrestore(&krcp->lock, flags);
}
@@ -3168,7 +3169,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!krcp->monitor_todo) {
krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+ schedule_delayed_work_on(raw_smp_processor_id(),
+ &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
}
unlock_return:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 891ccad5f271..49fcc50469f4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1723,7 +1723,9 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
/* read the comment in __queue_work() */
local_irq_disable();
- __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
+
+ /* Just for illustration. Can have queue_rcu_work_on(). */
+ __queue_work(raw_smp_processor_id(), rwork->wq, &rwork->work);
local_irq_enable();
}
<snip>
Thoughts?
--
Vlad Rezki
^ permalink raw reply related [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-18 12:37 ` Uladzislau Rezki
@ 2020-04-19 14:58 ` Paul E. McKenney
2020-04-20 0:27 ` Joel Fernandes
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-19 14:58 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Joel Fernandes, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki wrote:
> On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney wrote:
> > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > > > >
> > > > > > > We might need different calling-context restrictions for the two variants
> > > > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > >
> > > > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > This one will scream if you do
> > > > > > raw_spin_lock();
> > > > > > spin_lock();
> > > > > >
> > > > > > Sadly, as of today, there is code triggering this which needs to be
> > > > > > addressed first (but it is one list of things to do).
> > > > > >
> > > > > > Given the thread so far, is it okay if I repost the series with
> > > > > > migrate_disable() instead of accepting a possible migration before
> > > > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > > > memory allocations in a possible atomic context) until we get there.
> > > > >
> > > > > I prefer something like the following to make it possible to invoke
> > > > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > > > from such contexts. Thoughts?
> > > >
> > > > So it looks like it would work. However, could we please delay this
> > > > until we have an actual case on RT? I just added
> > > > WARN_ON(!preemptible());
> > >
> > > I am not sure if waiting for it to break in the future is a good idea. I'd
> > > rather design it in a forward thinking way. There could be folks replacing
> > > "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> > > in !preemptible(), we'd break on page allocation.
> > >
> > > Also as a sidenote, the additional pre-allocation of pages that Vlad is
> > > planning on adding would further reduce the need for pages from the page
> > > allocator.
> > >
> > > Paul, what is your opinion on this?
> >
> > My experience with call_rcu(), of which kfree_rcu() is a specialization,
> > is that it gets invoked with preemption disabled, with interrupts
> > disabled, and during early boot, as in even before rcu_init() has been
> > invoked. This experience does make me lean towards raw spinlocks.
> >
> > But to Sebastian's point, if we are going to use raw spinlocks, we need
> > to keep the code paths holding those spinlocks as short as possible.
> > I suppose that the inability to allocate memory with raw spinlocks held
> > helps, but it is worth checking.
> >
> How about reducing the lock contention even further?
Can we do even better by moving the work-scheduling out from under the
spinlock? This of course means that it is necessary to handle the
occasional spurious call to the work handler, but that should be rare
and should be in the noise compared to the reduction in contention.
Thoughts?
Thanx, Paul
> <snip>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f288477ee1c2..fb916e065784 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3053,7 +3053,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
>
> // Previous RCU batch still in progress, try again later.
> krcp->monitor_todo = true;
> - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> + schedule_delayed_work_on(raw_smp_processor_id(),
> + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> @@ -3168,7 +3169,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> !krcp->monitor_todo) {
> krcp->monitor_todo = true;
> - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> + schedule_delayed_work_on(raw_smp_processor_id(),
> + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> }
>
> unlock_return:
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 891ccad5f271..49fcc50469f4 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1723,7 +1723,9 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
>
> /* read the comment in __queue_work() */
> local_irq_disable();
> - __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
> +
> + /* Just for illustration. Can have queue_rcu_work_on(). */
> + __queue_work(raw_smp_processor_id(), rwork->wq, &rwork->work);
> local_irq_enable();
> }
> <snip>
>
> Thoughts?
>
> --
> Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-19 14:58 ` Paul E. McKenney
@ 2020-04-20 0:27 ` Joel Fernandes
2020-04-20 1:17 ` Joel Fernandes
0 siblings, 1 reply; 85+ messages in thread
From: Joel Fernandes @ 2020-04-20 0:27 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki wrote:
> > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney wrote:
> > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > > > > >
> > > > > > > > We might need different calling-context restrictions for the two variants
> > > > > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > >
> > > > > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > This one will scream if you do
> > > > > > > raw_spin_lock();
> > > > > > > spin_lock();
> > > > > > >
> > > > > > > Sadly, as of today, there is code triggering this which needs to be
> > > > > > > addressed first (but it is one list of things to do).
> > > > > > >
> > > > > > > Given the thread so far, is it okay if I repost the series with
> > > > > > > migrate_disable() instead of accepting a possible migration before
> > > > > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > > > > memory allocations in a possible atomic context) until we get there.
> > > > > >
> > > > > > I prefer something like the following to make it possible to invoke
> > > > > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > > > > from such contexts. Thoughts?
> > > > >
> > > > > So it looks like it would work. However, could we please delay this
> > > > > until we have an actual case on RT? I just added
> > > > > WARN_ON(!preemptible());
> > > >
> > > > I am not sure if waiting for it to break in the future is a good idea. I'd
> > > > rather design it in a forward thinking way. There could be folks replacing
> > > > "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> > > > in !preemptible(), we'd break on page allocation.
> > > >
> > > > Also as a sidenote, the additional pre-allocation of pages that Vlad is
> > > > planning on adding would further reduce the need for pages from the page
> > > > allocator.
> > > >
> > > > Paul, what is your opinion on this?
> > >
> > > My experience with call_rcu(), of which kfree_rcu() is a specialization,
> > > is that it gets invoked with preemption disabled, with interrupts
> > > disabled, and during early boot, as in even before rcu_init() has been
> > > invoked. This experience does make me lean towards raw spinlocks.
> > >
> > > But to Sebastian's point, if we are going to use raw spinlocks, we need
> > > to keep the code paths holding those spinlocks as short as possible.
> > > I suppose that the inability to allocate memory with raw spinlocks held
> > > helps, but it is worth checking.
> > >
> > How about reducing the lock contention even further?
>
> Can we do even better by moving the work-scheduling out from under the
> spinlock? This of course means that it is necessary to handle the
> occasional spurious call to the work handler, but that should be rare
> and should be in the noise compared to the reduction in contention.
Yes I think that will be required since -rt will sleep on workqueue locks as
well :-(. I'm looking into it right now.
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work);
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
spin_lock(&last_pool->lock);
Thanks!
- Joel
>
> Thoughts?
>
> Thanx, Paul
>
> > <snip>
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index f288477ee1c2..fb916e065784 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3053,7 +3053,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> >
> > // Previous RCU batch still in progress, try again later.
> > krcp->monitor_todo = true;
> > - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > + schedule_delayed_work_on(raw_smp_processor_id(),
> > + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > spin_unlock_irqrestore(&krcp->lock, flags);
> > }
> >
> > @@ -3168,7 +3169,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> > !krcp->monitor_todo) {
> > krcp->monitor_todo = true;
> > - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > + schedule_delayed_work_on(raw_smp_processor_id(),
> > + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > }
> >
> > unlock_return:
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index 891ccad5f271..49fcc50469f4 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -1723,7 +1723,9 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
> >
> > /* read the comment in __queue_work() */
> > local_irq_disable();
> > - __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
> > +
> > + /* Just for illustration. Can have queue_rcu_work_on(). */
> > + __queue_work(raw_smp_processor_id(), rwork->wq, &rwork->work);
> > local_irq_enable();
> > }
> > <snip>
> >
> > Thoughts?
> >
> > --
> > Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 0:27 ` Joel Fernandes
@ 2020-04-20 1:17 ` Joel Fernandes
2020-04-20 1:44 ` Paul E. McKenney
2020-04-20 3:02 ` Mike Galbraith
0 siblings, 2 replies; 85+ messages in thread
From: Joel Fernandes @ 2020-04-20 1:17 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki wrote:
> > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney wrote:
> > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > > > > > >
> > > > > > > > > We might need different calling-context restrictions for the two variants
> > > > > > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > >
> > > > > > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > This one will scream if you do
> > > > > > > > raw_spin_lock();
> > > > > > > > spin_lock();
> > > > > > > >
> > > > > > > > Sadly, as of today, there is code triggering this which needs to be
> > > > > > > > addressed first (but it is one list of things to do).
> > > > > > > >
> > > > > > > > Given the thread so far, is it okay if I repost the series with
> > > > > > > > migrate_disable() instead of accepting a possible migration before
> > > > > > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > > > > > memory allocations in a possible atomic context) until we get there.
> > > > > > >
> > > > > > > I prefer something like the following to make it possible to invoke
> > > > > > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > > > > > from such contexts. Thoughts?
> > > > > >
> > > > > > So it looks like it would work. However, could we please delay this
> > > > > > until we have an actual case on RT? I just added
> > > > > > WARN_ON(!preemptible());
> > > > >
> > > > > I am not sure if waiting for it to break in the future is a good idea. I'd
> > > > > rather design it in a forward thinking way. There could be folks replacing
> > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> > > > > in !preemptible(), we'd break on page allocation.
> > > > >
> > > > > Also as a sidenote, the additional pre-allocation of pages that Vlad is
> > > > > planning on adding would further reduce the need for pages from the page
> > > > > allocator.
> > > > >
> > > > > Paul, what is your opinion on this?
> > > >
> > > > My experience with call_rcu(), of which kfree_rcu() is a specialization,
> > > > is that it gets invoked with preemption disabled, with interrupts
> > > > disabled, and during early boot, as in even before rcu_init() has been
> > > > invoked. This experience does make me lean towards raw spinlocks.
> > > >
> > > > But to Sebastian's point, if we are going to use raw spinlocks, we need
> > > > to keep the code paths holding those spinlocks as short as possible.
> > > > I suppose that the inability to allocate memory with raw spinlocks held
> > > > helps, but it is worth checking.
> > > >
> > > How about reducing the lock contention even further?
> >
> > Can we do even better by moving the work-scheduling out from under the
> > spinlock? This of course means that it is necessary to handle the
> > occasional spurious call to the work handler, but that should be rare
> > and should be in the noise compared to the reduction in contention.
>
> Yes I think that will be required since -rt will sleep on workqueue locks as
> well :-(. I'm looking into it right now.
>
> /*
> * If @work was previously on a different pool, it might still be
> * running there, in which case the work needs to be queued on that
> * pool to guarantee non-reentrancy.
> */
> last_pool = get_work_pool(work);
> if (last_pool && last_pool != pwq->pool) {
> struct worker *worker;
>
> spin_lock(&last_pool->lock);
Hmm, I think moving schedule_delayed_work() outside lock will work. Just took
a good look and that's not an issue. However calling schedule_delayed_work()
itself is an issue if the caller of kfree_rcu() is !preemptible() on
PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on pool->lock
which can sleep on PREEMPT_RT :-(. Which means we have to do either of:
1. Implement a new mechanism for scheduling delayed work that does not
acquire sleeping locks.
2. Allow kfree_rcu() only from preemptible context (That is Sebastian's
initial patch to replace local_irq_save() + spin_lock() with
spin_lock_irqsave()).
3. Queue the work through irq_work or another bottom-half mechanism.
Any other thoughts?
thanks,
- Joel
>
> Thanks!
>
> - Joel
>
>
> >
> > Thoughts?
> >
> > Thanx, Paul
> >
> > > <snip>
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index f288477ee1c2..fb916e065784 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -3053,7 +3053,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> > >
> > > // Previous RCU batch still in progress, try again later.
> > > krcp->monitor_todo = true;
> > > - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > + schedule_delayed_work_on(raw_smp_processor_id(),
> > > + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > spin_unlock_irqrestore(&krcp->lock, flags);
> > > }
> > >
> > > @@ -3168,7 +3169,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > > if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> > > !krcp->monitor_todo) {
> > > krcp->monitor_todo = true;
> > > - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > + schedule_delayed_work_on(raw_smp_processor_id(),
> > > + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > }
> > >
> > > unlock_return:
> > > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > > index 891ccad5f271..49fcc50469f4 100644
> > > --- a/kernel/workqueue.c
> > > +++ b/kernel/workqueue.c
> > > @@ -1723,7 +1723,9 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
> > >
> > > /* read the comment in __queue_work() */
> > > local_irq_disable();
> > > - __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
> > > +
> > > + /* Just for illustration. Can have queue_rcu_work_on(). */
> > > + __queue_work(raw_smp_processor_id(), rwork->wq, &rwork->work);
> > > local_irq_enable();
> > > }
> > > <snip>
> > >
> > > Thoughts?
> > >
> > > --
> > > Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 1:17 ` Joel Fernandes
@ 2020-04-20 1:44 ` Paul E. McKenney
2020-04-20 12:13 ` Uladzislau Rezki
2020-04-20 3:02 ` Mike Galbraith
1 sibling, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 1:44 UTC (permalink / raw)
To: Joel Fernandes
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki wrote:
> > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney wrote:
> > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > > > > > > >
> > > > > > > > > > We might need different calling-context restrictions for the two variants
> > > > > > > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > >
> > > > > > > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > This one will scream if you do
> > > > > > > > > raw_spin_lock();
> > > > > > > > > spin_lock();
> > > > > > > > >
> > > > > > > > > Sadly, as of today, there is code triggering this which needs to be
> > > > > > > > > addressed first (but it is one list of things to do).
> > > > > > > > >
> > > > > > > > > Given the thread so far, is it okay if I repost the series with
> > > > > > > > > migrate_disable() instead of accepting a possible migration before
> > > > > > > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > > > > > > memory allocations in a possible atomic context) until we get there.
> > > > > > > >
> > > > > > > > I prefer something like the following to make it possible to invoke
> > > > > > > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > > > > > > from such contexts. Thoughts?
> > > > > > >
> > > > > > > So it looks like it would work. However, could we please delay this
> > > > > > > until we have an actual case on RT? I just added
> > > > > > > WARN_ON(!preemptible());
> > > > > >
> > > > > > I am not sure if waiting for it to break in the future is a good idea. I'd
> > > > > > rather design it in a forward thinking way. There could be folks replacing
> > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> > > > > > in !preemptible(), we'd break on page allocation.
> > > > > >
> > > > > > Also as a sidenote, the additional pre-allocation of pages that Vlad is
> > > > > > planning on adding would further reduce the need for pages from the page
> > > > > > allocator.
> > > > > >
> > > > > > Paul, what is your opinion on this?
> > > > >
> > > > > My experience with call_rcu(), of which kfree_rcu() is a specialization,
> > > > > is that it gets invoked with preemption disabled, with interrupts
> > > > > disabled, and during early boot, as in even before rcu_init() has been
> > > > > invoked. This experience does make me lean towards raw spinlocks.
> > > > >
> > > > > But to Sebastian's point, if we are going to use raw spinlocks, we need
> > > > > to keep the code paths holding those spinlocks as short as possible.
> > > > > I suppose that the inability to allocate memory with raw spinlocks held
> > > > > helps, but it is worth checking.
> > > > >
> > > > How about reducing the lock contention even further?
> > >
> > > Can we do even better by moving the work-scheduling out from under the
> > > spinlock? This of course means that it is necessary to handle the
> > > occasional spurious call to the work handler, but that should be rare
> > > and should be in the noise compared to the reduction in contention.
> >
> > Yes I think that will be required since -rt will sleep on workqueue locks as
> > well :-(. I'm looking into it right now.
> >
> > /*
> > * If @work was previously on a different pool, it might still be
> > * running there, in which case the work needs to be queued on that
> > * pool to guarantee non-reentrancy.
> > */
> > last_pool = get_work_pool(work);
> > if (last_pool && last_pool != pwq->pool) {
> > struct worker *worker;
> >
> > spin_lock(&last_pool->lock);
>
> Hmm, I think moving schedule_delayed_work() outside lock will work. Just took
> a good look and that's not an issue. However calling schedule_delayed_work()
> itself is an issue if the caller of kfree_rcu() is !preemptible() on
> PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on pool->lock
> which can sleep on PREEMPT_RT :-(. Which means we have to do either of:
>
> 1. Implement a new mechanism for scheduling delayed work that does not
> acquire sleeping locks.
>
> 2. Allow kfree_rcu() only from preemptible context (That is Sebastian's
> initial patch to replace local_irq_save() + spin_lock() with
> spin_lock_irqsave()).
>
> 3. Queue the work through irq_work or another bottom-half mechanism.
I use irq_work elsewhere in RCU, but the queue_delayed_work() might
go well with a timer. This can of course be done conditionally.
> Any other thoughts?
I did forget to ask you guys your opinions about the downsides (if any)
of moving from unbound to per-CPU workqueues. Thoughts?
Thanx, Paul
> thanks,
>
> - Joel
>
>
> >
> > Thanks!
> >
> > - Joel
> >
> >
> > >
> > > Thoughts?
> > >
> > > Thanx, Paul
> > >
> > > > <snip>
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index f288477ee1c2..fb916e065784 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -3053,7 +3053,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> > > >
> > > > // Previous RCU batch still in progress, try again later.
> > > > krcp->monitor_todo = true;
> > > > - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > > + schedule_delayed_work_on(raw_smp_processor_id(),
> > > > + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > > spin_unlock_irqrestore(&krcp->lock, flags);
> > > > }
> > > >
> > > > @@ -3168,7 +3169,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > > > if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> > > > !krcp->monitor_todo) {
> > > > krcp->monitor_todo = true;
> > > > - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > > + schedule_delayed_work_on(raw_smp_processor_id(),
> > > > + &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > > > }
> > > >
> > > > unlock_return:
> > > > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > > > index 891ccad5f271..49fcc50469f4 100644
> > > > --- a/kernel/workqueue.c
> > > > +++ b/kernel/workqueue.c
> > > > @@ -1723,7 +1723,9 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
> > > >
> > > > /* read the comment in __queue_work() */
> > > > local_irq_disable();
> > > > - __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
> > > > +
> > > > + /* Just for illustration. Can have queue_rcu_work_on(). */
> > > > + __queue_work(raw_smp_processor_id(), rwork->wq, &rwork->work);
> > > > local_irq_enable();
> > > > }
> > > > <snip>
> > > >
> > > > Thoughts?
> > > >
> > > > --
> > > > Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 1:44 ` Paul E. McKenney
@ 2020-04-20 12:13 ` Uladzislau Rezki
2020-04-20 12:36 ` joel
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 12:13 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Joel Fernandes, Uladzislau Rezki, Sebastian Andrzej Siewior,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki wrote:
> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney wrote:
> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes wrote:
> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > > > > > > > > > >
> > > > > > > > > > > We might need different calling-context restrictions for the two variants
> > > > > > > > > > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > > >
> > > > > > > > > > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > > This one will scream if you do
> > > > > > > > > > raw_spin_lock();
> > > > > > > > > > spin_lock();
> > > > > > > > > >
> > > > > > > > > > Sadly, as of today, there is code triggering this which needs to be
> > > > > > > > > > addressed first (but it is one list of things to do).
> > > > > > > > > >
> > > > > > > > > > Given the thread so far, is it okay if I repost the series with
> > > > > > > > > > migrate_disable() instead of accepting a possible migration before
> > > > > > > > > > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > > > > > > > > > memory allocations in a possible atomic context) until we get there.
> > > > > > > > >
> > > > > > > > > I prefer something like the following to make it possible to invoke
> > > > > > > > > kfree_rcu() from atomic context considering call_rcu() is already callable
> > > > > > > > > from such contexts. Thoughts?
> > > > > > > >
> > > > > > > > So it looks like it would work. However, could we please delay this
> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > WARN_ON(!preemptible());
> > > > > > >
> > > > > > > I am not sure if waiting for it to break in the future is a good idea. I'd
> > > > > > > rather design it in a forward thinking way. There could be folks replacing
> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for example. If they were
> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > >
> > > > > > > Also as a sidenote, the additional pre-allocation of pages that Vlad is
> > > > > > > planning on adding would further reduce the need for pages from the page
> > > > > > > allocator.
> > > > > > >
> > > > > > > Paul, what is your opinion on this?
> > > > > >
> > > > > > My experience with call_rcu(), of which kfree_rcu() is a specialization,
> > > > > > is that it gets invoked with preemption disabled, with interrupts
> > > > > > disabled, and during early boot, as in even before rcu_init() has been
> > > > > > invoked. This experience does make me lean towards raw spinlocks.
> > > > > >
> > > > > > But to Sebastian's point, if we are going to use raw spinlocks, we need
> > > > > > to keep the code paths holding those spinlocks as short as possible.
> > > > > > I suppose that the inability to allocate memory with raw spinlocks held
> > > > > > helps, but it is worth checking.
> > > > > >
> > > > > How about reducing the lock contention even further?
> > > >
> > > > Can we do even better by moving the work-scheduling out from under the
> > > > spinlock? This of course means that it is necessary to handle the
> > > > occasional spurious call to the work handler, but that should be rare
> > > > and should be in the noise compared to the reduction in contention.
> > >
> > > Yes I think that will be required since -rt will sleep on workqueue locks as
> > > well :-(. I'm looking into it right now.
> > >
> > > /*
> > > * If @work was previously on a different pool, it might still be
> > > * running there, in which case the work needs to be queued on that
> > > * pool to guarantee non-reentrancy.
> > > */
> > > last_pool = get_work_pool(work);
> > > if (last_pool && last_pool != pwq->pool) {
> > > struct worker *worker;
> > >
> > > spin_lock(&last_pool->lock);
> >
> > Hmm, I think moving schedule_delayed_work() outside lock will work. Just took
> > a good look and that's not an issue. However calling schedule_delayed_work()
> > itself is an issue if the caller of kfree_rcu() is !preemptible() on
> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on pool->lock
> > which can sleep on PREEMPT_RT :-(. Which means we have to do either of:
> >
> > 1. Implement a new mechanism for scheduling delayed work that does not
> > acquire sleeping locks.
> >
> > 2. Allow kfree_rcu() only from preemptible context (That is Sebastian's
> > initial patch to replace local_irq_save() + spin_lock() with
> > spin_lock_irqsave()).
> >
> > 3. Queue the work through irq_work or another bottom-half mechanism.
>
> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> go well with a timer. This can of course be done conditionally.
>
We can schedule_delayed_work() inside and outside of the spinlock,
i.e. it is not an issue for RT kernel, because as it was noted in last
message a workqueue system uses raw spinlicks internally. I checked
the latest linux-5.6.y-rt also. If we do it inside, we will place the
work on current CPU, at least as i see it, even if it is "unbound".
If we do it outside, we will reduce a critical section, from the other
hand we can introduce a potential delay in placing the context into CPUs
run-queuye. As a result we could end up on another CPU, thus placing
the work on new CPU, plus memory foot-print might be higher. It would
be good to test and have a look at it actually.
But it can be negligible :)
> > Any other thoughts?
>
> I did forget to ask you guys your opinions about the downsides (if any)
> of moving from unbound to per-CPU workqueues. Thoughts?
>
If we do it outside of spinlock, there is at least one drawback that i
see, i described it above. We can use schedule_delayed_work_on() but
we as a caller have to guarantee that a CPU we about to place a work
is alive :)
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 12:13 ` Uladzislau Rezki
@ 2020-04-20 12:36 ` joel
2020-04-20 13:00 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: joel @ 2020-04-20 12:36 UTC (permalink / raw)
To: Uladzislau Rezki, Paul E. McKenney
Cc: Sebastian Andrzej Siewior, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
>On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
>> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
>> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
>> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
>> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
>wrote:
>> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
>wrote:
>> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
>wrote:
>> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
>Andrzej Siewior wrote:
>> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
>> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
>Andrzej Siewior wrote:
>> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
>wrote:
>> > > > > > > > > > >
>> > > > > > > > > > > We might need different calling-context
>restrictions for the two variants
>> > > > > > > > > > > of kfree_rcu(). And we might need to come up
>with some sort of lockdep
>> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
>> > > > > > > > > >
>> > > > > > > > > > Oh. We do have this already, it is called
>CONFIG_PROVE_RAW_LOCK_NESTING.
>> > > > > > > > > > This one will scream if you do
>> > > > > > > > > > raw_spin_lock();
>> > > > > > > > > > spin_lock();
>> > > > > > > > > >
>> > > > > > > > > > Sadly, as of today, there is code triggering this
>which needs to be
>> > > > > > > > > > addressed first (but it is one list of things to
>do).
>> > > > > > > > > >
>> > > > > > > > > > Given the thread so far, is it okay if I repost the
>series with
>> > > > > > > > > > migrate_disable() instead of accepting a possible
>migration before
>> > > > > > > > > > grabbing the lock? I would prefer to avoid the
>extra RT case (avoiding
>> > > > > > > > > > memory allocations in a possible atomic context)
>until we get there.
>> > > > > > > > >
>> > > > > > > > > I prefer something like the following to make it
>possible to invoke
>> > > > > > > > > kfree_rcu() from atomic context considering
>call_rcu() is already callable
>> > > > > > > > > from such contexts. Thoughts?
>> > > > > > > >
>> > > > > > > > So it looks like it would work. However, could we
>please delay this
>> > > > > > > > until we have an actual case on RT? I just added
>> > > > > > > > WARN_ON(!preemptible());
>> > > > > > >
>> > > > > > > I am not sure if waiting for it to break in the future is
>a good idea. I'd
>> > > > > > > rather design it in a forward thinking way. There could
>be folks replacing
>> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
>example. If they were
>> > > > > > > in !preemptible(), we'd break on page allocation.
>> > > > > > >
>> > > > > > > Also as a sidenote, the additional pre-allocation of
>pages that Vlad is
>> > > > > > > planning on adding would further reduce the need for
>pages from the page
>> > > > > > > allocator.
>> > > > > > >
>> > > > > > > Paul, what is your opinion on this?
>> > > > > >
>> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
>specialization,
>> > > > > > is that it gets invoked with preemption disabled, with
>interrupts
>> > > > > > disabled, and during early boot, as in even before
>rcu_init() has been
>> > > > > > invoked. This experience does make me lean towards raw
>spinlocks.
>> > > > > >
>> > > > > > But to Sebastian's point, if we are going to use raw
>spinlocks, we need
>> > > > > > to keep the code paths holding those spinlocks as short as
>possible.
>> > > > > > I suppose that the inability to allocate memory with raw
>spinlocks held
>> > > > > > helps, but it is worth checking.
>> > > > > >
>> > > > > How about reducing the lock contention even further?
>> > > >
>> > > > Can we do even better by moving the work-scheduling out from
>under the
>> > > > spinlock? This of course means that it is necessary to handle
>the
>> > > > occasional spurious call to the work handler, but that should
>be rare
>> > > > and should be in the noise compared to the reduction in
>contention.
>> > >
>> > > Yes I think that will be required since -rt will sleep on
>workqueue locks as
>> > > well :-(. I'm looking into it right now.
>> > >
>> > > /*
>> > > * If @work was previously on a different pool, it might
>still be
>> > > * running there, in which case the work needs to be
>queued on that
>> > > * pool to guarantee non-reentrancy.
>> > > */
>> > > last_pool = get_work_pool(work);
>> > > if (last_pool && last_pool != pwq->pool) {
>> > > struct worker *worker;
>> > >
>> > > spin_lock(&last_pool->lock);
>> >
>> > Hmm, I think moving schedule_delayed_work() outside lock will work.
>Just took
>> > a good look and that's not an issue. However calling
>schedule_delayed_work()
>> > itself is an issue if the caller of kfree_rcu() is !preemptible()
>on
>> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
>pool->lock
>> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
>of:
>> >
>> > 1. Implement a new mechanism for scheduling delayed work that does
>not
>> > acquire sleeping locks.
>> >
>> > 2. Allow kfree_rcu() only from preemptible context (That is
>Sebastian's
>> > initial patch to replace local_irq_save() + spin_lock() with
>> > spin_lock_irqsave()).
>> >
>> > 3. Queue the work through irq_work or another bottom-half
>mechanism.
>>
>> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
>> go well with a timer. This can of course be done conditionally.
>>
>We can schedule_delayed_work() inside and outside of the spinlock,
>i.e. it is not an issue for RT kernel, because as it was noted in last
>message a workqueue system uses raw spinlicks internally. I checked
>the latest linux-5.6.y-rt also. If we do it inside, we will place the
>work on current CPU, at least as i see it, even if it is "unbound".
>
Thanks for confirming!!
>If we do it outside, we will reduce a critical section, from the other
>hand we can introduce a potential delay in placing the context into
>CPUs
>run-queuye. As a result we could end up on another CPU, thus placing
>the work on new CPU, plus memory foot-print might be higher. It would
>be good to test and have a look at it actually.
>
>But it can be negligible :)
Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
>
>> > Any other thoughts?
>>
>> I did forget to ask you guys your opinions about the downsides (if
>any)
>> of moving from unbound to per-CPU workqueues. Thoughts?
>>
>If we do it outside of spinlock, there is at least one drawback that i
>see, i described it above. We can use schedule_delayed_work_on() but
>we as a caller have to guarantee that a CPU we about to place a work
>is alive :)
FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
Thanks,
- Joel
>
>--
>Vlad Rezki
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 12:36 ` joel
@ 2020-04-20 13:00 ` Uladzislau Rezki
2020-04-20 13:26 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 13:00 UTC (permalink / raw)
To: joel
Cc: Uladzislau Rezki, Paul E. McKenney, Sebastian Andrzej Siewior,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
>
>
> On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> >wrote:
> >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> >wrote:
> >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> >wrote:
> >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> >Andrzej Siewior wrote:
> >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> >Andrzej Siewior wrote:
> >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> >wrote:
> >> > > > > > > > > > >
> >> > > > > > > > > > > We might need different calling-context
> >restrictions for the two variants
> >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> >with some sort of lockdep
> >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> >> > > > > > > > > >
> >> > > > > > > > > > Oh. We do have this already, it is called
> >CONFIG_PROVE_RAW_LOCK_NESTING.
> >> > > > > > > > > > This one will scream if you do
> >> > > > > > > > > > raw_spin_lock();
> >> > > > > > > > > > spin_lock();
> >> > > > > > > > > >
> >> > > > > > > > > > Sadly, as of today, there is code triggering this
> >which needs to be
> >> > > > > > > > > > addressed first (but it is one list of things to
> >do).
> >> > > > > > > > > >
> >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> >series with
> >> > > > > > > > > > migrate_disable() instead of accepting a possible
> >migration before
> >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> >extra RT case (avoiding
> >> > > > > > > > > > memory allocations in a possible atomic context)
> >until we get there.
> >> > > > > > > > >
> >> > > > > > > > > I prefer something like the following to make it
> >possible to invoke
> >> > > > > > > > > kfree_rcu() from atomic context considering
> >call_rcu() is already callable
> >> > > > > > > > > from such contexts. Thoughts?
> >> > > > > > > >
> >> > > > > > > > So it looks like it would work. However, could we
> >please delay this
> >> > > > > > > > until we have an actual case on RT? I just added
> >> > > > > > > > WARN_ON(!preemptible());
> >> > > > > > >
> >> > > > > > > I am not sure if waiting for it to break in the future is
> >a good idea. I'd
> >> > > > > > > rather design it in a forward thinking way. There could
> >be folks replacing
> >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> >example. If they were
> >> > > > > > > in !preemptible(), we'd break on page allocation.
> >> > > > > > >
> >> > > > > > > Also as a sidenote, the additional pre-allocation of
> >pages that Vlad is
> >> > > > > > > planning on adding would further reduce the need for
> >pages from the page
> >> > > > > > > allocator.
> >> > > > > > >
> >> > > > > > > Paul, what is your opinion on this?
> >> > > > > >
> >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> >specialization,
> >> > > > > > is that it gets invoked with preemption disabled, with
> >interrupts
> >> > > > > > disabled, and during early boot, as in even before
> >rcu_init() has been
> >> > > > > > invoked. This experience does make me lean towards raw
> >spinlocks.
> >> > > > > >
> >> > > > > > But to Sebastian's point, if we are going to use raw
> >spinlocks, we need
> >> > > > > > to keep the code paths holding those spinlocks as short as
> >possible.
> >> > > > > > I suppose that the inability to allocate memory with raw
> >spinlocks held
> >> > > > > > helps, but it is worth checking.
> >> > > > > >
> >> > > > > How about reducing the lock contention even further?
> >> > > >
> >> > > > Can we do even better by moving the work-scheduling out from
> >under the
> >> > > > spinlock? This of course means that it is necessary to handle
> >the
> >> > > > occasional spurious call to the work handler, but that should
> >be rare
> >> > > > and should be in the noise compared to the reduction in
> >contention.
> >> > >
> >> > > Yes I think that will be required since -rt will sleep on
> >workqueue locks as
> >> > > well :-(. I'm looking into it right now.
> >> > >
> >> > > /*
> >> > > * If @work was previously on a different pool, it might
> >still be
> >> > > * running there, in which case the work needs to be
> >queued on that
> >> > > * pool to guarantee non-reentrancy.
> >> > > */
> >> > > last_pool = get_work_pool(work);
> >> > > if (last_pool && last_pool != pwq->pool) {
> >> > > struct worker *worker;
> >> > >
> >> > > spin_lock(&last_pool->lock);
> >> >
> >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> >Just took
> >> > a good look and that's not an issue. However calling
> >schedule_delayed_work()
> >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> >on
> >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> >pool->lock
> >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> >of:
> >> >
> >> > 1. Implement a new mechanism for scheduling delayed work that does
> >not
> >> > acquire sleeping locks.
> >> >
> >> > 2. Allow kfree_rcu() only from preemptible context (That is
> >Sebastian's
> >> > initial patch to replace local_irq_save() + spin_lock() with
> >> > spin_lock_irqsave()).
> >> >
> >> > 3. Queue the work through irq_work or another bottom-half
> >mechanism.
> >>
> >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> >> go well with a timer. This can of course be done conditionally.
> >>
> >We can schedule_delayed_work() inside and outside of the spinlock,
> >i.e. it is not an issue for RT kernel, because as it was noted in last
> >message a workqueue system uses raw spinlicks internally. I checked
> >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> >work on current CPU, at least as i see it, even if it is "unbound".
> >
>
> Thanks for confirming!!
>
> >If we do it outside, we will reduce a critical section, from the other
> >hand we can introduce a potential delay in placing the context into
> >CPUs
> >run-queuye. As a result we could end up on another CPU, thus placing
> >the work on new CPU, plus memory foot-print might be higher. It would
> >be good to test and have a look at it actually.
> >
> >But it can be negligible :)
>
> Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
>
It should be OK, i do not expect to get noticeable latency for any RT
workloads.
> >
> >> > Any other thoughts?
> >>
> >> I did forget to ask you guys your opinions about the downsides (if
> >any)
> >> of moving from unbound to per-CPU workqueues. Thoughts?
> >>
> >If we do it outside of spinlock, there is at least one drawback that i
> >see, i described it above. We can use schedule_delayed_work_on() but
> >we as a caller have to guarantee that a CPU we about to place a work
> >is alive :)
>
> FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
>
<snip>
/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
* can't go away.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
<snip>
It says, how i see it, we should ensure it can not go away. So, if
we drop the lock we should do like:
get_online_cpus();
check a CPU is onlen;
queue_work_on();
put_online_cpus();
but i suspect we do not want to do it :)
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 13:00 ` Uladzislau Rezki
@ 2020-04-20 13:26 ` Paul E. McKenney
2020-04-20 16:08 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 13:26 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: joel, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> >
> >
> > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > >wrote:
> > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > >wrote:
> > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > >wrote:
> > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > >Andrzej Siewior wrote:
> > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > >Andrzej Siewior wrote:
> > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > >wrote:
> > >> > > > > > > > > > >
> > >> > > > > > > > > > > We might need different calling-context
> > >restrictions for the two variants
> > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > >with some sort of lockdep
> > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > >> > > > > > > > > >
> > >> > > > > > > > > > Oh. We do have this already, it is called
> > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > >> > > > > > > > > > This one will scream if you do
> > >> > > > > > > > > > raw_spin_lock();
> > >> > > > > > > > > > spin_lock();
> > >> > > > > > > > > >
> > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > >which needs to be
> > >> > > > > > > > > > addressed first (but it is one list of things to
> > >do).
> > >> > > > > > > > > >
> > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > >series with
> > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > >migration before
> > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > >extra RT case (avoiding
> > >> > > > > > > > > > memory allocations in a possible atomic context)
> > >until we get there.
> > >> > > > > > > > >
> > >> > > > > > > > > I prefer something like the following to make it
> > >possible to invoke
> > >> > > > > > > > > kfree_rcu() from atomic context considering
> > >call_rcu() is already callable
> > >> > > > > > > > > from such contexts. Thoughts?
> > >> > > > > > > >
> > >> > > > > > > > So it looks like it would work. However, could we
> > >please delay this
> > >> > > > > > > > until we have an actual case on RT? I just added
> > >> > > > > > > > WARN_ON(!preemptible());
> > >> > > > > > >
> > >> > > > > > > I am not sure if waiting for it to break in the future is
> > >a good idea. I'd
> > >> > > > > > > rather design it in a forward thinking way. There could
> > >be folks replacing
> > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > >example. If they were
> > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > >> > > > > > >
> > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > >pages that Vlad is
> > >> > > > > > > planning on adding would further reduce the need for
> > >pages from the page
> > >> > > > > > > allocator.
> > >> > > > > > >
> > >> > > > > > > Paul, what is your opinion on this?
> > >> > > > > >
> > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > >specialization,
> > >> > > > > > is that it gets invoked with preemption disabled, with
> > >interrupts
> > >> > > > > > disabled, and during early boot, as in even before
> > >rcu_init() has been
> > >> > > > > > invoked. This experience does make me lean towards raw
> > >spinlocks.
> > >> > > > > >
> > >> > > > > > But to Sebastian's point, if we are going to use raw
> > >spinlocks, we need
> > >> > > > > > to keep the code paths holding those spinlocks as short as
> > >possible.
> > >> > > > > > I suppose that the inability to allocate memory with raw
> > >spinlocks held
> > >> > > > > > helps, but it is worth checking.
> > >> > > > > >
> > >> > > > > How about reducing the lock contention even further?
> > >> > > >
> > >> > > > Can we do even better by moving the work-scheduling out from
> > >under the
> > >> > > > spinlock? This of course means that it is necessary to handle
> > >the
> > >> > > > occasional spurious call to the work handler, but that should
> > >be rare
> > >> > > > and should be in the noise compared to the reduction in
> > >contention.
> > >> > >
> > >> > > Yes I think that will be required since -rt will sleep on
> > >workqueue locks as
> > >> > > well :-(. I'm looking into it right now.
> > >> > >
> > >> > > /*
> > >> > > * If @work was previously on a different pool, it might
> > >still be
> > >> > > * running there, in which case the work needs to be
> > >queued on that
> > >> > > * pool to guarantee non-reentrancy.
> > >> > > */
> > >> > > last_pool = get_work_pool(work);
> > >> > > if (last_pool && last_pool != pwq->pool) {
> > >> > > struct worker *worker;
> > >> > >
> > >> > > spin_lock(&last_pool->lock);
> > >> >
> > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > >Just took
> > >> > a good look and that's not an issue. However calling
> > >schedule_delayed_work()
> > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > >on
> > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > >pool->lock
> > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > >of:
> > >> >
> > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > >not
> > >> > acquire sleeping locks.
> > >> >
> > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > >Sebastian's
> > >> > initial patch to replace local_irq_save() + spin_lock() with
> > >> > spin_lock_irqsave()).
> > >> >
> > >> > 3. Queue the work through irq_work or another bottom-half
> > >mechanism.
> > >>
> > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > >> go well with a timer. This can of course be done conditionally.
> > >>
> > >We can schedule_delayed_work() inside and outside of the spinlock,
> > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > >message a workqueue system uses raw spinlicks internally. I checked
> > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > >work on current CPU, at least as i see it, even if it is "unbound".
> > >
> >
> > Thanks for confirming!!
> >
> > >If we do it outside, we will reduce a critical section, from the other
> > >hand we can introduce a potential delay in placing the context into
> > >CPUs
> > >run-queuye. As a result we could end up on another CPU, thus placing
> > >the work on new CPU, plus memory foot-print might be higher. It would
> > >be good to test and have a look at it actually.
> > >
> > >But it can be negligible :)
> >
> > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> >
> It should be OK, i do not expect to get noticeable latency for any RT
> workloads.
>
> > >
> > >> > Any other thoughts?
> > >>
> > >> I did forget to ask you guys your opinions about the downsides (if
> > >any)
> > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > >>
> > >If we do it outside of spinlock, there is at least one drawback that i
> > >see, i described it above. We can use schedule_delayed_work_on() but
> > >we as a caller have to guarantee that a CPU we about to place a work
> > >is alive :)
> >
> > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> >
> <snip>
> /**
> * queue_work_on - queue work on specific cpu
> * @cpu: CPU number to execute work on
> * @wq: workqueue to use
> * @work: work to queue
> *
> * We queue the work to a specific CPU, the caller must ensure it
> * can't go away.
> *
> * Return: %false if @work was already on a queue, %true otherwise.
> */
> <snip>
>
> It says, how i see it, we should ensure it can not go away. So, if
> we drop the lock we should do like:
>
> get_online_cpus();
> check a CPU is onlen;
> queue_work_on();
> put_online_cpus();
>
> but i suspect we do not want to do it :)
Indeed, it might impose a few restrictions and a bit of overhead that
might not be welcome at some point in the future. ;-)
On top of this there are potential load-balancing concerns. By specifying
the CPU, you are limiting workqueue's and scheduler's ability to adjust to
any sudden changes in load. Maybe not enough to matter in most cases, but
might be an issue if there is a sudden flood of kfree_rcu() invocations.
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 13:26 ` Paul E. McKenney
@ 2020-04-20 16:08 ` Uladzislau Rezki
2020-04-20 16:25 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 16:08 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, joel, Sebastian Andrzej Siewior,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > >
> > >
> > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > >wrote:
> > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > >wrote:
> > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > >wrote:
> > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > >Andrzej Siewior wrote:
> > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > >Andrzej Siewior wrote:
> > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > >wrote:
> > > >> > > > > > > > > > >
> > > >> > > > > > > > > > > We might need different calling-context
> > > >restrictions for the two variants
> > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > >with some sort of lockdep
> > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > >> > > > > > > > > >
> > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > >> > > > > > > > > > This one will scream if you do
> > > >> > > > > > > > > > raw_spin_lock();
> > > >> > > > > > > > > > spin_lock();
> > > >> > > > > > > > > >
> > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > >which needs to be
> > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > >do).
> > > >> > > > > > > > > >
> > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > >series with
> > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > >migration before
> > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > >extra RT case (avoiding
> > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > >until we get there.
> > > >> > > > > > > > >
> > > >> > > > > > > > > I prefer something like the following to make it
> > > >possible to invoke
> > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > >call_rcu() is already callable
> > > >> > > > > > > > > from such contexts. Thoughts?
> > > >> > > > > > > >
> > > >> > > > > > > > So it looks like it would work. However, could we
> > > >please delay this
> > > >> > > > > > > > until we have an actual case on RT? I just added
> > > >> > > > > > > > WARN_ON(!preemptible());
> > > >> > > > > > >
> > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > >a good idea. I'd
> > > >> > > > > > > rather design it in a forward thinking way. There could
> > > >be folks replacing
> > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > >example. If they were
> > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > >> > > > > > >
> > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > >pages that Vlad is
> > > >> > > > > > > planning on adding would further reduce the need for
> > > >pages from the page
> > > >> > > > > > > allocator.
> > > >> > > > > > >
> > > >> > > > > > > Paul, what is your opinion on this?
> > > >> > > > > >
> > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > >specialization,
> > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > >interrupts
> > > >> > > > > > disabled, and during early boot, as in even before
> > > >rcu_init() has been
> > > >> > > > > > invoked. This experience does make me lean towards raw
> > > >spinlocks.
> > > >> > > > > >
> > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > >spinlocks, we need
> > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > >possible.
> > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > >spinlocks held
> > > >> > > > > > helps, but it is worth checking.
> > > >> > > > > >
> > > >> > > > > How about reducing the lock contention even further?
> > > >> > > >
> > > >> > > > Can we do even better by moving the work-scheduling out from
> > > >under the
> > > >> > > > spinlock? This of course means that it is necessary to handle
> > > >the
> > > >> > > > occasional spurious call to the work handler, but that should
> > > >be rare
> > > >> > > > and should be in the noise compared to the reduction in
> > > >contention.
> > > >> > >
> > > >> > > Yes I think that will be required since -rt will sleep on
> > > >workqueue locks as
> > > >> > > well :-(. I'm looking into it right now.
> > > >> > >
> > > >> > > /*
> > > >> > > * If @work was previously on a different pool, it might
> > > >still be
> > > >> > > * running there, in which case the work needs to be
> > > >queued on that
> > > >> > > * pool to guarantee non-reentrancy.
> > > >> > > */
> > > >> > > last_pool = get_work_pool(work);
> > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > >> > > struct worker *worker;
> > > >> > >
> > > >> > > spin_lock(&last_pool->lock);
> > > >> >
> > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > >Just took
> > > >> > a good look and that's not an issue. However calling
> > > >schedule_delayed_work()
> > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > >on
> > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > >pool->lock
> > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > >of:
> > > >> >
> > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > >not
> > > >> > acquire sleeping locks.
> > > >> >
> > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > >Sebastian's
> > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > >> > spin_lock_irqsave()).
> > > >> >
> > > >> > 3. Queue the work through irq_work or another bottom-half
> > > >mechanism.
> > > >>
> > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > >> go well with a timer. This can of course be done conditionally.
> > > >>
> > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > >message a workqueue system uses raw spinlicks internally. I checked
> > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > >
> > >
> > > Thanks for confirming!!
> > >
> > > >If we do it outside, we will reduce a critical section, from the other
> > > >hand we can introduce a potential delay in placing the context into
> > > >CPUs
> > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > >be good to test and have a look at it actually.
> > > >
> > > >But it can be negligible :)
> > >
> > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > >
> > It should be OK, i do not expect to get noticeable latency for any RT
> > workloads.
> >
> > > >
> > > >> > Any other thoughts?
> > > >>
> > > >> I did forget to ask you guys your opinions about the downsides (if
> > > >any)
> > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > >>
> > > >If we do it outside of spinlock, there is at least one drawback that i
> > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > >we as a caller have to guarantee that a CPU we about to place a work
> > > >is alive :)
> > >
> > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > >
> > <snip>
> > /**
> > * queue_work_on - queue work on specific cpu
> > * @cpu: CPU number to execute work on
> > * @wq: workqueue to use
> > * @work: work to queue
> > *
> > * We queue the work to a specific CPU, the caller must ensure it
> > * can't go away.
> > *
> > * Return: %false if @work was already on a queue, %true otherwise.
> > */
> > <snip>
> >
> > It says, how i see it, we should ensure it can not go away. So, if
> > we drop the lock we should do like:
> >
> > get_online_cpus();
> > check a CPU is onlen;
> > queue_work_on();
> > put_online_cpus();
> >
> > but i suspect we do not want to do it :)
>
> Indeed, it might impose a few restrictions and a bit of overhead that
> might not be welcome at some point in the future. ;-)
>
> On top of this there are potential load-balancing concerns. By specifying
> the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> any sudden changes in load. Maybe not enough to matter in most cases, but
> might be an issue if there is a sudden flood of kfree_rcu() invocations.
>
Agree. Let's keep it as it is now :)
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 16:08 ` Uladzislau Rezki
@ 2020-04-20 16:25 ` Paul E. McKenney
2020-04-20 16:29 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 16:25 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: joel, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > >
> > > >
> > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > >wrote:
> > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > >wrote:
> > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > >wrote:
> > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > >Andrzej Siewior wrote:
> > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > >Andrzej Siewior wrote:
> > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > >wrote:
> > > > >> > > > > > > > > > >
> > > > >> > > > > > > > > > > We might need different calling-context
> > > > >restrictions for the two variants
> > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > >with some sort of lockdep
> > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > >> > > > > > > > > >
> > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > >> > > > > > > > > > This one will scream if you do
> > > > >> > > > > > > > > > raw_spin_lock();
> > > > >> > > > > > > > > > spin_lock();
> > > > >> > > > > > > > > >
> > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > >which needs to be
> > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > >do).
> > > > >> > > > > > > > > >
> > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > >series with
> > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > >migration before
> > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > >extra RT case (avoiding
> > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > >until we get there.
> > > > >> > > > > > > > >
> > > > >> > > > > > > > > I prefer something like the following to make it
> > > > >possible to invoke
> > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > >call_rcu() is already callable
> > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > >> > > > > > > >
> > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > >please delay this
> > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > >> > > > > > >
> > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > >a good idea. I'd
> > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > >be folks replacing
> > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > >example. If they were
> > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > >> > > > > > >
> > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > >pages that Vlad is
> > > > >> > > > > > > planning on adding would further reduce the need for
> > > > >pages from the page
> > > > >> > > > > > > allocator.
> > > > >> > > > > > >
> > > > >> > > > > > > Paul, what is your opinion on this?
> > > > >> > > > > >
> > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > >specialization,
> > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > >interrupts
> > > > >> > > > > > disabled, and during early boot, as in even before
> > > > >rcu_init() has been
> > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > >spinlocks.
> > > > >> > > > > >
> > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > >spinlocks, we need
> > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > >possible.
> > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > >spinlocks held
> > > > >> > > > > > helps, but it is worth checking.
> > > > >> > > > > >
> > > > >> > > > > How about reducing the lock contention even further?
> > > > >> > > >
> > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > >under the
> > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > >the
> > > > >> > > > occasional spurious call to the work handler, but that should
> > > > >be rare
> > > > >> > > > and should be in the noise compared to the reduction in
> > > > >contention.
> > > > >> > >
> > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > >workqueue locks as
> > > > >> > > well :-(. I'm looking into it right now.
> > > > >> > >
> > > > >> > > /*
> > > > >> > > * If @work was previously on a different pool, it might
> > > > >still be
> > > > >> > > * running there, in which case the work needs to be
> > > > >queued on that
> > > > >> > > * pool to guarantee non-reentrancy.
> > > > >> > > */
> > > > >> > > last_pool = get_work_pool(work);
> > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > >> > > struct worker *worker;
> > > > >> > >
> > > > >> > > spin_lock(&last_pool->lock);
> > > > >> >
> > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > >Just took
> > > > >> > a good look and that's not an issue. However calling
> > > > >schedule_delayed_work()
> > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > >on
> > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > >pool->lock
> > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > >of:
> > > > >> >
> > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > >not
> > > > >> > acquire sleeping locks.
> > > > >> >
> > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > >Sebastian's
> > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > >> > spin_lock_irqsave()).
> > > > >> >
> > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > >mechanism.
> > > > >>
> > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > >> go well with a timer. This can of course be done conditionally.
> > > > >>
> > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > >
> > > >
> > > > Thanks for confirming!!
> > > >
> > > > >If we do it outside, we will reduce a critical section, from the other
> > > > >hand we can introduce a potential delay in placing the context into
> > > > >CPUs
> > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > >be good to test and have a look at it actually.
> > > > >
> > > > >But it can be negligible :)
> > > >
> > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > >
> > > It should be OK, i do not expect to get noticeable latency for any RT
> > > workloads.
> > >
> > > > >
> > > > >> > Any other thoughts?
> > > > >>
> > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > >any)
> > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > >>
> > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > >is alive :)
> > > >
> > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > >
> > > <snip>
> > > /**
> > > * queue_work_on - queue work on specific cpu
> > > * @cpu: CPU number to execute work on
> > > * @wq: workqueue to use
> > > * @work: work to queue
> > > *
> > > * We queue the work to a specific CPU, the caller must ensure it
> > > * can't go away.
> > > *
> > > * Return: %false if @work was already on a queue, %true otherwise.
> > > */
> > > <snip>
> > >
> > > It says, how i see it, we should ensure it can not go away. So, if
> > > we drop the lock we should do like:
> > >
> > > get_online_cpus();
> > > check a CPU is onlen;
> > > queue_work_on();
> > > put_online_cpus();
> > >
> > > but i suspect we do not want to do it :)
> >
> > Indeed, it might impose a few restrictions and a bit of overhead that
> > might not be welcome at some point in the future. ;-)
> >
> > On top of this there are potential load-balancing concerns. By specifying
> > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > any sudden changes in load. Maybe not enough to matter in most cases, but
> > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> >
> Agree. Let's keep it as it is now :)
I am not sure which "as it is now" you are referring to, but I suspect
that the -rt guys prefer two short interrupts-disabled regions to one
longer interrupts-disabled region.
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 16:25 ` Paul E. McKenney
@ 2020-04-20 16:29 ` Uladzislau Rezki
2020-04-20 16:46 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 16:29 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, joel, Sebastian Andrzej Siewior,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > >
> > > > >
> > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > >wrote:
> > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > >wrote:
> > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > >wrote:
> > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > >Andrzej Siewior wrote:
> > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > >Andrzej Siewior wrote:
> > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > >wrote:
> > > > > >> > > > > > > > > > >
> > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > >restrictions for the two variants
> > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > >with some sort of lockdep
> > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > >> > > > > > > > > >
> > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > >> > > > > > > > > > This one will scream if you do
> > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > >> > > > > > > > > > spin_lock();
> > > > > >> > > > > > > > > >
> > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > >which needs to be
> > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > >do).
> > > > > >> > > > > > > > > >
> > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > >series with
> > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > >migration before
> > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > >extra RT case (avoiding
> > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > >until we get there.
> > > > > >> > > > > > > > >
> > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > >possible to invoke
> > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > >call_rcu() is already callable
> > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > >> > > > > > > >
> > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > >please delay this
> > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > >> > > > > > >
> > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > >a good idea. I'd
> > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > >be folks replacing
> > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > >example. If they were
> > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > >> > > > > > >
> > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > >pages that Vlad is
> > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > >pages from the page
> > > > > >> > > > > > > allocator.
> > > > > >> > > > > > >
> > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > >> > > > > >
> > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > >specialization,
> > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > >interrupts
> > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > >rcu_init() has been
> > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > >spinlocks.
> > > > > >> > > > > >
> > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > >spinlocks, we need
> > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > >possible.
> > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > >spinlocks held
> > > > > >> > > > > > helps, but it is worth checking.
> > > > > >> > > > > >
> > > > > >> > > > > How about reducing the lock contention even further?
> > > > > >> > > >
> > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > >under the
> > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > >the
> > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > >be rare
> > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > >contention.
> > > > > >> > >
> > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > >workqueue locks as
> > > > > >> > > well :-(. I'm looking into it right now.
> > > > > >> > >
> > > > > >> > > /*
> > > > > >> > > * If @work was previously on a different pool, it might
> > > > > >still be
> > > > > >> > > * running there, in which case the work needs to be
> > > > > >queued on that
> > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > >> > > */
> > > > > >> > > last_pool = get_work_pool(work);
> > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > >> > > struct worker *worker;
> > > > > >> > >
> > > > > >> > > spin_lock(&last_pool->lock);
> > > > > >> >
> > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > >Just took
> > > > > >> > a good look and that's not an issue. However calling
> > > > > >schedule_delayed_work()
> > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > >on
> > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > >pool->lock
> > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > >of:
> > > > > >> >
> > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > >not
> > > > > >> > acquire sleeping locks.
> > > > > >> >
> > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > >Sebastian's
> > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > >> > spin_lock_irqsave()).
> > > > > >> >
> > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > >mechanism.
> > > > > >>
> > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > >>
> > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > >
> > > > >
> > > > > Thanks for confirming!!
> > > > >
> > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > >hand we can introduce a potential delay in placing the context into
> > > > > >CPUs
> > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > >be good to test and have a look at it actually.
> > > > > >
> > > > > >But it can be negligible :)
> > > > >
> > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > >
> > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > workloads.
> > > >
> > > > > >
> > > > > >> > Any other thoughts?
> > > > > >>
> > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > >any)
> > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > >>
> > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > >is alive :)
> > > > >
> > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > >
> > > > <snip>
> > > > /**
> > > > * queue_work_on - queue work on specific cpu
> > > > * @cpu: CPU number to execute work on
> > > > * @wq: workqueue to use
> > > > * @work: work to queue
> > > > *
> > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > * can't go away.
> > > > *
> > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > */
> > > > <snip>
> > > >
> > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > we drop the lock we should do like:
> > > >
> > > > get_online_cpus();
> > > > check a CPU is onlen;
> > > > queue_work_on();
> > > > put_online_cpus();
> > > >
> > > > but i suspect we do not want to do it :)
> > >
> > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > might not be welcome at some point in the future. ;-)
> > >
> > > On top of this there are potential load-balancing concerns. By specifying
> > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > >
> > Agree. Let's keep it as it is now :)
>
> I am not sure which "as it is now" you are referring to, but I suspect
> that the -rt guys prefer two short interrupts-disabled regions to one
> longer interrupts-disabled region.
>
I mean to run schedule_delayed_work() under spinlock.
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 16:29 ` Uladzislau Rezki
@ 2020-04-20 16:46 ` Paul E. McKenney
2020-04-20 16:59 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 16:46 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: joel, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > >
> > > > > >
> > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > >wrote:
> > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > >wrote:
> > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > >wrote:
> > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > >Andrzej Siewior wrote:
> > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > >Andrzej Siewior wrote:
> > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > >wrote:
> > > > > > >> > > > > > > > > > >
> > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > >restrictions for the two variants
> > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > >with some sort of lockdep
> > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > >> > > > > > > > > >
> > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > >> > > > > > > > > > spin_lock();
> > > > > > >> > > > > > > > > >
> > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > >which needs to be
> > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > >do).
> > > > > > >> > > > > > > > > >
> > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > >series with
> > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > >migration before
> > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > >extra RT case (avoiding
> > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > >until we get there.
> > > > > > >> > > > > > > > >
> > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > >possible to invoke
> > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > >call_rcu() is already callable
> > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > >> > > > > > > >
> > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > >please delay this
> > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > >> > > > > > >
> > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > >a good idea. I'd
> > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > >be folks replacing
> > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > >example. If they were
> > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > >> > > > > > >
> > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > >pages that Vlad is
> > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > >pages from the page
> > > > > > >> > > > > > > allocator.
> > > > > > >> > > > > > >
> > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > >> > > > > >
> > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > >specialization,
> > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > >interrupts
> > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > >rcu_init() has been
> > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > >spinlocks.
> > > > > > >> > > > > >
> > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > >spinlocks, we need
> > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > >possible.
> > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > >spinlocks held
> > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > >> > > > > >
> > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > >> > > >
> > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > >under the
> > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > >the
> > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > >be rare
> > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > >contention.
> > > > > > >> > >
> > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > >workqueue locks as
> > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > >> > >
> > > > > > >> > > /*
> > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > >still be
> > > > > > >> > > * running there, in which case the work needs to be
> > > > > > >queued on that
> > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > >> > > */
> > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > >> > > struct worker *worker;
> > > > > > >> > >
> > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > >> >
> > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > >Just took
> > > > > > >> > a good look and that's not an issue. However calling
> > > > > > >schedule_delayed_work()
> > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > >on
> > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > >pool->lock
> > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > >of:
> > > > > > >> >
> > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > >not
> > > > > > >> > acquire sleeping locks.
> > > > > > >> >
> > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > >Sebastian's
> > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > >> > spin_lock_irqsave()).
> > > > > > >> >
> > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > >mechanism.
> > > > > > >>
> > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > >>
> > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > >
> > > > > >
> > > > > > Thanks for confirming!!
> > > > > >
> > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > >CPUs
> > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > >be good to test and have a look at it actually.
> > > > > > >
> > > > > > >But it can be negligible :)
> > > > > >
> > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > >
> > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > workloads.
> > > > >
> > > > > > >
> > > > > > >> > Any other thoughts?
> > > > > > >>
> > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > >any)
> > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > >>
> > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > >is alive :)
> > > > > >
> > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > >
> > > > > <snip>
> > > > > /**
> > > > > * queue_work_on - queue work on specific cpu
> > > > > * @cpu: CPU number to execute work on
> > > > > * @wq: workqueue to use
> > > > > * @work: work to queue
> > > > > *
> > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > * can't go away.
> > > > > *
> > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > */
> > > > > <snip>
> > > > >
> > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > we drop the lock we should do like:
> > > > >
> > > > > get_online_cpus();
> > > > > check a CPU is onlen;
> > > > > queue_work_on();
> > > > > put_online_cpus();
> > > > >
> > > > > but i suspect we do not want to do it :)
> > > >
> > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > might not be welcome at some point in the future. ;-)
> > > >
> > > > On top of this there are potential load-balancing concerns. By specifying
> > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > >
> > > Agree. Let's keep it as it is now :)
> >
> > I am not sure which "as it is now" you are referring to, but I suspect
> > that the -rt guys prefer two short interrupts-disabled regions to one
> > longer interrupts-disabled region.
>
> I mean to run schedule_delayed_work() under spinlock.
Which is an interrupt-disabled spinlock, correct?
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 16:46 ` Paul E. McKenney
@ 2020-04-20 16:59 ` Uladzislau Rezki
2020-04-20 17:21 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 16:59 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, joel, Sebastian Andrzej Siewior,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > >
> > > > > > >
> > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > >wrote:
> > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > >wrote:
> > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > >wrote:
> > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > >Andrzej Siewior wrote:
> > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > >Andrzej Siewior wrote:
> > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > >wrote:
> > > > > > > >> > > > > > > > > > >
> > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > >restrictions for the two variants
> > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > >with some sort of lockdep
> > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > >> > > > > > > > > >
> > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > >> > > > > > > > > >
> > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > >which needs to be
> > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > >do).
> > > > > > > >> > > > > > > > > >
> > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > >series with
> > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > >migration before
> > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > >extra RT case (avoiding
> > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > >until we get there.
> > > > > > > >> > > > > > > > >
> > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > >possible to invoke
> > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > >call_rcu() is already callable
> > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > >> > > > > > > >
> > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > >please delay this
> > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > >> > > > > > >
> > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > >a good idea. I'd
> > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > >be folks replacing
> > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > >example. If they were
> > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > >> > > > > > >
> > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > >pages that Vlad is
> > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > >pages from the page
> > > > > > > >> > > > > > > allocator.
> > > > > > > >> > > > > > >
> > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > >> > > > > >
> > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > >specialization,
> > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > >interrupts
> > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > >rcu_init() has been
> > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > >spinlocks.
> > > > > > > >> > > > > >
> > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > >spinlocks, we need
> > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > >possible.
> > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > >spinlocks held
> > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > >> > > > > >
> > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > >> > > >
> > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > >under the
> > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > >the
> > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > >be rare
> > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > >contention.
> > > > > > > >> > >
> > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > >workqueue locks as
> > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > >> > >
> > > > > > > >> > > /*
> > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > >still be
> > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > >queued on that
> > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > >> > > */
> > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > >> > > struct worker *worker;
> > > > > > > >> > >
> > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > >> >
> > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > >Just took
> > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > >schedule_delayed_work()
> > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > >on
> > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > >pool->lock
> > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > >of:
> > > > > > > >> >
> > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > >not
> > > > > > > >> > acquire sleeping locks.
> > > > > > > >> >
> > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > >Sebastian's
> > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > >> > spin_lock_irqsave()).
> > > > > > > >> >
> > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > >mechanism.
> > > > > > > >>
> > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > >>
> > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > >
> > > > > > >
> > > > > > > Thanks for confirming!!
> > > > > > >
> > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > >CPUs
> > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > >be good to test and have a look at it actually.
> > > > > > > >
> > > > > > > >But it can be negligible :)
> > > > > > >
> > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > >
> > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > workloads.
> > > > > >
> > > > > > > >
> > > > > > > >> > Any other thoughts?
> > > > > > > >>
> > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > >any)
> > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > >>
> > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > >is alive :)
> > > > > > >
> > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > >
> > > > > > <snip>
> > > > > > /**
> > > > > > * queue_work_on - queue work on specific cpu
> > > > > > * @cpu: CPU number to execute work on
> > > > > > * @wq: workqueue to use
> > > > > > * @work: work to queue
> > > > > > *
> > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > * can't go away.
> > > > > > *
> > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > */
> > > > > > <snip>
> > > > > >
> > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > we drop the lock we should do like:
> > > > > >
> > > > > > get_online_cpus();
> > > > > > check a CPU is onlen;
> > > > > > queue_work_on();
> > > > > > put_online_cpus();
> > > > > >
> > > > > > but i suspect we do not want to do it :)
> > > > >
> > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > might not be welcome at some point in the future. ;-)
> > > > >
> > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > >
> > > > Agree. Let's keep it as it is now :)
> > >
> > > I am not sure which "as it is now" you are referring to, but I suspect
> > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > longer interrupts-disabled region.
> >
> > I mean to run schedule_delayed_work() under spinlock.
>
> Which is an interrupt-disabled spinlock, correct?
>
To do it under holding the lock, currently it is spinlock, but it is
going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
saw Joel sent out patches.
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 16:59 ` Uladzislau Rezki
@ 2020-04-20 17:21 ` Paul E. McKenney
2020-04-20 17:40 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 17:21 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: joel, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 06:59:24PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> > On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > > >
> > > > > > > >
> > > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > > >wrote:
> > > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > > >wrote:
> > > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > > >wrote:
> > > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > > >wrote:
> > > > > > > > >> > > > > > > > > > >
> > > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > > >restrictions for the two variants
> > > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > > >with some sort of lockdep
> > > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > >> > > > > > > > > >
> > > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > > >> > > > > > > > > >
> > > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > > >which needs to be
> > > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > > >do).
> > > > > > > > >> > > > > > > > > >
> > > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > > >series with
> > > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > > >migration before
> > > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > > >extra RT case (avoiding
> > > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > > >until we get there.
> > > > > > > > >> > > > > > > > >
> > > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > > >possible to invoke
> > > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > > >call_rcu() is already callable
> > > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > > >> > > > > > > >
> > > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > > >please delay this
> > > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > > >> > > > > > >
> > > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > > >a good idea. I'd
> > > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > > >be folks replacing
> > > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > > >example. If they were
> > > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > > >> > > > > > >
> > > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > > >pages that Vlad is
> > > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > > >pages from the page
> > > > > > > > >> > > > > > > allocator.
> > > > > > > > >> > > > > > >
> > > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > > >> > > > > >
> > > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > > >specialization,
> > > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > > >interrupts
> > > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > > >rcu_init() has been
> > > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > > >spinlocks.
> > > > > > > > >> > > > > >
> > > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > > >spinlocks, we need
> > > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > > >possible.
> > > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > > >spinlocks held
> > > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > > >> > > > > >
> > > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > > >> > > >
> > > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > > >under the
> > > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > > >the
> > > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > > >be rare
> > > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > > >contention.
> > > > > > > > >> > >
> > > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > > >workqueue locks as
> > > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > > >> > >
> > > > > > > > >> > > /*
> > > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > > >still be
> > > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > > >queued on that
> > > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > > >> > > */
> > > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > > >> > > struct worker *worker;
> > > > > > > > >> > >
> > > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > > >> >
> > > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > > >Just took
> > > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > > >schedule_delayed_work()
> > > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > > >on
> > > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > > >pool->lock
> > > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > > >of:
> > > > > > > > >> >
> > > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > > >not
> > > > > > > > >> > acquire sleeping locks.
> > > > > > > > >> >
> > > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > > >Sebastian's
> > > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > > >> > spin_lock_irqsave()).
> > > > > > > > >> >
> > > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > > >mechanism.
> > > > > > > > >>
> > > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > > >>
> > > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > > >
> > > > > > > >
> > > > > > > > Thanks for confirming!!
> > > > > > > >
> > > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > > >CPUs
> > > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > > >be good to test and have a look at it actually.
> > > > > > > > >
> > > > > > > > >But it can be negligible :)
> > > > > > > >
> > > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > > >
> > > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > > workloads.
> > > > > > >
> > > > > > > > >
> > > > > > > > >> > Any other thoughts?
> > > > > > > > >>
> > > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > > >any)
> > > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > > >>
> > > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > > >is alive :)
> > > > > > > >
> > > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > > >
> > > > > > > <snip>
> > > > > > > /**
> > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > * @cpu: CPU number to execute work on
> > > > > > > * @wq: workqueue to use
> > > > > > > * @work: work to queue
> > > > > > > *
> > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > * can't go away.
> > > > > > > *
> > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > */
> > > > > > > <snip>
> > > > > > >
> > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > we drop the lock we should do like:
> > > > > > >
> > > > > > > get_online_cpus();
> > > > > > > check a CPU is onlen;
> > > > > > > queue_work_on();
> > > > > > > put_online_cpus();
> > > > > > >
> > > > > > > but i suspect we do not want to do it :)
> > > > > >
> > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > might not be welcome at some point in the future. ;-)
> > > > > >
> > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > >
> > > > > Agree. Let's keep it as it is now :)
> > > >
> > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > longer interrupts-disabled region.
> > >
> > > I mean to run schedule_delayed_work() under spinlock.
> >
> > Which is an interrupt-disabled spinlock, correct?
> >
> To do it under holding the lock, currently it is spinlock, but it is
> going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> saw Joel sent out patches.
Then please move the schedule_delayed_work() and friends out from
under the spinlock. Unless Sebastian has some reason why extending
an interrupts-disabled critical section (and thus degrading real-time
latency) is somehow OK in this case.
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 17:21 ` Paul E. McKenney
@ 2020-04-20 17:40 ` Uladzislau Rezki
2020-04-20 17:57 ` Joel Fernandes
` (2 more replies)
0 siblings, 3 replies; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 17:40 UTC (permalink / raw)
To: Paul E. McKenney, Sebastian Andrzej Siewior
Cc: Uladzislau Rezki, joel, Sebastian Andrzej Siewior,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 20, 2020 at 06:59:24PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> > > On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > > > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > > > >wrote:
> > > > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > > > >wrote:
> > > > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > > > >wrote:
> > > > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > > > >wrote:
> > > > > > > > > >> > > > > > > > > > >
> > > > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > > > >restrictions for the two variants
> > > > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > > > >with some sort of lockdep
> > > > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > > >> > > > > > > > > >
> > > > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > > > >> > > > > > > > > >
> > > > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > > > >which needs to be
> > > > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > > > >do).
> > > > > > > > > >> > > > > > > > > >
> > > > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > > > >series with
> > > > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > > > >migration before
> > > > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > > > >extra RT case (avoiding
> > > > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > > > >until we get there.
> > > > > > > > > >> > > > > > > > >
> > > > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > > > >possible to invoke
> > > > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > > > >call_rcu() is already callable
> > > > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > > > >> > > > > > > >
> > > > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > > > >please delay this
> > > > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > > > >> > > > > > >
> > > > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > > > >a good idea. I'd
> > > > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > > > >be folks replacing
> > > > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > > > >example. If they were
> > > > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > > > >> > > > > > >
> > > > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > > > >pages that Vlad is
> > > > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > > > >pages from the page
> > > > > > > > > >> > > > > > > allocator.
> > > > > > > > > >> > > > > > >
> > > > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > > > >> > > > > >
> > > > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > > > >specialization,
> > > > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > > > >interrupts
> > > > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > > > >rcu_init() has been
> > > > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > > > >spinlocks.
> > > > > > > > > >> > > > > >
> > > > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > > > >spinlocks, we need
> > > > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > > > >possible.
> > > > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > > > >spinlocks held
> > > > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > > > >> > > > > >
> > > > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > > > >> > > >
> > > > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > > > >under the
> > > > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > > > >the
> > > > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > > > >be rare
> > > > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > > > >contention.
> > > > > > > > > >> > >
> > > > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > > > >workqueue locks as
> > > > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > > > >> > >
> > > > > > > > > >> > > /*
> > > > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > > > >still be
> > > > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > > > >queued on that
> > > > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > > > >> > > */
> > > > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > > > >> > > struct worker *worker;
> > > > > > > > > >> > >
> > > > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > > > >> >
> > > > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > > > >Just took
> > > > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > > > >schedule_delayed_work()
> > > > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > > > >on
> > > > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > > > >pool->lock
> > > > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > > > >of:
> > > > > > > > > >> >
> > > > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > > > >not
> > > > > > > > > >> > acquire sleeping locks.
> > > > > > > > > >> >
> > > > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > > > >Sebastian's
> > > > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > > > >> > spin_lock_irqsave()).
> > > > > > > > > >> >
> > > > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > > > >mechanism.
> > > > > > > > > >>
> > > > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > > > >>
> > > > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks for confirming!!
> > > > > > > > >
> > > > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > > > >CPUs
> > > > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > > > >be good to test and have a look at it actually.
> > > > > > > > > >
> > > > > > > > > >But it can be negligible :)
> > > > > > > > >
> > > > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > > > >
> > > > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > > > workloads.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> > Any other thoughts?
> > > > > > > > > >>
> > > > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > > > >any)
> > > > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > > > >>
> > > > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > > > >is alive :)
> > > > > > > > >
> > > > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > > > >
> > > > > > > > <snip>
> > > > > > > > /**
> > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > * @wq: workqueue to use
> > > > > > > > * @work: work to queue
> > > > > > > > *
> > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > * can't go away.
> > > > > > > > *
> > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > */
> > > > > > > > <snip>
> > > > > > > >
> > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > we drop the lock we should do like:
> > > > > > > >
> > > > > > > > get_online_cpus();
> > > > > > > > check a CPU is onlen;
> > > > > > > > queue_work_on();
> > > > > > > > put_online_cpus();
> > > > > > > >
> > > > > > > > but i suspect we do not want to do it :)
> > > > > > >
> > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > >
> > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > >
> > > > > > Agree. Let's keep it as it is now :)
> > > > >
> > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > longer interrupts-disabled region.
> > > >
> > > > I mean to run schedule_delayed_work() under spinlock.
> > >
> > > Which is an interrupt-disabled spinlock, correct?
> > >
> > To do it under holding the lock, currently it is spinlock, but it is
> > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > saw Joel sent out patches.
>
> Then please move the schedule_delayed_work() and friends out from
> under the spinlock. Unless Sebastian has some reason why extending
> an interrupts-disabled critical section (and thus degrading real-time
> latency) is somehow OK in this case.
>
Paul, if move outside of the lock we may introduce unneeded migration
issues, plus it can introduce higher memory footprint(i have not tested).
I have described it in more detail earlier in this mail thread. I do not
think that waking up the work is an issue for RT from latency point of
view. But let's ask Sebastian to confirm.
Sebastian, do you think that placing a work on current CPU is an issue?
If we do it under raw spinlock?
Thank you!
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 17:40 ` Uladzislau Rezki
@ 2020-04-20 17:57 ` Joel Fernandes
2020-04-20 18:13 ` Paul E. McKenney
2020-04-20 17:59 ` Paul E. McKenney
2020-04-21 13:39 ` Sebastian Andrzej Siewior
2 siblings, 1 reply; 85+ messages in thread
From: Joel Fernandes @ 2020-04-20 17:57 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Paul E. McKenney, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 07:40:19PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
[...]
> > > > > > > > > <snip>
> > > > > > > > > /**
> > > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > > * @wq: workqueue to use
> > > > > > > > > * @work: work to queue
> > > > > > > > > *
> > > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > > * can't go away.
> > > > > > > > > *
> > > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > > */
> > > > > > > > > <snip>
> > > > > > > > >
> > > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > > we drop the lock we should do like:
> > > > > > > > >
> > > > > > > > > get_online_cpus();
> > > > > > > > > check a CPU is onlen;
> > > > > > > > > queue_work_on();
> > > > > > > > > put_online_cpus();
> > > > > > > > >
> > > > > > > > > but i suspect we do not want to do it :)
> > > > > > > >
> > > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > > >
> > > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > > >
> > > > > > > Agree. Let's keep it as it is now :)
> > > > > >
> > > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > > longer interrupts-disabled region.
> > > > >
> > > > > I mean to run schedule_delayed_work() under spinlock.
> > > >
> > > > Which is an interrupt-disabled spinlock, correct?
> > > >
> > > To do it under holding the lock, currently it is spinlock, but it is
> > > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > > saw Joel sent out patches.
> >
> > Then please move the schedule_delayed_work() and friends out from
> > under the spinlock. Unless Sebastian has some reason why extending
> > an interrupts-disabled critical section (and thus degrading real-time
> > latency) is somehow OK in this case.
> >
> Paul, if move outside of the lock we may introduce unneeded migration
> issues, plus it can introduce higher memory footprint(i have not tested).
> I have described it in more detail earlier in this mail thread. I do not
> think that waking up the work is an issue for RT from latency point of
> view. But let's ask Sebastian to confirm.
I was also a bit concerned about migration. If we moved it outside of lock,
then even on !PREEMPT_RT, we could be migrated before the work is
scheduled. Then we'd lose the benefit of executing the work on the same CPU
where it is queued. There's no migrate_disable() in non-PREEMPT_RT when I
recently checked as well :-\ (PeterZ mentioned that migrate_disable() is hard
to achieve on !PREEMPT_RT).
> Sebastian, do you think that placing a work on current CPU is an issue?
> If we do it under raw spinlock?
Yes, I am also curious if calling schedule_delayed_work can cause long
delays at all. Considering that workqueue code uses raw spinlocks as Mike
mentioned, I was under the impression that this code should not be causing
such issues, and the fact that it is called in many places from IRQ-disabled
sections as well.
Let us definitely double-check and discuss it more to be sure.
thanks,
- Joel
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 17:57 ` Joel Fernandes
@ 2020-04-20 18:13 ` Paul E. McKenney
0 siblings, 0 replies; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 18:13 UTC (permalink / raw)
To: Joel Fernandes
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 01:57:50PM -0400, Joel Fernandes wrote:
> On Mon, Apr 20, 2020 at 07:40:19PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
> [...]
> > > > > > > > > > <snip>
> > > > > > > > > > /**
> > > > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > > > * @wq: workqueue to use
> > > > > > > > > > * @work: work to queue
> > > > > > > > > > *
> > > > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > > > * can't go away.
> > > > > > > > > > *
> > > > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > > > */
> > > > > > > > > > <snip>
> > > > > > > > > >
> > > > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > > > we drop the lock we should do like:
> > > > > > > > > >
> > > > > > > > > > get_online_cpus();
> > > > > > > > > > check a CPU is onlen;
> > > > > > > > > > queue_work_on();
> > > > > > > > > > put_online_cpus();
> > > > > > > > > >
> > > > > > > > > > but i suspect we do not want to do it :)
> > > > > > > > >
> > > > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > > > >
> > > > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > > > >
> > > > > > > > Agree. Let's keep it as it is now :)
> > > > > > >
> > > > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > > > longer interrupts-disabled region.
> > > > > >
> > > > > > I mean to run schedule_delayed_work() under spinlock.
> > > > >
> > > > > Which is an interrupt-disabled spinlock, correct?
> > > > >
> > > > To do it under holding the lock, currently it is spinlock, but it is
> > > > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > > > saw Joel sent out patches.
> > >
> > > Then please move the schedule_delayed_work() and friends out from
> > > under the spinlock. Unless Sebastian has some reason why extending
> > > an interrupts-disabled critical section (and thus degrading real-time
> > > latency) is somehow OK in this case.
> > >
> > Paul, if move outside of the lock we may introduce unneeded migration
> > issues, plus it can introduce higher memory footprint(i have not tested).
> > I have described it in more detail earlier in this mail thread. I do not
> > think that waking up the work is an issue for RT from latency point of
> > view. But let's ask Sebastian to confirm.
>
> I was also a bit concerned about migration. If we moved it outside of lock,
> then even on !PREEMPT_RT, we could be migrated before the work is
> scheduled. Then we'd lose the benefit of executing the work on the same CPU
> where it is queued. There's no migrate_disable() in non-PREEMPT_RT when I
> recently checked as well :-\ (PeterZ mentioned that migrate_disable() is hard
> to achieve on !PREEMPT_RT).
>
> > Sebastian, do you think that placing a work on current CPU is an issue?
> > If we do it under raw spinlock?
>
> Yes, I am also curious if calling schedule_delayed_work can cause long
> delays at all. Considering that workqueue code uses raw spinlocks as Mike
> mentioned, I was under the impression that this code should not be causing
> such issues, and the fact that it is called in many places from IRQ-disabled
> sections as well.
>
> Let us definitely double-check and discuss it more to be sure.
Just to be clear, I am not trying to NAK this approach. Yet, anyway.
Just trying to make sure that we think it through. Because it is easier
to get it right now than it ever will be in the future. ;-)
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 17:40 ` Uladzislau Rezki
2020-04-20 17:57 ` Joel Fernandes
@ 2020-04-20 17:59 ` Paul E. McKenney
2020-04-20 19:06 ` Uladzislau Rezki
2020-04-21 13:39 ` Sebastian Andrzej Siewior
2 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 17:59 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Sebastian Andrzej Siewior, joel, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 07:40:19PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
> > On Mon, Apr 20, 2020 at 06:59:24PM +0200, Uladzislau Rezki wrote:
> > > On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> > > > On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > > > > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > > > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > > > > >wrote:
> > > > > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > > > > >wrote:
> > > > > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > > > > >wrote:
> > > > > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > > > > >wrote:
> > > > > > > > > > >> > > > > > > > > > >
> > > > > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > > > > >restrictions for the two variants
> > > > > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > > > > >with some sort of lockdep
> > > > > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > > > > >which needs to be
> > > > > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > > > > >do).
> > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > > > > >series with
> > > > > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > > > > >migration before
> > > > > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > > > > >extra RT case (avoiding
> > > > > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > > > > >until we get there.
> > > > > > > > > > >> > > > > > > > >
> > > > > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > > > > >possible to invoke
> > > > > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > > > > >call_rcu() is already callable
> > > > > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > > > > >> > > > > > > >
> > > > > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > > > > >please delay this
> > > > > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > > > > >> > > > > > >
> > > > > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > > > > >a good idea. I'd
> > > > > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > > > > >be folks replacing
> > > > > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > > > > >example. If they were
> > > > > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > > > > >> > > > > > >
> > > > > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > > > > >pages that Vlad is
> > > > > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > > > > >pages from the page
> > > > > > > > > > >> > > > > > > allocator.
> > > > > > > > > > >> > > > > > >
> > > > > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > > > > >> > > > > >
> > > > > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > > > > >specialization,
> > > > > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > > > > >interrupts
> > > > > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > > > > >rcu_init() has been
> > > > > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > > > > >spinlocks.
> > > > > > > > > > >> > > > > >
> > > > > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > > > > >spinlocks, we need
> > > > > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > > > > >possible.
> > > > > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > > > > >spinlocks held
> > > > > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > > > > >> > > > > >
> > > > > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > > > > >> > > >
> > > > > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > > > > >under the
> > > > > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > > > > >the
> > > > > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > > > > >be rare
> > > > > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > > > > >contention.
> > > > > > > > > > >> > >
> > > > > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > > > > >workqueue locks as
> > > > > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > > > > >> > >
> > > > > > > > > > >> > > /*
> > > > > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > > > > >still be
> > > > > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > > > > >queued on that
> > > > > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > > > > >> > > */
> > > > > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > > > > >> > > struct worker *worker;
> > > > > > > > > > >> > >
> > > > > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > > > > >> >
> > > > > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > > > > >Just took
> > > > > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > > > > >schedule_delayed_work()
> > > > > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > > > > >on
> > > > > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > > > > >pool->lock
> > > > > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > > > > >of:
> > > > > > > > > > >> >
> > > > > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > > > > >not
> > > > > > > > > > >> > acquire sleeping locks.
> > > > > > > > > > >> >
> > > > > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > > > > >Sebastian's
> > > > > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > > > > >> > spin_lock_irqsave()).
> > > > > > > > > > >> >
> > > > > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > > > > >mechanism.
> > > > > > > > > > >>
> > > > > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > > > > >>
> > > > > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks for confirming!!
> > > > > > > > > >
> > > > > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > > > > >CPUs
> > > > > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > > > > >be good to test and have a look at it actually.
> > > > > > > > > > >
> > > > > > > > > > >But it can be negligible :)
> > > > > > > > > >
> > > > > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > > > > >
> > > > > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > > > > workloads.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> > Any other thoughts?
> > > > > > > > > > >>
> > > > > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > > > > >any)
> > > > > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > > > > >>
> > > > > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > > > > >is alive :)
> > > > > > > > > >
> > > > > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > > > > >
> > > > > > > > > <snip>
> > > > > > > > > /**
> > > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > > * @wq: workqueue to use
> > > > > > > > > * @work: work to queue
> > > > > > > > > *
> > > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > > * can't go away.
> > > > > > > > > *
> > > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > > */
> > > > > > > > > <snip>
> > > > > > > > >
> > > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > > we drop the lock we should do like:
> > > > > > > > >
> > > > > > > > > get_online_cpus();
> > > > > > > > > check a CPU is onlen;
> > > > > > > > > queue_work_on();
> > > > > > > > > put_online_cpus();
> > > > > > > > >
> > > > > > > > > but i suspect we do not want to do it :)
> > > > > > > >
> > > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > > >
> > > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > > >
> > > > > > > Agree. Let's keep it as it is now :)
> > > > > >
> > > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > > longer interrupts-disabled region.
> > > > >
> > > > > I mean to run schedule_delayed_work() under spinlock.
> > > >
> > > > Which is an interrupt-disabled spinlock, correct?
> > > >
> > > To do it under holding the lock, currently it is spinlock, but it is
> > > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > > saw Joel sent out patches.
> >
> > Then please move the schedule_delayed_work() and friends out from
> > under the spinlock. Unless Sebastian has some reason why extending
> > an interrupts-disabled critical section (and thus degrading real-time
> > latency) is somehow OK in this case.
> >
> Paul, if move outside of the lock we may introduce unneeded migration
> issues, plus it can introduce higher memory footprint(i have not tested).
> I have described it in more detail earlier in this mail thread. I do not
> think that waking up the work is an issue for RT from latency point of
> view. But let's ask Sebastian to confirm.
>
> Sebastian, do you think that placing a work on current CPU is an issue?
> If we do it under raw spinlock?
We really are talking past each other, aren't we? ;-)
My concern is lengthening the duration of the critical section by having
the extra work-queuing execution within it. As in leave the workqueue
free to migrate, but invoke it after releasing the lock.
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 17:59 ` Paul E. McKenney
@ 2020-04-20 19:06 ` Uladzislau Rezki
2020-04-20 20:17 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 19:06 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, joel,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 10:59:15AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 20, 2020 at 07:40:19PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
> > > On Mon, Apr 20, 2020 at 06:59:24PM +0200, Uladzislau Rezki wrote:
> > > > On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> > > > > On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > > > > > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > > > > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > > > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > > > > > >wrote:
> > > > > > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > > > > > >wrote:
> > > > > > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > > > > > >wrote:
> > > > > > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > > > > > >wrote:
> > > > > > > > > > > >> > > > > > > > > > >
> > > > > > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > > > > > >restrictions for the two variants
> > > > > > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > > > > > >with some sort of lockdep
> > > > > > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > > > > > >which needs to be
> > > > > > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > > > > > >do).
> > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > > > > > >series with
> > > > > > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > > > > > >migration before
> > > > > > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > > > > > >extra RT case (avoiding
> > > > > > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > > > > > >until we get there.
> > > > > > > > > > > >> > > > > > > > >
> > > > > > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > > > > > >possible to invoke
> > > > > > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > > > > > >call_rcu() is already callable
> > > > > > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > > > > > >> > > > > > > >
> > > > > > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > > > > > >please delay this
> > > > > > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > > > > > >a good idea. I'd
> > > > > > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > > > > > >be folks replacing
> > > > > > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > > > > > >example. If they were
> > > > > > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > > > > > >pages that Vlad is
> > > > > > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > > > > > >pages from the page
> > > > > > > > > > > >> > > > > > > allocator.
> > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > > > > > >> > > > > >
> > > > > > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > > > > > >specialization,
> > > > > > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > > > > > >interrupts
> > > > > > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > > > > > >rcu_init() has been
> > > > > > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > > > > > >spinlocks.
> > > > > > > > > > > >> > > > > >
> > > > > > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > > > > > >spinlocks, we need
> > > > > > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > > > > > >possible.
> > > > > > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > > > > > >spinlocks held
> > > > > > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > > > > > >> > > > > >
> > > > > > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > > > > > >> > > >
> > > > > > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > > > > > >under the
> > > > > > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > > > > > >the
> > > > > > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > > > > > >be rare
> > > > > > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > > > > > >contention.
> > > > > > > > > > > >> > >
> > > > > > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > > > > > >workqueue locks as
> > > > > > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > > > > > >> > >
> > > > > > > > > > > >> > > /*
> > > > > > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > > > > > >still be
> > > > > > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > > > > > >queued on that
> > > > > > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > > > > > >> > > */
> > > > > > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > > > > > >> > > struct worker *worker;
> > > > > > > > > > > >> > >
> > > > > > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > > > > > >> >
> > > > > > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > > > > > >Just took
> > > > > > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > > > > > >schedule_delayed_work()
> > > > > > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > > > > > >on
> > > > > > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > > > > > >pool->lock
> > > > > > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > > > > > >of:
> > > > > > > > > > > >> >
> > > > > > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > > > > > >not
> > > > > > > > > > > >> > acquire sleeping locks.
> > > > > > > > > > > >> >
> > > > > > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > > > > > >Sebastian's
> > > > > > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > > > > > >> > spin_lock_irqsave()).
> > > > > > > > > > > >> >
> > > > > > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > > > > > >mechanism.
> > > > > > > > > > > >>
> > > > > > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > > > > > >>
> > > > > > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks for confirming!!
> > > > > > > > > > >
> > > > > > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > > > > > >CPUs
> > > > > > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > > > > > >be good to test and have a look at it actually.
> > > > > > > > > > > >
> > > > > > > > > > > >But it can be negligible :)
> > > > > > > > > > >
> > > > > > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > > > > > >
> > > > > > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > > > > > workloads.
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >> > Any other thoughts?
> > > > > > > > > > > >>
> > > > > > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > > > > > >any)
> > > > > > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > > > > > >>
> > > > > > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > > > > > >is alive :)
> > > > > > > > > > >
> > > > > > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > > > > > >
> > > > > > > > > > <snip>
> > > > > > > > > > /**
> > > > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > > > * @wq: workqueue to use
> > > > > > > > > > * @work: work to queue
> > > > > > > > > > *
> > > > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > > > * can't go away.
> > > > > > > > > > *
> > > > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > > > */
> > > > > > > > > > <snip>
> > > > > > > > > >
> > > > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > > > we drop the lock we should do like:
> > > > > > > > > >
> > > > > > > > > > get_online_cpus();
> > > > > > > > > > check a CPU is onlen;
> > > > > > > > > > queue_work_on();
> > > > > > > > > > put_online_cpus();
> > > > > > > > > >
> > > > > > > > > > but i suspect we do not want to do it :)
> > > > > > > > >
> > > > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > > > >
> > > > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > > > >
> > > > > > > > Agree. Let's keep it as it is now :)
> > > > > > >
> > > > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > > > longer interrupts-disabled region.
> > > > > >
> > > > > > I mean to run schedule_delayed_work() under spinlock.
> > > > >
> > > > > Which is an interrupt-disabled spinlock, correct?
> > > > >
> > > > To do it under holding the lock, currently it is spinlock, but it is
> > > > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > > > saw Joel sent out patches.
> > >
> > > Then please move the schedule_delayed_work() and friends out from
> > > under the spinlock. Unless Sebastian has some reason why extending
> > > an interrupts-disabled critical section (and thus degrading real-time
> > > latency) is somehow OK in this case.
> > >
> > Paul, if move outside of the lock we may introduce unneeded migration
> > issues, plus it can introduce higher memory footprint(i have not tested).
> > I have described it in more detail earlier in this mail thread. I do not
> > think that waking up the work is an issue for RT from latency point of
> > view. But let's ask Sebastian to confirm.
> >
> > Sebastian, do you think that placing a work on current CPU is an issue?
> > If we do it under raw spinlock?
>
> We really are talking past each other, aren't we? ;-)
>
Let's hear each other better then :)
>
> My concern is lengthening the duration of the critical section by having
> the extra work-queuing execution within it. As in leave the workqueue
> free to migrate, but invoke it after releasing the lock.
>
I totally understand your concern and "understood it before" when you proposed
to do it outside of the lock. That is for sure will reduce the critical section.
It is up to you anyway.
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 19:06 ` Uladzislau Rezki
@ 2020-04-20 20:17 ` Uladzislau Rezki
2020-04-20 22:16 ` Paul E. McKenney
2020-04-21 1:22 ` Steven Rostedt
0 siblings, 2 replies; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-20 20:17 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Paul E. McKenney, Sebastian Andrzej Siewior, joel,
Steven Rostedt, rcu, Josh Triplett, Mathieu Desnoyers,
Lai Jiangshan, Thomas Gleixner, Mike Galbraith
On Mon, Apr 20, 2020 at 09:06:50PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 10:59:15AM -0700, Paul E. McKenney wrote:
> > On Mon, Apr 20, 2020 at 07:40:19PM +0200, Uladzislau Rezki wrote:
> > > On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
> > > > On Mon, Apr 20, 2020 at 06:59:24PM +0200, Uladzislau Rezki wrote:
> > > > > On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> > > > > > On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > > > > > > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > > > > > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > > > > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > > > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > > > > > > >wrote:
> > > > > > > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > > > > > > >wrote:
> > > > > > > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > > > > > > >wrote:
> > > > > > > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > > > > > > >wrote:
> > > > > > > > > > > > >> > > > > > > > > > >
> > > > > > > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > > > > > > >restrictions for the two variants
> > > > > > > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > > > > > > >with some sort of lockdep
> > > > > > > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > > > > > > >which needs to be
> > > > > > > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > > > > > > >do).
> > > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > > > > > > >series with
> > > > > > > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > > > > > > >migration before
> > > > > > > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > > > > > > >extra RT case (avoiding
> > > > > > > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > > > > > > >until we get there.
> > > > > > > > > > > > >> > > > > > > > >
> > > > > > > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > > > > > > >possible to invoke
> > > > > > > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > > > > > > >call_rcu() is already callable
> > > > > > > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > > > > > > >> > > > > > > >
> > > > > > > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > > > > > > >please delay this
> > > > > > > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > > > > > > >a good idea. I'd
> > > > > > > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > > > > > > >be folks replacing
> > > > > > > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > > > > > > >example. If they were
> > > > > > > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > > > > > > >pages that Vlad is
> > > > > > > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > > > > > > >pages from the page
> > > > > > > > > > > > >> > > > > > > allocator.
> > > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > > > > > > >> > > > > >
> > > > > > > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > > > > > > >specialization,
> > > > > > > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > > > > > > >interrupts
> > > > > > > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > > > > > > >rcu_init() has been
> > > > > > > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > > > > > > >spinlocks.
> > > > > > > > > > > > >> > > > > >
> > > > > > > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > > > > > > >spinlocks, we need
> > > > > > > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > > > > > > >possible.
> > > > > > > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > > > > > > >spinlocks held
> > > > > > > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > > > > > > >> > > > > >
> > > > > > > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > > > > > > >> > > >
> > > > > > > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > > > > > > >under the
> > > > > > > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > > > > > > >the
> > > > > > > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > > > > > > >be rare
> > > > > > > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > > > > > > >contention.
> > > > > > > > > > > > >> > >
> > > > > > > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > > > > > > >workqueue locks as
> > > > > > > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > > > > > > >> > >
> > > > > > > > > > > > >> > > /*
> > > > > > > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > > > > > > >still be
> > > > > > > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > > > > > > >queued on that
> > > > > > > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > > > > > > >> > > */
> > > > > > > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > > > > > > >> > > struct worker *worker;
> > > > > > > > > > > > >> > >
> > > > > > > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > > > > > > >Just took
> > > > > > > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > > > > > > >schedule_delayed_work()
> > > > > > > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > > > > > > >on
> > > > > > > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > > > > > > >pool->lock
> > > > > > > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > > > > > > >of:
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > > > > > > >not
> > > > > > > > > > > > >> > acquire sleeping locks.
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > > > > > > >Sebastian's
> > > > > > > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > > > > > > >> > spin_lock_irqsave()).
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > > > > > > >mechanism.
> > > > > > > > > > > > >>
> > > > > > > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > > > > > > >>
> > > > > > > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks for confirming!!
> > > > > > > > > > > >
> > > > > > > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > > > > > > >CPUs
> > > > > > > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > > > > > > >be good to test and have a look at it actually.
> > > > > > > > > > > > >
> > > > > > > > > > > > >But it can be negligible :)
> > > > > > > > > > > >
> > > > > > > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > > > > > > >
> > > > > > > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > > > > > > workloads.
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >> > Any other thoughts?
> > > > > > > > > > > > >>
> > > > > > > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > > > > > > >any)
> > > > > > > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > > > > > > >>
> > > > > > > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > > > > > > >is alive :)
> > > > > > > > > > > >
> > > > > > > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > > > > > > >
> > > > > > > > > > > <snip>
> > > > > > > > > > > /**
> > > > > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > > > > * @wq: workqueue to use
> > > > > > > > > > > * @work: work to queue
> > > > > > > > > > > *
> > > > > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > > > > * can't go away.
> > > > > > > > > > > *
> > > > > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > > > > */
> > > > > > > > > > > <snip>
> > > > > > > > > > >
> > > > > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > > > > we drop the lock we should do like:
> > > > > > > > > > >
> > > > > > > > > > > get_online_cpus();
> > > > > > > > > > > check a CPU is onlen;
> > > > > > > > > > > queue_work_on();
> > > > > > > > > > > put_online_cpus();
> > > > > > > > > > >
> > > > > > > > > > > but i suspect we do not want to do it :)
> > > > > > > > > >
> > > > > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > > > > >
> > > > > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > > > > >
> > > > > > > > > Agree. Let's keep it as it is now :)
> > > > > > > >
> > > > > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > > > > longer interrupts-disabled region.
> > > > > > >
> > > > > > > I mean to run schedule_delayed_work() under spinlock.
> > > > > >
> > > > > > Which is an interrupt-disabled spinlock, correct?
> > > > > >
> > > > > To do it under holding the lock, currently it is spinlock, but it is
> > > > > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > > > > saw Joel sent out patches.
> > > >
> > > > Then please move the schedule_delayed_work() and friends out from
> > > > under the spinlock. Unless Sebastian has some reason why extending
> > > > an interrupts-disabled critical section (and thus degrading real-time
> > > > latency) is somehow OK in this case.
> > > >
> > > Paul, if move outside of the lock we may introduce unneeded migration
> > > issues, plus it can introduce higher memory footprint(i have not tested).
> > > I have described it in more detail earlier in this mail thread. I do not
> > > think that waking up the work is an issue for RT from latency point of
> > > view. But let's ask Sebastian to confirm.
> > >
> > > Sebastian, do you think that placing a work on current CPU is an issue?
> > > If we do it under raw spinlock?
> >
> > We really are talking past each other, aren't we? ;-)
> >
> Let's hear each other better then :)
>
> >
> > My concern is lengthening the duration of the critical section by having
> > the extra work-queuing execution within it. As in leave the workqueue
> > free to migrate, but invoke it after releasing the lock.
> >
Paul, i have just measured the time duration of the schedule_delayed_work().
To do that i used below patch:
<snip>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 02f73f7bbd40..f74ae0f3556e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3232,6 +3232,12 @@ static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
return ((struct rcu_head *) ++ptr);
}
+static void noinline
+measure_schedule_delayed_work(struct kfree_rcu_cpu *krcp)
+{
+ schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+}
+
/*
* Queue a request for lazy invocation of appropriate free routine after a
* grace period. Please note there are three paths are maintained, two are the
@@ -3327,8 +3333,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!krcp->monitor_todo) {
krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work,
- expedited_drain ? 0 : KFREE_DRAIN_JIFFIES);
+ measure_schedule_delayed_work(krcp);
}
<snip>
i have done it for not CONFIG_PREEMPT_RT kernel, i do not have any RT configuration.
I run rcuperf to apply the load to see the time taken by the actual placing of the work,
i.e. the time taken by schedule_delayed_work():
<snip>
root@pc636:/sys/kernel/debug/tracing# cat trace
# tracer: function_graph
#
# function_graph latency trace v1.1.5 on 5.6.0-rc6+
# --------------------------------------------------------------------
# latency: 0 us, #16/16, CPU#0 | (M:server VP:0, KP:0, SP:0 HP:0 #P:4)
# -----------------
# | task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
# -----------------
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| /
# TIME CPU TASK/PID |||| DURATION FUNCTION CALLS
# | | | | |||| | | | | | |
682.384653 | 1) <idle>-0 | d.s. | 5.329 us | } /* measure_schedule_delayed_work.constprop.86 */
685.374654 | 2) <idle>-0 | d.s. | 5.392 us | } /* measure_schedule_delayed_work.constprop.86 */
700.304647 | 2) <idle>-0 | d.s. | 5.650 us | } /* measure_schedule_delayed_work.constprop.86 */
710.331280 | 3) <idle>-0 | d.s. | 5.145 us | } /* measure_schedule_delayed_work.constprop.86 */
714.387943 | 1) <idle>-0 | d.s. | 9.986 us | } /* measure_schedule_delayed_work.constprop.86 */
720.251229 | 0) <idle>-0 | d.s. | 5.292 us | } /* measure_schedule_delayed_work.constprop.86 */
725.211208 | 2) <idle>-0 | d.s. | 5.295 us | } /* measure_schedule_delayed_work.constprop.86 */
731.847845 | 1) <idle>-0 | d.s. | 5.048 us | } /* measure_schedule_delayed_work.constprop.86 */
736.357802 | 2) <idle>-0 | d.s. | 5.134 us | } /* measure_schedule_delayed_work.constprop.86 */
738.287785 | 1) <idle>-0 | d.s. | 5.863 us | } /* measure_schedule_delayed_work.constprop.86 */
742.214431 | 1) <idle>-0 | d.s. | 5.202 us | } /* measure_schedule_delayed_work.constprop.86 */
759.844264 | 2) <idle>-0 | d.s. | 5.375 us | } /* measure_schedule_delayed_work.constprop.86 */
764.304218 | 1) <idle>-0 | d.s. | 5.650 us | } /* measure_schedule_delayed_work.constprop.86 */
766.224204 | 3) <idle>-0 | d.s. | 5.015 us | } /* measure_schedule_delayed_work.constprop.86 */
772.410794 | 1) <idle>-0 | d.s. | 5.061 us | } /* measure_schedule_delayed_work.constprop.86 */
781.370691 | 1) <idle>-0 | d.s. | 5.165 us | } /* measure_schedule_delayed_work.constprop.86 */
root@pc636:/sys/kernel/debug/tracing# cat tracing_thresh
5
root@pc636:/sys/kernel/debug/tracing#
<snip>
--
Vlad Rezki
^ permalink raw reply related [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 20:17 ` Uladzislau Rezki
@ 2020-04-20 22:16 ` Paul E. McKenney
2020-04-21 1:22 ` Steven Rostedt
1 sibling, 0 replies; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-20 22:16 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Sebastian Andrzej Siewior, joel, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, Apr 20, 2020 at 10:17:23PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 20, 2020 at 09:06:50PM +0200, Uladzislau Rezki wrote:
> > On Mon, Apr 20, 2020 at 10:59:15AM -0700, Paul E. McKenney wrote:
> > > On Mon, Apr 20, 2020 at 07:40:19PM +0200, Uladzislau Rezki wrote:
> > > > On Mon, Apr 20, 2020 at 10:21:26AM -0700, Paul E. McKenney wrote:
> > > > > On Mon, Apr 20, 2020 at 06:59:24PM +0200, Uladzislau Rezki wrote:
> > > > > > On Mon, Apr 20, 2020 at 09:46:57AM -0700, Paul E. McKenney wrote:
> > > > > > > On Mon, Apr 20, 2020 at 06:29:00PM +0200, Uladzislau Rezki wrote:
> > > > > > > > On Mon, Apr 20, 2020 at 09:25:34AM -0700, Paul E. McKenney wrote:
> > > > > > > > > On Mon, Apr 20, 2020 at 06:08:47PM +0200, Uladzislau Rezki wrote:
> > > > > > > > > > On Mon, Apr 20, 2020 at 06:26:01AM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > On Mon, Apr 20, 2020 at 03:00:03PM +0200, Uladzislau Rezki wrote:
> > > > > > > > > > > > On Mon, Apr 20, 2020 at 08:36:31AM -0400, joel@joelfernandes.org wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > On April 20, 2020 8:13:16 AM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > > > > > > > > > > >On Sun, Apr 19, 2020 at 06:44:50PM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > > > >> On Sun, Apr 19, 2020 at 09:17:49PM -0400, Joel Fernandes wrote:
> > > > > > > > > > > > > >> > On Sun, Apr 19, 2020 at 08:27:13PM -0400, Joel Fernandes wrote:
> > > > > > > > > > > > > >> > > On Sun, Apr 19, 2020 at 07:58:36AM -0700, Paul E. McKenney wrote:
> > > > > > > > > > > > > >> > > > On Sat, Apr 18, 2020 at 02:37:48PM +0200, Uladzislau Rezki
> > > > > > > > > > > > > >wrote:
> > > > > > > > > > > > > >> > > > > On Fri, Apr 17, 2020 at 11:54:49AM -0700, Paul E. McKenney
> > > > > > > > > > > > > >wrote:
> > > > > > > > > > > > > >> > > > > > On Fri, Apr 17, 2020 at 02:26:41PM -0400, Joel Fernandes
> > > > > > > > > > > > > >wrote:
> > > > > > > > > > > > > >> > > > > > > On Fri, Apr 17, 2020 at 05:04:42PM +0200, Sebastian
> > > > > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > > > > >> > > > > > > > On 2020-04-16 23:05:15 [-0400], Joel Fernandes wrote:
> > > > > > > > > > > > > >> > > > > > > > > On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian
> > > > > > > > > > > > > >Andrzej Siewior wrote:
> > > > > > > > > > > > > >> > > > > > > > > > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney
> > > > > > > > > > > > > >wrote:
> > > > > > > > > > > > > >> > > > > > > > > > >
> > > > > > > > > > > > > >> > > > > > > > > > > We might need different calling-context
> > > > > > > > > > > > > >restrictions for the two variants
> > > > > > > > > > > > > >> > > > > > > > > > > of kfree_rcu(). And we might need to come up
> > > > > > > > > > > > > >with some sort of lockdep
> > > > > > > > > > > > > >> > > > > > > > > > > check for "safe to use normal spinlock in -rt".
> > > > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > > > >> > > > > > > > > > Oh. We do have this already, it is called
> > > > > > > > > > > > > >CONFIG_PROVE_RAW_LOCK_NESTING.
> > > > > > > > > > > > > >> > > > > > > > > > This one will scream if you do
> > > > > > > > > > > > > >> > > > > > > > > > raw_spin_lock();
> > > > > > > > > > > > > >> > > > > > > > > > spin_lock();
> > > > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > > > >> > > > > > > > > > Sadly, as of today, there is code triggering this
> > > > > > > > > > > > > >which needs to be
> > > > > > > > > > > > > >> > > > > > > > > > addressed first (but it is one list of things to
> > > > > > > > > > > > > >do).
> > > > > > > > > > > > > >> > > > > > > > > >
> > > > > > > > > > > > > >> > > > > > > > > > Given the thread so far, is it okay if I repost the
> > > > > > > > > > > > > >series with
> > > > > > > > > > > > > >> > > > > > > > > > migrate_disable() instead of accepting a possible
> > > > > > > > > > > > > >migration before
> > > > > > > > > > > > > >> > > > > > > > > > grabbing the lock? I would prefer to avoid the
> > > > > > > > > > > > > >extra RT case (avoiding
> > > > > > > > > > > > > >> > > > > > > > > > memory allocations in a possible atomic context)
> > > > > > > > > > > > > >until we get there.
> > > > > > > > > > > > > >> > > > > > > > >
> > > > > > > > > > > > > >> > > > > > > > > I prefer something like the following to make it
> > > > > > > > > > > > > >possible to invoke
> > > > > > > > > > > > > >> > > > > > > > > kfree_rcu() from atomic context considering
> > > > > > > > > > > > > >call_rcu() is already callable
> > > > > > > > > > > > > >> > > > > > > > > from such contexts. Thoughts?
> > > > > > > > > > > > > >> > > > > > > >
> > > > > > > > > > > > > >> > > > > > > > So it looks like it would work. However, could we
> > > > > > > > > > > > > >please delay this
> > > > > > > > > > > > > >> > > > > > > > until we have an actual case on RT? I just added
> > > > > > > > > > > > > >> > > > > > > > WARN_ON(!preemptible());
> > > > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > > > >> > > > > > > I am not sure if waiting for it to break in the future is
> > > > > > > > > > > > > >a good idea. I'd
> > > > > > > > > > > > > >> > > > > > > rather design it in a forward thinking way. There could
> > > > > > > > > > > > > >be folks replacing
> > > > > > > > > > > > > >> > > > > > > "call_rcu() + kfree in a callback" with kfree_rcu() for
> > > > > > > > > > > > > >example. If they were
> > > > > > > > > > > > > >> > > > > > > in !preemptible(), we'd break on page allocation.
> > > > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > > > >> > > > > > > Also as a sidenote, the additional pre-allocation of
> > > > > > > > > > > > > >pages that Vlad is
> > > > > > > > > > > > > >> > > > > > > planning on adding would further reduce the need for
> > > > > > > > > > > > > >pages from the page
> > > > > > > > > > > > > >> > > > > > > allocator.
> > > > > > > > > > > > > >> > > > > > >
> > > > > > > > > > > > > >> > > > > > > Paul, what is your opinion on this?
> > > > > > > > > > > > > >> > > > > >
> > > > > > > > > > > > > >> > > > > > My experience with call_rcu(), of which kfree_rcu() is a
> > > > > > > > > > > > > >specialization,
> > > > > > > > > > > > > >> > > > > > is that it gets invoked with preemption disabled, with
> > > > > > > > > > > > > >interrupts
> > > > > > > > > > > > > >> > > > > > disabled, and during early boot, as in even before
> > > > > > > > > > > > > >rcu_init() has been
> > > > > > > > > > > > > >> > > > > > invoked. This experience does make me lean towards raw
> > > > > > > > > > > > > >spinlocks.
> > > > > > > > > > > > > >> > > > > >
> > > > > > > > > > > > > >> > > > > > But to Sebastian's point, if we are going to use raw
> > > > > > > > > > > > > >spinlocks, we need
> > > > > > > > > > > > > >> > > > > > to keep the code paths holding those spinlocks as short as
> > > > > > > > > > > > > >possible.
> > > > > > > > > > > > > >> > > > > > I suppose that the inability to allocate memory with raw
> > > > > > > > > > > > > >spinlocks held
> > > > > > > > > > > > > >> > > > > > helps, but it is worth checking.
> > > > > > > > > > > > > >> > > > > >
> > > > > > > > > > > > > >> > > > > How about reducing the lock contention even further?
> > > > > > > > > > > > > >> > > >
> > > > > > > > > > > > > >> > > > Can we do even better by moving the work-scheduling out from
> > > > > > > > > > > > > >under the
> > > > > > > > > > > > > >> > > > spinlock? This of course means that it is necessary to handle
> > > > > > > > > > > > > >the
> > > > > > > > > > > > > >> > > > occasional spurious call to the work handler, but that should
> > > > > > > > > > > > > >be rare
> > > > > > > > > > > > > >> > > > and should be in the noise compared to the reduction in
> > > > > > > > > > > > > >contention.
> > > > > > > > > > > > > >> > >
> > > > > > > > > > > > > >> > > Yes I think that will be required since -rt will sleep on
> > > > > > > > > > > > > >workqueue locks as
> > > > > > > > > > > > > >> > > well :-(. I'm looking into it right now.
> > > > > > > > > > > > > >> > >
> > > > > > > > > > > > > >> > > /*
> > > > > > > > > > > > > >> > > * If @work was previously on a different pool, it might
> > > > > > > > > > > > > >still be
> > > > > > > > > > > > > >> > > * running there, in which case the work needs to be
> > > > > > > > > > > > > >queued on that
> > > > > > > > > > > > > >> > > * pool to guarantee non-reentrancy.
> > > > > > > > > > > > > >> > > */
> > > > > > > > > > > > > >> > > last_pool = get_work_pool(work);
> > > > > > > > > > > > > >> > > if (last_pool && last_pool != pwq->pool) {
> > > > > > > > > > > > > >> > > struct worker *worker;
> > > > > > > > > > > > > >> > >
> > > > > > > > > > > > > >> > > spin_lock(&last_pool->lock);
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >> > Hmm, I think moving schedule_delayed_work() outside lock will work.
> > > > > > > > > > > > > >Just took
> > > > > > > > > > > > > >> > a good look and that's not an issue. However calling
> > > > > > > > > > > > > >schedule_delayed_work()
> > > > > > > > > > > > > >> > itself is an issue if the caller of kfree_rcu() is !preemptible()
> > > > > > > > > > > > > >on
> > > > > > > > > > > > > >> > PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
> > > > > > > > > > > > > >pool->lock
> > > > > > > > > > > > > >> > which can sleep on PREEMPT_RT :-(. Which means we have to do either
> > > > > > > > > > > > > >of:
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >> > 1. Implement a new mechanism for scheduling delayed work that does
> > > > > > > > > > > > > >not
> > > > > > > > > > > > > >> > acquire sleeping locks.
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >> > 2. Allow kfree_rcu() only from preemptible context (That is
> > > > > > > > > > > > > >Sebastian's
> > > > > > > > > > > > > >> > initial patch to replace local_irq_save() + spin_lock() with
> > > > > > > > > > > > > >> > spin_lock_irqsave()).
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >> > 3. Queue the work through irq_work or another bottom-half
> > > > > > > > > > > > > >mechanism.
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >> I use irq_work elsewhere in RCU, but the queue_delayed_work() might
> > > > > > > > > > > > > >> go well with a timer. This can of course be done conditionally.
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >We can schedule_delayed_work() inside and outside of the spinlock,
> > > > > > > > > > > > > >i.e. it is not an issue for RT kernel, because as it was noted in last
> > > > > > > > > > > > > >message a workqueue system uses raw spinlicks internally. I checked
> > > > > > > > > > > > > >the latest linux-5.6.y-rt also. If we do it inside, we will place the
> > > > > > > > > > > > > >work on current CPU, at least as i see it, even if it is "unbound".
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thanks for confirming!!
> > > > > > > > > > > > >
> > > > > > > > > > > > > >If we do it outside, we will reduce a critical section, from the other
> > > > > > > > > > > > > >hand we can introduce a potential delay in placing the context into
> > > > > > > > > > > > > >CPUs
> > > > > > > > > > > > > >run-queuye. As a result we could end up on another CPU, thus placing
> > > > > > > > > > > > > >the work on new CPU, plus memory foot-print might be higher. It would
> > > > > > > > > > > > > >be good to test and have a look at it actually.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >But it can be negligible :)
> > > > > > > > > > > > >
> > > > > > > > > > > > > Since the wq locking is raw spinlock on rt as Mike and you mentioned, if wq holds lock for too long that itself will spawn a lengthy non preemptible critical section, so from that standpoint doing it under our lock should be ok I think.
> > > > > > > > > > > > >
> > > > > > > > > > > > It should be OK, i do not expect to get noticeable latency for any RT
> > > > > > > > > > > > workloads.
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >> > Any other thoughts?
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >> I did forget to ask you guys your opinions about the downsides (if
> > > > > > > > > > > > > >any)
> > > > > > > > > > > > > >> of moving from unbound to per-CPU workqueues. Thoughts?
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >If we do it outside of spinlock, there is at least one drawback that i
> > > > > > > > > > > > > >see, i described it above. We can use schedule_delayed_work_on() but
> > > > > > > > > > > > > >we as a caller have to guarantee that a CPU we about to place a work
> > > > > > > > > > > > > >is alive :)
> > > > > > > > > > > > >
> > > > > > > > > > > > > FWIW, some time back I did a simple manual test calling queue_work_on on an offline CPU to see what happens and it appears to be working fine. On a 4 CPU system, I offline CPU 3 and queue the work on it which ends up executing on CPU 0 instead.
> > > > > > > > > > > > >
> > > > > > > > > > > > <snip>
> > > > > > > > > > > > /**
> > > > > > > > > > > > * queue_work_on - queue work on specific cpu
> > > > > > > > > > > > * @cpu: CPU number to execute work on
> > > > > > > > > > > > * @wq: workqueue to use
> > > > > > > > > > > > * @work: work to queue
> > > > > > > > > > > > *
> > > > > > > > > > > > * We queue the work to a specific CPU, the caller must ensure it
> > > > > > > > > > > > * can't go away.
> > > > > > > > > > > > *
> > > > > > > > > > > > * Return: %false if @work was already on a queue, %true otherwise.
> > > > > > > > > > > > */
> > > > > > > > > > > > <snip>
> > > > > > > > > > > >
> > > > > > > > > > > > It says, how i see it, we should ensure it can not go away. So, if
> > > > > > > > > > > > we drop the lock we should do like:
> > > > > > > > > > > >
> > > > > > > > > > > > get_online_cpus();
> > > > > > > > > > > > check a CPU is onlen;
> > > > > > > > > > > > queue_work_on();
> > > > > > > > > > > > put_online_cpus();
> > > > > > > > > > > >
> > > > > > > > > > > > but i suspect we do not want to do it :)
> > > > > > > > > > >
> > > > > > > > > > > Indeed, it might impose a few restrictions and a bit of overhead that
> > > > > > > > > > > might not be welcome at some point in the future. ;-)
> > > > > > > > > > >
> > > > > > > > > > > On top of this there are potential load-balancing concerns. By specifying
> > > > > > > > > > > the CPU, you are limiting workqueue's and scheduler's ability to adjust to
> > > > > > > > > > > any sudden changes in load. Maybe not enough to matter in most cases, but
> > > > > > > > > > > might be an issue if there is a sudden flood of kfree_rcu() invocations.
> > > > > > > > > > >
> > > > > > > > > > Agree. Let's keep it as it is now :)
> > > > > > > > >
> > > > > > > > > I am not sure which "as it is now" you are referring to, but I suspect
> > > > > > > > > that the -rt guys prefer two short interrupts-disabled regions to one
> > > > > > > > > longer interrupts-disabled region.
> > > > > > > >
> > > > > > > > I mean to run schedule_delayed_work() under spinlock.
> > > > > > >
> > > > > > > Which is an interrupt-disabled spinlock, correct?
> > > > > > >
> > > > > > To do it under holding the lock, currently it is spinlock, but it is
> > > > > > going to be(if you agree :)) raw ones, which keeps IRQs disabled. I
> > > > > > saw Joel sent out patches.
> > > > >
> > > > > Then please move the schedule_delayed_work() and friends out from
> > > > > under the spinlock. Unless Sebastian has some reason why extending
> > > > > an interrupts-disabled critical section (and thus degrading real-time
> > > > > latency) is somehow OK in this case.
> > > > >
> > > > Paul, if move outside of the lock we may introduce unneeded migration
> > > > issues, plus it can introduce higher memory footprint(i have not tested).
> > > > I have described it in more detail earlier in this mail thread. I do not
> > > > think that waking up the work is an issue for RT from latency point of
> > > > view. But let's ask Sebastian to confirm.
> > > >
> > > > Sebastian, do you think that placing a work on current CPU is an issue?
> > > > If we do it under raw spinlock?
> > >
> > > We really are talking past each other, aren't we? ;-)
> > >
> > Let's hear each other better then :)
> >
> > >
> > > My concern is lengthening the duration of the critical section by having
> > > the extra work-queuing execution within it. As in leave the workqueue
> > > free to migrate, but invoke it after releasing the lock.
> > >
> Paul, i have just measured the time duration of the schedule_delayed_work().
> To do that i used below patch:
>
> <snip>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 02f73f7bbd40..f74ae0f3556e 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3232,6 +3232,12 @@ static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
> return ((struct rcu_head *) ++ptr);
> }
>
> +static void noinline
> +measure_schedule_delayed_work(struct kfree_rcu_cpu *krcp)
> +{
> + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> +}
> +
> /*
> * Queue a request for lazy invocation of appropriate free routine after a
> * grace period. Please note there are three paths are maintained, two are the
> @@ -3327,8 +3333,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> !krcp->monitor_todo) {
> krcp->monitor_todo = true;
> - schedule_delayed_work(&krcp->monitor_work,
> - expedited_drain ? 0 : KFREE_DRAIN_JIFFIES);
> + measure_schedule_delayed_work(krcp);
> }
> <snip>
>
> i have done it for not CONFIG_PREEMPT_RT kernel, i do not have any RT configuration.
> I run rcuperf to apply the load to see the time taken by the actual placing of the work,
> i.e. the time taken by schedule_delayed_work():
Thank you for measuring this!
We do need Sebastian's eyes on this. I am not sure what the latency
goals are now, but in the past they have been in the range of a few tens
of microseconds total latency, in which case a five-microsecond change
can make a difference. To say nothing of the ten-microsecond latency
in one of the measurements.
Thanx, Paul
> <snip>
> root@pc636:/sys/kernel/debug/tracing# cat trace
> # tracer: function_graph
> #
> # function_graph latency trace v1.1.5 on 5.6.0-rc6+
> # --------------------------------------------------------------------
> # latency: 0 us, #16/16, CPU#0 | (M:server VP:0, KP:0, SP:0 HP:0 #P:4)
> # -----------------
> # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
> # -----------------
> #
> # _-----=> irqs-off
> # / _----=> need-resched
> # | / _---=> hardirq/softirq
> # || / _--=> preempt-depth
> # ||| /
> # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS
> # | | | | |||| | | | | | |
> 682.384653 | 1) <idle>-0 | d.s. | 5.329 us | } /* measure_schedule_delayed_work.constprop.86 */
> 685.374654 | 2) <idle>-0 | d.s. | 5.392 us | } /* measure_schedule_delayed_work.constprop.86 */
> 700.304647 | 2) <idle>-0 | d.s. | 5.650 us | } /* measure_schedule_delayed_work.constprop.86 */
> 710.331280 | 3) <idle>-0 | d.s. | 5.145 us | } /* measure_schedule_delayed_work.constprop.86 */
> 714.387943 | 1) <idle>-0 | d.s. | 9.986 us | } /* measure_schedule_delayed_work.constprop.86 */
> 720.251229 | 0) <idle>-0 | d.s. | 5.292 us | } /* measure_schedule_delayed_work.constprop.86 */
> 725.211208 | 2) <idle>-0 | d.s. | 5.295 us | } /* measure_schedule_delayed_work.constprop.86 */
> 731.847845 | 1) <idle>-0 | d.s. | 5.048 us | } /* measure_schedule_delayed_work.constprop.86 */
> 736.357802 | 2) <idle>-0 | d.s. | 5.134 us | } /* measure_schedule_delayed_work.constprop.86 */
> 738.287785 | 1) <idle>-0 | d.s. | 5.863 us | } /* measure_schedule_delayed_work.constprop.86 */
> 742.214431 | 1) <idle>-0 | d.s. | 5.202 us | } /* measure_schedule_delayed_work.constprop.86 */
> 759.844264 | 2) <idle>-0 | d.s. | 5.375 us | } /* measure_schedule_delayed_work.constprop.86 */
> 764.304218 | 1) <idle>-0 | d.s. | 5.650 us | } /* measure_schedule_delayed_work.constprop.86 */
> 766.224204 | 3) <idle>-0 | d.s. | 5.015 us | } /* measure_schedule_delayed_work.constprop.86 */
> 772.410794 | 1) <idle>-0 | d.s. | 5.061 us | } /* measure_schedule_delayed_work.constprop.86 */
> 781.370691 | 1) <idle>-0 | d.s. | 5.165 us | } /* measure_schedule_delayed_work.constprop.86 */
> root@pc636:/sys/kernel/debug/tracing# cat tracing_thresh
> 5
> root@pc636:/sys/kernel/debug/tracing#
> <snip>
>
> --
> Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 20:17 ` Uladzislau Rezki
2020-04-20 22:16 ` Paul E. McKenney
@ 2020-04-21 1:22 ` Steven Rostedt
2020-04-21 5:18 ` Uladzislau Rezki
1 sibling, 1 reply; 85+ messages in thread
From: Steven Rostedt @ 2020-04-21 1:22 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Paul E. McKenney, Sebastian Andrzej Siewior, joel, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Mon, 20 Apr 2020 22:17:23 +0200
Uladzislau Rezki <urezki@gmail.com> wrote:
<rant>
I really wish you would crop your email. If I scroll down three pages
without seeing any reply, I usually stop reading there.
</rant>
> > >
> Paul, i have just measured the time duration of the schedule_delayed_work().
> To do that i used below patch:
>
> <snip>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 02f73f7bbd40..f74ae0f3556e 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3232,6 +3232,12 @@ static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
> return ((struct rcu_head *) ++ptr);
> }
>
> +static void noinline
> +measure_schedule_delayed_work(struct kfree_rcu_cpu *krcp)
> +{
> + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> +}
> +
> /*
> * Queue a request for lazy invocation of appropriate free routine after a
> * grace period. Please note there are three paths are maintained, two are the
> @@ -3327,8 +3333,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> !krcp->monitor_todo) {
> krcp->monitor_todo = true;
> - schedule_delayed_work(&krcp->monitor_work,
> - expedited_drain ? 0 : KFREE_DRAIN_JIFFIES);
> + measure_schedule_delayed_work(krcp);
> }
> <snip>
>
> i have done it for not CONFIG_PREEMPT_RT kernel, i do not have any RT configuration.
> I run rcuperf to apply the load to see the time taken by the actual placing of the work,
> i.e. the time taken by schedule_delayed_work():
>
> <snip>
> root@pc636:/sys/kernel/debug/tracing# cat trace
> # tracer: function_graph
> #
> # function_graph latency trace v1.1.5 on 5.6.0-rc6+
> # --------------------------------------------------------------------
> # latency: 0 us, #16/16, CPU#0 | (M:server VP:0, KP:0, SP:0 HP:0 #P:4)
> # -----------------
> # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
> # -----------------
> #
> # _-----=> irqs-off
> # / _----=> need-resched
> # | / _---=> hardirq/softirq
> # || / _--=> preempt-depth
> # ||| /
> # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS
> # | | | | |||| | | | | | |
> 682.384653 | 1) <idle>-0 | d.s. | 5.329 us | } /* measure_schedule_delayed_work.constprop.86 */
Strange output. Do you have all functions being traced? That could
cause overhead.
Try this:
# echo measure_schedule_delayed_work > set_ftrace_filter
# echo function_graph > current_tracer
# cat trace
That will give you much better timings of the overhead of a single
function.
-- Steve
> 685.374654 | 2) <idle>-0 | d.s. | 5.392 us | } /* measure_schedule_delayed_work.constprop.86 */
> 700.304647 | 2) <idle>-0 | d.s. | 5.650 us | } /* measure_schedule_delayed_work.constprop.86 */
> 710.331280 | 3) <idle>-0 | d.s. | 5.145 us | } /* measure_schedule_delayed_work.constprop.86 */
> 714.387943 | 1) <idle>-0 | d.s. | 9.986 us | } /* measure_schedule_delayed_work.constprop.86 */
> 720.251229 | 0) <idle>-0 | d.s. | 5.292 us | } /* measure_schedule_delayed_work.constprop.86 */
> 725.211208 | 2) <idle>-0 | d.s. | 5.295 us | } /* measure_schedule_delayed_work.constprop.86 */
> 731.847845 | 1) <idle>-0 | d.s. | 5.048 us | } /* measure_schedule_delayed_work.constprop.86 */
> 736.357802 | 2) <idle>-0 | d.s. | 5.134 us | } /* measure_schedule_delayed_work.constprop.86 */
> 738.287785 | 1) <idle>-0 | d.s. | 5.863 us | } /* measure_schedule_delayed_work.constprop.86 */
> 742.214431 | 1) <idle>-0 | d.s. | 5.202 us | } /* measure_schedule_delayed_work.constprop.86 */
> 759.844264 | 2) <idle>-0 | d.s. | 5.375 us | } /* measure_schedule_delayed_work.constprop.86 */
> 764.304218 | 1) <idle>-0 | d.s. | 5.650 us | } /* measure_schedule_delayed_work.constprop.86 */
> 766.224204 | 3) <idle>-0 | d.s. | 5.015 us | } /* measure_schedule_delayed_work.constprop.86 */
> 772.410794 | 1) <idle>-0 | d.s. | 5.061 us | } /* measure_schedule_delayed_work.constprop.86 */
> 781.370691 | 1) <idle>-0 | d.s. | 5.165 us | } /* measure_schedule_delayed_work.constprop.86 */
> root@pc636:/sys/kernel/debug/tracing# cat tracing_thresh
> 5
> root@pc636:/sys/kernel/debug/g/tracing# <snip>
>
> --
> Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 1:22 ` Steven Rostedt
@ 2020-04-21 5:18 ` Uladzislau Rezki
2020-04-21 13:30 ` Steven Rostedt
0 siblings, 1 reply; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-21 5:18 UTC (permalink / raw)
To: Steven Rostedt
Cc: Uladzislau Rezki, Paul E. McKenney, Sebastian Andrzej Siewior,
joel, rcu, Josh Triplett, Mathieu Desnoyers, Lai Jiangshan,
Thomas Gleixner, Mike Galbraith
> <rant>
> I really wish you would crop your email. If I scroll down three pages
> without seeing any reply, I usually stop reading there.
> </rant>
>
Agree. i will do it in better manner next time.
> > > >
> > Paul, i have just measured the time duration of the schedule_delayed_work().
> > To do that i used below patch:
> >
> > <snip>
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 02f73f7bbd40..f74ae0f3556e 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3232,6 +3232,12 @@ static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
> > return ((struct rcu_head *) ++ptr);
> > }
> >
> > +static void noinline
> > +measure_schedule_delayed_work(struct kfree_rcu_cpu *krcp)
> > +{
> > + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> > +}
> > +
> > /*
> > * Queue a request for lazy invocation of appropriate free routine after a
> > * grace period. Please note there are three paths are maintained, two are the
> > @@ -3327,8 +3333,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> > !krcp->monitor_todo) {
> > krcp->monitor_todo = true;
> > - schedule_delayed_work(&krcp->monitor_work,
> > - expedited_drain ? 0 : KFREE_DRAIN_JIFFIES);
> > + measure_schedule_delayed_work(krcp);
> > }
> > <snip>
> >
> > i have done it for not CONFIG_PREEMPT_RT kernel, i do not have any RT configuration.
> > I run rcuperf to apply the load to see the time taken by the actual placing of the work,
> > i.e. the time taken by schedule_delayed_work():
> >
> > <snip>
> > root@pc636:/sys/kernel/debug/tracing# cat trace
> > # tracer: function_graph
> > #
> > # function_graph latency trace v1.1.5 on 5.6.0-rc6+
> > # --------------------------------------------------------------------
> > # latency: 0 us, #16/16, CPU#0 | (M:server VP:0, KP:0, SP:0 HP:0 #P:4)
> > # -----------------
> > # | task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
> > # -----------------
> > #
> > # _-----=> irqs-off
> > # / _----=> need-resched
> > # | / _---=> hardirq/softirq
> > # || / _--=> preempt-depth
> > # ||| /
> > # TIME CPU TASK/PID |||| DURATION FUNCTION CALLS
> > # | | | | |||| | | | | | |
> > 682.384653 | 1) <idle>-0 | d.s. | 5.329 us | } /* measure_schedule_delayed_work.constprop.86 */
>
> Strange output. Do you have all functions being traced? That could
> cause overhead.
>
> Try this:
>
> # echo measure_schedule_delayed_work > set_ftrace_filter
> # echo function_graph > current_tracer
> # cat trace
>
> That will give you much better timings of the overhead of a single
> function.
>
I did exactly how are your steps. I do not filter all available
functions, there is only one set:
<snip>
root@pc636:/sys/kernel/debug/tracing# cat set_ftrace_filter
measure_schedule_delayed_work.constprop.86
root@pc636:/sys/kernel/debug/tracing# cat tracing_thresh
5
root@pc636:/sys/kernel/debug/tracing# cat current_tracer
function_graph
root@pc636:/sys/kernel/debug/tracing#
<snip>
Also i set 5 microseconds threshold to filter out what is less
and added the latency-format trace option.
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 5:18 ` Uladzislau Rezki
@ 2020-04-21 13:30 ` Steven Rostedt
2020-04-21 13:45 ` Uladzislau Rezki
0 siblings, 1 reply; 85+ messages in thread
From: Steven Rostedt @ 2020-04-21 13:30 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Paul E. McKenney, Sebastian Andrzej Siewior, joel, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Tue, 21 Apr 2020 07:18:05 +0200
Uladzislau Rezki <urezki@gmail.com> wrote:
> <snip>
> root@pc636:/sys/kernel/debug/tracing# cat set_ftrace_filter
> measure_schedule_delayed_work.constprop.86
> root@pc636:/sys/kernel/debug/tracing# cat tracing_thresh
> 5
> root@pc636:/sys/kernel/debug/tracing# cat current_tracer
> function_graph
> root@pc636:/sys/kernel/debug/tracing#
> <snip>
>
> Also i set 5 microseconds threshold to filter out what is less
> and added the latency-format trace option.
Ah, I forgot that tracing_thresh affects function graph tracing.
The tracing infrastructure has so many features, I can't even keep up ;-)
-- Steve
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 13:30 ` Steven Rostedt
@ 2020-04-21 13:45 ` Uladzislau Rezki
0 siblings, 0 replies; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-21 13:45 UTC (permalink / raw)
To: Steven Rostedt, Paul E. McKenney
Cc: Uladzislau Rezki, Paul E. McKenney, Sebastian Andrzej Siewior,
joel, rcu, Josh Triplett, Mathieu Desnoyers, Lai Jiangshan,
Thomas Gleixner, Mike Galbraith
> Uladzislau Rezki <urezki@gmail.com> wrote:
>
> > <snip>
> > root@pc636:/sys/kernel/debug/tracing# cat set_ftrace_filter
> > measure_schedule_delayed_work.constprop.86
> > root@pc636:/sys/kernel/debug/tracing# cat tracing_thresh
> > 5
> > root@pc636:/sys/kernel/debug/tracing# cat current_tracer
> > function_graph
> > root@pc636:/sys/kernel/debug/tracing#
> > <snip>
> >
> > Also i set 5 microseconds threshold to filter out what is less
> > and added the latency-format trace option.
>
> Ah, I forgot that tracing_thresh affects function graph tracing.
>
> The tracing infrastructure has so many features, I can't even keep up ;-)
>
No problem :)
BTW, i decided to perform manual measurement of the schedule_delayed_work()
duration, just in case:
test PC: laptop with Intel(R) Core(TM) i5-3320M CPU @ 2.60GHz
<snip>
static void noinline
measure_schedule_delayed_work(struct kfree_rcu_cpu *krcp)
{
u64 delta;
delta = sched_clock();
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
delta = sched_clock() - delta;
if (delta > 100)
trace_printk("-> took %llu nanoseconds.\n", delta);
}
<snip>
I get the following figures of taken time:
MEDIAN: 831 nanoseconds
MAX: 4423 nanoseconds
MIN: 123 nanoseconds
The data you can find here: ftp://vps418301.ovh.net/incoming/measure_schedule_delayed_work.txt
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 17:40 ` Uladzislau Rezki
2020-04-20 17:57 ` Joel Fernandes
2020-04-20 17:59 ` Paul E. McKenney
@ 2020-04-21 13:39 ` Sebastian Andrzej Siewior
2020-04-21 15:41 ` Paul E. McKenney
2 siblings, 1 reply; 85+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-04-21 13:39 UTC (permalink / raw)
To: Uladzislau Rezki
Cc: Paul E. McKenney, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On 2020-04-20 19:40:19 [+0200], Uladzislau Rezki wrote:
> Paul, if move outside of the lock we may introduce unneeded migration
> issues, plus it can introduce higher memory footprint(i have not tested).
> I have described it in more detail earlier in this mail thread. I do not
> think that waking up the work is an issue for RT from latency point of
> view. But let's ask Sebastian to confirm.
>
> Sebastian, do you think that placing a work on current CPU is an issue?
> If we do it under raw spinlock?
I sent one simple patch to address the issues I see now. It was raised
that the patch may introduce a performance regression (`may' since we
had no numbers back it up) in case where we the context would be
migrated from one CPU to another between "this" and "spin_lock". It was
suggested to use migrate_disable() to avoid it.
It was then decided to address possible future problems and make the
lock raw which brings us here.
Invoking queue_work() is not a problem locking wise. It does however
extend the length of the critical section (which is not yet needed).
> Thank you!
>
Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 13:39 ` Sebastian Andrzej Siewior
@ 2020-04-21 15:41 ` Paul E. McKenney
2020-04-21 17:05 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-21 15:41 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Tue, Apr 21, 2020 at 03:39:47PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-04-20 19:40:19 [+0200], Uladzislau Rezki wrote:
> > Paul, if move outside of the lock we may introduce unneeded migration
> > issues, plus it can introduce higher memory footprint(i have not tested).
> > I have described it in more detail earlier in this mail thread. I do not
> > think that waking up the work is an issue for RT from latency point of
> > view. But let's ask Sebastian to confirm.
> >
> > Sebastian, do you think that placing a work on current CPU is an issue?
> > If we do it under raw spinlock?
>
> I sent one simple patch to address the issues I see now. It was raised
> that the patch may introduce a performance regression (`may' since we
> had no numbers back it up) in case where we the context would be
> migrated from one CPU to another between "this" and "spin_lock". It was
> suggested to use migrate_disable() to avoid it.
>
> It was then decided to address possible future problems and make the
> lock raw which brings us here.
>
> Invoking queue_work() is not a problem locking wise. It does however
> extend the length of the critical section (which is not yet needed).
I am guessing that by "which is not yet needed" you were meaning that
it is not absolutely necessary to extend the length of the critical
section. Please correct me if my guess is wrong.
In the meantime, plunging ahead...
One approach might be to move queue_work() and friends out of the critical
section, but only enable interrupts between the two critical sections
under CONFIG_PREEMPT_RT_FULL. Would that be a reasonable approach?
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 15:41 ` Paul E. McKenney
@ 2020-04-21 17:05 ` Sebastian Andrzej Siewior
2020-04-21 18:09 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-04-21 17:05 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On 2020-04-21 08:41:01 [-0700], Paul E. McKenney wrote:
> > Invoking queue_work() is not a problem locking wise. It does however
> > extend the length of the critical section (which is not yet needed).
>
> I am guessing that by "which is not yet needed" you were meaning that
> it is not absolutely necessary to extend the length of the critical
> section. Please correct me if my guess is wrong.
By changing the lock type you extend the atomic section from "only"
queue_work() (if invoked) to everything kfree_rcu() does plus
queue_work().
> In the meantime, plunging ahead...
>
> One approach might be to move queue_work() and friends out of the critical
> section, but only enable interrupts between the two critical sections
> under CONFIG_PREEMPT_RT_FULL. Would that be a reasonable approach?
Yes but why do we do this raw_spinlock_t here? It is not yet needed on
v5.6-RT as I *did* check. It also complicates the code for !RT but
nobody responded to that part but…
That said: the current memory allocation is the problem here. The
remaining part is fine. The part under the lock is small enough so it
should not cause the trouble if it invokes queue_work() which will
"only" enqueue the timer.
Side question: Is there any real-life workloads that benefits from this?
I'm asking because rcuperf allocates the kfree_rcu() the pointer right
away. The chances are high that the pointer are fed from the same page.
SLUB's build_detached_freelist() scans the page of RCU's pointers to
ensure that they are in the same page and then slab_free() them in one
go. There is lookahead = 3 so it finds three different pages it stops
further scanning and does slab_free() with what it found so far.
Which means if your kfree_rcu() collects random pointer from the system,
they may belong to different pages (especially if they are part of
different "types").
> Thanx, Paul
Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 17:05 ` Sebastian Andrzej Siewior
@ 2020-04-21 18:09 ` Paul E. McKenney
2020-04-22 11:13 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-21 18:09 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Tue, Apr 21, 2020 at 07:05:56PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-04-21 08:41:01 [-0700], Paul E. McKenney wrote:
> > > Invoking queue_work() is not a problem locking wise. It does however
> > > extend the length of the critical section (which is not yet needed).
> >
> > I am guessing that by "which is not yet needed" you were meaning that
> > it is not absolutely necessary to extend the length of the critical
> > section. Please correct me if my guess is wrong.
>
> By changing the lock type you extend the atomic section from "only"
> queue_work() (if invoked) to everything kfree_rcu() does plus
> queue_work().
Got it, thank you!
> > In the meantime, plunging ahead...
> >
> > One approach might be to move queue_work() and friends out of the critical
> > section, but only enable interrupts between the two critical sections
> > under CONFIG_PREEMPT_RT_FULL. Would that be a reasonable approach?
>
> Yes but why do we do this raw_spinlock_t here? It is not yet needed on
> v5.6-RT as I *did* check. It also complicates the code for !RT but
> nobody responded to that part but…
I did respond by pointing out that the essentially similar call_rcu()
function ends up being invoked pretty much everywhere, including early
boot before rcu_init() has been invoked. It is therefore only reasonable
to assume that there will be a need for kfree_rcu() to tolerate a similar
range of calling contexts.
> That said: the current memory allocation is the problem here. The
> remaining part is fine. The part under the lock is small enough so it
> should not cause the trouble if it invokes queue_work() which will
> "only" enqueue the timer.
To your point, the small memory allocation will be going away. The
memory allocations will pull in 4K pages of pointers.
On the timer, are you thinking of the queue_work() calls or instead of
the queue_delayed_work() calls?
> Side question: Is there any real-life workloads that benefits from this?
> I'm asking because rcuperf allocates the kfree_rcu() the pointer right
> away. The chances are high that the pointer are fed from the same page.
> SLUB's build_detached_freelist() scans the page of RCU's pointers to
> ensure that they are in the same page and then slab_free() them in one
> go. There is lookahead = 3 so it finds three different pages it stops
> further scanning and does slab_free() with what it found so far.
>
> Which means if your kfree_rcu() collects random pointer from the system,
> they may belong to different pages (especially if they are part of
> different "types").
It gets significantly better performance as it currently is due to
the reduced cache-miss rate scanning pointers in a page as opposed to
pointer-chasing through a series of rcu_head pointers.
Yes, it might be even better if kfree_rcu() further sorted the freed
objects per slab or maybe even per page within a slab, but one step at
a time! For one thing, it is not hard to imagine situations where this
further sorting actually slowed things down, especially if the system
was under any sort of memory pressure or if the kfree_rcu() calls were
scattered across so many slabs and pages that it essentially reverted
back to pointer chasing.
Make sense, or am I missing your point?
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-21 18:09 ` Paul E. McKenney
@ 2020-04-22 11:13 ` Sebastian Andrzej Siewior
2020-04-22 13:33 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-04-22 11:13 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On 2020-04-21 11:09:14 [-0700], Paul E. McKenney wrote:
> > Yes but why do we do this raw_spinlock_t here? It is not yet needed on
> > v5.6-RT as I *did* check. It also complicates the code for !RT but
> > nobody responded to that part but…
>
> I did respond by pointing out that the essentially similar call_rcu()
> function ends up being invoked pretty much everywhere, including early
> boot before rcu_init() has been invoked. It is therefore only reasonable
> to assume that there will be a need for kfree_rcu() to tolerate a similar
> range of calling contexts.
Early in the boot we have IRQs disabled but also one CPU and no
scheduling. That means that not a single lock is contained.
> > That said: the current memory allocation is the problem here. The
> > remaining part is fine. The part under the lock is small enough so it
> > should not cause the trouble if it invokes queue_work() which will
> > "only" enqueue the timer.
>
> To your point, the small memory allocation will be going away. The
> memory allocations will pull in 4K pages of pointers.
Oki.
> On the timer, are you thinking of the queue_work() calls or instead of
> the queue_delayed_work() calls?
As of now, on the "first" invocation of kfree_rcu() it invokes
queue_delayed_work(). The work is not active, the timer is not pending
so it always enqueues a new timer.
> > Side question: Is there any real-life workloads that benefits from this?
> > I'm asking because rcuperf allocates the kfree_rcu() the pointer right
> > away. The chances are high that the pointer are fed from the same page.
> > SLUB's build_detached_freelist() scans the page of RCU's pointers to
> > ensure that they are in the same page and then slab_free() them in one
> > go. There is lookahead = 3 so it finds three different pages it stops
> > further scanning and does slab_free() with what it found so far.
> >
> > Which means if your kfree_rcu() collects random pointer from the system,
> > they may belong to different pages (especially if they are part of
> > different "types").
>
> It gets significantly better performance as it currently is due to
> the reduced cache-miss rate scanning pointers in a page as opposed to
> pointer-chasing through a series of rcu_head pointers.
So the performance boost is not due to kfree_bulk() but due to the
pointers which are "in ordered".
> Yes, it might be even better if kfree_rcu() further sorted the freed
> objects per slab or maybe even per page within a slab, but one step at
> a time! For one thing, it is not hard to imagine situations where this
> further sorting actually slowed things down, especially if the system
> was under any sort of memory pressure or if the kfree_rcu() calls were
> scattered across so many slabs and pages that it essentially reverted
> back to pointer chasing.
Okay.
> Make sense, or am I missing your point?
No, you got it.
> Thanx, Paul
Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-22 11:13 ` Sebastian Andrzej Siewior
@ 2020-04-22 13:33 ` Paul E. McKenney
2020-04-22 15:46 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-22 13:33 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Wed, Apr 22, 2020 at 01:13:24PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-04-21 11:09:14 [-0700], Paul E. McKenney wrote:
> > > Yes but why do we do this raw_spinlock_t here? It is not yet needed on
> > > v5.6-RT as I *did* check. It also complicates the code for !RT but
> > > nobody responded to that part but…
> >
> > I did respond by pointing out that the essentially similar call_rcu()
> > function ends up being invoked pretty much everywhere, including early
> > boot before rcu_init() has been invoked. It is therefore only reasonable
> > to assume that there will be a need for kfree_rcu() to tolerate a similar
> > range of calling contexts.
>
> Early in the boot we have IRQs disabled but also one CPU and no
> scheduling. That means that not a single lock is contained.
You are saying that call_rcu() is never invoked while holding a raw
spinlock?
> > > That said: the current memory allocation is the problem here. The
> > > remaining part is fine. The part under the lock is small enough so it
> > > should not cause the trouble if it invokes queue_work() which will
> > > "only" enqueue the timer.
> >
> > To your point, the small memory allocation will be going away. The
> > memory allocations will pull in 4K pages of pointers.
>
> Oki.
>
> > On the timer, are you thinking of the queue_work() calls or instead of
> > the queue_delayed_work() calls?
>
> As of now, on the "first" invocation of kfree_rcu() it invokes
> queue_delayed_work(). The work is not active, the timer is not pending
> so it always enqueues a new timer.
Fair enough, and thank you for checking!
The other work-queuing operations are also OK as they are, then?
> > > Side question: Is there any real-life workloads that benefits from this?
> > > I'm asking because rcuperf allocates the kfree_rcu() the pointer right
> > > away. The chances are high that the pointer are fed from the same page.
> > > SLUB's build_detached_freelist() scans the page of RCU's pointers to
> > > ensure that they are in the same page and then slab_free() them in one
> > > go. There is lookahead = 3 so it finds three different pages it stops
> > > further scanning and does slab_free() with what it found so far.
> > >
> > > Which means if your kfree_rcu() collects random pointer from the system,
> > > they may belong to different pages (especially if they are part of
> > > different "types").
> >
> > It gets significantly better performance as it currently is due to
> > the reduced cache-miss rate scanning pointers in a page as opposed to
> > pointer-chasing through a series of rcu_head pointers.
>
> So the performance boost is not due to kfree_bulk() but due to the
> pointers which are "in ordered".
Almost. Rather that the performance boost due to the cache locality
of the pointers justifies the change. The performance boost due to
kfree_bulk() can sometimes help further, for example, when a large number
of objects from the same slab are freed.
Again, thank you for digging into this!
Thanx, Paul
> > Yes, it might be even better if kfree_rcu() further sorted the freed
> > objects per slab or maybe even per page within a slab, but one step at
> > a time! For one thing, it is not hard to imagine situations where this
> > further sorting actually slowed things down, especially if the system
> > was under any sort of memory pressure or if the kfree_rcu() calls were
> > scattered across so many slabs and pages that it essentially reverted
> > back to pointer chasing.
>
> Okay.
>
> > Make sense, or am I missing your point?
>
> No, you got it.
>
> > Thanx, Paul
>
> Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-22 13:33 ` Paul E. McKenney
@ 2020-04-22 15:46 ` Sebastian Andrzej Siewior
2020-04-22 16:19 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-04-22 15:46 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On 2020-04-22 06:33:46 [-0700], Paul E. McKenney wrote:
> On Wed, Apr 22, 2020 at 01:13:24PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-21 11:09:14 [-0700], Paul E. McKenney wrote:
> > > > Yes but why do we do this raw_spinlock_t here? It is not yet needed on
> > > > v5.6-RT as I *did* check. It also complicates the code for !RT but
> > > > nobody responded to that part but…
> > >
> > > I did respond by pointing out that the essentially similar call_rcu()
> > > function ends up being invoked pretty much everywhere, including early
> > > boot before rcu_init() has been invoked. It is therefore only reasonable
> > > to assume that there will be a need for kfree_rcu() to tolerate a similar
> > > range of calling contexts.
> >
> > Early in the boot we have IRQs disabled but also one CPU and no
> > scheduling. That means that not a single lock is contained.
>
> You are saying that call_rcu() is never invoked while holding a raw
> spinlock?
I didn't say that. I said if you use spin_lock() with interrupts
disabled *but* early in the boot process (without the scheduler active)
then it is okay.
This was a response to your "including early boot before …".
But since you ask: On top of my head I know that
task_struct is released via RCU by the scheduler in a preempt-disabled
section. We have a similar workaround for the mm struct. So yes, we have
at the very least those two.
> Fair enough, and thank you for checking!
>
> The other work-queuing operations are also OK as they are, then?
yes.
Sebastian
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-22 15:46 ` Sebastian Andrzej Siewior
@ 2020-04-22 16:19 ` Paul E. McKenney
2020-04-22 16:35 ` Paul E. McKenney
0 siblings, 1 reply; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-22 16:19 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Wed, Apr 22, 2020 at 05:46:25PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-04-22 06:33:46 [-0700], Paul E. McKenney wrote:
> > On Wed, Apr 22, 2020 at 01:13:24PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2020-04-21 11:09:14 [-0700], Paul E. McKenney wrote:
> > > > > Yes but why do we do this raw_spinlock_t here? It is not yet needed on
> > > > > v5.6-RT as I *did* check. It also complicates the code for !RT but
> > > > > nobody responded to that part but…
> > > >
> > > > I did respond by pointing out that the essentially similar call_rcu()
> > > > function ends up being invoked pretty much everywhere, including early
> > > > boot before rcu_init() has been invoked. It is therefore only reasonable
> > > > to assume that there will be a need for kfree_rcu() to tolerate a similar
> > > > range of calling contexts.
> > >
> > > Early in the boot we have IRQs disabled but also one CPU and no
> > > scheduling. That means that not a single lock is contained.
> >
> > You are saying that call_rcu() is never invoked while holding a raw
> > spinlock?
>
> I didn't say that. I said if you use spin_lock() with interrupts
> disabled *but* early in the boot process (without the scheduler active)
> then it is okay.
> This was a response to your "including early boot before …".
> But since you ask: On top of my head I know that
> task_struct is released via RCU by the scheduler in a preempt-disabled
> section. We have a similar workaround for the mm struct. So yes, we have
> at the very least those two.
>
> > Fair enough, and thank you for checking!
> >
> > The other work-queuing operations are also OK as they are, then?
>
> yes.
Very good, and again, thank you!
Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-22 16:19 ` Paul E. McKenney
@ 2020-04-22 16:35 ` Paul E. McKenney
0 siblings, 0 replies; 85+ messages in thread
From: Paul E. McKenney @ 2020-04-22 16:35 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Uladzislau Rezki, joel, Steven Rostedt, rcu, Josh Triplett,
Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith
On Wed, Apr 22, 2020 at 09:19:55AM -0700, Paul E. McKenney wrote:
> On Wed, Apr 22, 2020 at 05:46:25PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-22 06:33:46 [-0700], Paul E. McKenney wrote:
> > > On Wed, Apr 22, 2020 at 01:13:24PM +0200, Sebastian Andrzej Siewior wrote:
> > > > On 2020-04-21 11:09:14 [-0700], Paul E. McKenney wrote:
> > > > > > Yes but why do we do this raw_spinlock_t here? It is not yet needed on
> > > > > > v5.6-RT as I *did* check. It also complicates the code for !RT but
> > > > > > nobody responded to that part but…
> > > > >
> > > > > I did respond by pointing out that the essentially similar call_rcu()
> > > > > function ends up being invoked pretty much everywhere, including early
> > > > > boot before rcu_init() has been invoked. It is therefore only reasonable
> > > > > to assume that there will be a need for kfree_rcu() to tolerate a similar
> > > > > range of calling contexts.
> > > >
> > > > Early in the boot we have IRQs disabled but also one CPU and no
> > > > scheduling. That means that not a single lock is contained.
> > >
> > > You are saying that call_rcu() is never invoked while holding a raw
> > > spinlock?
> >
> > I didn't say that. I said if you use spin_lock() with interrupts
> > disabled *but* early in the boot process (without the scheduler active)
> > then it is okay.
> > This was a response to your "including early boot before …".
> > But since you ask: On top of my head I know that
> > task_struct is released via RCU by the scheduler in a preempt-disabled
> > section. We have a similar workaround for the mm struct. So yes, we have
> > at the very least those two.
In case your question is "Why can call_srcu() be restricted but not
kfree_rcu()", the answer is that call_srcu() has all of five call sites
outside of rcutorture and friends. So this restriction is much more
likely to stick for call_srcu() than for kfree_rcu().
Thanx, Paul
> > > Fair enough, and thank you for checking!
> > >
> > > The other work-queuing operations are also OK as they are, then?
> >
> > yes.
>
> Very good, and again, thank you!
>
> Thanx, Paul
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 1:17 ` Joel Fernandes
2020-04-20 1:44 ` Paul E. McKenney
@ 2020-04-20 3:02 ` Mike Galbraith
2020-04-20 12:30 ` joel
1 sibling, 1 reply; 85+ messages in thread
From: Mike Galbraith @ 2020-04-20 3:02 UTC (permalink / raw)
To: Joel Fernandes, Paul E. McKenney
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner
On Sun, 2020-04-19 at 21:17 -0400, Joel Fernandes wrote:
>
> > > Can we do even better by moving the work-scheduling out from under the
> > > spinlock? This of course means that it is necessary to handle the
> > > occasional spurious call to the work handler, but that should be rare
> > > and should be in the noise compared to the reduction in contention.
> >
> > Yes I think that will be required since -rt will sleep on workqueue locks as
> > well :-(. I'm looking into it right now.
> >
> > /*
> > * If @work was previously on a different pool, it might still be
> > * running there, in which case the work needs to be queued on that
> > * pool to guarantee non-reentrancy.
> > */
> > last_pool = get_work_pool(work);
> > if (last_pool && last_pool != pwq->pool) {
> > struct worker *worker;
> >
> > spin_lock(&last_pool->lock);
>
> Hmm, I think moving schedule_delayed_work() outside lock will work. Just took
> a good look and that's not an issue. However calling schedule_delayed_work()
> itself is an issue if the caller of kfree_rcu() is !preemptible() on
> PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on pool->lock
> which can sleep on PREEMPT_RT :-(.
As of 4.19-rt, workqueue locks are converted to raw_spinlock_t.
-Mike
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-20 3:02 ` Mike Galbraith
@ 2020-04-20 12:30 ` joel
0 siblings, 0 replies; 85+ messages in thread
From: joel @ 2020-04-20 12:30 UTC (permalink / raw)
To: Mike Galbraith, Paul E. McKenney
Cc: Uladzislau Rezki, Sebastian Andrzej Siewior, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner
On April 19, 2020 11:02:41 PM EDT, Mike Galbraith <efault@gmx.de> wrote:
>On Sun, 2020-04-19 at 21:17 -0400, Joel Fernandes wrote:
>>
>> > > Can we do even better by moving the work-scheduling out from
>under the
>> > > spinlock? This of course means that it is necessary to handle
>the
>> > > occasional spurious call to the work handler, but that should be
>rare
>> > > and should be in the noise compared to the reduction in
>contention.
>> >
>> > Yes I think that will be required since -rt will sleep on workqueue
>locks as
>> > well :-(. I'm looking into it right now.
>> >
>> > /*
>> > * If @work was previously on a different pool, it might
>still be
>> > * running there, in which case the work needs to be queued
>on that
>> > * pool to guarantee non-reentrancy.
>> > */
>> > last_pool = get_work_pool(work);
>> > if (last_pool && last_pool != pwq->pool) {
>> > struct worker *worker;
>> >
>> > spin_lock(&last_pool->lock);
>>
>> Hmm, I think moving schedule_delayed_work() outside lock will work.
>Just took
>> a good look and that's not an issue. However calling
>schedule_delayed_work()
>> itself is an issue if the caller of kfree_rcu() is !preemptible() on
>> PREEMPT_RT. Because the schedule_delayed_work() calls spin_lock on
>pool->lock
>> which can sleep on PREEMPT_RT :-(.
>
>As of 4.19-rt, workqueue locks are converted to raw_spinlock_t.
>
Thanks for the clarification on that!!
- Joel
> -Mike
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 3:05 ` Joel Fernandes
2020-04-17 8:47 ` Uladzislau Rezki
2020-04-17 15:04 ` Sebastian Andrzej Siewior
@ 2020-04-17 16:11 ` Uladzislau Rezki
2020-04-19 12:15 ` Uladzislau Rezki
3 siblings, 0 replies; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-17 16:11 UTC (permalink / raw)
To: Joel Fernandes, Paul E. McKenney
Cc: Sebastian Andrzej Siewior, Paul E. McKenney, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On Thu, Apr 16, 2020 at 11:05:15PM -0400, Joel Fernandes wrote:
> On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > >
> > > We might need different calling-context restrictions for the two variants
> > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > check for "safe to use normal spinlock in -rt".
> >
> > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > This one will scream if you do
> > raw_spin_lock();
> > spin_lock();
> >
> > Sadly, as of today, there is code triggering this which needs to be
> > addressed first (but it is one list of things to do).
> >
> > Given the thread so far, is it okay if I repost the series with
> > migrate_disable() instead of accepting a possible migration before
> > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > memory allocations in a possible atomic context) until we get there.
>
> I prefer something like the following to make it possible to invoke
> kfree_rcu() from atomic context considering call_rcu() is already callable
> from such contexts. Thoughts?
>
> (Only build tested)
> ---8<-----------------------
>
> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> Subject: [PATCH] rcu/tree: Avoid allocating in non-preemptible context for
> PREEMPT_RT kernels
>
> Per recent discussions, kfree_rcu() is a low-level facility which should be
> callable in atomic context (raw spinlock sections, IRQ disable sections etc).
>
> However, it depends on page allocation which acquires sleeping locks on
> PREEMPT_RT.
>
> In order to support all usecases, avoid the allocation of pages for
> PREEMPT_RT. The page allocation is just an optimization which does not
> break functionality. Further, in future patches the pages will be
> pre-allocated reducing the likelihood that page allocations will be
> needed.
>
> We also convert the spinlock_t to raw_spinlock_t so that does not sleep
> in PREEMPT_RT's raw atomic critical sections.
>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
> kernel/rcu/tree.c | 42 +++++++++++++++++++++++++-----------------
> 1 file changed, 25 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f288477ee1c26..ba831712fb307 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2905,7 +2905,7 @@ struct kfree_rcu_cpu {
> struct kfree_rcu_bulk_data *bhead;
> struct kfree_rcu_bulk_data *bcached;
> struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> - spinlock_t lock;
> + raw_spinlock_t lock;
> struct delayed_work monitor_work;
> bool monitor_todo;
> bool initialized;
> @@ -2939,12 +2939,12 @@ static void kfree_rcu_work(struct work_struct *work)
> krwp = container_of(to_rcu_work(work),
> struct kfree_rcu_cpu_work, rcu_work);
> krcp = krwp->krcp;
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> head = krwp->head_free;
> krwp->head_free = NULL;
> bhead = krwp->bhead_free;
> krwp->bhead_free = NULL;
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
>
> /* "bhead" is now private, so traverse locklessly. */
> for (; bhead; bhead = bnext) {
> @@ -3047,14 +3047,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> krcp->monitor_todo = false;
> if (queue_kfree_rcu_work(krcp)) {
> // Success! Our job is done here.
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> return;
> }
>
> // Previous RCU batch still in progress, try again later.
> krcp->monitor_todo = true;
> schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> /*
> @@ -3067,16 +3067,16 @@ static void kfree_rcu_monitor(struct work_struct *work)
> struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
> monitor_work.work);
>
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (krcp->monitor_todo)
> kfree_rcu_drain_unlock(krcp, flags);
> else
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> static inline bool
> kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> - struct rcu_head *head, rcu_callback_t func)
> + struct rcu_head *head, rcu_callback_t func, bool alloc)
> {
> struct kfree_rcu_bulk_data *bnode;
>
> @@ -3092,6 +3092,10 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> if (!bnode) {
> WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
>
> + /* If allocation is not allowed, don't do it. */
> + if (!alloc)
> + return false;
> +
> bnode = (struct kfree_rcu_bulk_data *)
> __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> }
> @@ -3138,11 +3142,15 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> {
> unsigned long flags;
> struct kfree_rcu_cpu *krcp;
> + bool alloc = true;
> +
> + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
> + alloc = false;
>
> local_irq_save(flags); // For safely calling this_cpu_ptr().
> krcp = this_cpu_ptr(&krc);
> if (krcp->initialized)
> - spin_lock(&krcp->lock);
> + raw_spin_lock(&krcp->lock);
>
> // Queue the object but don't yet schedule the batch.
> if (debug_rcu_head_queue(head)) {
> @@ -3156,7 +3164,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> * Under high memory pressure GFP_NOWAIT can fail,
> * in that case the emergency path is maintained.
> */
> - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
> + if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func, alloc))) {
> head->func = func;
> head->next = krcp->head;
> krcp->head = head;
> @@ -3173,7 +3181,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>
> unlock_return:
> if (krcp->initialized)
> - spin_unlock(&krcp->lock);
> + raw_spin_unlock(&krcp->lock);
> local_irq_restore(flags);
> }
> EXPORT_SYMBOL_GPL(kfree_call_rcu);
> @@ -3205,11 +3213,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> count = krcp->count;
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (krcp->monitor_todo)
> kfree_rcu_drain_unlock(krcp, flags);
> else
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
>
> sc->nr_to_scan -= count;
> freed += count;
> @@ -3236,15 +3244,15 @@ void __init kfree_rcu_scheduler_running(void)
> for_each_online_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (!krcp->head || krcp->monitor_todo) {
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> continue;
> }
> krcp->monitor_todo = true;
> schedule_delayed_work_on(cpu, &krcp->monitor_work,
> KFREE_DRAIN_JIFFIES);
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
> }
>
> @@ -4140,7 +4148,7 @@ static void __init kfree_rcu_batch_init(void)
> for_each_possible_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> - spin_lock_init(&krcp->lock);
> + raw_spin_lock_init(&krcp->lock);
> for (i = 0; i < KFREE_N_BATCHES; i++) {
> INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> krcp->krw_arr[i].krcp = krcp;
> --
> 2.26.1.301.g55bc3eb7cb9-goog
>
Forgot to add:
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
--
Vlad Rezki
^ permalink raw reply [flat|nested] 85+ messages in thread
* Re: [PATCH 1/3] rcu: Use static initializer for krc.lock
2020-04-17 3:05 ` Joel Fernandes
` (2 preceding siblings ...)
2020-04-17 16:11 ` Uladzislau Rezki
@ 2020-04-19 12:15 ` Uladzislau Rezki
3 siblings, 0 replies; 85+ messages in thread
From: Uladzislau Rezki @ 2020-04-19 12:15 UTC (permalink / raw)
To: Joel Fernandes
Cc: Sebastian Andrzej Siewior, Paul E. McKenney, Steven Rostedt, rcu,
Josh Triplett, Mathieu Desnoyers, Lai Jiangshan, Thomas Gleixner,
Mike Galbraith, urezki
On Thu, Apr 16, 2020 at 11:05:15PM -0400, Joel Fernandes wrote:
> On Thu, Apr 16, 2020 at 11:34:44PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-04-16 14:00:57 [-0700], Paul E. McKenney wrote:
> > >
> > > We might need different calling-context restrictions for the two variants
> > > of kfree_rcu(). And we might need to come up with some sort of lockdep
> > > check for "safe to use normal spinlock in -rt".
> >
> > Oh. We do have this already, it is called CONFIG_PROVE_RAW_LOCK_NESTING.
> > This one will scream if you do
> > raw_spin_lock();
> > spin_lock();
> >
> > Sadly, as of today, there is code triggering this which needs to be
> > addressed first (but it is one list of things to do).
> >
> > Given the thread so far, is it okay if I repost the series with
> > migrate_disable() instead of accepting a possible migration before
> > grabbing the lock? I would prefer to avoid the extra RT case (avoiding
> > memory allocations in a possible atomic context) until we get there.
>
> I prefer something like the following to make it possible to invoke
> kfree_rcu() from atomic context considering call_rcu() is already callable
> from such contexts. Thoughts?
>
> (Only build tested)
> ---8<-----------------------
>
> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> Subject: [PATCH] rcu/tree: Avoid allocating in non-preemptible context for
> PREEMPT_RT kernels
>
> Per recent discussions, kfree_rcu() is a low-level facility which should be
> callable in atomic context (raw spinlock sections, IRQ disable sections etc).
>
> However, it depends on page allocation which acquires sleeping locks on
> PREEMPT_RT.
>
> In order to support all usecases, avoid the allocation of pages for
> PREEMPT_RT. The page allocation is just an optimization which does not
> break functionality. Further, in future patches the pages will be
> pre-allocated reducing the likelihood that page allocations will be
> needed.
>
> We also convert the spinlock_t to raw_spinlock_t so that does not sleep
> in PREEMPT_RT's raw atomic critical sections.
>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
> kernel/rcu/tree.c | 42 +++++++++++++++++++++++++-----------------
> 1 file changed, 25 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f288477ee1c26..ba831712fb307 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2905,7 +2905,7 @@ struct kfree_rcu_cpu {
> struct kfree_rcu_bulk_data *bhead;
> struct kfree_rcu_bulk_data *bcached;
> struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> - spinlock_t lock;
> + raw_spinlock_t lock;
> struct delayed_work monitor_work;
> bool monitor_todo;
> bool initialized;
> @@ -2939,12 +2939,12 @@ static void kfree_rcu_work(struct work_struct *work)
> krwp = container_of(to_rcu_work(work),
> struct kfree_rcu_cpu_work, rcu_work);
> krcp = krwp->krcp;
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> head = krwp->head_free;
> krwp->head_free = NULL;
> bhead = krwp->bhead_free;
> krwp->bhead_free = NULL;
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
>
> /* "bhead" is now private, so traverse locklessly. */
> for (; bhead; bhead = bnext) {
> @@ -3047,14 +3047,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> krcp->monitor_todo = false;
> if (queue_kfree_rcu_work(krcp)) {
> // Success! Our job is done here.
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> return;
> }
>
> // Previous RCU batch still in progress, try again later.
> krcp->monitor_todo = true;
> schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> /*
> @@ -3067,16 +3067,16 @@ static void kfree_rcu_monitor(struct work_struct *work)
> struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
> monitor_work.work);
>
> - spin_lock_irqsave(&krcp->lock, flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (krcp->monitor_todo)
> kfree_rcu_drain_unlock(krcp, flags);
> else
> - spin_unlock_irqrestore(&krcp->lock, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> static inline bool
> kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> - struct rcu_head *head, rcu_callback_t func)
> + struct rcu_head *head, rcu_callback_t func, bool alloc)
> {
> struct kfree_rcu_bulk_data *bnode;
>
> @@ -3092,6 +3092,10 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
> if (!bnode) {
> WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
>
> + /* If allocation is not allowed, don't do it. */
> + if (!alloc)
> + return false;
> +
> bnode = (struct kfree_rcu_bulk_data *)
> __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> }
Joel, we also need to drop the lock before alloc_pages(). We are not
allowed to allocate under raw_spin_lock held, because of it uses
sleepable spinlocks. If i miss something, please fix me :)
<snip>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ba831712fb30..9a334e3c7f96 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3076,7 +3076,8 @@ static void kfree_rcu_monitor(struct work_struct *work)
static inline bool
kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
- struct rcu_head *head, rcu_callback_t func, bool alloc)
+ struct rcu_head *head, rcu_callback_t func, bool alloc,
+ unsigned long *flags)
{
struct kfree_rcu_bulk_data *bnode;
@@ -3096,8 +3097,22 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
if (!alloc)
return false;
+ /* Drop the lock. */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ migrate_disable();
+ raw_spin_unlock(&krcp->lock);
+ local_irq_restore(*flags);
+ }
+
bnode = (struct kfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+ /* Grab the lock back. */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ local_irq_save(*flags);
+ raw_spin_lock(&krcp->lock);
+ migrate_enable();
+ }
}
/* Switch to emergency path. */
@@ -3164,7 +3179,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
* Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained.
*/
- if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func, alloc))) {
+ if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func, alloc, &flags))) {
head->func = func;
head->next = krcp->head;
krcp->head = head;
<snip>
If everyone agrees to go with such solution, let's split this work into
at least two patches, one is converting to raw spinlocks and second one
is an allocation fix for RT case.
?
--
Vlad Rezki
^ permalink raw reply related [flat|nested] 85+ messages in thread