signal: Allow RT tasks to cache one sigqueue struct
diff mbox series

Message ID 20210303142025.wbbt2nnr6dtgwjfi@linutronix.de
State New, archived
Headers show
Series
  • signal: Allow RT tasks to cache one sigqueue struct
Related show

Commit Message

Sebastian Andrzej Siewior March 3, 2021, 2:20 p.m. UTC
From: Thomas Gleixner <tglx@linutronix.de>

Allow realtime tasks to cache one sigqueue in task struct. This avoids an
allocation which can increase the latency or fail.
Ideally the sigqueue is cached after first successful delivery and will be
available for next signal delivery. This works under the assumption that the RT
task has never an unprocessed signal while a one is about to be queued.

The caching is not used for SIGQUEUE_PREALLOC because this kind of sigqueue is
handled differently (and not used for regular signal delivery).

[bigeasy: With a fix from Matt Fleming <matt@codeblueprint.co.uk>]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/sched.h  |  1 +
 include/linux/signal.h |  1 +
 kernel/exit.c          |  2 +-
 kernel/fork.c          |  1 +
 kernel/signal.c        | 65 +++++++++++++++++++++++++++++++++++++++---
 5 files changed, 65 insertions(+), 5 deletions(-)

Comments

Oleg Nesterov March 3, 2021, 3:37 p.m. UTC | #1
On 03/03, Sebastian Andrzej Siewior wrote:
>
> +static struct sigqueue *sigqueue_from_cache(struct task_struct *t)
> +{
> +	struct sigqueue *q = t->sigqueue_cache;
> +
> +	if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q)
> +		return q;
> +	return NULL;
> +}
> +
> +static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q)
> +{
> +	if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
> +		return true;
> +	return false;
> +}

Do we really need cmpxchg? It seems they are always called with spinlock held.

>  static struct sigqueue *
> -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
> +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
> +		    int override_rlimit, bool fromslab)
>  {
>  	struct sigqueue *q = NULL;
>  	struct user_struct *user;
> @@ -432,7 +450,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
>  	rcu_read_unlock();
>
>  	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
> -		q = kmem_cache_alloc(sigqueue_cachep, flags);
> +		if (!fromslab)
> +			q = sigqueue_from_cache(t);
> +		if (!q)
> +			q = kmem_cache_alloc(sigqueue_cachep, flags);

I won't insist but afaics you can avoid the new arg/function and simplify this
patch. __sigqueue_alloc() can simply check "sig > 0" or valid_signal(sig) rather
than "!fromslab".

> +static void __sigqueue_cache_or_free(struct sigqueue *q)
> +{
> +	struct user_struct *up;
> +
> +	if (q->flags & SIGQUEUE_PREALLOC)
> +		return;
> +
> +	up = q->user;
> +	if (atomic_dec_and_test(&up->sigpending))
> +		free_uid(up);
> +	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
> +		kmem_cache_free(sigqueue_cachep, q);
> +}

Well, this duplicates __sigqueue_free... Do we really need the new helper?
What if we simply change __sigqueue_free() to do sigqueue_add_cache() if
task_is_realtime() && !PF_EXITING ? This too can simplify the patch...

Oleg.
Eric W. Biederman March 3, 2021, 10:09 p.m. UTC | #2
Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:

> From: Thomas Gleixner <tglx@linutronix.de>
>
> Allow realtime tasks to cache one sigqueue in task struct. This avoids an
> allocation which can increase the latency or fail.
> Ideally the sigqueue is cached after first successful delivery and will be
> available for next signal delivery. This works under the assumption that the RT
> task has never an unprocessed signal while a one is about to be queued.
>
> The caching is not used for SIGQUEUE_PREALLOC because this kind of sigqueue is
> handled differently (and not used for regular signal delivery).

What part of this is about real time tasks?  This allows any task
to cache a sigqueue entry.

Either the patch is buggy or the description is.  Overall caching one
sigqueue entry doesn't look insane. But it would help to have a clear
description of what is going on.

Eric


> [bigeasy: With a fix from Matt Fleming <matt@codeblueprint.co.uk>]
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  include/linux/sched.h  |  1 +
>  include/linux/signal.h |  1 +
>  kernel/exit.c          |  2 +-
>  kernel/fork.c          |  1 +
>  kernel/signal.c        | 65 +++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 65 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ef00bb22164cd..7009b25f48160 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -985,6 +985,7 @@ struct task_struct {
>  	/* Signal handlers: */
>  	struct signal_struct		*signal;
>  	struct sighand_struct __rcu		*sighand;
> +	struct sigqueue			*sigqueue_cache;
>  	sigset_t			blocked;
>  	sigset_t			real_blocked;
>  	/* Restored if set_restore_sigmask() was used: */
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 205526c4003aa..d47a86790edc8 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig)
>  }
>  
>  extern void flush_sigqueue(struct sigpending *queue);
> +extern void flush_task_sigqueue(struct task_struct *tsk);
>  
>  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
>  static inline int valid_signal(unsigned long sig)
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 04029e35e69af..346f7b76cecaa 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk)
>  	 * Do this under ->siglock, we can race with another thread
>  	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
>  	 */
> -	flush_sigqueue(&tsk->pending);
> +	flush_task_sigqueue(tsk);
>  	tsk->sighand = NULL;
>  	spin_unlock(&sighand->siglock);
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d66cd1014211b..a767e4e49a692 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1993,6 +1993,7 @@ static __latent_entropy struct task_struct *copy_process(
>  	spin_lock_init(&p->alloc_lock);
>  
>  	init_sigpending(&p->pending);
> +	p->sigqueue_cache = NULL;
>  
>  	p->utime = p->stime = p->gtime = 0;
>  #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
> diff --git a/kernel/signal.c b/kernel/signal.c
> index ba4d1ef39a9ea..d99273b798085 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -20,6 +20,7 @@
>  #include <linux/sched/task.h>
>  #include <linux/sched/task_stack.h>
>  #include <linux/sched/cputime.h>
> +#include <linux/sched/rt.h>
>  #include <linux/file.h>
>  #include <linux/fs.h>
>  #include <linux/proc_fs.h>
> @@ -404,13 +405,30 @@ void task_join_group_stop(struct task_struct *task)
>  	task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
>  }
>  
> +static struct sigqueue *sigqueue_from_cache(struct task_struct *t)
> +{
> +	struct sigqueue *q = t->sigqueue_cache;
> +
> +	if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q)
> +		return q;
> +	return NULL;
> +}
> +
> +static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q)
> +{
> +	if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
> +		return true;
> +	return false;
> +}
> +
>  /*
>   * allocate a new signal queue record
>   * - this may be called without locks if and only if t == current, otherwise an
>   *   appropriate lock must be held to stop the target task from exiting
>   */
>  static struct sigqueue *
> -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
> +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
> +		    int override_rlimit, bool fromslab)
>  {
>  	struct sigqueue *q = NULL;
>  	struct user_struct *user;
> @@ -432,7 +450,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
>  	rcu_read_unlock();
>  
>  	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
> -		q = kmem_cache_alloc(sigqueue_cachep, flags);
> +		if (!fromslab)
> +			q = sigqueue_from_cache(t);
> +		if (!q)
> +			q = kmem_cache_alloc(sigqueue_cachep, flags);
>  	} else {
>  		print_dropped_signal(sig);
>  	}
> @@ -449,6 +470,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
>  	return q;
>  }
>  
> +static struct sigqueue *
> +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
> +		 int override_rlimit)
> +{
> +	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, false);
> +}
> +
>  static void __sigqueue_free(struct sigqueue *q)
>  {
>  	if (q->flags & SIGQUEUE_PREALLOC)
> @@ -458,6 +486,20 @@ static void __sigqueue_free(struct sigqueue *q)
>  	kmem_cache_free(sigqueue_cachep, q);
>  }
>  
> +static void __sigqueue_cache_or_free(struct sigqueue *q)
> +{
> +	struct user_struct *up;
> +
> +	if (q->flags & SIGQUEUE_PREALLOC)
> +		return;
> +
> +	up = q->user;
> +	if (atomic_dec_and_test(&up->sigpending))
> +		free_uid(up);
> +	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
> +		kmem_cache_free(sigqueue_cachep, q);
> +}
> +
>  void flush_sigqueue(struct sigpending *queue)
>  {
>  	struct sigqueue *q;
> @@ -470,6 +512,21 @@ void flush_sigqueue(struct sigpending *queue)
>  	}
>  }
>  
> +/*
> + * Called from __exit_signal. Flush tsk->pending and
> + * tsk->sigqueue_cache
> + */
> +void flush_task_sigqueue(struct task_struct *tsk)
> +{
> +	struct sigqueue *q;
> +
> +	flush_sigqueue(&tsk->pending);
> +
> +	q = sigqueue_from_cache(tsk);
> +	if (q)
> +		kmem_cache_free(sigqueue_cachep, q);
> +}
> +
>  /*
>   * Flush all pending signals for this kthread.
>   */
> @@ -594,7 +651,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i
>  			(info->si_code == SI_TIMER) &&
>  			(info->si_sys_private);
>  
> -		__sigqueue_free(first);
> +		__sigqueue_cache_or_free(first);
>  	} else {
>  		/*
>  		 * Ok, it wasn't in the queue.  This must be
> @@ -1807,7 +1864,7 @@ EXPORT_SYMBOL(kill_pid);
>   */
>  struct sigqueue *sigqueue_alloc(void)
>  {
> -	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
> +	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, true);
>  
>  	if (q)
>  		q->flags |= SIGQUEUE_PREALLOC;
Sebastian Andrzej Siewior March 4, 2021, 8:11 a.m. UTC | #3
On 2021-03-03 16:09:05 [-0600], Eric W. Biederman wrote:
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:
> 
> > From: Thomas Gleixner <tglx@linutronix.de>
> >
> > Allow realtime tasks to cache one sigqueue in task struct. This avoids an
> > allocation which can increase the latency or fail.
> > Ideally the sigqueue is cached after first successful delivery and will be
> > available for next signal delivery. This works under the assumption that the RT
> > task has never an unprocessed signal while a one is about to be queued.
> >
> > The caching is not used for SIGQUEUE_PREALLOC because this kind of sigqueue is
> > handled differently (and not used for regular signal delivery).
> 
> What part of this is about real time tasks?  This allows any task
> to cache a sigqueue entry.

It is limited to realtime tasks (SCHED_FIFO/RR/DL):

+static void __sigqueue_cache_or_free(struct sigqueue *q)
+{
…
+	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
+		kmem_cache_free(sigqueue_cachep, q);
+}

> Either the patch is buggy or the description is.  Overall caching one
> sigqueue entry doesn't look insane. But it would help to have a clear
> description of what is going on.

Does this clear things up or is my logic somehow broken here?

> Eric

Sebastian
Thomas Gleixner March 4, 2021, 3:02 p.m. UTC | #4
On Thu, Mar 04 2021 at 09:11, Sebastian Andrzej Siewior wrote:
> On 2021-03-03 16:09:05 [-0600], Eric W. Biederman wrote:
>> Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:
>> 
>> > From: Thomas Gleixner <tglx@linutronix.de>
>> >
>> > Allow realtime tasks to cache one sigqueue in task struct. This avoids an
>> > allocation which can increase the latency or fail.
>> > Ideally the sigqueue is cached after first successful delivery and will be
>> > available for next signal delivery. This works under the assumption that the RT
>> > task has never an unprocessed signal while a one is about to be queued.
>> >
>> > The caching is not used for SIGQUEUE_PREALLOC because this kind of sigqueue is
>> > handled differently (and not used for regular signal delivery).
>> 
>> What part of this is about real time tasks?  This allows any task
>> to cache a sigqueue entry.
>
> It is limited to realtime tasks (SCHED_FIFO/RR/DL):
>
> +static void __sigqueue_cache_or_free(struct sigqueue *q)
> +{
> …
> +	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
> +		kmem_cache_free(sigqueue_cachep, q);
> +}

We could of course do the caching unconditionally for all tasks.

Thanks,

        tglx
Eric W. Biederman March 4, 2021, 7:01 p.m. UTC | #5
Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:

> On 2021-03-03 16:09:05 [-0600], Eric W. Biederman wrote:
>> Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:
>> 
>> > From: Thomas Gleixner <tglx@linutronix.de>
>> >
>> > Allow realtime tasks to cache one sigqueue in task struct. This avoids an
>> > allocation which can increase the latency or fail.
>> > Ideally the sigqueue is cached after first successful delivery and will be
>> > available for next signal delivery. This works under the assumption that the RT
>> > task has never an unprocessed signal while a one is about to be queued.
>> >
>> > The caching is not used for SIGQUEUE_PREALLOC because this kind of sigqueue is
>> > handled differently (and not used for regular signal delivery).
>> 
>> What part of this is about real time tasks?  This allows any task
>> to cache a sigqueue entry.
>
> It is limited to realtime tasks (SCHED_FIFO/RR/DL):
>
> +static void __sigqueue_cache_or_free(struct sigqueue *q)
> +{
> …
> +	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
> +		kmem_cache_free(sigqueue_cachep, q);
> +}

I see now.  I was looking for it somewhere in the allocation side.
Oleg's suggestion of simply adding a few additional lines to
__sigqueue_free would have made this stand out more.

A __sigqueue_free that takes the relevant task_struct instead of always
assuming current would be nice here.


>> Either the patch is buggy or the description is.  Overall caching one
>> sigqueue entry doesn't look insane. But it would help to have a clear
>> description of what is going on.
>
> Does this clear things up or is my logic somehow broken here?

No I just missed the task_is_realtime limitation.

Eric
Eric W. Biederman March 4, 2021, 7:04 p.m. UTC | #6
Thomas Gleixner <tglx@linutronix.de> writes:

> On Thu, Mar 04 2021 at 09:11, Sebastian Andrzej Siewior wrote:
>> On 2021-03-03 16:09:05 [-0600], Eric W. Biederman wrote:
>>> Sebastian Andrzej Siewior <bigeasy@linutronix.de> writes:
>>> 
>>> > From: Thomas Gleixner <tglx@linutronix.de>
>>> >
>>> > Allow realtime tasks to cache one sigqueue in task struct. This avoids an
>>> > allocation which can increase the latency or fail.
>>> > Ideally the sigqueue is cached after first successful delivery and will be
>>> > available for next signal delivery. This works under the assumption that the RT
>>> > task has never an unprocessed signal while a one is about to be queued.
>>> >
>>> > The caching is not used for SIGQUEUE_PREALLOC because this kind of sigqueue is
>>> > handled differently (and not used for regular signal delivery).
>>> 
>>> What part of this is about real time tasks?  This allows any task
>>> to cache a sigqueue entry.
>>
>> It is limited to realtime tasks (SCHED_FIFO/RR/DL):
>>
>> +static void __sigqueue_cache_or_free(struct sigqueue *q)
>> +{
>> …
>> +	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
>> +		kmem_cache_free(sigqueue_cachep, q);
>> +}
>
> We could of course do the caching unconditionally for all tasks.

Is there any advantage to only doing this for realtime tasks?

If not it probably makes sense to do the caching for all tasks.

I am wondering if we want to count the cached sigqueue structure to the
users rt signal rlimit?

Eric
Thomas Gleixner March 4, 2021, 8:58 p.m. UTC | #7
On Thu, Mar 04 2021 at 13:04, Eric W. Biederman wrote:
> Thomas Gleixner <tglx@linutronix.de> writes:
>>
>> We could of course do the caching unconditionally for all tasks.
>
> Is there any advantage to only doing this for realtime tasks?

It was mostly to avoid tons of cached entries hanging around all over
the place. So I limited it to the case which the RT users deeply cared
about. Also related to the accounting question below.

> If not it probably makes sense to do the caching for all tasks.
>
> I am wondering if we want to count the cached sigqueue structure to the
> users rt signal rlimit?

That makes some sense, but that's a user visible change as a single
signal will up the count for a tasks lifetime while today it is removed
from accounting again once the signal is delivered. So that needs some
thought.

Thanks,

        tglx
Thomas Gleixner March 4, 2021, 9:10 p.m. UTC | #8
On Wed, Mar 03 2021 at 16:37, Oleg Nesterov wrote:
> On 03/03, Sebastian Andrzej Siewior wrote:
>>
>> +static struct sigqueue *sigqueue_from_cache(struct task_struct *t)
>> +{
>> +	struct sigqueue *q = t->sigqueue_cache;
>> +
>> +	if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q)
>> +		return q;
>> +	return NULL;
>> +}
>> +
>> +static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q)
>> +{
>> +	if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
>> +		return true;
>> +	return false;
>> +}
>
> Do we really need cmpxchg? It seems they are always called with
> spinlock held.

With which spinlock held?

__send_signal()         <- sighand::siglock held
  __sigqueue_alloc()

alloc_posix_timer()
  sigqueue_alloc()      <- No lock held
    __sigqueue_alloc()

and on the free side we have a bunch of callers which do not hold
sighand::siglock either. So the cmpxchg() is required.

Thanks,

        tglx
Thomas Gleixner March 4, 2021, 9:14 p.m. UTC | #9
On Wed, Mar 03 2021 at 16:37, Oleg Nesterov wrote:
> On 03/03, Sebastian Andrzej Siewior wrote:
>> +static void __sigqueue_cache_or_free(struct sigqueue *q)
>> +{
>> +	struct user_struct *up;
>> +
>> +	if (q->flags & SIGQUEUE_PREALLOC)
>> +		return;
>> +
>> +	up = q->user;
>> +	if (atomic_dec_and_test(&up->sigpending))
>> +		free_uid(up);
>> +	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
>> +		kmem_cache_free(sigqueue_cachep, q);
>> +}
>
> Well, this duplicates __sigqueue_free... Do we really need the new helper?
> What if we simply change __sigqueue_free() to do sigqueue_add_cache() if
> task_is_realtime() && !PF_EXITING ? This too can simplify the patch...

Need to stare at all callers of __sigqueue_free() whether they are
really happy about this. Even if not, this surely can be deduplicated.

Thanks,

        tglx
Oleg Nesterov March 5, 2021, 10:57 a.m. UTC | #10
On 03/04, Thomas Gleixner wrote:
>
> On Wed, Mar 03 2021 at 16:37, Oleg Nesterov wrote:
> > On 03/03, Sebastian Andrzej Siewior wrote:
> >>
> >> +static struct sigqueue *sigqueue_from_cache(struct task_struct *t)
> >> +{
> >> +	struct sigqueue *q = t->sigqueue_cache;
> >> +
> >> +	if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q)
> >> +		return q;
> >> +	return NULL;
> >> +}
> >> +
> >> +static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q)
> >> +{
> >> +	if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
> >> +		return true;
> >> +	return false;
> >> +}
> >
> > Do we really need cmpxchg? It seems they are always called with
> > spinlock held.
>
> With which spinlock held?
>
> __send_signal()         <- sighand::siglock held
>   __sigqueue_alloc()
>
> alloc_posix_timer()
>   sigqueue_alloc()      <- No lock held
>     __sigqueue_alloc()

In the last case "fromslab" is true, sigqueue_from_cache() won't be called.

> and on the free side we have a bunch of callers which do not hold
> sighand::siglock either.

Where?

Oleg.
Thomas Gleixner March 10, 2021, 6:54 p.m. UTC | #11
On Thu, Mar 04 2021 at 21:58, Thomas Gleixner wrote:
> On Thu, Mar 04 2021 at 13:04, Eric W. Biederman wrote:
>> Thomas Gleixner <tglx@linutronix.de> writes:
>>>
>>> We could of course do the caching unconditionally for all tasks.
>>
>> Is there any advantage to only doing this for realtime tasks?
>
> It was mostly to avoid tons of cached entries hanging around all over
> the place. So I limited it to the case which the RT users deeply cared
> about. Also related to the accounting question below.
>
>> If not it probably makes sense to do the caching for all tasks.
>>
>> I am wondering if we want to count the cached sigqueue structure to the
>> users rt signal rlimit?
>
> That makes some sense, but that's a user visible change as a single
> signal will up the count for a tasks lifetime while today it is removed
> from accounting again once the signal is delivered. So that needs some
> thought.

Thought more about it. To make this accounting useful we'd need:

  - a seperate user::sigqueue_cached counter
  - a seperate RLIMIT_SIGQUEUE_CACHED

Then you need to think about the defaults. Any signal heavy application
will want this enabled and obviously automagically.

Also there is an argument not to have this due to possible pointless
memory consumption.

But what are we talking about? 80 bytes worth of memory per task in the
worst case. Which is compared to the rest of a task's memory consumption
just noise.

Looking at some statistics from a devel system there are less than 10
items cached when the machine is fully idle after boot. During a kernel
compile the cache utilization goes up to ~150 at max (make -j128 and 64
CPUs). What's interesting is the allocation statistics after boot and
full kernel compile:

  from slab:            23996
  from task cache:	52223

A typical pattern there is:

    <ls>-58490   [010] d..2  7765.664198: __sigqueue_alloc: 58488 from slab ffff8881132df460 10
    <ls>-58488   [002] d..1  7765.664294: __sigqueue_free.part.35: cache ffff8881132df460 10
    <ls>-58488   [002] d..2  7765.665146: __sigqueue_alloc: 1149 from cache ffff8881103dc550 10
     bash-1149   [000] d..2  7765.665220: exit_task_sighand: free ffff8881132df460 8 9
     bash-1149   [000] d..1  7765.665662: __sigqueue_free.part.35: cache ffff8881103dc550 9

58488 grabs the sigqueue from bash's task cache and bash sticks it back
in. Lather, rinse and repeat. 

IMO, not bothering with an extra counter and rlimit plus the required
atomic operations is just fine and having this for all tasks
unconditionally looks like a clear win.

I'll post an updated version of this soonish.

Thanks,

        tglx
Eric W. Biederman March 10, 2021, 9:57 p.m. UTC | #12
Thomas Gleixner <tglx@linutronix.de> writes:

> On Thu, Mar 04 2021 at 21:58, Thomas Gleixner wrote:
>> On Thu, Mar 04 2021 at 13:04, Eric W. Biederman wrote:
>>> Thomas Gleixner <tglx@linutronix.de> writes:
>>>>
>>>> We could of course do the caching unconditionally for all tasks.
>>>
>>> Is there any advantage to only doing this for realtime tasks?
>>
>> It was mostly to avoid tons of cached entries hanging around all over
>> the place. So I limited it to the case which the RT users deeply cared
>> about. Also related to the accounting question below.
>>
>>> If not it probably makes sense to do the caching for all tasks.
>>>
>>> I am wondering if we want to count the cached sigqueue structure to the
>>> users rt signal rlimit?
>>
>> That makes some sense, but that's a user visible change as a single
>> signal will up the count for a tasks lifetime while today it is removed
>> from accounting again once the signal is delivered. So that needs some
>> thought.
>
> Thought more about it. To make this accounting useful we'd need:
>
>   - a seperate user::sigqueue_cached counter
>   - a seperate RLIMIT_SIGQUEUE_CACHED
>
> Then you need to think about the defaults. Any signal heavy application
> will want this enabled and obviously automagically.
>
> Also there is an argument not to have this due to possible pointless
> memory consumption.
>
> But what are we talking about? 80 bytes worth of memory per task in the
> worst case. Which is compared to the rest of a task's memory consumption
> just noise.
>
> Looking at some statistics from a devel system there are less than 10
> items cached when the machine is fully idle after boot. During a kernel
> compile the cache utilization goes up to ~150 at max (make -j128 and 64
> CPUs). What's interesting is the allocation statistics after boot and
> full kernel compile:
>
>   from slab:            23996
>   from task cache:	52223
>
> A typical pattern there is:
>
>     <ls>-58490   [010] d..2  7765.664198: __sigqueue_alloc: 58488 from slab ffff8881132df460 10
>     <ls>-58488   [002] d..1  7765.664294: __sigqueue_free.part.35: cache ffff8881132df460 10
>     <ls>-58488   [002] d..2  7765.665146: __sigqueue_alloc: 1149 from cache ffff8881103dc550 10
>      bash-1149   [000] d..2  7765.665220: exit_task_sighand: free ffff8881132df460 8 9
>      bash-1149   [000] d..1  7765.665662: __sigqueue_free.part.35: cache ffff8881103dc550 9
>
> 58488 grabs the sigqueue from bash's task cache and bash sticks it back
> in. Lather, rinse and repeat. 
>
> IMO, not bothering with an extra counter and rlimit plus the required
> atomic operations is just fine and having this for all tasks
> unconditionally looks like a clear win.
>
> I'll post an updated version of this soonish.

That looks like a good analysis.

I see that there is a sigqueue_cachep.  As I recall there are per cpu
caches and all kinds of other good stuff when using kmem_cache_alloc.

Are those goodies falling down?

I am just a little unclear on why a slab allocation is sufficiently
problematic that we want to avoid it.

Eric
Thomas Gleixner March 10, 2021, 11:56 p.m. UTC | #13
On Wed, Mar 10 2021 at 15:57, Eric W. Biederman wrote:
> Thomas Gleixner <tglx@linutronix.de> writes:
>> IMO, not bothering with an extra counter and rlimit plus the required
>> atomic operations is just fine and having this for all tasks
>> unconditionally looks like a clear win.
>>
>> I'll post an updated version of this soonish.
>
> That looks like a good analysis.
>
> I see that there is a sigqueue_cachep.  As I recall there are per cpu
> caches and all kinds of other good stuff when using kmem_cache_alloc.
>
> Are those goodies falling down?
>
> I am just a little unclear on why a slab allocation is sufficiently
> problematic that we want to avoid it.

In the normal case it's not problematic at all. i.e. when the per cpu
cache can directly fullfil the allocation in the fast path. Once that
fails you're off into latency land...

For the usual setup probably not an issue at all, but for real time
processing it matters.

Vs. the dedicated kmem cache for sigqueue. That's a red herring. By
default kmem caches are shared/merged as I learned today and if you want
dedicated ones you need to boot with 'slab_nomerge' on the command line.

So without that option (which is of course not backwards compatible
because the original behaviour was the other way around) your signal
kmem cache might end up in a shared/merged kmem cache. Just do:

  cat /proc/slabinfo | grep sig

and the default will find:

signal_cache        6440   6440   1152   28    8 : tunables    0    0    0 : slabdata    230    230      0
sighand_cache       3952   4035   2112   15    8 : tunables    0    0    0 : slabdata    269    269      0

But of course there is no way to figure out where your cache actually
landed and then with with 'slab_nomerge' you'll get:

sigqueue            3264   3264     80   51    1 : tunables    0    0    0 : slabdata     64     64      0
signal_cache        6440   6440   1152   28    8 : tunables    0    0    0 : slabdata    230    230      0
sighand_cache       3952   4035   2112   15    8 : tunables    0    0    0 : slabdata    269    269      0

Don't worry about the 'active objects' field. That's just bonkers
because SLUB has no proper accounting for active objects. That number is
useless ...

Not even CONFIG_SLUB_STATS=y will give you anything useful. I had to
hack my own statistics into the signal code to gather these numbers
!$@**!^?#!

But why I'm not surprised? This stuff is optimized for High Frequency
Trading which is useless by definition. Oh well...

Rant aside, there is no massive benefit of doing that caching in
general, but there is not much of a downside either and for particular
use cases it's useful even outside of PREEMPT_RT.

IMO, having it there unconditionally is better than yet another special
cased hackery.

Thanks,

        tglx
Thomas Gleixner March 11, 2021, 12:45 p.m. UTC | #14
On Thu, Mar 11 2021 at 00:56, Thomas Gleixner wrote:
> Rant aside, there is no massive benefit of doing that caching in
> general, but there is not much of a downside either and for particular
> use cases it's useful even outside of PREEMPT_RT.
>
> IMO, having it there unconditionally is better than yet another special
> cased hackery.

Just did some micro instrumentation to measure the time spent in
__sigqueue_alloc/free() with and without the caching.

Unsurprisingly that results in a time reduction of ~67% saving about 3us
per alloc/free pair. Not hugely relevant for a kernel build but for
anything which is signal heavy it'll make an difference.

Thanks,

        tglx
Thomas Gleixner March 11, 2021, 2:20 p.m. UTC | #15
On Thu, Mar 11 2021 at 13:45, Thomas Gleixner wrote:

> On Thu, Mar 11 2021 at 00:56, Thomas Gleixner wrote:
>> Rant aside, there is no massive benefit of doing that caching in
>> general, but there is not much of a downside either and for particular
>> use cases it's useful even outside of PREEMPT_RT.
>>
>> IMO, having it there unconditionally is better than yet another special
>> cased hackery.
>
> Just did some micro instrumentation to measure the time spent in
> __sigqueue_alloc/free() with and without the caching.
>
> Unsurprisingly that results in a time reduction of ~67% saving about 3us
> per alloc/free pair. Not hugely relevant for a kernel build but for
> anything which is signal heavy it'll make an difference.

That's all fastpath allocations and nothing which hit the slow path,
which would be way worse.

Thanks,

        tglx
Eric W. Biederman March 11, 2021, 4:32 p.m. UTC | #16
Thomas Gleixner <tglx@linutronix.de> writes:

> On Wed, Mar 10 2021 at 15:57, Eric W. Biederman wrote:
>> Thomas Gleixner <tglx@linutronix.de> writes:
>>> IMO, not bothering with an extra counter and rlimit plus the required
>>> atomic operations is just fine and having this for all tasks
>>> unconditionally looks like a clear win.
>>>
>>> I'll post an updated version of this soonish.
>>
>> That looks like a good analysis.
>>
>> I see that there is a sigqueue_cachep.  As I recall there are per cpu
>> caches and all kinds of other good stuff when using kmem_cache_alloc.
>>
>> Are those goodies falling down?
>>
>> I am just a little unclear on why a slab allocation is sufficiently
>> problematic that we want to avoid it.
>
> In the normal case it's not problematic at all. i.e. when the per cpu
> cache can directly fullfil the allocation in the fast path. Once that
> fails you're off into latency land...
>
> For the usual setup probably not an issue at all, but for real time
> processing it matters.
>
> Vs. the dedicated kmem cache for sigqueue. That's a red herring. By
> default kmem caches are shared/merged as I learned today and if you want
> dedicated ones you need to boot with 'slab_nomerge' on the command line.
>
> So without that option (which is of course not backwards compatible
> because the original behaviour was the other way around) your signal
> kmem cache might end up in a shared/merged kmem cache. Just do:
>
>   cat /proc/slabinfo | grep sig
>
> and the default will find:
>
> signal_cache        6440   6440   1152   28    8 : tunables    0    0    0 : slabdata    230    230      0
> sighand_cache       3952   4035   2112   15    8 : tunables    0    0    0 : slabdata    269    269      0
>
> But of course there is no way to figure out where your cache actually
> landed and then with with 'slab_nomerge' you'll get:
>
> sigqueue            3264   3264     80   51    1 : tunables    0    0    0 : slabdata     64     64      0
> signal_cache        6440   6440   1152   28    8 : tunables    0    0    0 : slabdata    230    230      0
> sighand_cache       3952   4035   2112   15    8 : tunables    0    0    0 : slabdata    269    269      0
>
> Don't worry about the 'active objects' field. That's just bonkers
> because SLUB has no proper accounting for active objects. That number is
> useless ...
>
> Not even CONFIG_SLUB_STATS=y will give you anything useful. I had to
> hack my own statistics into the signal code to gather these numbers
> !$@**!^?#!
>
> But why I'm not surprised? This stuff is optimized for High Frequency
> Trading which is useless by definition. Oh well...
>
> Rant aside, there is no massive benefit of doing that caching in
> general, but there is not much of a downside either and for particular
> use cases it's useful even outside of PREEMPT_RT.
>
> IMO, having it there unconditionally is better than yet another special
> cased hackery.

Sounds reasonable, and thank you for actually looking into it.  I think
a comment saying this gives a strong guarantee that as long as userspace
plays by the rules (aka max one outstanding signal per process)
userspace gets a low latency guarantee.

Eric

Patch
diff mbox series

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef00bb22164cd..7009b25f48160 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -985,6 +985,7 @@  struct task_struct {
 	/* Signal handlers: */
 	struct signal_struct		*signal;
 	struct sighand_struct __rcu		*sighand;
+	struct sigqueue			*sigqueue_cache;
 	sigset_t			blocked;
 	sigset_t			real_blocked;
 	/* Restored if set_restore_sigmask() was used: */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c4003aa..d47a86790edc8 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -265,6 +265,7 @@  static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
+extern void flush_task_sigqueue(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69af..346f7b76cecaa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -152,7 +152,7 @@  static void __exit_signal(struct task_struct *tsk)
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
-	flush_sigqueue(&tsk->pending);
+	flush_task_sigqueue(tsk);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211b..a767e4e49a692 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1993,6 +1993,7 @@  static __latent_entropy struct task_struct *copy_process(
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
+	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
diff --git a/kernel/signal.c b/kernel/signal.c
index ba4d1ef39a9ea..d99273b798085 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@ 
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/rt.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
@@ -404,13 +405,30 @@  void task_join_group_stop(struct task_struct *task)
 	task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
 }
 
+static struct sigqueue *sigqueue_from_cache(struct task_struct *t)
+{
+	struct sigqueue *q = t->sigqueue_cache;
+
+	if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q)
+		return q;
+	return NULL;
+}
+
+static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q)
+{
+	if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+		return true;
+	return false;
+}
+
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
  *   appropriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *
-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
+		    int override_rlimit, bool fromslab)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -432,7 +450,10 @@  __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	rcu_read_unlock();
 
 	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
-		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (!fromslab)
+			q = sigqueue_from_cache(t);
+		if (!q)
+			q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -449,6 +470,13 @@  __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	return q;
 }
 
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
+		 int override_rlimit)
+{
+	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, false);
+}
+
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
@@ -458,6 +486,20 @@  static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
+static void __sigqueue_cache_or_free(struct sigqueue *q)
+{
+	struct user_struct *up;
+
+	if (q->flags & SIGQUEUE_PREALLOC)
+		return;
+
+	up = q->user;
+	if (atomic_dec_and_test(&up->sigpending))
+		free_uid(up);
+	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
 void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
@@ -470,6 +512,21 @@  void flush_sigqueue(struct sigpending *queue)
 	}
 }
 
+/*
+ * Called from __exit_signal. Flush tsk->pending and
+ * tsk->sigqueue_cache
+ */
+void flush_task_sigqueue(struct task_struct *tsk)
+{
+	struct sigqueue *q;
+
+	flush_sigqueue(&tsk->pending);
+
+	q = sigqueue_from_cache(tsk);
+	if (q)
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
 /*
  * Flush all pending signals for this kthread.
  */
@@ -594,7 +651,7 @@  static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i
 			(info->si_code == SI_TIMER) &&
 			(info->si_sys_private);
 
-		__sigqueue_free(first);
+		__sigqueue_cache_or_free(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -1807,7 +1864,7 @@  EXPORT_SYMBOL(kill_pid);
  */
 struct sigqueue *sigqueue_alloc(void)
 {
-	struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, true);
 
 	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;