Re: [CFT][PATCH] ucounts: Fix signal ucount refcounting

From: ebiederm@xmission.com (Eric W. Biederman)
To: Hillf Danton <hdanton@sina.com>
Cc: Rune Kleveland <rune.kleveland@infomedia.dk>,
	Yu Zhao <yuzhao@google.com>, Alexey Gladkov <legion@kernel.org>,
	Jordan Glover <Golden_Miller83@protonmail.ch>,
	LKML <linux-kernel@vger.kernel.org>,
	linux-mm@kvack.org, containers@lists.linux-foundation.org
Subject: Re: [CFT][PATCH] ucounts: Fix signal ucount refcounting
Date: Sat, 16 Oct 2021 13:00:49 -0500	[thread overview]
Message-ID: <87czo4voha.fsf@disp2133> (raw)
In-Reply-To: <20211016020833.1538-1-hdanton@sina.com> (Hillf Danton's message of "Sat, 16 Oct 2021 10:08:33 +0800")

Hillf Danton <hdanton@sina.com> writes:

> On Fri, 15 Oct 2021 17:10:58 -0500 Eric W. Biederman wrote:
>> 
>> In commit fda31c50292a ("signal: avoid double atomic counter
>> increments for user accounting") Linus made a clever optimization to
>> how rlimits and the struct user_struct.  Unfortunately that
>> optimization does not work in the obvious way when moved to nested
>> rlimits.  The problem is that the last decrement of the per user
>> namespace per user sigpending counter might also be the last decrement
>> of the sigpending counter in the parent user namespace as well.  Which
>> means that simply freeing the leaf ucount in __free_sigqueue is not
>> enough.
>> 
>> Maintain the optimization and handle the tricky cases by introducing
>> inc_rlimit_get_ucounts and dec_rlimit_put_ucounts.
>> 
>> By moving the entire optimization into functions that perform all of
>> the work it becomes possible to ensure that every level is handled
>> properly.
>> 
>> I wish we had a single user across all of the threads whose rlimit
>> could be charged so we did not need this complexity.
>> 
>> Cc: stable@vger.kernel.org
>> Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts")
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>> 
>> With a lot of help from Alex who found a way I could reproduce this
>> I believe I have found the issue.
>> 
>> Could people who are seeing this issue test and verify this solves the
>> problem for them?
>> 
>>  include/linux/user_namespace.h |  2 ++
>>  kernel/signal.c                | 25 +++++----------------
>>  kernel/ucount.c                | 41 ++++++++++++++++++++++++++++++++++
>>  3 files changed, 49 insertions(+), 19 deletions(-)
>> 
>> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
>> index eb70cabe6e7f..33a4240e6a6f 100644
>> --- a/include/linux/user_namespace.h
>> +++ b/include/linux/user_namespace.h
>> @@ -127,6 +127,8 @@ static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type t
>>  
>>  long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
>>  bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
>> +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type);
>> +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type);
>>  bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
>>  
>>  static inline void set_rlimit_ucount_max(struct user_namespace *ns,
>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index a3229add4455..762de58c6e76 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -425,22 +425,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
>>  	 */
>>  	rcu_read_lock();
>>  	ucounts = task_ucounts(t);
>> -	sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
>> -	switch (sigpending) {
>> -	case 1:
>> -		if (likely(get_ucounts(ucounts)))
>> -			break;
>> -		fallthrough;
>> -	case LONG_MAX:
>> -		/*
>> -		 * we need to decrease the ucount in the userns tree on any
>> -		 * failure to avoid counts leaking.
>> -		 */
>> -		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
>> -		rcu_read_unlock();
>> -		return NULL;
>> -	}
>> +	sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
>>  	rcu_read_unlock();
>> +	if (sigpending == LONG_MAX)
>> +		return NULL;
>>  
>>  	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
>>  		q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
>> @@ -449,8 +437,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
>>  	}
>>  
>>  	if (unlikely(q == NULL)) {
>> -		if (dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
>> -			put_ucounts(ucounts);
>> +		dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
>>  	} else {
>>  		INIT_LIST_HEAD(&q->list);
>>  		q->flags = sigqueue_flags;
>> @@ -463,8 +450,8 @@ static void __sigqueue_free(struct sigqueue *q)
>>  {
>>  	if (q->flags & SIGQUEUE_PREALLOC)
>>  		return;
>> -	if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
>> -		put_ucounts(q->ucounts);
>> +	if (q->ucounts) {
>> +		dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
>>  		q->ucounts = NULL;
>>  	}
>>  	kmem_cache_free(sigqueue_cachep, q);
>> diff --git a/kernel/ucount.c b/kernel/ucount.c
>> index 3b7e176cf7a2..687d77aa66bb 100644
>> --- a/kernel/ucount.c
>> +++ b/kernel/ucount.c
>> @@ -285,6 +285,47 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
>>  	return (new == 0);
>>  }
>>  
>> +static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
>> +				struct ucounts *last, enum ucount_type type)
>> +{
>> +	struct ucounts *iter;
>> +	for (iter = ucounts; iter != last; iter = iter->ns->ucounts) {
>> +		long dec = atomic_long_add_return(-1, &iter->ucount[type]);
>> +		WARN_ON_ONCE(dec < 0);
>> +		if (dec == 0)
>> +			put_ucounts(iter);
>> +	}
>
> Given kfree in put_ucounts(), this has difficulty surviving tests like
> kasan if the put pairs with the get in the below
> inc_rlimit_get_ucounts().

I don't know if this is what you are thinking about but there is indeed
a bug in that loop caused by kfree.

The problem is that iter->ns->ucounts is read after put_ucounts.  It
just needs to be read before hand.

>> +}
>> +
>> +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type)
>> +{
>> +	do_dec_rlimit_put_ucounts(ucounts, NULL, type);
>> +}
>> +
>> +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
>> +{
>> +	struct ucounts *iter;
>> +	long dec, ret = 0;
>> +
>> +	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
>> +		long max = READ_ONCE(iter->ns->ucount_max[type]);
>> +		long new = atomic_long_add_return(1, &iter->ucount[type]);
>> +		if (new < 0 || new > max)
>> +			goto unwind;
>> +		else if (iter == ucounts)
>> +			ret = new;
>> +		if ((new == 1) && (get_ucounts(iter) != iter))
>> +			goto dec_unwind;
>
> Add a line of comment for get to ease readers.

/* you are not expected to understand this */

I think that is the classic comment from unix source.  Seriously I can't
think of any comment that will make the situation more comprehensible.

> Hillf
>
>> +	}
>> +	return ret;
>> +dec_unwind:
>> +	dec = atomic_long_add_return(1, &iter->ucount[type]);
>> +	WARN_ON_ONCE(dec < 0);
>> +unwind:
>> +	do_dec_rlimit_put_ucounts(ucounts, iter, type);
>> +	return LONG_MAX;
>> +}
>> +
>>  bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
>>  {
>>  	struct ucounts *iter;
>> -- 
>> 2.20.1

Eric