linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1] fs/epoll: use a per-cpu counter for user's watches count
@ 2021-08-02  3:20 Nicholas Piggin
  2021-08-04 15:22 ` Guenter Roeck
  0 siblings, 1 reply; 3+ messages in thread
From: Nicholas Piggin @ 2021-08-02  3:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nicholas Piggin, Alexander Viro, linux-fsdevel, linux-kernel,
	Anton Blanchard

This counter tracks the number of watches a user has, to compare against
the 'max_user_watches' limit. This causes a scalability bottleneck on
SPECjbb2015 on large systems as there is only one user. Changing to a
per-cpu counter increases throughput of the benchmark by about 30% on a
16-socket, > 1000 thread system.

Reported-by: Anton Blanchard <anton@ozlabs.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 fs/eventpoll.c             | 18 ++++++++++--------
 include/linux/sched/user.h |  3 ++-
 kernel/user.c              |  9 +++++++++
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e596e1d0bba..648ed77f4164 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	 */
 	call_rcu(&epi->rcu, epi_rcu_free);
 
-	atomic_long_dec(&ep->user->epoll_watches);
+	percpu_counter_dec(&ep->user->epoll_watches);
 
 	return 0;
 }
@@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 {
 	int error, pwake = 0;
 	__poll_t revents;
-	long user_watches;
 	struct epitem *epi;
 	struct ep_pqueue epq;
 	struct eventpoll *tep = NULL;
@@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 
 	lockdep_assert_irqs_enabled();
 
-	user_watches = atomic_long_read(&ep->user->epoll_watches);
-	if (unlikely(user_watches >= max_user_watches))
+	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
+					    max_user_watches) >= 0))
 		return -ENOSPC;
-	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
+	percpu_counter_inc(&ep->user->epoll_watches);
+
+	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+		percpu_counter_dec(&ep->user->epoll_watches);
 		return -ENOMEM;
+	}
 
 	/* Item initialization follow here ... */
 	INIT_LIST_HEAD(&epi->rdllink);
@@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		mutex_lock_nested(&tep->mtx, 1);
 	/* Add the current item to the list of active epoll hook for this file */
 	if (unlikely(attach_epitem(tfile, epi) < 0)) {
-		kmem_cache_free(epi_cache, epi);
 		if (tep)
 			mutex_unlock(&tep->mtx);
+		kmem_cache_free(epi_cache, epi);
+		percpu_counter_dec(&ep->user->epoll_watches);
 		return -ENOMEM;
 	}
 
 	if (full_check && !tep)
 		list_file(tfile);
 
-	atomic_long_inc(&ep->user->epoll_watches);
-
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
 	 * protected by "mtx", and ep_insert() is called with "mtx" held.
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 2462f7d07695..00ed419dd464 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -4,6 +4,7 @@
 
 #include <linux/uidgid.h>
 #include <linux/atomic.h>
+#include <linux/percpu_counter.h>
 #include <linux/refcount.h>
 #include <linux/ratelimit.h>
 
@@ -13,7 +14,7 @@
 struct user_struct {
 	refcount_t __count;	/* reference count */
 #ifdef CONFIG_EPOLL
-	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
+	struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
 #endif
 	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
diff --git a/kernel/user.c b/kernel/user.c
index c82399c1618a..a2673f940506 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -138,6 +138,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
+	percpu_counter_destroy(&up->epoll_watches);
 	kmem_cache_free(uid_cachep, up);
 }
 
@@ -185,6 +186,10 @@ struct user_struct *alloc_uid(kuid_t uid)
 
 		new->uid = uid;
 		refcount_set(&new->__count, 1);
+		if (percpu_counter_init(&new->epoll_watches, 0, GFP_KERNEL)) {
+			kmem_cache_free(uid_cachep, new);
+			return NULL;
+		}
 		ratelimit_state_init(&new->ratelimit, HZ, 100);
 		ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 
@@ -195,6 +200,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
+			percpu_counter_destroy(&new->epoll_watches);
 			kmem_cache_free(uid_cachep, new);
 		} else {
 			uid_hash_insert(new, hashent);
@@ -216,6 +222,9 @@ static int __init uid_cache_init(void)
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_HLIST_HEAD(uidhash_table + n);
 
+	if (percpu_counter_init(&root_user.epoll_watches, 0, GFP_KERNEL))
+		panic("percpu cpunter alloc failed");
+
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
 	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v1] fs/epoll: use a per-cpu counter for user's watches count
  2021-08-02  3:20 [PATCH v1] fs/epoll: use a per-cpu counter for user's watches count Nicholas Piggin
@ 2021-08-04 15:22 ` Guenter Roeck
  2021-08-04 19:06   ` Randy Dunlap
  0 siblings, 1 reply; 3+ messages in thread
From: Guenter Roeck @ 2021-08-04 15:22 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andrew Morton, Alexander Viro, linux-fsdevel, linux-kernel,
	Anton Blanchard

On Mon, Aug 02, 2021 at 01:20:13PM +1000, Nicholas Piggin wrote:
> This counter tracks the number of watches a user has, to compare against
> the 'max_user_watches' limit. This causes a scalability bottleneck on
> SPECjbb2015 on large systems as there is only one user. Changing to a
> per-cpu counter increases throughput of the benchmark by about 30% on a
> 16-socket, > 1000 thread system.
> 
> Reported-by: Anton Blanchard <anton@ozlabs.org>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

With all tinyconfig builds (and all other builds with CONFIG_EPOLL=n),
this patch results in:

kernel/user.c: In function 'free_user':
kernel/user.c:141:35: error: 'struct user_struct' has no member named 'epoll_watches'
  141 |         percpu_counter_destroy(&up->epoll_watches);
      |                                   ^~
kernel/user.c: In function 'alloc_uid':
kernel/user.c:189:45: error: 'struct user_struct' has no member named 'epoll_watches'
  189 |                 if (percpu_counter_init(&new->epoll_watches, 0, GFP_KERNEL)) {
      |                                             ^~
kernel/user.c:203:52: error: 'struct user_struct' has no member named 'epoll_watches'
  203 |                         percpu_counter_destroy(&new->epoll_watches);
      |                                                    ^~
kernel/user.c: In function 'uid_cache_init':
kernel/user.c:225:43: error: 'struct user_struct' has no member named 'epoll_watches'
  225 |         if (percpu_counter_init(&root_user.epoll_watches, 0, GFP_KERNEL))
      |                                           ^

Guenter

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v1] fs/epoll: use a per-cpu counter for user's watches count
  2021-08-04 15:22 ` Guenter Roeck
@ 2021-08-04 19:06   ` Randy Dunlap
  0 siblings, 0 replies; 3+ messages in thread
From: Randy Dunlap @ 2021-08-04 19:06 UTC (permalink / raw)
  To: Guenter Roeck, Nicholas Piggin
  Cc: Andrew Morton, Alexander Viro, linux-fsdevel, linux-kernel,
	Anton Blanchard

On 8/4/21 8:22 AM, Guenter Roeck wrote:
> On Mon, Aug 02, 2021 at 01:20:13PM +1000, Nicholas Piggin wrote:
>> This counter tracks the number of watches a user has, to compare against
>> the 'max_user_watches' limit. This causes a scalability bottleneck on
>> SPECjbb2015 on large systems as there is only one user. Changing to a
>> per-cpu counter increases throughput of the benchmark by about 30% on a
>> 16-socket, > 1000 thread system.
>>
>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> 
> With all tinyconfig builds (and all other builds with CONFIG_EPOLL=n),
> this patch results in:
> 
> kernel/user.c: In function 'free_user':
> kernel/user.c:141:35: error: 'struct user_struct' has no member named 'epoll_watches'
>    141 |         percpu_counter_destroy(&up->epoll_watches);
>        |                                   ^~
> kernel/user.c: In function 'alloc_uid':
> kernel/user.c:189:45: error: 'struct user_struct' has no member named 'epoll_watches'
>    189 |                 if (percpu_counter_init(&new->epoll_watches, 0, GFP_KERNEL)) {
>        |                                             ^~
> kernel/user.c:203:52: error: 'struct user_struct' has no member named 'epoll_watches'
>    203 |                         percpu_counter_destroy(&new->epoll_watches);
>        |                                                    ^~
> kernel/user.c: In function 'uid_cache_init':
> kernel/user.c:225:43: error: 'struct user_struct' has no member named 'epoll_watches'
>    225 |         if (percpu_counter_init(&root_user.epoll_watches, 0, GFP_KERNEL))
>        |                                           ^
> 
> Guenter
> 

Hi,
Nick and I have also posted patches for this.

https://lore.kernel.org/lkml/1628051945.fens3r99ox.astroid@bobo.none/

thanks.--
~Randy


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-08-04 19:07 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-02  3:20 [PATCH v1] fs/epoll: use a per-cpu counter for user's watches count Nicholas Piggin
2021-08-04 15:22 ` Guenter Roeck
2021-08-04 19:06   ` Randy Dunlap

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).