linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] kselftests: introduce new epoll60 testcase for catching lost wakeups
@ 2020-04-30 13:03 Roman Penyaev
  2020-04-30 13:03 ` [PATCH 2/2] epoll: atomically remove wait entry on wake up Roman Penyaev
  0 siblings, 1 reply; 4+ messages in thread
From: Roman Penyaev @ 2020-04-30 13:03 UTC (permalink / raw)
  Cc: Roman Penyaev, Andrew Morton, Khazhismel Kumykov, Alexander Viro,
	Heiher, Jason Baron, linux-fsdevel, linux-kernel

This test case catches lost wake up introduced by:
  339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")

The test is simple: we have 10 threads and 10 event fds. Each thread
can harvest only 1 event. 1 producer fires all 10 events at once and
waits that all 10 events will be observed by 10 threads.

In case of lost wakeup epoll_wait() will timeout and 0 will be
returned.

Test case catches two sort of problems: forgotten wakeup on event,
which hits the ->ovflist list, this problem was fixed by:
  5a2513239750 ("eventpoll: fix missing wakeup for ovflist in ep_poll_callback")

the other problem is when several sequential events hit the same
waiting thread, thus other waiters get no wakeups. Problem is
fixed in the following patch.

Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Khazhismel Kumykov <khazhy@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Heiher <r@hev.cc>
Cc: Jason Baron <jbaron@akamai.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 .../filesystems/epoll/epoll_wakeup_test.c     | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
index 11eee0b60040..d979ff14775a 100644
--- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -3,6 +3,7 @@
 #define _GNU_SOURCE
 #include <poll.h>
 #include <unistd.h>
+#include <assert.h>
 #include <signal.h>
 #include <pthread.h>
 #include <sys/epoll.h>
@@ -3136,4 +3137,149 @@ TEST(epoll59)
 	close(ctx.sfd[0]);
 }
 
+enum {
+	EPOLL60_EVENTS_NR = 10,
+};
+
+struct epoll60_ctx {
+	volatile int stopped;
+	int ready;
+	int waiters;
+	int epfd;
+	int evfd[EPOLL60_EVENTS_NR];
+};
+
+static void *epoll60_wait_thread(void *ctx_)
+{
+	struct epoll60_ctx *ctx = ctx_;
+	struct epoll_event e;
+	sigset_t sigmask;
+	uint64_t v;
+	int ret;
+
+	/* Block SIGUSR1 */
+	sigemptyset(&sigmask);
+	sigaddset(&sigmask, SIGUSR1);
+	sigprocmask(SIG_SETMASK, &sigmask, NULL);
+
+	/* Prepare empty mask for epoll_pwait() */
+	sigemptyset(&sigmask);
+
+	while (!ctx->stopped) {
+		/* Mark we are ready */
+		__atomic_fetch_add(&ctx->ready, 1, __ATOMIC_ACQUIRE);
+
+		/* Start when all are ready */
+		while (__atomic_load_n(&ctx->ready, __ATOMIC_ACQUIRE) &&
+		       !ctx->stopped);
+
+		/* Account this waiter */
+		__atomic_fetch_add(&ctx->waiters, 1, __ATOMIC_ACQUIRE);
+
+		ret = epoll_pwait(ctx->epfd, &e, 1, 2000, &sigmask);
+		if (ret != 1) {
+			/* We expect only signal delivery on stop */
+			assert(ret < 0 && errno == EINTR && "Lost wakeup!\n");
+			assert(ctx->stopped);
+			break;
+		}
+
+		ret = read(e.data.fd, &v, sizeof(v));
+		/* Since we are on ET mode, thus each thread gets its own fd. */
+		assert(ret == sizeof(v));
+
+		__atomic_fetch_sub(&ctx->waiters, 1, __ATOMIC_RELEASE);
+	}
+
+	return NULL;
+}
+
+static inline unsigned long long msecs(void)
+{
+	struct timespec ts;
+	unsigned long long msecs;
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+	msecs = ts.tv_sec * 1000ull;
+	msecs += ts.tv_nsec / 1000000ull;
+
+	return msecs;
+}
+
+static inline int count_waiters(struct epoll60_ctx *ctx)
+{
+	return __atomic_load_n(&ctx->waiters, __ATOMIC_ACQUIRE);
+}
+
+TEST(epoll60)
+{
+	struct epoll60_ctx ctx = { 0 };
+	pthread_t waiters[ARRAY_SIZE(ctx.evfd)];
+	struct epoll_event e;
+	int i, n, ret;
+
+	signal(SIGUSR1, signal_handler);
+
+	ctx.epfd = epoll_create1(0);
+	ASSERT_GE(ctx.epfd, 0);
+
+	/* Create event fds */
+	for (i = 0; i < ARRAY_SIZE(ctx.evfd); i++) {
+		ctx.evfd[i] = eventfd(0, EFD_NONBLOCK);
+		ASSERT_GE(ctx.evfd[i], 0);
+
+		e.events = EPOLLIN | EPOLLET;
+		e.data.fd = ctx.evfd[i];
+		ASSERT_EQ(epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd[i], &e), 0);
+	}
+
+	/* Create waiter threads */
+	for (i = 0; i < ARRAY_SIZE(waiters); i++)
+		ASSERT_EQ(pthread_create(&waiters[i], NULL,
+					 epoll60_wait_thread, &ctx), 0);
+
+	for (i = 0; i < 300; i++) {
+		uint64_t v = 1, ms;
+
+		/* Wait for all to be ready */
+		while (__atomic_load_n(&ctx.ready, __ATOMIC_ACQUIRE) !=
+		       ARRAY_SIZE(ctx.evfd))
+			;
+
+		/* Steady, go */
+		__atomic_fetch_sub(&ctx.ready, ARRAY_SIZE(ctx.evfd),
+				   __ATOMIC_ACQUIRE);
+
+		/* Wait all have gone to kernel */
+		while (count_waiters(&ctx) != ARRAY_SIZE(ctx.evfd))
+			;
+
+		/* 1ms should be enough to schedule away */
+		usleep(1000);
+
+		/* Quickly signal all handles at once */
+		for (n = 0; n < ARRAY_SIZE(ctx.evfd); n++) {
+			ret = write(ctx.evfd[n], &v, sizeof(v));
+			ASSERT_EQ(ret, sizeof(v));
+		}
+
+		/* Busy loop for 1s and wait for all waiters to wake up */
+		ms = msecs();
+		while (count_waiters(&ctx) && msecs() < ms + 1000)
+			;
+
+		ASSERT_EQ(count_waiters(&ctx), 0);
+	}
+	ctx.stopped = 1;
+	/* Stop waiters */
+	for (i = 0; i < ARRAY_SIZE(waiters); i++)
+		ret = pthread_kill(waiters[i], SIGUSR1);
+	for (i = 0; i < ARRAY_SIZE(waiters); i++)
+		pthread_join(waiters[i], NULL);
+
+	for (i = 0; i < ARRAY_SIZE(waiters); i++)
+		close(ctx.evfd[i]);
+	close(ctx.epfd);
+}
+
 TEST_HARNESS_MAIN
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] epoll: atomically remove wait entry on wake up
  2020-04-30 13:03 [PATCH 1/2] kselftests: introduce new epoll60 testcase for catching lost wakeups Roman Penyaev
@ 2020-04-30 13:03 ` Roman Penyaev
  2020-05-01  2:55   ` Sasha Levin
  2020-05-01 19:27   ` Jason Baron
  0 siblings, 2 replies; 4+ messages in thread
From: Roman Penyaev @ 2020-04-30 13:03 UTC (permalink / raw)
  Cc: Roman Penyaev, Andrew Morton, Khazhismel Kumykov, Alexander Viro,
	Heiher, Jason Baron, stable, linux-fsdevel, linux-kernel

This patch does two things:

1. fixes lost wakeup introduced by:
  339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")

2. improves performance for events delivery.

The description of the problem is the following: if N (>1) threads
are waiting on ep->wq for new events and M (>1) events come, it is
quite likely that >1 wakeups hit the same wait queue entry, because
there is quite a big window between __add_wait_queue_exclusive() and
the following __remove_wait_queue() calls in ep_poll() function.  This
can lead to lost wakeups, because thread, which was woken up, can
handle not all the events in ->rdllist. (in better words the problem
is described here: https://lkml.org/lkml/2019/10/7/905)

The idea of the current patch is to use init_wait() instead of
init_waitqueue_entry(). Internally init_wait() sets
autoremove_wake_function as a callback, which removes the wait entry
atomically (under the wq locks) from the list, thus the next coming
wakeup hits the next wait entry in the wait queue, thus preventing
lost wakeups.

Problem is very well reproduced by the epoll60 test case [1].

Wait entry removal on wakeup has also performance benefits, because
there is no need to take a ep->lock and remove wait entry from the
queue after the successful wakeup. Here is the timing output of
the epoll60 test case:

  With explicit wakeup from ep_scan_ready_list() (the state of the
  code prior 339ddb53d373):

    real    0m6.970s
    user    0m49.786s
    sys     0m0.113s

 After this patch:

   real    0m5.220s
   user    0m36.879s
   sys     0m0.019s

The other testcase is the stress-epoll [2], where one thread consumes
all the events and other threads produce many events:

  With explicit wakeup from ep_scan_ready_list() (the state of the
  code prior 339ddb53d373):

    threads  events/ms  run-time ms
          8       5427         1474
         16       6163         2596
         32       6824         4689
         64       7060         9064
        128       6991        18309

 After this patch:

    threads  events/ms  run-time ms
          8       5598         1429
         16       7073         2262
         32       7502         4265
         64       7640         8376
        128       7634        16767

 (number of "events/ms" represents event bandwidth, thus higher is
  better; number of "run-time ms" represents overall time spent
  doing the benchmark, thus lower is better)

[1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
[2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c

Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Khazhismel Kumykov <khazhy@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Heiher <r@hev.cc>
Cc: Jason Baron <jbaron@akamai.com>
Cc: stable@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 fs/eventpoll.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d6ba0e52439b..aba03ee749f8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 {
 	int res = 0, eavail, timed_out = 0;
 	u64 slack = 0;
-	bool waiter = false;
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
@@ -1867,21 +1866,23 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	 */
 	ep_reset_busy_poll_napi_id(ep);
 
-	/*
-	 * We don't have any available event to return to the caller.  We need
-	 * to sleep here, and we will be woken by ep_poll_callback() when events
-	 * become available.
-	 */
-	if (!waiter) {
-		waiter = true;
-		init_waitqueue_entry(&wait, current);
-
+	do {
+		/*
+		 * Internally init_wait() uses autoremove_wake_function(),
+		 * thus wait entry is removed from the wait queue on each
+		 * wakeup. Why it is important? In case of several waiters
+		 * each new wakeup will hit the next waiter, giving it the
+		 * chance to harvest new event. Otherwise wakeup can be
+		 * lost. This is also good performance-wise, because on
+		 * normal wakeup path no need to call __remove_wait_queue()
+		 * explicitly, thus ep->lock is not taken, which halts the
+		 * event delivery.
+		 */
+		init_wait(&wait);
 		write_lock_irq(&ep->lock);
 		__add_wait_queue_exclusive(&ep->wq, &wait);
 		write_unlock_irq(&ep->lock);
-	}
 
-	for (;;) {
 		/*
 		 * We don't want to sleep if the ep_poll_callback() sends us
 		 * a wakeup in between. That's why we set the task state
@@ -1911,10 +1912,20 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 			timed_out = 1;
 			break;
 		}
-	}
+
+		/* We were woken up, thus go and try to harvest some events */
+		eavail = 1;
+
+	} while (0);
 
 	__set_current_state(TASK_RUNNING);
 
+	if (!list_empty_careful(&wait.entry)) {
+		write_lock_irq(&ep->lock);
+		__remove_wait_queue(&ep->wq, &wait);
+		write_unlock_irq(&ep->lock);
+	}
+
 send_events:
 	/*
 	 * Try to transfer events to user space. In case we get 0 events and
@@ -1925,12 +1936,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
 		goto fetch_events;
 
-	if (waiter) {
-		write_lock_irq(&ep->lock);
-		__remove_wait_queue(&ep->wq, &wait);
-		write_unlock_irq(&ep->lock);
-	}
-
 	return res;
 }
 
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] epoll: atomically remove wait entry on wake up
  2020-04-30 13:03 ` [PATCH 2/2] epoll: atomically remove wait entry on wake up Roman Penyaev
@ 2020-05-01  2:55   ` Sasha Levin
  2020-05-01 19:27   ` Jason Baron
  1 sibling, 0 replies; 4+ messages in thread
From: Sasha Levin @ 2020-05-01  2:55 UTC (permalink / raw)
  To: Sasha Levin, Roman Penyaev
  Cc: Roman Penyaev, Andrew Morton, Khazhismel Kumykov, Alexander Viro,
	Heiher, Jason Baron, stable, linux-fsdevel, linux-kernel, stable

Hi

[This is an automated email]

This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: all

The bot has tested the following trees: v5.6.7, v5.4.35, v4.19.118, v4.14.177, v4.9.220, v4.4.220.

v5.6.7: Build OK!
v5.4.35: Build OK!
v4.19.118: Failed to apply! Possible dependencies:
    1b53734bd0b2 ("epoll: fix possible lost wakeup on epoll_ctl() path")
    35cff1a6e023 ("fs/epoll: rename check_events label to send_events")
    86c051793b4c ("fs/epoll: deal with wait_queue only once")
    abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2) timeout")
    c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()")

v4.14.177: Failed to apply! Possible dependencies:
    002b343669c4 ("fs/epoll: loosen irq safety in ep_scan_ready_list()")
    35cff1a6e023 ("fs/epoll: rename check_events label to send_events")
    37b5e5212a44 ("epoll: remove ep_call_nested() from ep_eventpoll_poll()")
    679abf381a18 ("fs/eventpoll.c: loosen irq safety in ep_poll()")
    69112736e2f0 ("eventpoll: no need to mask the result of epi_item_poll() again")
    86c051793b4c ("fs/epoll: deal with wait_queue only once")
    bec1a502d34d ("eventpoll: constify struct epoll_event pointers")
    c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()")
    d85e2aa2e34d ("annotate ep_scan_ready_list()")
    ee8ef0a4b167 ("epoll: use the waitqueue lock to protect ep->wq")

v4.9.220: Failed to apply! Possible dependencies:
    86c051793b4c ("fs/epoll: deal with wait_queue only once")
    89d947561077 ("sd: Implement support for ZBC devices")
    ac6424b981bc ("sched/wait: Rename wait_queue_t => wait_queue_entry_t")
    bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers")
    cf43e6be865a ("block: add scalable completion tracking of requests")
    e806402130c9 ("block: split out request-only flags into a new namespace")

v4.4.220: Failed to apply! Possible dependencies:
    27489a3c827b ("blk-mq: turn hctx->run_work into a regular work struct")
    511cbce2ff8b ("irq_poll: make blk-iopoll available outside the block layer")
    86c051793b4c ("fs/epoll: deal with wait_queue only once")
    8d354f133e86 ("blk-mq: improve layout of blk_mq_hw_ctx")
    9467f85960a3 ("blk-mq/cpu-notif: Convert to new hotplug state machine")
    ac6424b981bc ("sched/wait: Rename wait_queue_t => wait_queue_entry_t")
    bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers")
    cf43e6be865a ("block: add scalable completion tracking of requests")
    d9d8c5c489f4 ("block: convert is_sync helpers to use REQ_OPs.")
    da8b44d5a9f8 ("timer: convert timer_slack_ns from unsigned long to u64")
    e57690fe009b ("blk-mq: don't overwrite rq->mq_ctx")
    e6a40b096e28 ("block: prepare request creation/destruction code to use REQ_OPs")
    e806402130c9 ("block: split out request-only flags into a new namespace")


NOTE: The patch will not be queued to stable trees until it is upstream.

How should we proceed with this patch?

-- 
Thanks
Sasha

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] epoll: atomically remove wait entry on wake up
  2020-04-30 13:03 ` [PATCH 2/2] epoll: atomically remove wait entry on wake up Roman Penyaev
  2020-05-01  2:55   ` Sasha Levin
@ 2020-05-01 19:27   ` Jason Baron
  1 sibling, 0 replies; 4+ messages in thread
From: Jason Baron @ 2020-05-01 19:27 UTC (permalink / raw)
  To: Roman Penyaev
  Cc: Andrew Morton, Khazhismel Kumykov, Alexander Viro, Heiher,
	stable, linux-fsdevel, linux-kernel



On 4/30/20 9:03 AM, Roman Penyaev wrote:
> This patch does two things:
> 
> 1. fixes lost wakeup introduced by:
>   339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
> 
> 2. improves performance for events delivery.
> 
> The description of the problem is the following: if N (>1) threads
> are waiting on ep->wq for new events and M (>1) events come, it is
> quite likely that >1 wakeups hit the same wait queue entry, because
> there is quite a big window between __add_wait_queue_exclusive() and
> the following __remove_wait_queue() calls in ep_poll() function.  This
> can lead to lost wakeups, because thread, which was woken up, can
> handle not all the events in ->rdllist. (in better words the problem
> is described here: https://lkml.org/lkml/2019/10/7/905)
> 
> The idea of the current patch is to use init_wait() instead of
> init_waitqueue_entry(). Internally init_wait() sets
> autoremove_wake_function as a callback, which removes the wait entry
> atomically (under the wq locks) from the list, thus the next coming
> wakeup hits the next wait entry in the wait queue, thus preventing
> lost wakeups.
> 
> Problem is very well reproduced by the epoll60 test case [1].
> 
> Wait entry removal on wakeup has also performance benefits, because
> there is no need to take a ep->lock and remove wait entry from the
> queue after the successful wakeup. Here is the timing output of
> the epoll60 test case:
> 
>   With explicit wakeup from ep_scan_ready_list() (the state of the
>   code prior 339ddb53d373):
> 
>     real    0m6.970s
>     user    0m49.786s
>     sys     0m0.113s
> 
>  After this patch:
> 
>    real    0m5.220s
>    user    0m36.879s
>    sys     0m0.019s
> 
> The other testcase is the stress-epoll [2], where one thread consumes
> all the events and other threads produce many events:
> 
>   With explicit wakeup from ep_scan_ready_list() (the state of the
>   code prior 339ddb53d373):
> 
>     threads  events/ms  run-time ms
>           8       5427         1474
>          16       6163         2596
>          32       6824         4689
>          64       7060         9064
>         128       6991        18309
> 
>  After this patch:
> 
>     threads  events/ms  run-time ms
>           8       5598         1429
>          16       7073         2262
>          32       7502         4265
>          64       7640         8376
>         128       7634        16767
> 
>  (number of "events/ms" represents event bandwidth, thus higher is
>   better; number of "run-time ms" represents overall time spent
>   doing the benchmark, thus lower is better)
> 
> [1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
> [2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c
> 
> Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Khazhismel Kumykov <khazhy@google.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: Heiher <r@hev.cc>
> Cc: Jason Baron <jbaron@akamai.com>
> Cc: stable@vger.kernel.org
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  fs/eventpoll.c | 43 ++++++++++++++++++++++++-------------------
>  1 file changed, 24 insertions(+), 19 deletions(-)
> 

Looks good to me and nice speedups.
Reviewed-by: Jason Baron <jbaron@akamai.com>

Thanks,

-Jason



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-05-01 20:27 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-30 13:03 [PATCH 1/2] kselftests: introduce new epoll60 testcase for catching lost wakeups Roman Penyaev
2020-04-30 13:03 ` [PATCH 2/2] epoll: atomically remove wait entry on wake up Roman Penyaev
2020-05-01  2:55   ` Sasha Levin
2020-05-01 19:27   ` Jason Baron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).