linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll()
@ 2017-10-31  6:10 Jason Baron
  2017-10-31 14:58 ` Davidlohr Bueso
  0 siblings, 1 reply; 6+ messages in thread
From: Jason Baron @ 2017-10-31  6:10 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, Davidlohr Bueso, Alexander Viro, Salman Qazi, Hou Tao

The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
routine for an epoll fd, is used to prevent excessively deep epoll
nesting, and to prevent circular paths. However, we are already preventing
these conditions during EPOLL_CTL_ADD. In terms of too deep epoll chains,
we do in fact allow deep nesting of the epoll fds themselves (deeper
than EP_MAX_NESTS), however we don't allow more than EP_MAX_NESTS when
an epoll file descriptor is actually connected to a wakeup source. Thus,
we do not require the use of ep_call_nested(), since ep_eventpoll_poll(),
which is called via ep_scan_ready_list() only continues nesting if there
are events available. Since ep_call_nested() is implemented using a global
lock, applications that make use of nested epoll can see large performance
improvements with this change.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Salman Qazi <sqazi@google.com>
Cc: Hou Tao <houtao1@huawei.com>
---
 fs/eventpoll.c | 80 +++++++++++++++++++++++++---------------------------------
 1 file changed, 35 insertions(+), 45 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 69acfab..2e85951 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex);
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
 
-/* Used to call file's f_op->poll() under the nested calls boundaries */
-static struct nested_calls poll_readywalk_ncalls;
-
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
 
@@ -867,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv);
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
+
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth)
 {
+	struct eventpoll *ep;
+	bool locked;
+
 	pt->_key = epi->event.events;
+	if (!is_file_epoll(epi->ffd.file))
+		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
+		       epi->event.events;
 
-	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+	ep = epi->ffd.file->private_data;
+	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
+	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
+
+	return ep_scan_ready_list(epi->ffd.file->private_data,
+				  ep_read_events_proc, &depth, depth,
+				  locked) & epi->event.events;
 }
 
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -879,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 {
 	struct epitem *epi, *tmp;
 	poll_table pt;
+	int depth = *(int *)priv;
 
 	init_poll_funcptr(&pt, NULL);
+	depth++;
 
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
-		if (ep_item_poll(epi, &pt))
+		if (ep_item_poll(epi, &pt, depth)) {
 			return POLLIN | POLLRDNORM;
-		else {
+		} else {
 			/*
 			 * Item has been dropped into the ready list by the poll
 			 * callback, but it's not actually ready, as far as
@@ -899,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	return 0;
 }
 
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
-				 poll_table *pt);
-
-struct readyevents_arg {
-	struct eventpoll *ep;
-	bool locked;
-};
-
-static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
-{
-	struct readyevents_arg *arg = priv;
-
-	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
-				  call_nests + 1, arg->locked);
-}
-
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-	int pollflags;
 	struct eventpoll *ep = file->private_data;
-	struct readyevents_arg arg;
-
-	/*
-	 * During ep_insert() we already hold the ep->mtx for the tfile.
-	 * Prevent re-aquisition.
-	 */
-	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
-	arg.ep = ep;
+	int depth = 0;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
 
 	/*
 	 * Proceed to find out if wanted events are really available inside
-	 * the ready list. This need to be done under ep_call_nested()
-	 * supervision, since the call to f_op->poll() done on listed files
-	 * could re-enter here.
+	 * the ready list.
 	 */
-	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, &arg, ep, current);
-
-	return pollflags != -1 ? pollflags : 0;
+	return ep_scan_ready_list(ep, ep_read_events_proc,
+				  &depth, depth, false);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1459,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * this operation completes, the poll callback can start hitting
 	 * the new item.
 	 */
-	revents = ep_item_poll(epi, &epq.pt);
+	revents = ep_item_poll(epi, &epq.pt, 1);
 
 	/*
 	 * We have to check if something went wrong during the poll wait queue
@@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = ep_item_poll(epi, &pt);
+	revents = ep_item_poll(epi, &pt, 1);
 
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
@@ -1661,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 
 		list_del_init(&epi->rdllink);
 
-		revents = ep_item_poll(epi, &pt);
+		revents = ep_item_poll(epi, &pt, 1);
 
 		/*
 		 * If the event mask intersect the caller-requested one,
@@ -2307,9 +2300,6 @@ static int __init eventpoll_init(void)
 	ep_nested_calls_init(&poll_safewake_ncalls);
 #endif
 
-	/* Initialize the structure used to perform file's f_op->poll() calls */
-	ep_nested_calls_init(&poll_readywalk_ncalls);
-
 	/*
 	 * We can have many thousands of epitems, so prevent this from
 	 * using an extra cache line on 64-bit (and smaller) CPUs
-- 
2.6.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll()
  2017-10-31  6:10 [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll() Jason Baron
@ 2017-10-31 14:58 ` Davidlohr Bueso
  2017-11-01 20:53   ` Andrew Morton
  0 siblings, 1 reply; 6+ messages in thread
From: Davidlohr Bueso @ 2017-10-31 14:58 UTC (permalink / raw)
  To: Jason Baron; +Cc: akpm, linux-kernel, Alexander Viro, Salman Qazi, Hou Tao

On Tue, 31 Oct 2017, Jason Baron wrote:

>The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
>routine for an epoll fd, is used to prevent excessively deep epoll
>nesting, and to prevent circular paths. However, we are already preventing
>these conditions during EPOLL_CTL_ADD. In terms of too deep epoll chains,
>we do in fact allow deep nesting of the epoll fds themselves (deeper
>than EP_MAX_NESTS), however we don't allow more than EP_MAX_NESTS when
>an epoll file descriptor is actually connected to a wakeup source. Thus,
>we do not require the use of ep_call_nested(), since ep_eventpoll_poll(),
>which is called via ep_scan_ready_list() only continues nesting if there
>are events available. Since ep_call_nested() is implemented using a global
>lock, applications that make use of nested epoll can see large performance
>improvements with this change.

Improvements are quite obscene actually, such as for the following epoll_wait()
benchmark with 2 level nesting on a 80 core IvyBridge:

ncpus  vanilla     dirty     delta
1      2447092     3028315   +23.75%
4      231265      2986954   +1191.57%
8      121631      2898796   +2283.27%
16     59749       2902056   +4757.07%
32     26837	   2326314   +8568.30%
64     12926       1341281   +10276.61%

(http://linux-scalability.org/epoll/epoll-test.c)

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll()
  2017-10-31 14:58 ` Davidlohr Bueso
@ 2017-11-01 20:53   ` Andrew Morton
  2017-11-01 21:07     ` Davidlohr Bueso
  2017-11-02  1:24     ` Jason Baron
  0 siblings, 2 replies; 6+ messages in thread
From: Andrew Morton @ 2017-11-01 20:53 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Jason Baron, linux-kernel, Alexander Viro, Salman Qazi, Hou Tao

On Tue, 31 Oct 2017 07:58:21 -0700 Davidlohr Bueso <dave@stgolabs.net> wrote:

> On Tue, 31 Oct 2017, Jason Baron wrote:
> 
> >The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
> >routine for an epoll fd, is used to prevent excessively deep epoll
> >nesting, and to prevent circular paths. However, we are already preventing
> >these conditions during EPOLL_CTL_ADD. In terms of too deep epoll chains,
> >we do in fact allow deep nesting of the epoll fds themselves (deeper
> >than EP_MAX_NESTS), however we don't allow more than EP_MAX_NESTS when
> >an epoll file descriptor is actually connected to a wakeup source. Thus,
> >we do not require the use of ep_call_nested(), since ep_eventpoll_poll(),
> >which is called via ep_scan_ready_list() only continues nesting if there
> >are events available. Since ep_call_nested() is implemented using a global
> >lock, applications that make use of nested epoll can see large performance
> >improvements with this change.
> 
> Improvements are quite obscene actually, such as for the following epoll_wait()
> benchmark with 2 level nesting on a 80 core IvyBridge:
> 
> ncpus  vanilla     dirty     delta
> 1      2447092     3028315   +23.75%
> 4      231265      2986954   +1191.57%
> 8      121631      2898796   +2283.27%
> 16     59749       2902056   +4757.07%
> 32     26837	   2326314   +8568.30%
> 64     12926       1341281   +10276.61%
> 
> (http://linux-scalability.org/epoll/epoll-test.c)

This is tempting, but boy it is late in the -rc cycle.

How important are these workloads?  Would the world end if we held off
on this for 4.15?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll()
  2017-11-01 20:53   ` Andrew Morton
@ 2017-11-01 21:07     ` Davidlohr Bueso
  2017-11-01 21:11       ` Davidlohr Bueso
  2017-11-02  1:24     ` Jason Baron
  1 sibling, 1 reply; 6+ messages in thread
From: Davidlohr Bueso @ 2017-11-01 21:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jason Baron, linux-kernel, Alexander Viro, Salman Qazi, Hou Tao

On Wed, 01 Nov 2017, Andrew Morton wrote:

>This is tempting, but boy it is late in the -rc cycle.
>
>How important are these workloads?  Would the world end if we held off
>on this for 4.15?

While it's very important to one customer, nested epoll is certainly not
the common case, and this performance bottleneck has been there for 7+
years, so no hurry in that regard.

I'd like to target for v4.16 some of the series of epoll patches out there
to (1) remove the remaining loop_ncalls list, and (2) remove the epmutex
global lock. So this patch would fit in fine there.

Now, could you please pick this patch up and just leave it in linux-next
for a while to get some more testing/exposure? I mainly ask because the
patch will most likely be running in production before this time.

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll()
  2017-11-01 21:07     ` Davidlohr Bueso
@ 2017-11-01 21:11       ` Davidlohr Bueso
  0 siblings, 0 replies; 6+ messages in thread
From: Davidlohr Bueso @ 2017-11-01 21:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jason Baron, linux-kernel, Alexander Viro, Salman Qazi, Hou Tao

On Wed, 01 Nov 2017, Davidlohr Bueso wrote:

>Now, could you please pick this patch up and just leave it in linux-next
>for a while to get some more testing/exposure? I mainly ask because the
>patch will most likely be running in production before this time.

(To be clear, this without sending to Linus obviously until v4.16)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll()
  2017-11-01 20:53   ` Andrew Morton
  2017-11-01 21:07     ` Davidlohr Bueso
@ 2017-11-02  1:24     ` Jason Baron
  1 sibling, 0 replies; 6+ messages in thread
From: Jason Baron @ 2017-11-02  1:24 UTC (permalink / raw)
  To: Andrew Morton, Davidlohr Bueso
  Cc: linux-kernel, Alexander Viro, Salman Qazi, Hou Tao



On 11/01/2017 04:53 PM, Andrew Morton wrote:
> On Tue, 31 Oct 2017 07:58:21 -0700 Davidlohr Bueso <dave@stgolabs.net> wrote:
> 
>> On Tue, 31 Oct 2017, Jason Baron wrote:
>>
>>> The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
>>> routine for an epoll fd, is used to prevent excessively deep epoll
>>> nesting, and to prevent circular paths. However, we are already preventing
>>> these conditions during EPOLL_CTL_ADD. In terms of too deep epoll chains,
>>> we do in fact allow deep nesting of the epoll fds themselves (deeper
>>> than EP_MAX_NESTS), however we don't allow more than EP_MAX_NESTS when
>>> an epoll file descriptor is actually connected to a wakeup source. Thus,
>>> we do not require the use of ep_call_nested(), since ep_eventpoll_poll(),
>>> which is called via ep_scan_ready_list() only continues nesting if there
>>> are events available. Since ep_call_nested() is implemented using a global
>>> lock, applications that make use of nested epoll can see large performance
>>> improvements with this change.
>>
>> Improvements are quite obscene actually, such as for the following epoll_wait()
>> benchmark with 2 level nesting on a 80 core IvyBridge:
>>
>> ncpus  vanilla     dirty     delta
>> 1      2447092     3028315   +23.75%
>> 4      231265      2986954   +1191.57%
>> 8      121631      2898796   +2283.27%
>> 16     59749       2902056   +4757.07%
>> 32     26837	   2326314   +8568.30%
>> 64     12926       1341281   +10276.61%
>>
>> (http://linux-scalability.org/epoll/epoll-test.c)
> 
> This is tempting, but boy it is late in the -rc cycle.
> 
> How important are these workloads?  Would the world end if we held off
> on this for 4.15?
> 

Hi Andrew,

As Davidlohr pointed out, the nested epoll case is less common and these
locks have been here for a long time. I also think the patch needs to be
in linux-next for a bit and validated more, before hitting mainline.

Thanks,

-Jason

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-11-02  1:24 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-31  6:10 [PATCH] epoll: remove ep_call_nested() from ep_eventpoll_poll() Jason Baron
2017-10-31 14:58 ` Davidlohr Bueso
2017-11-01 20:53   ` Andrew Morton
2017-11-01 21:07     ` Davidlohr Bueso
2017-11-01 21:11       ` Davidlohr Bueso
2017-11-02  1:24     ` Jason Baron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).