linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file
@ 2019-07-27 11:35 Cheng Jian
  2019-08-08 11:52 ` Xie XiuQi
  0 siblings, 1 reply; 2+ messages in thread
From: Cheng Jian @ 2019-07-27 11:35 UTC (permalink / raw)
  To: cj.chengjian, xiexiuqi, viro, houtao1; +Cc: linux-fsdevel, linux-kernel

We are optimizing the Request-Per-Second of nginx http server,
and we found that acquiring epmutex in eventpoll_release_file()
will become a bottleneck under the one-request-per-connection
scenario.

Optimize the epmutex with a smaller granularity. Introduce
an ref-counter to eventpoll and free eventpoll by rcu, using rcu
and list_first_or_null_rcu() to iterate file->f_ep_links instead
of epmutex.

The following are some details of the scenario:

HTTP server (nginx):
	* under ARM64 with 64 cores
	* 64 worker processes, each worker is binded to a specific CPU
	* keepalive_requests = 1 in nginx.conf: nginx will close the
	  connection fd after a reply is send
HTTP client[benchmark] (wrk):
	* under x86-64 with 48 cores
	* 16 threads, 64 connections per-thread

Before the patch, the RPS measured by wrk is ~220K, after applying
the patch the RPS is ~240K. We also measure the overhead of
eventpoll_release_file() and its children by perf: 29% before and
2% after.

Link : https://lkml.org/lkml/2017/10/28/81

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 fs/eventpoll.c | 106 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 92 insertions(+), 14 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d7f1f5011fac..dc81f1c4fbaa 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <net/busy_poll.h>
+#include <linux/refcount.h>
 
 /*
  * LOCKING:
@@ -225,6 +226,11 @@ struct eventpoll {
 	/* used to track busy poll napi_id */
 	unsigned int napi_id;
 #endif
+
+	/* used to ensure the validity of eventpoll when release file */
+	refcount_t ref;
+	/* used to free itself */
+	struct rcu_head rcu;
 };
 
 /* Wait structure used by the poll hooks */
@@ -809,6 +815,32 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	return 0;
 }
 
+static void ep_rcu_free(struct rcu_head *head)
+{
+	struct eventpoll *ep = container_of(head, struct eventpoll, rcu);
+
+	kfree(ep);
+}
+
+static void eventpoll_put_ep(struct eventpoll *ep)
+{
+	if (refcount_dec_and_test(&ep->ref)) {
+		mutex_destroy(&ep->mtx);
+		free_uid(ep->user);
+		wakeup_source_unregister(ep->ws);
+
+		call_rcu(&ep->rcu, ep_rcu_free);
+	}
+}
+
+static struct eventpoll *eventpoll_get_ep(struct eventpoll *ep)
+{
+	if (refcount_inc_not_zero(&ep->ref))
+		return ep;
+	else
+		return NULL;
+}
+
 static void ep_free(struct eventpoll *ep)
 {
 	struct rb_node *rbp;
@@ -826,11 +858,11 @@ static void ep_free(struct eventpoll *ep)
 	 * anymore. The only hit might come from eventpoll_release_file() but
 	 * holding "epmutex" is sufficient here.
 	 */
-	mutex_lock(&epmutex);
 
 	/*
 	 * Walks through the whole tree by unregistering poll callbacks.
 	 */
+	mutex_lock(&ep->mtx);
 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 
@@ -846,7 +878,6 @@ static void ep_free(struct eventpoll *ep)
 	 * We do not need to lock ep->mtx, either, we only do it to prevent
 	 * a lockdep warning.
 	 */
-	mutex_lock(&ep->mtx);
 	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		ep_remove(ep, epi);
@@ -854,11 +885,19 @@ static void ep_free(struct eventpoll *ep)
 	}
 	mutex_unlock(&ep->mtx);
 
-	mutex_unlock(&epmutex);
-	mutex_destroy(&ep->mtx);
-	free_uid(ep->user);
-	wakeup_source_unregister(ep->ws);
-	kfree(ep);
+	/*
+	 * ep will not been added to visited_list, because ep_ctrl()
+	 * can not get its reference and can not reference it by the
+	 * corresponding epitem. The only possible operation is list_del_init,
+	 * so it's OK to use list_empty_careful() here.
+	 */
+	if (!list_empty_careful(&ep->visited_list_link)) {
+		mutex_lock(&epmutex);
+		list_del_init(&ep->visited_list_link);
+		mutex_unlock(&epmutex);
+	}
+
+	eventpoll_put_ep(ep);
 }
 
 static int ep_eventpoll_release(struct inode *inode, struct file *file)
@@ -985,7 +1024,7 @@ static const struct file_operations eventpoll_fops = {
 void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
-	struct epitem *epi, *next;
+	struct epitem *epi;
 
 	/*
 	 * We don't want to get "file->f_lock" because it is not
@@ -1000,14 +1039,51 @@ void eventpoll_release_file(struct file *file)
 	 *
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
-	mutex_lock(&epmutex);
-	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
-		ep = epi->ep;
+	rcu_read_lock();
+	while (true) {
+		epi = list_first_or_null_rcu(&file->f_ep_links,
+				struct epitem, fllink);
+		if (!epi)
+			break;
+
+		ep = eventpoll_get_ep(epi->ep);
+		/* Current epi had been removed by ep_free() */
+		if (!ep)
+			continue;
+		rcu_read_unlock();
+
 		mutex_lock_nested(&ep->mtx, 0);
-		ep_remove(ep, epi);
+		/*
+		 * If rb_first_cached() returns NULL, it means that
+		 * the current epi had been removed by ep_free().
+		 * To prevent epi from double-freeing, check the
+		 * condition before invoking ep_remove().
+		 * If eventpoll_release_file() frees epi firstly,
+		 * the epi will not be freed again because the epi
+		 * must have been removed from ep->rbr when ep_free()
+		 * is invoked.
+		 */
+		if (rb_first_cached(&ep->rbr))
+			ep_remove(ep, epi);
 		mutex_unlock(&ep->mtx);
+
+		eventpoll_put_ep(ep);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	/*
+	 * The file can not been added to tfile_check_list again, because
+	 * (1) refcnt has been zero, ep_ctrl() can no longer get its reference
+	 * (2) related ep items have been removed, ep_loop_check_proc() can not
+	 *     get the file by ep->rbr.
+	 */
+	if (!list_empty_careful(&file->f_tfile_llink)) {
+		mutex_lock(&epmutex);
+		list_del_init(&file->f_tfile_llink);
+		mutex_unlock(&epmutex);
 	}
-	mutex_unlock(&epmutex);
 }
 
 static int ep_alloc(struct eventpoll **pep)
@@ -1030,6 +1106,8 @@ static int ep_alloc(struct eventpoll **pep)
 	ep->rbr = RB_ROOT_CACHED;
 	ep->ovflist = EP_UNACTIVE_PTR;
 	ep->user = user;
+	INIT_LIST_HEAD(&ep->visited_list_link);
+	refcount_set(&ep->ref, 1);
 
 	*pep = ep;
 
@@ -2018,7 +2096,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
 	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
 							visited_list_link) {
 		ep_cur->visited = 0;
-		list_del(&ep_cur->visited_list_link);
+		list_del_init(&ep_cur->visited_list_link);
 	}
 	return ret;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file
  2019-07-27 11:35 [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file Cheng Jian
@ 2019-08-08 11:52 ` Xie XiuQi
  0 siblings, 0 replies; 2+ messages in thread
From: Xie XiuQi @ 2019-08-08 11:52 UTC (permalink / raw)
  To: Cheng Jian, viro, houtao1, zhouxiong13; +Cc: linux-fsdevel, linux-kernel

Hi,
Zhou Xiong test it on 2 sockets 128 cores ARM64 box, and get the following numbers:

seconds, smaller is better:
concurrency	5.3-rc2		patched		diff (smaller is better)
------------------------------------------------------------------------
10		6.983		4.658		-33.30%
20		14.479		6.8		-53.04%
40		47.689		15.303		-67.91%
80		103.336		22.402		-78.32%

There is a significant improvement in high concurrent and short connection scenario.
Any comments is welcome.

The test program:

#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <unistd.h>
#include <errno.h>
#include <error.h>
#include <sys/epoll.h>

#define TRUE 1
#define FALSE 0

int main(int argc, char * argv[])
{
    int sk = 0, ret = 0;
    struct epoll_event ev, events[10];
    int ep = epoll_create(10);

    int time = 1000000;

    while (time--) {
        sk = socket(PF_INET, SOCK_STREAM, 0);
        if (sk < 0)
            perror("socket failed.\n");

        ev.events = EPOLLIN;
        ev.data.u64 = 43; // Some other number
        if (epoll_ctl(ep, EPOLL_CTL_ADD, sk, &ev) < 0) {
            error(-1, errno, "epoll_ctl");
        }

        close(sk);
    }
}



On 2019/7/27 19:35, Cheng Jian wrote:
> We are optimizing the Request-Per-Second of nginx http server,
> and we found that acquiring epmutex in eventpoll_release_file()
> will become a bottleneck under the one-request-per-connection
> scenario.
> 
> Optimize the epmutex with a smaller granularity. Introduce
> an ref-counter to eventpoll and free eventpoll by rcu, using rcu
> and list_first_or_null_rcu() to iterate file->f_ep_links instead
> of epmutex.
> 
> The following are some details of the scenario:
> 
> HTTP server (nginx):
> 	* under ARM64 with 64 cores
> 	* 64 worker processes, each worker is binded to a specific CPU
> 	* keepalive_requests = 1 in nginx.conf: nginx will close the
> 	  connection fd after a reply is send
> HTTP client[benchmark] (wrk):
> 	* under x86-64 with 48 cores
> 	* 16 threads, 64 connections per-thread
> 
> Before the patch, the RPS measured by wrk is ~220K, after applying
> the patch the RPS is ~240K. We also measure the overhead of
> eventpoll_release_file() and its children by perf: 29% before and
> 2% after.
> 
> Link : https://lkml.org/lkml/2017/10/28/81
> 
> Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  fs/eventpoll.c | 106 ++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 92 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index d7f1f5011fac..dc81f1c4fbaa 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -38,6 +38,7 @@
>  #include <linux/compat.h>
>  #include <linux/rculist.h>
>  #include <net/busy_poll.h>
> +#include <linux/refcount.h>
>  
>  /*
>   * LOCKING:
> @@ -225,6 +226,11 @@ struct eventpoll {
>  	/* used to track busy poll napi_id */
>  	unsigned int napi_id;
>  #endif
> +
> +	/* used to ensure the validity of eventpoll when release file */
> +	refcount_t ref;
> +	/* used to free itself */
> +	struct rcu_head rcu;
>  };
>  
>  /* Wait structure used by the poll hooks */
> @@ -809,6 +815,32 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
>  	return 0;
>  }
>  
> +static void ep_rcu_free(struct rcu_head *head)
> +{
> +	struct eventpoll *ep = container_of(head, struct eventpoll, rcu);
> +
> +	kfree(ep);
> +}
> +
> +static void eventpoll_put_ep(struct eventpoll *ep)
> +{
> +	if (refcount_dec_and_test(&ep->ref)) {
> +		mutex_destroy(&ep->mtx);
> +		free_uid(ep->user);
> +		wakeup_source_unregister(ep->ws);
> +
> +		call_rcu(&ep->rcu, ep_rcu_free);
> +	}
> +}
> +
> +static struct eventpoll *eventpoll_get_ep(struct eventpoll *ep)
> +{
> +	if (refcount_inc_not_zero(&ep->ref))
> +		return ep;
> +	else
> +		return NULL;
> +}
> +
>  static void ep_free(struct eventpoll *ep)
>  {
>  	struct rb_node *rbp;
> @@ -826,11 +858,11 @@ static void ep_free(struct eventpoll *ep)
>  	 * anymore. The only hit might come from eventpoll_release_file() but
>  	 * holding "epmutex" is sufficient here.
>  	 */
> -	mutex_lock(&epmutex);
>  
>  	/*
>  	 * Walks through the whole tree by unregistering poll callbacks.
>  	 */
> +	mutex_lock(&ep->mtx);
>  	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
>  		epi = rb_entry(rbp, struct epitem, rbn);
>  
> @@ -846,7 +878,6 @@ static void ep_free(struct eventpoll *ep)
>  	 * We do not need to lock ep->mtx, either, we only do it to prevent
>  	 * a lockdep warning.
>  	 */
> -	mutex_lock(&ep->mtx);
>  	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
>  		epi = rb_entry(rbp, struct epitem, rbn);
>  		ep_remove(ep, epi);
> @@ -854,11 +885,19 @@ static void ep_free(struct eventpoll *ep)
>  	}
>  	mutex_unlock(&ep->mtx);
>  
> -	mutex_unlock(&epmutex);
> -	mutex_destroy(&ep->mtx);
> -	free_uid(ep->user);
> -	wakeup_source_unregister(ep->ws);
> -	kfree(ep);
> +	/*
> +	 * ep will not been added to visited_list, because ep_ctrl()
> +	 * can not get its reference and can not reference it by the
> +	 * corresponding epitem. The only possible operation is list_del_init,
> +	 * so it's OK to use list_empty_careful() here.
> +	 */
> +	if (!list_empty_careful(&ep->visited_list_link)) {
> +		mutex_lock(&epmutex);
> +		list_del_init(&ep->visited_list_link);
> +		mutex_unlock(&epmutex);
> +	}
> +
> +	eventpoll_put_ep(ep);
>  }
>  
>  static int ep_eventpoll_release(struct inode *inode, struct file *file)
> @@ -985,7 +1024,7 @@ static const struct file_operations eventpoll_fops = {
>  void eventpoll_release_file(struct file *file)
>  {
>  	struct eventpoll *ep;
> -	struct epitem *epi, *next;
> +	struct epitem *epi;
>  
>  	/*
>  	 * We don't want to get "file->f_lock" because it is not
> @@ -1000,14 +1039,51 @@ void eventpoll_release_file(struct file *file)
>  	 *
>  	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
>  	 */
> -	mutex_lock(&epmutex);
> -	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
> -		ep = epi->ep;
> +	rcu_read_lock();
> +	while (true) {
> +		epi = list_first_or_null_rcu(&file->f_ep_links,
> +				struct epitem, fllink);
> +		if (!epi)
> +			break;
> +
> +		ep = eventpoll_get_ep(epi->ep);
> +		/* Current epi had been removed by ep_free() */
> +		if (!ep)
> +			continue;
> +		rcu_read_unlock();
> +
>  		mutex_lock_nested(&ep->mtx, 0);
> -		ep_remove(ep, epi);
> +		/*
> +		 * If rb_first_cached() returns NULL, it means that
> +		 * the current epi had been removed by ep_free().
> +		 * To prevent epi from double-freeing, check the
> +		 * condition before invoking ep_remove().
> +		 * If eventpoll_release_file() frees epi firstly,
> +		 * the epi will not be freed again because the epi
> +		 * must have been removed from ep->rbr when ep_free()
> +		 * is invoked.
> +		 */
> +		if (rb_first_cached(&ep->rbr))
> +			ep_remove(ep, epi);
>  		mutex_unlock(&ep->mtx);
> +
> +		eventpoll_put_ep(ep);
> +
> +		rcu_read_lock();
> +	}
> +	rcu_read_unlock();
> +
> +	/*
> +	 * The file can not been added to tfile_check_list again, because
> +	 * (1) refcnt has been zero, ep_ctrl() can no longer get its reference
> +	 * (2) related ep items have been removed, ep_loop_check_proc() can not
> +	 *     get the file by ep->rbr.
> +	 */
> +	if (!list_empty_careful(&file->f_tfile_llink)) {
> +		mutex_lock(&epmutex);
> +		list_del_init(&file->f_tfile_llink);
> +		mutex_unlock(&epmutex);
>  	}
> -	mutex_unlock(&epmutex);
>  }
>  
>  static int ep_alloc(struct eventpoll **pep)
> @@ -1030,6 +1106,8 @@ static int ep_alloc(struct eventpoll **pep)
>  	ep->rbr = RB_ROOT_CACHED;
>  	ep->ovflist = EP_UNACTIVE_PTR;
>  	ep->user = user;
> +	INIT_LIST_HEAD(&ep->visited_list_link);
> +	refcount_set(&ep->ref, 1);
>  
>  	*pep = ep;
>  
> @@ -2018,7 +2096,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
>  	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
>  							visited_list_link) {
>  		ep_cur->visited = 0;
> -		list_del(&ep_cur->visited_list_link);
> +		list_del_init(&ep_cur->visited_list_link);
>  	}
>  	return ret;
>  }
> 

-- 
Thanks,
Xie XiuQi


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-08-08 11:52 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-27 11:35 [PATCH] epoll: optimize epmutex in ep_free and eventpoll_release_file Cheng Jian
2019-08-08 11:52 ` Xie XiuQi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).