Re: [RFC] eventfd: add EFD_AUTORESET flag

From: Stefan Hajnoczi <stefanha@redhat.com>
To: linux-fsdevel@vger.kernel.org
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	Avi Kivity <avi@scylladb.com>,
	Davide Libenzi <davidel@xmailserver.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Masatake YAMATO <yamato@redhat.com>
Subject: Re: [RFC] eventfd: add EFD_AUTORESET flag
Date: Tue, 4 Feb 2020 15:40:35 +0000	[thread overview]
Message-ID: <20200204154035.GA47059@stefanha-x1.localdomain> (raw)
In-Reply-To: <20200129172010.162215-1-stefanha@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 7000 bytes --]

On Wed, Jan 29, 2020 at 05:20:10PM +0000, Stefan Hajnoczi wrote:
> Some applications simply use eventfd for inter-thread notifications
> without requiring counter or semaphore semantics.  They wait for the
> eventfd to become readable using poll(2)/select(2) and then call read(2)
> to reset the counter.
> 
> This patch adds the EFD_AUTORESET flag to reset the counter when
> f_ops->poll() finds the eventfd is readable, eliminating the need to
> call read(2) to reset the counter.
> 
> This results in a small but measurable 1% performance improvement with
> QEMU virtio-blk emulation.  Each read(2) takes 1 microsecond execution
> time in the event loop according to perf.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> Does this look like a reasonable thing to do?  I'm not very familiar
> with f_ops->poll() or the eventfd internals, so maybe I'm overlooking a
> design flaw.

Ping?

> I've tested this with QEMU and it works fine:
> https://github.com/stefanha/qemu/commits/eventfd-autoreset
> ---
>  fs/eventfd.c            | 99 +++++++++++++++++++++++++----------------
>  include/linux/eventfd.h |  3 +-
>  2 files changed, 62 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index 8aa0ea8c55e8..208f6b9e2234 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -116,45 +116,62 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
>  
>  	poll_wait(file, &ctx->wqh, wait);
>  
> -	/*
> -	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
> -	 * can be done outside ctx->wqh.lock because we know that poll_wait
> -	 * takes that lock (through add_wait_queue) if our caller will sleep.
> -	 *
> -	 * The read _can_ therefore seep into add_wait_queue's critical
> -	 * section, but cannot move above it!  add_wait_queue's spin_lock acts
> -	 * as an acquire barrier and ensures that the read be ordered properly
> -	 * against the writes.  The following CAN happen and is safe:
> -	 *
> -	 *     poll                               write
> -	 *     -----------------                  ------------
> -	 *     lock ctx->wqh.lock (in poll_wait)
> -	 *     count = ctx->count
> -	 *     __add_wait_queue
> -	 *     unlock ctx->wqh.lock
> -	 *                                        lock ctx->qwh.lock
> -	 *                                        ctx->count += n
> -	 *                                        if (waitqueue_active)
> -	 *                                          wake_up_locked_poll
> -	 *                                        unlock ctx->qwh.lock
> -	 *     eventfd_poll returns 0
> -	 *
> -	 * but the following, which would miss a wakeup, cannot happen:
> -	 *
> -	 *     poll                               write
> -	 *     -----------------                  ------------
> -	 *     count = ctx->count (INVALID!)
> -	 *                                        lock ctx->qwh.lock
> -	 *                                        ctx->count += n
> -	 *                                        **waitqueue_active is false**
> -	 *                                        **no wake_up_locked_poll!**
> -	 *                                        unlock ctx->qwh.lock
> -	 *     lock ctx->wqh.lock (in poll_wait)
> -	 *     __add_wait_queue
> -	 *     unlock ctx->wqh.lock
> -	 *     eventfd_poll returns 0
> -	 */
> -	count = READ_ONCE(ctx->count);
> +	if (ctx->flags & EFD_AUTORESET) {
> +		unsigned long flags;
> +		__poll_t requested = poll_requested_events(wait);
> +
> +		spin_lock_irqsave(&ctx->wqh.lock, flags);
> +		count = ctx->count;
> +
> +		/* Reset counter if caller is polling for read */
> +		if (count != 0 && (requested & EPOLLIN)) {
> +			ctx->count = 0;
> +			events |= EPOLLOUT;
> +			/* TODO is a EPOLLOUT wakeup necessary here? */
> +		}
> +
> +		spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> +	} else {
> +		/*
> +		 * All writes to ctx->count occur within ctx->wqh.lock.  This read
> +		 * can be done outside ctx->wqh.lock because we know that poll_wait
> +		 * takes that lock (through add_wait_queue) if our caller will sleep.
> +		 *
> +		 * The read _can_ therefore seep into add_wait_queue's critical
> +		 * section, but cannot move above it!  add_wait_queue's spin_lock acts
> +		 * as an acquire barrier and ensures that the read be ordered properly
> +		 * against the writes.  The following CAN happen and is safe:
> +		 *
> +		 *     poll                               write
> +		 *     -----------------                  ------------
> +		 *     lock ctx->wqh.lock (in poll_wait)
> +		 *     count = ctx->count
> +		 *     __add_wait_queue
> +		 *     unlock ctx->wqh.lock
> +		 *                                        lock ctx->qwh.lock
> +		 *                                        ctx->count += n
> +		 *                                        if (waitqueue_active)
> +		 *                                          wake_up_locked_poll
> +		 *                                        unlock ctx->qwh.lock
> +		 *     eventfd_poll returns 0
> +		 *
> +		 * but the following, which would miss a wakeup, cannot happen:
> +		 *
> +		 *     poll                               write
> +		 *     -----------------                  ------------
> +		 *     count = ctx->count (INVALID!)
> +		 *                                        lock ctx->qwh.lock
> +		 *                                        ctx->count += n
> +		 *                                        **waitqueue_active is false**
> +		 *                                        **no wake_up_locked_poll!**
> +		 *                                        unlock ctx->qwh.lock
> +		 *     lock ctx->wqh.lock (in poll_wait)
> +		 *     __add_wait_queue
> +		 *     unlock ctx->wqh.lock
> +		 *     eventfd_poll returns 0
> +		 */
> +		count = READ_ONCE(ctx->count);
> +	}
>  
>  	if (count > 0)
>  		events |= EPOLLIN;
> @@ -400,6 +417,10 @@ static int do_eventfd(unsigned int count, int flags)
>  	if (flags & ~EFD_FLAGS_SET)
>  		return -EINVAL;
>  
> +	/* Semaphore semantics don't make sense when autoreset is enabled */
> +	if ((flags & EFD_SEMAPHORE) && (flags & EFD_AUTORESET))
> +		return -EINVAL;
> +
>  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
>  	if (!ctx)
>  		return -ENOMEM;
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index ffcc7724ca21..27577fafc553 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -21,11 +21,12 @@
>   * shared O_* flags.
>   */
>  #define EFD_SEMAPHORE (1 << 0)
> +#define EFD_AUTORESET (1 << 6) /* aliases O_CREAT */
>  #define EFD_CLOEXEC O_CLOEXEC
>  #define EFD_NONBLOCK O_NONBLOCK
>  
>  #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
> -#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
> +#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_AUTORESET)
>  
>  struct eventfd_ctx;
>  struct file;
> -- 
> 2.24.1
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]