All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] eventfd - revised interface and cleanups
@ 2009-06-23 16:47 Davide Libenzi
  2009-06-23 16:59 ` Randy Dunlap
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 16:47 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Andrew Morton, avi, kvm, Gregory Haskins, Rusty Russell,
	Benjamin LaHaise

The following patch changes the eventfd interface to de-couple the eventfd 
memory context, from the file pointer instance.
Without such change, there is no clean way to racely free handle the 
POLLHUP event sent when the last instance of the file* goes away.
Also, now the internal eventfd APIs are using the eventfd context instead 
of the file*.
Another cleanup this patch does, is making AIO select EVENTFD, instead of 
adding a bunch of empty function stubs inside eventfd.h.

Andrew, this better go via Avi and the KVM tree, since they have patches 
that will be based on the new interface.



Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide


---
 drivers/lguest/lg.h          |    2 
 drivers/lguest/lguest_user.c |    4 -
 fs/aio.c                     |   24 ++--------
 fs/eventfd.c                 |  101 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/aio.h          |    4 -
 include/linux/eventfd.h      |   17 ++-----
 init/Kconfig                 |    1 
 7 files changed, 108 insertions(+), 45 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================
--- linux-2.6.mod.orig/fs/eventfd.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/eventfd.c	2009-06-23 09:34:42.000000000 -0700
@@ -17,32 +17,38 @@
 #include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
 
 struct eventfd_ctx {
+	struct kref kref;
 	wait_queue_head_t wqh;
 	/*
 	 * Every time that a write(2) is performed on an eventfd, the
 	 * value of the __u64 being written is added to "count" and a
 	 * wakeup is performed on "wqh". A read(2) will return the "count"
 	 * value to userspace, and will reset "count" to zero. The kernel
-	 * size eventfd_signal() also, adds to the "count" counter and
+	 * side eventfd_signal() also, adds to the "count" counter and
 	 * issue a wakeup.
 	 */
 	__u64 count;
 	unsigned int flags;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @n to the eventfd counter. This function is
+ *                  supposed to be called by the kernel in paths that do not
+ *                  allow sleeping. In this function we allow the counter
+ *                  to reach the ULLONG_MAX value, and we signal this as
+ *                  overflow condition by returining a POLLERR to poll(2).
+ *
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *
+ * Returns: In case of success, it returns @n, otherwise (in case of overflow
+ *          of the eventfd 64bit internal counter) a value lower than @n.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	unsigned long flags;
 
 	if (n < 0)
@@ -59,9 +65,49 @@ int eventfd_signal(struct file *file, in
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	kfree(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ *
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context,
+ *          otherwise a proper error code.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context
+ *                   (previously acquired either with eventfd_ctx_get() or
+ *                   eventfd_ctx_fdget()).
+ *
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * Returns: Nothing.
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
 	return 0;
 }
 
@@ -185,6 +231,14 @@ static const struct file_operations even
 	.write		= eventfd_write,
 };
 
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ *
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns: A pointer to the eventfd file structure in case of success, or a
+ *          proper error pointer in case of failure.
+ */
 struct file *eventfd_fget(int fd)
 {
 	struct file *file;
@@ -201,6 +255,30 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context
+ *                     given an eventfd file descriptor.
+ *
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns: In case of success, it returns a pointer to the internal eventfd
+ *          context, otherwise a proper error code.
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -217,6 +295,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, 
 	if (!ctx)
 		return -ENOMEM;
 
+	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/eventfd.h	2009-06-23 08:54:43.000000000 -0700
@@ -8,10 +8,8 @@
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
-#ifdef CONFIG_EVENTFD
-
-/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
+#include <linux/file.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -28,15 +26,10 @@
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
-
-#else /* CONFIG_EVENTFD */
-
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-static inline int eventfd_signal(struct file *file, int n)
-{ return 0; }
-
-#endif /* CONFIG_EVENTFD */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd);
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
+void eventfd_ctx_put(struct eventfd_ctx *ctx);
+int eventfd_signal(struct eventfd_ctx *ctx, int n);
 
 #endif /* _LINUX_EVENTFD_H */
 
Index: linux-2.6.mod/fs/aio.c
===================================================================
--- linux-2.6.mod.orig/fs/aio.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/aio.c	2009-06-22 10:54:13.000000000 -0700
@@ -485,6 +485,8 @@ static inline void really_put_req(struct
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
 			__fput(req->ki_filp);
-		if (req->ki_eventfd != NULL)
-			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	int schedule_putreq = 0;
-
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
-		schedule_putreq++;
-	else
-		req->ki_filp = NULL;
-	if (req->ki_eventfd != NULL) {
-		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-			schedule_putreq++;
-		else
-			req->ki_eventfd = NULL;
-	}
-	if (unlikely(schedule_putreq)) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
 		queue_work(aio_wq, &fput_work);
-	} else
+	} else {
+		req->ki_filp = NULL;
 		really_put_req(ctx, req);
+	}
 	return 1;
 }
 
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
Index: linux-2.6.mod/include/linux/aio.h
===================================================================
--- linux-2.6.mod.orig/include/linux/aio.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/aio.h	2009-06-23 09:19:11.000000000 -0700
@@ -121,9 +121,9 @@ struct kiocb {
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying file* to deliver event to.
+	 * this is the underlying eventfd context to deliver events to.
 	 */
-	struct file		*ki_eventfd;
+	struct eventfd_ctx	*ki_eventfd;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.mod/init/Kconfig
===================================================================
--- linux-2.6.mod.orig/init/Kconfig	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/init/Kconfig	2009-06-22 10:54:13.000000000 -0700
@@ -925,6 +925,7 @@ config SHMEM
 
 config AIO
 	bool "Enable AIO support" if EMBEDDED
+	select EVENTFD
 	default y
 	help
 	  This option enables POSIX asynchronous I/O which may by used
Index: linux-2.6.mod/drivers/lguest/lg.h
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lg.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lg.h	2009-06-23 08:55:34.000000000 -0700
@@ -82,7 +82,7 @@ struct lg_cpu {
 
 struct lg_eventfd {
 	unsigned long addr;
-	struct file *event;
+	struct eventfd_ctx *event;
 };
 
 struct lg_eventfd_map {
Index: linux-2.6.mod/drivers/lguest/lguest_user.c
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lguest_user.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lguest_user.c	2009-06-22 10:54:13.000000000 -0700
@@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg
 
 	/* Now append new entry. */
 	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_fget(fd);
+	new->map[new->num].event = eventfd_ctx_fdget(fd);
 	if (IS_ERR(new->map[new->num].event)) {
 		kfree(new);
 		return PTR_ERR(new->map[new->num].event);
@@ -357,7 +357,7 @@ static int close(struct inode *inode, st
 
 	/* Release any eventfds they registered. */
 	for (i = 0; i < lg->eventfds->num; i++)
-		fput(lg->eventfds->map[i].event);
+		eventfd_ctx_put(lg->eventfds->map[i].event);
 	kfree(lg->eventfds);
 
 	/* If lg->dead doesn't contain an error code it will be NULL or a

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups
  2009-06-23 16:47 [patch] eventfd - revised interface and cleanups Davide Libenzi
@ 2009-06-23 16:59 ` Randy Dunlap
  2009-06-23 17:04   ` Davide Libenzi
  2009-06-23 17:03 ` Avi Kivity
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Randy Dunlap @ 2009-06-23 16:59 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Andrew Morton, avi, kvm,
	Gregory Haskins, Rusty Russell, Benjamin LaHaise

Davide Libenzi wrote:
> The following patch changes the eventfd interface to de-couple the eventfd 
> memory context, from the file pointer instance.
> Without such change, there is no clean way to racely free handle the 
> POLLHUP event sent when the last instance of the file* goes away.
> Also, now the internal eventfd APIs are using the eventfd context instead 
> of the file*.
> Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> adding a bunch of empty function stubs inside eventfd.h.
> 
> Andrew, this better go via Avi and the KVM tree, since they have patches 
> that will be based on the new interface.
> 
> 
> 
> Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
> 
> 
> - Davide
> 
> 
> ---
>  drivers/lguest/lg.h          |    2 
>  drivers/lguest/lguest_user.c |    4 -
>  fs/aio.c                     |   24 ++--------
>  fs/eventfd.c                 |  101 ++++++++++++++++++++++++++++++++++++++-----
>  include/linux/aio.h          |    4 -
>  include/linux/eventfd.h      |   17 ++-----
>  init/Kconfig                 |    1 
>  7 files changed, 108 insertions(+), 45 deletions(-)
> 
> Index: linux-2.6.mod/fs/eventfd.c
> ===================================================================
> --- linux-2.6.mod.orig/fs/eventfd.c	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/fs/eventfd.c	2009-06-23 09:34:42.000000000 -0700
> @@ -17,32 +17,38 @@

> -/*
> - * Adds "n" to the eventfd counter "count". Returns "n" in case of
> - * success, or a value lower then "n" in case of coutner overflow.
> - * This function is supposed to be called by the kernel in paths
> - * that do not allow sleeping. In this function we allow the counter
> - * to reach the ULLONG_MAX value, and we signal this as overflow
> - * condition by returining a POLLERR to poll(2).
> +/**
> + * eventfd_signal - Adds @n to the eventfd counter. This function is
> + *                  supposed to be called by the kernel in paths that do not
> + *                  allow sleeping. In this function we allow the counter
> + *                  to reach the ULLONG_MAX value, and we signal this as
> + *                  overflow condition by returining a POLLERR to poll(2).
> + *

kernel-doc syntax requires the function name + short description on one line,
followed by parameters.  Any longer function description then comes after
the parameters.

See Documentation/kernel-doc-nano-HOWTO.txt for more info,
or ask me.

> + * @ctx: [in] Pointer to the eventfd context.
> + * @n: [in] Value of the counter to be added to the eventfd internal counter.
> + *
> + * Returns: In case of success, it returns @n, otherwise (in case of overflow
> + *          of the eventfd 64bit internal counter) a value lower than @n.
>   */
> -int eventfd_signal(struct file *file, int n)
> +int eventfd_signal(struct eventfd_ctx *ctx, int n)
>  {
> -	struct eventfd_ctx *ctx = file->private_data;
>  	unsigned long flags;
>  
>  	if (n < 0)

> +/**
> + * eventfd_ctx_put - Releases a reference to the internal eventfd context
> + *                   (previously acquired either with eventfd_ctx_get() or
> + *                   eventfd_ctx_fdget()).
> + *
> + * @ctx: [in] Pointer to eventfd context.
> + *
> + * Returns: Nothing.
> + */
> +void eventfd_ctx_put(struct eventfd_ctx *ctx)
> +{
> +	kref_put(&ctx->kref, eventfd_free);
> +}
> +EXPORT_SYMBOL_GPL(eventfd_ctx_put);
> +
>  static int eventfd_release(struct inode *inode, struct file *file)
>  {
> -	kfree(file->private_data);
> +	struct eventfd_ctx *ctx = file->private_data;
> +
> +	wake_up_poll(&ctx->wqh, POLLHUP);
> +	eventfd_ctx_put(ctx);
>  	return 0;
>  }

>  
> +/**
> + * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context
> + *                     given an eventfd file descriptor.
> + *
> + * @fd: [in] Eventfd file descriptor.
> + *
> + * Returns: In case of success, it returns a pointer to the internal eventfd
> + *          context, otherwise a proper error code.
> + */
> +struct eventfd_ctx *eventfd_ctx_fdget(int fd)
> +{
> +	struct file *file;
> +	struct eventfd_ctx *ctx;
> +
> +	file = eventfd_fget(fd);
> +	if (IS_ERR(file))
> +		return (struct eventfd_ctx *) file;
> +	ctx = eventfd_ctx_get(file->private_data);
> +	fput(file);
> +
> +	return ctx;
> +}
> +EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);


-- 
~Randy
LPC 2009, Sept. 23-25, Portland, Oregon
http://linuxplumbersconf.org/2009/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups
  2009-06-23 16:47 [patch] eventfd - revised interface and cleanups Davide Libenzi
  2009-06-23 16:59 ` Randy Dunlap
@ 2009-06-23 17:03 ` Avi Kivity
  2009-06-23 17:04   ` Davide Libenzi
  2009-06-23 17:51 ` Gregory Haskins
  2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
  3 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2009-06-23 17:03 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Andrew Morton, kvm, Gregory Haskins,
	Rusty Russell, Benjamin LaHaise

On 06/23/2009 07:47 PM, Davide Libenzi wrote:
> The following patch changes the eventfd interface to de-couple the eventfd
> memory context, from the file pointer instance.
> Without such change, there is no clean way to racely free handle the
> POLLHUP event sent when the last instance of the file* goes away.
> Also, now the internal eventfd APIs are using the eventfd context instead
> of the file*.
> Another cleanup this patch does, is making AIO select EVENTFD, instead of
> adding a bunch of empty function stubs inside eventfd.h.
>
> Andrew, this better go via Avi and the KVM tree, since they have patches
> that will be based on the new interface.
>    

The kvm patches will only be ready for 2.6.32.  Can this go in 2.6.31 
now, and we'll meet in 10 weeks?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups
  2009-06-23 17:03 ` Avi Kivity
@ 2009-06-23 17:04   ` Davide Libenzi
  0 siblings, 0 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 17:04 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Davide Libenzi, Linux Kernel Mailing List, Andrew Morton, kvm,
	Gregory Haskins, Rusty Russell, Benjamin LaHaise

On Tue, 23 Jun 2009, Avi Kivity wrote:

> On 06/23/2009 07:47 PM, Davide Libenzi wrote:
> > The following patch changes the eventfd interface to de-couple the eventfd
> > memory context, from the file pointer instance.
> > Without such change, there is no clean way to racely free handle the
> > POLLHUP event sent when the last instance of the file* goes away.
> > Also, now the internal eventfd APIs are using the eventfd context instead
> > of the file*.
> > Another cleanup this patch does, is making AIO select EVENTFD, instead of
> > adding a bunch of empty function stubs inside eventfd.h.
> > 
> > Andrew, this better go via Avi and the KVM tree, since they have patches
> > that will be based on the new interface.
> >    
> 
> The kvm patches will only be ready for 2.6.32.  Can this go in 2.6.31 now, and
> we'll meet in 10 weeks?

Either ways works for me.
Meanwhile, I'll repost with the revised documentation format.


- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups
  2009-06-23 16:59 ` Randy Dunlap
@ 2009-06-23 17:04   ` Davide Libenzi
  0 siblings, 0 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 17:04 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Linux Kernel Mailing List, Andrew Morton, avi, kvm,
	Gregory Haskins, Rusty Russell, Benjamin LaHaise

On Tue, 23 Jun 2009, Randy Dunlap wrote:

> kernel-doc syntax requires the function name + short description on one line,
> followed by parameters.  Any longer function description then comes after
> the parameters.
> 
> See Documentation/kernel-doc-nano-HOWTO.txt for more info,
> or ask me.

Will fix, thx!


- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups
  2009-06-23 17:51 ` Gregory Haskins
@ 2009-06-23 17:51   ` Davide Libenzi
  0 siblings, 0 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 17:51 UTC (permalink / raw)
  To: Gregory Haskins
  Cc: Linux Kernel Mailing List, Andrew Morton, avi, kvm,
	Rusty Russell, Benjamin LaHaise, Al Viro

On Tue, 23 Jun 2009, Gregory Haskins wrote:

> I think the lack of a way to got from a file* to a ctx* (or back) might
> be a problem.  I am not an expert, but if I understood the gist of what
> Al Viro (cc'd) was saying early on, an fd can change meaning out from
> under you without warning.
> 
> With the code the way it is now, I would need to call both
> eventfd_fget() and eventfd_ctx_fdget() to fully acquire state (unless I
> am missing something).  I would think that is a race and I am not
> guaranteed to get the same object.  Can we also have a way to convert
> one reference to the other?  I am thinking
> 
> struct eventfd_ctx *eventfd_ctx_fget(struct file *)
> 
> (or similar) would be sufficient.

You're right. That function was available in the previous patch, but then 
I dropped it because I thought the eventfd code could be simplified. But 
yes, it's needed.



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups
  2009-06-23 16:47 [patch] eventfd - revised interface and cleanups Davide Libenzi
  2009-06-23 16:59 ` Randy Dunlap
  2009-06-23 17:03 ` Avi Kivity
@ 2009-06-23 17:51 ` Gregory Haskins
  2009-06-23 17:51   ` Davide Libenzi
  2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
  3 siblings, 1 reply; 31+ messages in thread
From: Gregory Haskins @ 2009-06-23 17:51 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Linux Kernel Mailing List, Andrew Morton, avi, kvm,
	Rusty Russell, Benjamin LaHaise, Al Viro

[-- Attachment #1: Type: text/plain, Size: 13491 bytes --]

Davide Libenzi wrote:
> The following patch changes the eventfd interface to de-couple the eventfd 
> memory context, from the file pointer instance.
> Without such change, there is no clean way to racely free handle the 
> POLLHUP event sent when the last instance of the file* goes away.
> Also, now the internal eventfd APIs are using the eventfd context instead 
> of the file*.
> Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> adding a bunch of empty function stubs inside eventfd.h.
>
> Andrew, this better go via Avi and the KVM tree, since they have patches 
> that will be based on the new interface.
>
>
>
> Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
>
>
> - Davide
>
>
> ---
>  drivers/lguest/lg.h          |    2 
>  drivers/lguest/lguest_user.c |    4 -
>  fs/aio.c                     |   24 ++--------
>  fs/eventfd.c                 |  101 ++++++++++++++++++++++++++++++++++++++-----
>  include/linux/aio.h          |    4 -
>  include/linux/eventfd.h      |   17 ++-----
>  init/Kconfig                 |    1 
>  7 files changed, 108 insertions(+), 45 deletions(-)
>
> Index: linux-2.6.mod/fs/eventfd.c
> ===================================================================
> --- linux-2.6.mod.orig/fs/eventfd.c	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/fs/eventfd.c	2009-06-23 09:34:42.000000000 -0700
> @@ -17,32 +17,38 @@
>  #include <linux/eventfd.h>
>  #include <linux/syscalls.h>
>  #include <linux/module.h>
> +#include <linux/kref.h>
>  
>  struct eventfd_ctx {
> +	struct kref kref;
>  	wait_queue_head_t wqh;
>  	/*
>  	 * Every time that a write(2) is performed on an eventfd, the
>  	 * value of the __u64 being written is added to "count" and a
>  	 * wakeup is performed on "wqh". A read(2) will return the "count"
>  	 * value to userspace, and will reset "count" to zero. The kernel
> -	 * size eventfd_signal() also, adds to the "count" counter and
> +	 * side eventfd_signal() also, adds to the "count" counter and
>  	 * issue a wakeup.
>  	 */
>  	__u64 count;
>  	unsigned int flags;
>  };
>  
> -/*
> - * Adds "n" to the eventfd counter "count". Returns "n" in case of
> - * success, or a value lower then "n" in case of coutner overflow.
> - * This function is supposed to be called by the kernel in paths
> - * that do not allow sleeping. In this function we allow the counter
> - * to reach the ULLONG_MAX value, and we signal this as overflow
> - * condition by returining a POLLERR to poll(2).
> +/**
> + * eventfd_signal - Adds @n to the eventfd counter. This function is
> + *                  supposed to be called by the kernel in paths that do not
> + *                  allow sleeping. In this function we allow the counter
> + *                  to reach the ULLONG_MAX value, and we signal this as
> + *                  overflow condition by returining a POLLERR to poll(2).
> + *
> + * @ctx: [in] Pointer to the eventfd context.
> + * @n: [in] Value of the counter to be added to the eventfd internal counter.
> + *
> + * Returns: In case of success, it returns @n, otherwise (in case of overflow
> + *          of the eventfd 64bit internal counter) a value lower than @n.
>   */
> -int eventfd_signal(struct file *file, int n)
> +int eventfd_signal(struct eventfd_ctx *ctx, int n)
>  {
> -	struct eventfd_ctx *ctx = file->private_data;
>  	unsigned long flags;
>  
>  	if (n < 0)
> @@ -59,9 +65,49 @@ int eventfd_signal(struct file *file, in
>  }
>  EXPORT_SYMBOL_GPL(eventfd_signal);
>  
> +static void eventfd_free(struct kref *kref)
> +{
> +	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
> +
> +	kfree(ctx);
> +}
> +
> +/**
> + * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
> + *
> + * @ctx: [in] Pointer to the eventfd context.
> + *
> + * Returns: In case of success, returns a pointer to the eventfd context,
> + *          otherwise a proper error code.
> + */
> +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
> +{
> +	kref_get(&ctx->kref);
> +	return ctx;
> +}
> +EXPORT_SYMBOL_GPL(eventfd_ctx_get);
> +
> +/**
> + * eventfd_ctx_put - Releases a reference to the internal eventfd context
> + *                   (previously acquired either with eventfd_ctx_get() or
> + *                   eventfd_ctx_fdget()).
> + *
> + * @ctx: [in] Pointer to eventfd context.
> + *
> + * Returns: Nothing.
> + */
> +void eventfd_ctx_put(struct eventfd_ctx *ctx)
> +{
> +	kref_put(&ctx->kref, eventfd_free);
> +}
> +EXPORT_SYMBOL_GPL(eventfd_ctx_put);
> +
>  static int eventfd_release(struct inode *inode, struct file *file)
>  {
> -	kfree(file->private_data);
> +	struct eventfd_ctx *ctx = file->private_data;
> +
> +	wake_up_poll(&ctx->wqh, POLLHUP);
> +	eventfd_ctx_put(ctx);
>  	return 0;
>  }
>  
> @@ -185,6 +231,14 @@ static const struct file_operations even
>  	.write		= eventfd_write,
>  };
>  
> +/**
> + * eventfd_fget - Acquire a reference of an eventfd file descriptor.
> + *
> + * @fd: [in] Eventfd file descriptor.
> + *
> + * Returns: A pointer to the eventfd file structure in case of success, or a
> + *          proper error pointer in case of failure.
> + */
>  struct file *eventfd_fget(int fd)
>  {
>  	struct file *file;
> @@ -201,6 +255,30 @@ struct file *eventfd_fget(int fd)
>  }
>  EXPORT_SYMBOL_GPL(eventfd_fget);
>  
> +/**
> + * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context
> + *                     given an eventfd file descriptor.
> + *
> + * @fd: [in] Eventfd file descriptor.
> + *
> + * Returns: In case of success, it returns a pointer to the internal eventfd
> + *          context, otherwise a proper error code.
> + */
> +struct eventfd_ctx *eventfd_ctx_fdget(int fd)
> +{
> +	struct file *file;
> +	struct eventfd_ctx *ctx;
> +
> +	file = eventfd_fget(fd);
> +	if (IS_ERR(file))
> +		return (struct eventfd_ctx *) file;
> +	ctx = eventfd_ctx_get(file->private_data);
> +	fput(file);
> +
> +	return ctx;
> +}
> +EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
> +
>  SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
>  {
>  	int fd;
> @@ -217,6 +295,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, 
>  	if (!ctx)
>  		return -ENOMEM;
>  
> +	kref_init(&ctx->kref);
>  	init_waitqueue_head(&ctx->wqh);
>  	ctx->count = count;
>  	ctx->flags = flags;
> Index: linux-2.6.mod/include/linux/eventfd.h
> ===================================================================
> --- linux-2.6.mod.orig/include/linux/eventfd.h	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/include/linux/eventfd.h	2009-06-23 08:54:43.000000000 -0700
> @@ -8,10 +8,8 @@
>  #ifndef _LINUX_EVENTFD_H
>  #define _LINUX_EVENTFD_H
>  
> -#ifdef CONFIG_EVENTFD
> -
> -/* For O_CLOEXEC and O_NONBLOCK */
>  #include <linux/fcntl.h>
> +#include <linux/file.h>
>  
>  /*
>   * CAREFUL: Check include/asm-generic/fcntl.h when defining
> @@ -28,15 +26,10 @@
>  #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
>  
>  struct file *eventfd_fget(int fd);
> -int eventfd_signal(struct file *file, int n);
> -
> -#else /* CONFIG_EVENTFD */
> -
> -#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
> -static inline int eventfd_signal(struct file *file, int n)
> -{ return 0; }
> -
> -#endif /* CONFIG_EVENTFD */
> +struct eventfd_ctx *eventfd_ctx_fdget(int fd);
> +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
> +void eventfd_ctx_put(struct eventfd_ctx *ctx);
> +int eventfd_signal(struct eventfd_ctx *ctx, int n);
>   

I think the lack of a way to got from a file* to a ctx* (or back) might
be a problem.  I am not an expert, but if I understood the gist of what
Al Viro (cc'd) was saying early on, an fd can change meaning out from
under you without warning.

With the code the way it is now, I would need to call both
eventfd_fget() and eventfd_ctx_fdget() to fully acquire state (unless I
am missing something).  I would think that is a race and I am not
guaranteed to get the same object.  Can we also have a way to convert
one reference to the other?  I am thinking

struct eventfd_ctx *eventfd_ctx_fget(struct file *)

(or similar) would be sufficient.

-Greg

>  
>  #endif /* _LINUX_EVENTFD_H */
>  
> Index: linux-2.6.mod/fs/aio.c
> ===================================================================
> --- linux-2.6.mod.orig/fs/aio.c	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/fs/aio.c	2009-06-22 10:54:13.000000000 -0700
> @@ -485,6 +485,8 @@ static inline void really_put_req(struct
>  {
>  	assert_spin_locked(&ctx->ctx_lock);
>  
> +	if (req->ki_eventfd != NULL)
> +		eventfd_ctx_put(req->ki_eventfd);
>  	if (req->ki_dtor)
>  		req->ki_dtor(req);
>  	if (req->ki_iovec != &req->ki_inline_vec)
> @@ -509,8 +511,6 @@ static void aio_fput_routine(struct work
>  		/* Complete the fput(s) */
>  		if (req->ki_filp != NULL)
>  			__fput(req->ki_filp);
> -		if (req->ki_eventfd != NULL)
> -			__fput(req->ki_eventfd);
>  
>  		/* Link the iocb into the context's free list */
>  		spin_lock_irq(&ctx->ctx_lock);
> @@ -528,8 +528,6 @@ static void aio_fput_routine(struct work
>   */
>  static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
>  {
> -	int schedule_putreq = 0;
> -
>  	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
>  		req, atomic_long_read(&req->ki_filp->f_count));
>  
> @@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *
>  	 * we would not be holding the last reference to the file*, so
>  	 * this function will be executed w/out any aio kthread wakeup.
>  	 */
> -	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
> -		schedule_putreq++;
> -	else
> -		req->ki_filp = NULL;
> -	if (req->ki_eventfd != NULL) {
> -		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
> -			schedule_putreq++;
> -		else
> -			req->ki_eventfd = NULL;
> -	}
> -	if (unlikely(schedule_putreq)) {
> +	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
>  		get_ioctx(ctx);
>  		spin_lock(&fput_lock);
>  		list_add(&req->ki_list, &fput_head);
>  		spin_unlock(&fput_lock);
>  		queue_work(aio_wq, &fput_work);
> -	} else
> +	} else {
> +		req->ki_filp = NULL;
>  		really_put_req(ctx, req);
> +	}
>  	return 1;
>  }
>  
> @@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *
>  		 * an eventfd() fd, and will be signaled for each completed
>  		 * event using the eventfd_signal() function.
>  		 */
> -		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
> +		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
>  		if (IS_ERR(req->ki_eventfd)) {
>  			ret = PTR_ERR(req->ki_eventfd);
>  			req->ki_eventfd = NULL;
> Index: linux-2.6.mod/include/linux/aio.h
> ===================================================================
> --- linux-2.6.mod.orig/include/linux/aio.h	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/include/linux/aio.h	2009-06-23 09:19:11.000000000 -0700
> @@ -121,9 +121,9 @@ struct kiocb {
>  
>  	/*
>  	 * If the aio_resfd field of the userspace iocb is not zero,
> -	 * this is the underlying file* to deliver event to.
> +	 * this is the underlying eventfd context to deliver events to.
>  	 */
> -	struct file		*ki_eventfd;
> +	struct eventfd_ctx	*ki_eventfd;
>  };
>  
>  #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
> Index: linux-2.6.mod/init/Kconfig
> ===================================================================
> --- linux-2.6.mod.orig/init/Kconfig	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/init/Kconfig	2009-06-22 10:54:13.000000000 -0700
> @@ -925,6 +925,7 @@ config SHMEM
>  
>  config AIO
>  	bool "Enable AIO support" if EMBEDDED
> +	select EVENTFD
>  	default y
>  	help
>  	  This option enables POSIX asynchronous I/O which may by used
> Index: linux-2.6.mod/drivers/lguest/lg.h
> ===================================================================
> --- linux-2.6.mod.orig/drivers/lguest/lg.h	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/drivers/lguest/lg.h	2009-06-23 08:55:34.000000000 -0700
> @@ -82,7 +82,7 @@ struct lg_cpu {
>  
>  struct lg_eventfd {
>  	unsigned long addr;
> -	struct file *event;
> +	struct eventfd_ctx *event;
>  };
>  
>  struct lg_eventfd_map {
> Index: linux-2.6.mod/drivers/lguest/lguest_user.c
> ===================================================================
> --- linux-2.6.mod.orig/drivers/lguest/lguest_user.c	2009-06-21 16:54:15.000000000 -0700
> +++ linux-2.6.mod/drivers/lguest/lguest_user.c	2009-06-22 10:54:13.000000000 -0700
> @@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg
>  
>  	/* Now append new entry. */
>  	new->map[new->num].addr = addr;
> -	new->map[new->num].event = eventfd_fget(fd);
> +	new->map[new->num].event = eventfd_ctx_fdget(fd);
>  	if (IS_ERR(new->map[new->num].event)) {
>  		kfree(new);
>  		return PTR_ERR(new->map[new->num].event);
> @@ -357,7 +357,7 @@ static int close(struct inode *inode, st
>  
>  	/* Release any eventfds they registered. */
>  	for (i = 0; i < lg->eventfds->num; i++)
> -		fput(lg->eventfds->map[i].event);
> +		eventfd_ctx_put(lg->eventfds->map[i].event);
>  	kfree(lg->eventfds);
>  
>  	/* If lg->dead doesn't contain an error code it will be NULL or a
>   



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 266 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 16:47 [patch] eventfd - revised interface and cleanups Davide Libenzi
                   ` (2 preceding siblings ...)
  2009-06-23 17:51 ` Gregory Haskins
@ 2009-06-23 19:25 ` Davide Libenzi
  2009-06-23 19:48   ` Andrew Morton
                     ` (3 more replies)
  3 siblings, 4 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 19:25 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Andrew Morton, avi, kvm, Gregory Haskins, Rusty Russell,
	Benjamin LaHaise

The following patch changes the eventfd interface to de-couple the eventfd 
memory context, from the file pointer instance.
Without such change, there is no clean way to racely free handle the 
POLLHUP event sent when the last instance of the file* goes away.
Also, now the internal eventfd APIs are using the eventfd context instead 
of the file*.
Another cleanup this patch does, is making AIO select EVENTFD, instead of 
adding a bunch of empty function stubs inside eventfd.h in order to 
handle the (AIO && !EVENTFD) case.



Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide


---
 drivers/lguest/lg.h          |    2 
 drivers/lguest/lguest_user.c |    4 -
 fs/aio.c                     |   24 ++------
 fs/eventfd.c                 |  115 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/aio.h          |    4 -
 include/linux/eventfd.h      |   18 ++----
 init/Kconfig                 |    1 
 7 files changed, 122 insertions(+), 46 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================
--- linux-2.6.mod.orig/fs/eventfd.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/eventfd.c	2009-06-23 10:57:58.000000000 -0700
@@ -14,35 +14,41 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 
 struct eventfd_ctx {
+	struct kref kref;
 	wait_queue_head_t wqh;
 	/*
 	 * Every time that a write(2) is performed on an eventfd, the
 	 * value of the __u64 being written is added to "count" and a
 	 * wakeup is performed on "wqh". A read(2) will return the "count"
 	 * value to userspace, and will reset "count" to zero. The kernel
-	 * size eventfd_signal() also, adds to the "count" counter and
+	 * side eventfd_signal() also, adds to the "count" counter and
 	 * issue a wakeup.
 	 */
 	__u64 count;
 	unsigned int flags;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns: In case of success, it returns @n, otherwise (in case of overflow
+ *          of the eventfd 64bit internal counter) a value lower than @n.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	unsigned long flags;
 
 	if (n < 0)
@@ -59,9 +65,48 @@ int eventfd_signal(struct file *file, in
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	kfree(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context,
+ *          otherwise a proper error code.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ *
+ * Returns: Nothing.
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
 	return 0;
 }
 
@@ -185,6 +230,13 @@ static const struct file_operations even
 	.write		= eventfd_write,
 };
 
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns: A pointer to the eventfd file structure in case of success, or a
+ *          proper error pointer in case of failure.
+ */
 struct file *eventfd_fget(int fd)
 {
 	struct file *file;
@@ -201,6 +253,44 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns: In case of success, it returns a pointer to the internal eventfd
+ *          context, otherwise a proper error code.
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns: In case of success, it returns a pointer to the internal eventfd
+ *          context, otherwise a proper error code.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -217,6 +307,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, 
 	if (!ctx)
 		return -ENOMEM;
 
+	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/eventfd.h	2009-06-23 10:58:15.000000000 -0700
@@ -8,10 +8,8 @@
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
-#ifdef CONFIG_EVENTFD
-
-/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
+#include <linux/file.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -27,16 +25,12 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
+void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
-
-#else /* CONFIG_EVENTFD */
-
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-static inline int eventfd_signal(struct file *file, int n)
-{ return 0; }
-
-#endif /* CONFIG_EVENTFD */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd);
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
+int eventfd_signal(struct eventfd_ctx *ctx, int n);
 
 #endif /* _LINUX_EVENTFD_H */
 
Index: linux-2.6.mod/fs/aio.c
===================================================================
--- linux-2.6.mod.orig/fs/aio.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/aio.c	2009-06-22 10:54:13.000000000 -0700
@@ -485,6 +485,8 @@ static inline void really_put_req(struct
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
 			__fput(req->ki_filp);
-		if (req->ki_eventfd != NULL)
-			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	int schedule_putreq = 0;
-
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
-		schedule_putreq++;
-	else
-		req->ki_filp = NULL;
-	if (req->ki_eventfd != NULL) {
-		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-			schedule_putreq++;
-		else
-			req->ki_eventfd = NULL;
-	}
-	if (unlikely(schedule_putreq)) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
 		queue_work(aio_wq, &fput_work);
-	} else
+	} else {
+		req->ki_filp = NULL;
 		really_put_req(ctx, req);
+	}
 	return 1;
 }
 
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
Index: linux-2.6.mod/include/linux/aio.h
===================================================================
--- linux-2.6.mod.orig/include/linux/aio.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/aio.h	2009-06-23 09:19:11.000000000 -0700
@@ -121,9 +121,9 @@ struct kiocb {
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying file* to deliver event to.
+	 * this is the underlying eventfd context to deliver events to.
 	 */
-	struct file		*ki_eventfd;
+	struct eventfd_ctx	*ki_eventfd;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.mod/init/Kconfig
===================================================================
--- linux-2.6.mod.orig/init/Kconfig	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/init/Kconfig	2009-06-22 10:54:13.000000000 -0700
@@ -925,6 +925,7 @@ config SHMEM
 
 config AIO
 	bool "Enable AIO support" if EMBEDDED
+	select EVENTFD
 	default y
 	help
 	  This option enables POSIX asynchronous I/O which may by used
Index: linux-2.6.mod/drivers/lguest/lg.h
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lg.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lg.h	2009-06-23 08:55:34.000000000 -0700
@@ -82,7 +82,7 @@ struct lg_cpu {
 
 struct lg_eventfd {
 	unsigned long addr;
-	struct file *event;
+	struct eventfd_ctx *event;
 };
 
 struct lg_eventfd_map {
Index: linux-2.6.mod/drivers/lguest/lguest_user.c
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lguest_user.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lguest_user.c	2009-06-22 10:54:13.000000000 -0700
@@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg
 
 	/* Now append new entry. */
 	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_fget(fd);
+	new->map[new->num].event = eventfd_ctx_fdget(fd);
 	if (IS_ERR(new->map[new->num].event)) {
 		kfree(new);
 		return PTR_ERR(new->map[new->num].event);
@@ -357,7 +357,7 @@ static int close(struct inode *inode, st
 
 	/* Release any eventfds they registered. */
 	for (i = 0; i < lg->eventfds->num; i++)
-		fput(lg->eventfds->map[i].event);
+		eventfd_ctx_put(lg->eventfds->map[i].event);
 	kfree(lg->eventfds);
 
 	/* If lg->dead doesn't contain an error code it will be NULL or a


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
@ 2009-06-23 19:48   ` Andrew Morton
  2009-06-23 19:49     ` Davide Libenzi
  2009-06-23 20:12   ` Andrew Morton
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 19:48 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> The following patch changes the eventfd interface to de-couple the eventfd 
> memory context, from the file pointer instance.
> Without such change, there is no clean way to racely free handle the 
> POLLHUP event sent when the last instance of the file* goes away.
> Also, now the internal eventfd APIs are using the eventfd context instead 
> of the file*.
> Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> adding a bunch of empty function stubs inside eventfd.h in order to 
> handle the (AIO && !EVENTFD) case.
> 

If we really want to squeeze this into 2.6.31 then it would be helpful
to have justification in the changelog, please.  I see that some KVM
feature needs it, but what and why and why now, etc?

The patch conflicts with similar changes int he KVM tree, but that'll
just ruin sfr's life for a day.


Your patch has:


> ...
>  static int eventfd_release(struct inode *inode, struct file *file)
>  {
> -	kfree(file->private_data);
> +	struct eventfd_ctx *ctx = file->private_data;
> +
> +	wake_up_poll(&ctx->wqh, POLLHUP);
> +	eventfd_ctx_put(ctx);
>  	return 0;
>  }

but the patch in linux-next has

static int eventfd_release(struct inode *inode, struct file *file)
{
	struct eventfd_ctx *ctx = file->private_data;

	/*
	 * No need to hold the lock here, since we are on the file cleanup
	 * path and the ones still attached to the wait queue will be
	 * serialized by wake_up_locked_poll().
	 */
	wake_up_locked_poll(&ctx->wqh, POLLHUP);
	kfree(ctx);
	return 0;
}

which looks quite different (and has a nice comment!)

What's going on here?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 19:48   ` Andrew Morton
@ 2009-06-23 19:49     ` Davide Libenzi
  0 siblings, 0 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 19:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
> Davide Libenzi <davidel@xmailserver.org> wrote:
> 
> > The following patch changes the eventfd interface to de-couple the eventfd 
> > memory context, from the file pointer instance.
> > Without such change, there is no clean way to racely free handle the 
> > POLLHUP event sent when the last instance of the file* goes away.
> > Also, now the internal eventfd APIs are using the eventfd context instead 
> > of the file*.
> > Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> > adding a bunch of empty function stubs inside eventfd.h in order to 
> > handle the (AIO && !EVENTFD) case.
> > 
> 
> If we really want to squeeze this into 2.6.31 then it would be helpful
> to have justification in the changelog, please.  I see that some KVM
> feature needs it, but what and why and why now, etc?

The patch in -next added the ability to have waiters to be notified when 
the last instance of the file* is dropped (that is, on ->release).
But it is not possible for waiters to handle the POLLHUP event in a 
racy-free way, unless the eventfd memory context is de-coupled from the 
file*.
The next patches from gregory will use eventfd (and the POLLHUP handling) 
to implement the IRQfd code inside KVM.


> What's going on here?

The POLLHUP patch in -next is not sufficent. I based the patch over 
mainline, but if you guys want I can rebase it over -next.



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
  2009-06-23 19:48   ` Andrew Morton
@ 2009-06-23 20:12   ` Andrew Morton
  2009-06-23 20:59     ` Davide Libenzi
  2009-06-23 20:18   ` Andrew Morton
  2009-06-23 22:46   ` [patch] eventfd - revised interface and cleanups (3rd rev) Davide Libenzi
  3 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 20:12 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> adding a bunch of empty function stubs inside eventfd.h in order to 
> handle the (AIO && !EVENTFD) case.

Given that we're trying to squeak this patch into 2.6.31, it's quite
unfortunate to include unrelated changes.  Especially ones which
involve the always-problematic "select".  Although this one looks less
than usually deadly due to the simple dependencies of AIO and eventfd.

However...

Is this a good way of fixing the (AIO && !EVENTFD) case?  Surely if the
developer selected this combination, he doesn't want to carry the
overhead of including eventfd.  IOW, if AIO can work acceptably without
eventfd being compiled into the kernel then adding the stubs is a
superior solution.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
  2009-06-23 19:48   ` Andrew Morton
  2009-06-23 20:12   ` Andrew Morton
@ 2009-06-23 20:18   ` Andrew Morton
  2009-06-23 21:03     ` Davide Libenzi
  2009-06-23 22:46   ` [patch] eventfd - revised interface and cleanups (3rd rev) Davide Libenzi
  3 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 20:18 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> The following patch changes the eventfd interface to de-couple the eventfd 
> memory context, from the file pointer instance.
> Without such change, there is no clean way to racely free handle the 
> POLLHUP event sent when the last instance of the file* goes away.
> Also, now the internal eventfd APIs are using the eventfd context instead 
> of the file*.
> Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> adding a bunch of empty function stubs inside eventfd.h in order to 
> handle the (AIO && !EVENTFD) case.
> 
> 
> ...
>
> +/**
> + * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
> + * @ctx: [in] Pointer to the eventfd context.
> + *
> + * Returns: In case of success, returns a pointer to the eventfd context,
> + *          otherwise a proper error code.

The description of the return value

> + */
> +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
> +{
> +	kref_get(&ctx->kref);
> +	return ctx;
> +}
> +EXPORT_SYMBOL_GPL(eventfd_ctx_get);

doesn't match the code.


Also...

> + * Returns: A pointer to the eventfd file structure in case of success, or a
> + *          proper error pointer in case of failure.


> + * Returns: In case of success, it returns a pointer to the internal eventfd
> + *          context, otherwise a proper error code.
> + */

I'm unsure what the word "proper" means in this context.

The term "proper error pointer" is understandable enough - something
you run IS_ERR() against.  "error pointer" would suffice.

But the term "proper error code" is getting a bit remote from reality.


Unfortunately the kernel doesn't have a simple and agreed-to term for
an ERR_PTR() thingy.  Perhaps we should invent one.  "err_ptr"?


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 20:12   ` Andrew Morton
@ 2009-06-23 20:59     ` Davide Libenzi
  2009-06-23 21:25       ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 20:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
> Davide Libenzi <davidel@xmailserver.org> wrote:
> 
> > Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> > adding a bunch of empty function stubs inside eventfd.h in order to 
> > handle the (AIO && !EVENTFD) case.
> 
> Given that we're trying to squeak this patch into 2.6.31, it's quite
> unfortunate to include unrelated changes.  Especially ones which
> involve the always-problematic "select".  Although this one looks less
> than usually deadly due to the simple dependencies of AIO and eventfd.
> 
> However...
> 
> Is this a good way of fixing the (AIO && !EVENTFD) case?  Surely if the
> developer selected this combination, he doesn't want to carry the
> overhead of including eventfd.  IOW, if AIO can work acceptably without
> eventfd being compiled into the kernel then adding the stubs is a
> superior solution.

IMO when someone says "AIO included in the kernel", should get the whole 
AIO functionality and not part of it.
People already started using KAIO+eventfd, and a (AIO && !EVENTFD) config 
will make their code fail.


- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 20:18   ` Andrew Morton
@ 2009-06-23 21:03     ` Davide Libenzi
  2009-06-23 21:29       ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 21:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
> Davide Libenzi <davidel@xmailserver.org> wrote:
> 
> > The following patch changes the eventfd interface to de-couple the eventfd 
> > memory context, from the file pointer instance.
> > Without such change, there is no clean way to racely free handle the 
> > POLLHUP event sent when the last instance of the file* goes away.
> > Also, now the internal eventfd APIs are using the eventfd context instead 
> > of the file*.
> > Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> > adding a bunch of empty function stubs inside eventfd.h in order to 
> > handle the (AIO && !EVENTFD) case.
> > 
> > ...
> >
> > +/**
> > + * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
> > + * @ctx: [in] Pointer to the eventfd context.
> > + *
> > + * Returns: In case of success, returns a pointer to the eventfd context,
> > + *          otherwise a proper error code.
> 
> The description of the return value

Should functions be describing all the returned error codes, ala man pages?



> > + */
> > +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
> > +{
> > +	kref_get(&ctx->kref);
> > +	return ctx;
> > +}
> > +EXPORT_SYMBOL_GPL(eventfd_ctx_get);
> 
> doesn't match the code.
> 
> Also...
> 
> > + * Returns: A pointer to the eventfd file structure in case of success, or a
> > + *          proper error pointer in case of failure.
> 
> 
> > + * Returns: In case of success, it returns a pointer to the internal eventfd
> > + *          context, otherwise a proper error code.
> > + */
> 
> I'm unsure what the word "proper" means in this context.
> 
> The term "proper error pointer" is understandable enough - something
> you run IS_ERR() against.  "error pointer" would suffice.
> 
> But the term "proper error code" is getting a bit remote from reality.
> 
> Unfortunately the kernel doesn't have a simple and agreed-to term for
> an ERR_PTR() thingy.  Perhaps we should invent one.  "err_ptr"?

OK, but you tricked me once again :)
You posted your comments/changes while you merged the old version in -mm 
already.



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:25       ` Andrew Morton
@ 2009-06-23 21:25         ` Davide Libenzi
  2009-06-23 21:44           ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 21:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> That isn't for us to decide. Entire syscalls can be disabled in config.

That is not a well defined separate syscall though. It's a member/feature 
of the aiocb.


- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 20:59     ` Davide Libenzi
@ 2009-06-23 21:25       ` Andrew Morton
  2009-06-23 21:25         ` Davide Libenzi
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 21:25 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 13:59:07 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Tue, 23 Jun 2009, Andrew Morton wrote:
> 
> > On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
> > Davide Libenzi <davidel@xmailserver.org> wrote:
> > 
> > > Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> > > adding a bunch of empty function stubs inside eventfd.h in order to 
> > > handle the (AIO && !EVENTFD) case.
> > 
> > Given that we're trying to squeak this patch into 2.6.31, it's quite
> > unfortunate to include unrelated changes.  Especially ones which
> > involve the always-problematic "select".  Although this one looks less
> > than usually deadly due to the simple dependencies of AIO and eventfd.
> > 
> > However...
> > 
> > Is this a good way of fixing the (AIO && !EVENTFD) case?  Surely if the
> > developer selected this combination, he doesn't want to carry the
> > overhead of including eventfd.  IOW, if AIO can work acceptably without
> > eventfd being compiled into the kernel then adding the stubs is a
> > superior solution.
> 
> IMO when someone says "AIO included in the kernel", should get the whole 
> AIO functionality and not part of it.
> People already started using KAIO+eventfd, and a (AIO && !EVENTFD) config 
> will make their code fail.

That isn't for us to decide. Entire syscalls can be disabled in config.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:29       ` Andrew Morton
@ 2009-06-23 21:28         ` Davide Libenzi
  2009-06-23 21:34         ` Davide Libenzi
  1 sibling, 0 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 21:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Davide Libenzi, linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009, Andrew Morton wrote:

> > > > +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
> > > > +{
> > > > +	kref_get(&ctx->kref);
> > > > +	return ctx;
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(eventfd_ctx_get);
> > > 
> > > doesn't match the code.
> 
> You appear to have not seen the above sentence.

I saw that and have already fixed it. Sorry I missed the ACK.



> yeah, I never trust people.  You might lose the email or jump on a
> plane and disappear for three weeks, then it all gets forgotten about
> and lost.

Jumping on a plane. Check. Disappearing for 6 weeks. Check. ;)


- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:03     ` Davide Libenzi
@ 2009-06-23 21:29       ` Andrew Morton
  2009-06-23 21:28         ` Davide Libenzi
  2009-06-23 21:34         ` Davide Libenzi
  0 siblings, 2 replies; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 21:29 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 14:03:22 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Tue, 23 Jun 2009, Andrew Morton wrote:
> 
> > On Tue, 23 Jun 2009 12:25:36 -0700 (PDT)
> > Davide Libenzi <davidel@xmailserver.org> wrote:
> > 
> > > The following patch changes the eventfd interface to de-couple the eventfd 
> > > memory context, from the file pointer instance.
> > > Without such change, there is no clean way to racely free handle the 
> > > POLLHUP event sent when the last instance of the file* goes away.
> > > Also, now the internal eventfd APIs are using the eventfd context instead 
> > > of the file*.
> > > Another cleanup this patch does, is making AIO select EVENTFD, instead of 
> > > adding a bunch of empty function stubs inside eventfd.h in order to 
> > > handle the (AIO && !EVENTFD) case.
> > > 
> > > ...
> > >
> > > +/**
> > > + * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
> > > + * @ctx: [in] Pointer to the eventfd context.
> > > + *
> > > + * Returns: In case of success, returns a pointer to the eventfd context,
> > > + *          otherwise a proper error code.
> > 
> > The description of the return value
> 
> Should functions be describing all the returned error codes, ala man pages?
> 

I think so.

> 
> > > + */
> > > +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
> > > +{
> > > +	kref_get(&ctx->kref);
> > > +	return ctx;
> > > +}
> > > +EXPORT_SYMBOL_GPL(eventfd_ctx_get);
> > 
> > doesn't match the code.

You appear to have not seen the above sentence.

> > Also...
> > 
> > > + * Returns: A pointer to the eventfd file structure in case of success, or a
> > > + *          proper error pointer in case of failure.
> > 
> > 
> > > + * Returns: In case of success, it returns a pointer to the internal eventfd
> > > + *          context, otherwise a proper error code.
> > > + */
> > 
> > I'm unsure what the word "proper" means in this context.
> > 
> > The term "proper error pointer" is understandable enough - something
> > you run IS_ERR() against.  "error pointer" would suffice.
> > 
> > But the term "proper error code" is getting a bit remote from reality.
> > 
> > Unfortunately the kernel doesn't have a simple and agreed-to term for
> > an ERR_PTR() thingy.  Perhaps we should invent one.  "err_ptr"?
> 
> OK, but you tricked me once again :)
> You posted your comments/changes while you merged the old version in -mm 
> already.

yeah, I never trust people.  You might lose the email or jump on a
plane and disappear for three weeks, then it all gets forgotten about
and lost.

If the code doesn't have any apparent showstoppers I'll often merge it
with a note that it isn't finalised.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:29       ` Andrew Morton
  2009-06-23 21:28         ` Davide Libenzi
@ 2009-06-23 21:34         ` Davide Libenzi
  2009-06-23 21:46           ` Andrew Morton
  1 sibling, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 21:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> > Should functions be describing all the returned error codes, ala man pages?
> > 
> 
> I think so.

This becomes pretty painful when the function calls other functions, for 
which just relays the error code.
Should we be just documenting the error codes introduced by the function 
code, and say that the function returns errors A, B, C plus all the ones 
returned by the called functions X, Y, Z?
If not, it becomes hell in maintaining the comments...



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:25         ` Davide Libenzi
@ 2009-06-23 21:44           ` Andrew Morton
  2009-06-23 22:49             ` Davide Libenzi
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 21:44 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 14:25:05 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Tue, 23 Jun 2009, Andrew Morton wrote:
> 
> > That isn't for us to decide. Entire syscalls can be disabled in config.
> 
> That is not a well defined separate syscall though. It's a member/feature 
> of the aiocb.

I don't know what this means, really.

AIO eventfd support is a relatively recently added enhancement to AIO,
is it not?  Applications which continue to use the pre-May07 AIO
features will continue to work as before (they darn well better).  So
for such applications, AIO=y/EVENTFD=n kernels are usable and useful,
and eliminating this option is a loss?

Either way, I believe that this change should be unbundled from the
unrelated KVM one so we can handle it separately.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:34         ` Davide Libenzi
@ 2009-06-23 21:46           ` Andrew Morton
  2009-06-23 21:48             ` Davide Libenzi
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 21:46 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 14:34:50 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Tue, 23 Jun 2009, Andrew Morton wrote:
> 
> > > Should functions be describing all the returned error codes, ala man pages?
> > > 
> > 
> > I think so.
> 
> This becomes pretty painful when the function calls other functions, for 
> which just relays the error code.
> Should we be just documenting the error codes introduced by the function 
> code, and say that the function returns errors A, B, C plus all the ones 
> returned by the called functions X, Y, Z?
> If not, it becomes hell in maintaining the comments...

Well.  Don't worry about making rules.  Taste and common sense apply.  "Will
it be useful to readers if I explicitly document the return value".  If
"yes" then document away.  If "no" then don't.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:46           ` Andrew Morton
@ 2009-06-23 21:48             ` Davide Libenzi
  2009-06-23 22:07               ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 21:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> On Tue, 23 Jun 2009 14:34:50 -0700 (PDT)
> Davide Libenzi <davidel@xmailserver.org> wrote:
> 
> > On Tue, 23 Jun 2009, Andrew Morton wrote:
> > 
> > > > Should functions be describing all the returned error codes, ala man pages?
> > > > 
> > > 
> > > I think so.
> > 
> > This becomes pretty painful when the function calls other functions, for 
> > which just relays the error code.
> > Should we be just documenting the error codes introduced by the function 
> > code, and say that the function returns errors A, B, C plus all the ones 
> > returned by the called functions X, Y, Z?
> > If not, it becomes hell in maintaining the comments...
> 
> Well.  Don't worry about making rules.  Taste and common sense apply.  "Will
> it be useful to readers if I explicitly document the return value".  If
> "yes" then document away.  If "no" then don't.

Are you OK with the format in the patch below?


- Davide


---
 drivers/lguest/lg.h          |    2 
 drivers/lguest/lguest_user.c |    4 -
 fs/aio.c                     |   24 ++------
 fs/eventfd.c                 |  122 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/aio.h          |    4 -
 include/linux/eventfd.h      |   18 ++----
 init/Kconfig                 |    1 
 7 files changed, 129 insertions(+), 46 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================
--- linux-2.6.mod.orig/fs/eventfd.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/eventfd.c	2009-06-23 14:45:54.000000000 -0700
@@ -14,35 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 
 struct eventfd_ctx {
+	struct kref kref;
 	wait_queue_head_t wqh;
 	/*
 	 * Every time that a write(2) is performed on an eventfd, the
 	 * value of the __u64 being written is added to "count" and a
 	 * wakeup is performed on "wqh". A read(2) will return the "count"
 	 * value to userspace, and will reset "count" to zero. The kernel
-	 * size eventfd_signal() also, adds to the "count" counter and
+	 * side eventfd_signal() also, adds to the "count" counter and
 	 * issue a wakeup.
 	 */
 	__u64 count;
 	unsigned int flags;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	unsigned long flags;
 
 	if (n < 0)
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, in
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	kfree(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
 	return 0;
 }
 
@@ -185,6 +230,16 @@ static const struct file_operations even
 	.write		= eventfd_write,
 };
 
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
 	struct file *file;
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, 
 	if (!ctx)
 		return -ENOMEM;
 
+	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/eventfd.h	2009-06-23 10:58:15.000000000 -0700
@@ -8,10 +8,8 @@
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
-#ifdef CONFIG_EVENTFD
-
-/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
+#include <linux/file.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -27,16 +25,12 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
+void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
-
-#else /* CONFIG_EVENTFD */
-
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-static inline int eventfd_signal(struct file *file, int n)
-{ return 0; }
-
-#endif /* CONFIG_EVENTFD */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd);
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
+int eventfd_signal(struct eventfd_ctx *ctx, int n);
 
 #endif /* _LINUX_EVENTFD_H */
 
Index: linux-2.6.mod/fs/aio.c
===================================================================
--- linux-2.6.mod.orig/fs/aio.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/aio.c	2009-06-22 10:54:13.000000000 -0700
@@ -485,6 +485,8 @@ static inline void really_put_req(struct
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
 			__fput(req->ki_filp);
-		if (req->ki_eventfd != NULL)
-			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	int schedule_putreq = 0;
-
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
-		schedule_putreq++;
-	else
-		req->ki_filp = NULL;
-	if (req->ki_eventfd != NULL) {
-		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-			schedule_putreq++;
-		else
-			req->ki_eventfd = NULL;
-	}
-	if (unlikely(schedule_putreq)) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
 		queue_work(aio_wq, &fput_work);
-	} else
+	} else {
+		req->ki_filp = NULL;
 		really_put_req(ctx, req);
+	}
 	return 1;
 }
 
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
Index: linux-2.6.mod/include/linux/aio.h
===================================================================
--- linux-2.6.mod.orig/include/linux/aio.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/aio.h	2009-06-23 09:19:11.000000000 -0700
@@ -121,9 +121,9 @@ struct kiocb {
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying file* to deliver event to.
+	 * this is the underlying eventfd context to deliver events to.
 	 */
-	struct file		*ki_eventfd;
+	struct eventfd_ctx	*ki_eventfd;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.mod/init/Kconfig
===================================================================
--- linux-2.6.mod.orig/init/Kconfig	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/init/Kconfig	2009-06-22 10:54:13.000000000 -0700
@@ -925,6 +925,7 @@ config SHMEM
 
 config AIO
 	bool "Enable AIO support" if EMBEDDED
+	select EVENTFD
 	default y
 	help
 	  This option enables POSIX asynchronous I/O which may by used
Index: linux-2.6.mod/drivers/lguest/lg.h
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lg.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lg.h	2009-06-23 08:55:34.000000000 -0700
@@ -82,7 +82,7 @@ struct lg_cpu {
 
 struct lg_eventfd {
 	unsigned long addr;
-	struct file *event;
+	struct eventfd_ctx *event;
 };
 
 struct lg_eventfd_map {
Index: linux-2.6.mod/drivers/lguest/lguest_user.c
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lguest_user.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lguest_user.c	2009-06-22 10:54:13.000000000 -0700
@@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg
 
 	/* Now append new entry. */
 	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_fget(fd);
+	new->map[new->num].event = eventfd_ctx_fdget(fd);
 	if (IS_ERR(new->map[new->num].event)) {
 		kfree(new);
 		return PTR_ERR(new->map[new->num].event);
@@ -357,7 +357,7 @@ static int close(struct inode *inode, st
 
 	/* Release any eventfds they registered. */
 	for (i = 0; i < lg->eventfds->num; i++)
-		fput(lg->eventfds->map[i].event);
+		eventfd_ctx_put(lg->eventfds->map[i].event);
 	kfree(lg->eventfds);
 
 	/* If lg->dead doesn't contain an error code it will be NULL or a


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:48             ` Davide Libenzi
@ 2009-06-23 22:07               ` Andrew Morton
  0 siblings, 0 replies; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 22:07 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 14:48:51 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> > > This becomes pretty painful when the function calls other functions, for 
> > > which just relays the error code.
> > > Should we be just documenting the error codes introduced by the function 
> > > code, and say that the function returns errors A, B, C plus all the ones 
> > > returned by the called functions X, Y, Z?
> > > If not, it becomes hell in maintaining the comments...
> > 
> > Well.  Don't worry about making rules.  Taste and common sense apply.  "Will
> > it be useful to readers if I explicitly document the return value".  If
> > "yes" then document away.  If "no" then don't.
> 
> Are you OK with the format in the patch below?

Looks great to me.

Obviously the cost of maintaining this level of detail is fairly high,
and the chances of bitrot are also high.  So I wouldn't be expecting
people to document things at this level in general.  But if you're
prepared to maintain this then good for you.




^ permalink raw reply	[flat|nested] 31+ messages in thread

* [patch] eventfd - revised interface and cleanups (3rd rev)
  2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
                     ` (2 preceding siblings ...)
  2009-06-23 20:18   ` Andrew Morton
@ 2009-06-23 22:46   ` Davide Libenzi
  2009-06-24 23:57     ` [patch] eventfd - revised interface and cleanups (4th rev) Davide Libenzi
  3 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 22:46 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Andrew Morton, avi, kvm, Gregory Haskins, Rusty Russell,
	Benjamin LaHaise

The following patch changes the eventfd interface to de-couple the eventfd 
memory context, from the file pointer instance.
Without such change, there is no clean way to racely free handle the 
POLLHUP event sent when the last instance of the file* goes away.
Also, now the internal eventfd APIs are using the eventfd context instead 
of the file*.
Another cleanup this patch does, is making AIO select EVENTFD, instead of 
adding a bunch of empty function stubs inside eventfd.h in order to 
handle the (AIO && !EVENTFD) case.


Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide


---
 drivers/lguest/lg.h          |    2 
 drivers/lguest/lguest_user.c |    4 -
 fs/aio.c                     |   24 ++------
 fs/eventfd.c                 |  122 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/aio.h          |    4 -
 include/linux/eventfd.h      |   18 ++----
 init/Kconfig                 |    1 
 7 files changed, 129 insertions(+), 46 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================
--- linux-2.6.mod.orig/fs/eventfd.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/eventfd.c	2009-06-23 14:45:54.000000000 -0700
@@ -14,35 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 
 struct eventfd_ctx {
+	struct kref kref;
 	wait_queue_head_t wqh;
 	/*
 	 * Every time that a write(2) is performed on an eventfd, the
 	 * value of the __u64 being written is added to "count" and a
 	 * wakeup is performed on "wqh". A read(2) will return the "count"
 	 * value to userspace, and will reset "count" to zero. The kernel
-	 * size eventfd_signal() also, adds to the "count" counter and
+	 * side eventfd_signal() also, adds to the "count" counter and
 	 * issue a wakeup.
 	 */
 	__u64 count;
 	unsigned int flags;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	unsigned long flags;
 
 	if (n < 0)
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, in
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	kfree(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
 	return 0;
 }
 
@@ -185,6 +230,16 @@ static const struct file_operations even
 	.write		= eventfd_write,
 };
 
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
 	struct file *file;
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, 
 	if (!ctx)
 		return -ENOMEM;
 
+	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/eventfd.h	2009-06-23 10:58:15.000000000 -0700
@@ -8,10 +8,8 @@
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
-#ifdef CONFIG_EVENTFD
-
-/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
+#include <linux/file.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -27,16 +25,12 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
+void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
-
-#else /* CONFIG_EVENTFD */
-
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-static inline int eventfd_signal(struct file *file, int n)
-{ return 0; }
-
-#endif /* CONFIG_EVENTFD */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd);
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
+int eventfd_signal(struct eventfd_ctx *ctx, int n);
 
 #endif /* _LINUX_EVENTFD_H */
 
Index: linux-2.6.mod/fs/aio.c
===================================================================
--- linux-2.6.mod.orig/fs/aio.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/fs/aio.c	2009-06-22 10:54:13.000000000 -0700
@@ -485,6 +485,8 @@ static inline void really_put_req(struct
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
 			__fput(req->ki_filp);
-		if (req->ki_eventfd != NULL)
-			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	int schedule_putreq = 0;
-
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
-		schedule_putreq++;
-	else
-		req->ki_filp = NULL;
-	if (req->ki_eventfd != NULL) {
-		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-			schedule_putreq++;
-		else
-			req->ki_eventfd = NULL;
-	}
-	if (unlikely(schedule_putreq)) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
 		queue_work(aio_wq, &fput_work);
-	} else
+	} else {
+		req->ki_filp = NULL;
 		really_put_req(ctx, req);
+	}
 	return 1;
 }
 
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
Index: linux-2.6.mod/include/linux/aio.h
===================================================================
--- linux-2.6.mod.orig/include/linux/aio.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/include/linux/aio.h	2009-06-23 09:19:11.000000000 -0700
@@ -121,9 +121,9 @@ struct kiocb {
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying file* to deliver event to.
+	 * this is the underlying eventfd context to deliver events to.
 	 */
-	struct file		*ki_eventfd;
+	struct eventfd_ctx	*ki_eventfd;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.mod/init/Kconfig
===================================================================
--- linux-2.6.mod.orig/init/Kconfig	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/init/Kconfig	2009-06-22 10:54:13.000000000 -0700
@@ -925,6 +925,7 @@ config SHMEM
 
 config AIO
 	bool "Enable AIO support" if EMBEDDED
+	select EVENTFD
 	default y
 	help
 	  This option enables POSIX asynchronous I/O which may by used
Index: linux-2.6.mod/drivers/lguest/lg.h
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lg.h	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lg.h	2009-06-23 08:55:34.000000000 -0700
@@ -82,7 +82,7 @@ struct lg_cpu {
 
 struct lg_eventfd {
 	unsigned long addr;
-	struct file *event;
+	struct eventfd_ctx *event;
 };
 
 struct lg_eventfd_map {
Index: linux-2.6.mod/drivers/lguest/lguest_user.c
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lguest_user.c	2009-06-21 16:54:15.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lguest_user.c	2009-06-22 10:54:13.000000000 -0700
@@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg
 
 	/* Now append new entry. */
 	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_fget(fd);
+	new->map[new->num].event = eventfd_ctx_fdget(fd);
 	if (IS_ERR(new->map[new->num].event)) {
 		kfree(new);
 		return PTR_ERR(new->map[new->num].event);
@@ -357,7 +357,7 @@ static int close(struct inode *inode, st
 
 	/* Release any eventfds they registered. */
 	for (i = 0; i < lg->eventfds->num; i++)
-		fput(lg->eventfds->map[i].event);
+		eventfd_ctx_put(lg->eventfds->map[i].event);
 	kfree(lg->eventfds);
 
 	/* If lg->dead doesn't contain an error code it will be NULL or a


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 21:44           ` Andrew Morton
@ 2009-06-23 22:49             ` Davide Libenzi
  2009-06-23 23:18               ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-23 22:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> On Tue, 23 Jun 2009 14:25:05 -0700 (PDT)
> Davide Libenzi <davidel@xmailserver.org> wrote:
> 
> > On Tue, 23 Jun 2009, Andrew Morton wrote:
> > 
> > > That isn't for us to decide. Entire syscalls can be disabled in config.
> > 
> > That is not a well defined separate syscall though. It's a member/feature 
> > of the aiocb.
> 
> I don't know what this means, really.

This is the struct iocb:

struct iocb {
	...
	u_int32_t       aio_resfd;
};

And the only interface to access KAIO is io_submit().
IMO the end user perceives the KAIO functionality as the full deployment 
of the iocb struct and the io_submit() accessory.
Can code not using eventfd work in (AIO && !EVENTFD) mode? Sure.
It is a kinda confusing configuration from the end user POV IMO.



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 22:49             ` Davide Libenzi
@ 2009-06-23 23:18               ` Andrew Morton
  2009-06-24 22:47                 ` Davide Libenzi
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-23 23:18 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Tue, 23 Jun 2009 15:49:53 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Tue, 23 Jun 2009, Andrew Morton wrote:
> 
> > On Tue, 23 Jun 2009 14:25:05 -0700 (PDT)
> > Davide Libenzi <davidel@xmailserver.org> wrote:
> > 
> > > On Tue, 23 Jun 2009, Andrew Morton wrote:
> > > 
> > > > That isn't for us to decide. Entire syscalls can be disabled in config.
> > > 
> > > That is not a well defined separate syscall though. It's a member/feature 
> > > of the aiocb.
> > 
> > I don't know what this means, really.
> 
> This is the struct iocb:
> 
> struct iocb {
> 	...
> 	u_int32_t       aio_resfd;
> };
> 
> And the only interface to access KAIO is io_submit().
> IMO the end user perceives the KAIO functionality as the full deployment 
> of the iocb struct and the io_submit() accessory.
> Can code not using eventfd work in (AIO && !EVENTFD) mode? Sure.
> It is a kinda confusing configuration from the end user POV IMO.

What's confusing about it?

Most programmers who are using AIO probably don't even know that the
eventd hookup is available.  And even if they did, they might not want
to use it, to remain compatible with older kernels.

I'm still not seeing any compelling reason for unconditionally adding
kernel text which some users don't need.

Maybe there is such a reason, and it hasn't yet been beaten into my
skull.  But I still think it should be done in a separate patch.  One
which comes with a full description of the reasons for the change, btw.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-23 23:18               ` Andrew Morton
@ 2009-06-24 22:47                 ` Davide Libenzi
  2009-06-24 23:12                   ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-24 22:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Tue, 23 Jun 2009, Andrew Morton wrote:

> Maybe there is such a reason, and it hasn't yet been beaten into my
> skull.  But I still think it should be done in a separate patch.  One
> which comes with a full description of the reasons for the change, btw.

Since your skull seems pretty hard to beat, would you like me to split it 
for ya? :)



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-24 22:47                 ` Davide Libenzi
@ 2009-06-24 23:12                   ` Andrew Morton
  2009-06-24 23:52                     ` Davide Libenzi
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2009-06-24 23:12 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Wed, 24 Jun 2009 15:47:47 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> On Tue, 23 Jun 2009, Andrew Morton wrote:
> 
> > Maybe there is such a reason, and it hasn't yet been beaten into my
> > skull.  But I still think it should be done in a separate patch.  One
> > which comes with a full description of the reasons for the change, btw.
> 
> Since your skull seems pretty hard to beat, would you like me to split it 
> for ya? :)

Split what?  My skull?

umm, yes please, I believe the patches should be split.  And I'm still
not seeing the justification for forcing CONFIG_EVENTFD onto all
CONFIG_AIO users!


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-24 23:12                   ` Andrew Morton
@ 2009-06-24 23:52                     ` Davide Libenzi
  2009-06-25  0:33                       ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Davide Libenzi @ 2009-06-24 23:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel Mailing List, avi, kvm, ghaskins, Rusty Russell,
	Benjamin LaHaise

On Wed, 24 Jun 2009, Andrew Morton wrote:

> Split what?  My skull?

Heh :)


> umm, yes please, I believe the patches should be split.  And I'm still
> not seeing the justification for forcing CONFIG_EVENTFD onto all
> CONFIG_AIO users!

Eventfd notifications became part of the AIO API (it's not even delivered 
through a new syscall, from the AIO side - same existing aiocb struct and 
io_submit syscall) once we merged it, so IMHO (AIO && !EVENTFD) would be 
similar to split AIO in AIO_READ and AIO_WRITE and have (AIO && !AIO_WRITE).
Considering that the kernel config, once you unleash the CONFIG_EMBEDDED 
pandora box, allows you to select (AIO && !EVENTFD) w/out even a warning 
about possible userspace breakages, this makes it rather a confusing 
configuration if you ask me.
It's not a biggie from the kernel side, just a few ugly errors wrappers 
around functions. For me AIO (or whatever userspace visible kernel 
subsystem) should select all the components that are part of the userspace 
API, but my argument ends here.



- Davide



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [patch] eventfd - revised interface and cleanups (4th rev)
  2009-06-23 22:46   ` [patch] eventfd - revised interface and cleanups (3rd rev) Davide Libenzi
@ 2009-06-24 23:57     ` Davide Libenzi
  0 siblings, 0 replies; 31+ messages in thread
From: Davide Libenzi @ 2009-06-24 23:57 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Andrew Morton, avi, kvm, Gregory Haskins, Rusty Russell,
	Benjamin LaHaise

The following patch changes the eventfd interface to de-couple the eventfd 
memory context, from the file pointer instance.
Without such change, there is no clean way to racely free handle the 
POLLHUP event sent when the last instance of the file* goes away.
Also, now the internal eventfd APIs are using the eventfd context instead 
of the file*.

Gregory, none of the APIs changed, so your patches do not need to be 
rebased over this one. The last three revisions just updated comments.

Andrew, I'm not posting the "AIO select EVENTFD" patch at this time. Will 
eventually try another push in your skull once I come back from vacation ;)


Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide


---
 drivers/lguest/lg.h          |    2 
 drivers/lguest/lguest_user.c |    4 -
 fs/aio.c                     |   24 ++------
 fs/eventfd.c                 |  122 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/aio.h          |    4 -
 include/linux/eventfd.h      |   35 +++++++++---
 6 files changed, 149 insertions(+), 42 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================
--- linux-2.6.mod.orig/fs/eventfd.c	2009-06-23 19:04:38.000000000 -0700
+++ linux-2.6.mod/fs/eventfd.c	2009-06-24 16:13:18.000000000 -0700
@@ -14,35 +14,44 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
-#include <linux/eventfd.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/kref.h>
+#include <linux/eventfd.h>
 
 struct eventfd_ctx {
+	struct kref kref;
 	wait_queue_head_t wqh;
 	/*
 	 * Every time that a write(2) is performed on an eventfd, the
 	 * value of the __u64 being written is added to "count" and a
 	 * wakeup is performed on "wqh". A read(2) will return the "count"
 	 * value to userspace, and will reset "count" to zero. The kernel
-	 * size eventfd_signal() also, adds to the "count" counter and
+	 * side eventfd_signal() also, adds to the "count" counter and
 	 * issue a wakeup.
 	 */
 	__u64 count;
 	unsigned int flags;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ *          The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returining a POLLERR
+ * to poll(2).
+ *
+ * Returns @n in case of success, a non-negative number lower than @n in case
+ * of overflow, or the following error codes:
+ *
+ * -EINVAL    : The value of @n is negative.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct eventfd_ctx *ctx, int n)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	unsigned long flags;
 
 	if (n < 0)
@@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, in
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
+static void eventfd_free(struct kref *kref)
+{
+	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
+
+	kfree(ctx);
+}
+
+/**
+ * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to the eventfd context.
+ *
+ * Returns: In case of success, returns a pointer to the eventfd context.
+ */
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
+{
+	kref_get(&ctx->kref);
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_get);
+
+/**
+ * eventfd_ctx_put - Releases a reference to the internal eventfd context.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * The eventfd context reference must have been previously acquired either
+ * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ */
+void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+	kref_put(&ctx->kref, eventfd_free);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_put);
+
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-	kfree(file->private_data);
+	struct eventfd_ctx *ctx = file->private_data;
+
+	wake_up_poll(&ctx->wqh, POLLHUP);
+	eventfd_ctx_put(ctx);
 	return 0;
 }
 
@@ -185,6 +230,16 @@ static const struct file_operations even
 	.write		= eventfd_write,
 };
 
+/**
+ * eventfd_fget - Acquire a reference of an eventfd file descriptor.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the eventfd file structure in case of success, or the
+ * following error pointer:
+ *
+ * -EBADF    : Invalid @fd file descriptor.
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
 	struct file *file;
@@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd)
 }
 EXPORT_SYMBOL_GPL(eventfd_fget);
 
+/**
+ * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
+ * @fd: [in] Eventfd file descriptor.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointers returned by the following functions:
+ *
+ * eventfd_fget
+ */
+struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	struct file *file;
+	struct eventfd_ctx *ctx;
+
+	file = eventfd_fget(fd);
+	if (IS_ERR(file))
+		return (struct eventfd_ctx *) file;
+	ctx = eventfd_ctx_get(file->private_data);
+	fput(file);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
+
+/**
+ * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
+ * @file: [in] Eventfd file pointer.
+ *
+ * Returns a pointer to the internal eventfd context, otherwise the error
+ * pointer:
+ *
+ * -EINVAL   : The @fd file descriptor is not an eventfd file.
+ */
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
+{
+	if (file->f_op != &eventfd_fops)
+		return ERR_PTR(-EINVAL);
+
+	return eventfd_ctx_get(file->private_data);
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
+
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
@@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, 
 	if (!ctx)
 		return -ENOMEM;
 
+	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h	2009-06-23 19:04:38.000000000 -0700
+++ linux-2.6.mod/include/linux/eventfd.h	2009-06-24 16:35:11.000000000 -0700
@@ -8,10 +8,8 @@
 #ifndef _LINUX_EVENTFD_H
 #define _LINUX_EVENTFD_H
 
-#ifdef CONFIG_EVENTFD
-
-/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
+#include <linux/file.h>
 
 /*
  * CAREFUL: Check include/asm-generic/fcntl.h when defining
@@ -27,16 +25,37 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+#ifdef CONFIG_EVENTFD
+
+struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
+void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
+struct eventfd_ctx *eventfd_ctx_fdget(int fd);
+struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
+int eventfd_signal(struct eventfd_ctx *ctx, int n);
 
 #else /* CONFIG_EVENTFD */
 
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-static inline int eventfd_signal(struct file *file, int n)
-{ return 0; }
+/*
+ * Ugly ugly ugly error layer to support modules that uses eventfd but
+ * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO.
+ */
+static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int eventfd_signal(struct eventfd_ctx *ctx, int n)
+{
+	return -ENOSYS;
+}
+
+static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
+{
+
+}
 
-#endif /* CONFIG_EVENTFD */
+#endif
 
 #endif /* _LINUX_EVENTFD_H */
 
Index: linux-2.6.mod/fs/aio.c
===================================================================
--- linux-2.6.mod.orig/fs/aio.c	2009-06-23 19:04:38.000000000 -0700
+++ linux-2.6.mod/fs/aio.c	2009-06-24 16:13:18.000000000 -0700
@@ -485,6 +485,8 @@ static inline void really_put_req(struct
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (req->ki_eventfd != NULL)
+		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -509,8 +511,6 @@ static void aio_fput_routine(struct work
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
 			__fput(req->ki_filp);
-		if (req->ki_eventfd != NULL)
-			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -528,8 +528,6 @@ static void aio_fput_routine(struct work
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	int schedule_putreq = 0;
-
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
@@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
-		schedule_putreq++;
-	else
-		req->ki_filp = NULL;
-	if (req->ki_eventfd != NULL) {
-		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
-			schedule_putreq++;
-		else
-			req->ki_eventfd = NULL;
-	}
-	if (unlikely(schedule_putreq)) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
 		queue_work(aio_wq, &fput_work);
-	} else
+	} else {
+		req->ki_filp = NULL;
 		really_put_req(ctx, req);
+	}
 	return 1;
 }
 
@@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
Index: linux-2.6.mod/include/linux/aio.h
===================================================================
--- linux-2.6.mod.orig/include/linux/aio.h	2009-06-23 19:04:38.000000000 -0700
+++ linux-2.6.mod/include/linux/aio.h	2009-06-24 16:13:18.000000000 -0700
@@ -121,9 +121,9 @@ struct kiocb {
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-	 * this is the underlying file* to deliver event to.
+	 * this is the underlying eventfd context to deliver events to.
 	 */
-	struct file		*ki_eventfd;
+	struct eventfd_ctx	*ki_eventfd;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
Index: linux-2.6.mod/drivers/lguest/lg.h
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lg.h	2009-06-23 19:04:38.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lg.h	2009-06-24 16:13:18.000000000 -0700
@@ -82,7 +82,7 @@ struct lg_cpu {
 
 struct lg_eventfd {
 	unsigned long addr;
-	struct file *event;
+	struct eventfd_ctx *event;
 };
 
 struct lg_eventfd_map {
Index: linux-2.6.mod/drivers/lguest/lguest_user.c
===================================================================
--- linux-2.6.mod.orig/drivers/lguest/lguest_user.c	2009-06-23 19:04:38.000000000 -0700
+++ linux-2.6.mod/drivers/lguest/lguest_user.c	2009-06-24 16:13:18.000000000 -0700
@@ -50,7 +50,7 @@ static int add_eventfd(struct lguest *lg
 
 	/* Now append new entry. */
 	new->map[new->num].addr = addr;
-	new->map[new->num].event = eventfd_fget(fd);
+	new->map[new->num].event = eventfd_ctx_fdget(fd);
 	if (IS_ERR(new->map[new->num].event)) {
 		kfree(new);
 		return PTR_ERR(new->map[new->num].event);
@@ -357,7 +357,7 @@ static int close(struct inode *inode, st
 
 	/* Release any eventfds they registered. */
 	for (i = 0; i < lg->eventfds->num; i++)
-		fput(lg->eventfds->map[i].event);
+		eventfd_ctx_put(lg->eventfds->map[i].event);
 	kfree(lg->eventfds);
 
 	/* If lg->dead doesn't contain an error code it will be NULL or a


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [patch] eventfd - revised interface and cleanups (2nd rev)
  2009-06-24 23:52                     ` Davide Libenzi
@ 2009-06-25  0:33                       ` Andrew Morton
  0 siblings, 0 replies; 31+ messages in thread
From: Andrew Morton @ 2009-06-25  0:33 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, avi, kvm, ghaskins, rusty, bcrl

On Wed, 24 Jun 2009 16:52:06 -0700 (PDT)
Davide Libenzi <davidel@xmailserver.org> wrote:

> > umm, yes please, I believe the patches should be split.  And I'm still
> > not seeing the justification for forcing CONFIG_EVENTFD onto all
> > CONFIG_AIO users!
> 
> Eventfd notifications became part of the AIO API (it's not even delivered 
> through a new syscall, from the AIO side - same existing aiocb struct and 
> io_submit syscall) once we merged it,

That was a regression for existing embedded AIO users ;)

> so IMHO (AIO && !EVENTFD) would be 
> similar to split AIO in AIO_READ and AIO_WRITE and have (AIO && !AIO_WRITE).
> Considering that the kernel config, once you unleash the CONFIG_EMBEDDED 
> pandora box, allows you to select (AIO && !EVENTFD) w/out even a warning 
> about possible userspace breakages, this makes it rather a confusing 
> configuration if you ask me.

Sure.  But we do assume that one someone sets CONFIG_EMBEDDED, they
know what they're doing.  We prefer to give them maximum flexibility
and foot-shooting ability.  As long as the maintenance cost of doing so
in each case is reasonable, of course.

> It's not a biggie from the kernel side, just a few ugly errors wrappers 
> around functions. For me AIO (or whatever userspace visible kernel 
> subsystem) should select all the components that are part of the userspace 
> API, but my argument ends here.

Sure, it's not a biggie either way.  Doubtful if many tiny systems are
using AIO anyway.  Heck, lots of them are running 2.4.18...

But from the general this-is-the-way-we-do-things POV, it's preferred
that the two features be separable under CONFIG_EMBEDDED if poss.



^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2009-06-25  0:33 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-23 16:47 [patch] eventfd - revised interface and cleanups Davide Libenzi
2009-06-23 16:59 ` Randy Dunlap
2009-06-23 17:04   ` Davide Libenzi
2009-06-23 17:03 ` Avi Kivity
2009-06-23 17:04   ` Davide Libenzi
2009-06-23 17:51 ` Gregory Haskins
2009-06-23 17:51   ` Davide Libenzi
2009-06-23 19:25 ` [patch] eventfd - revised interface and cleanups (2nd rev) Davide Libenzi
2009-06-23 19:48   ` Andrew Morton
2009-06-23 19:49     ` Davide Libenzi
2009-06-23 20:12   ` Andrew Morton
2009-06-23 20:59     ` Davide Libenzi
2009-06-23 21:25       ` Andrew Morton
2009-06-23 21:25         ` Davide Libenzi
2009-06-23 21:44           ` Andrew Morton
2009-06-23 22:49             ` Davide Libenzi
2009-06-23 23:18               ` Andrew Morton
2009-06-24 22:47                 ` Davide Libenzi
2009-06-24 23:12                   ` Andrew Morton
2009-06-24 23:52                     ` Davide Libenzi
2009-06-25  0:33                       ` Andrew Morton
2009-06-23 20:18   ` Andrew Morton
2009-06-23 21:03     ` Davide Libenzi
2009-06-23 21:29       ` Andrew Morton
2009-06-23 21:28         ` Davide Libenzi
2009-06-23 21:34         ` Davide Libenzi
2009-06-23 21:46           ` Andrew Morton
2009-06-23 21:48             ` Davide Libenzi
2009-06-23 22:07               ` Andrew Morton
2009-06-23 22:46   ` [patch] eventfd - revised interface and cleanups (3rd rev) Davide Libenzi
2009-06-24 23:57     ` [patch] eventfd - revised interface and cleanups (4th rev) Davide Libenzi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.