All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] eventfd: implementation of EFD_MASK flag
@ 2013-02-07  6:41 Martin Sustrik
  2013-02-07 19:12 ` Andy Lutomirski
  2013-02-07 22:44 ` Andrew Morton
  0 siblings, 2 replies; 21+ messages in thread
From: Martin Sustrik @ 2013-02-07  6:41 UTC (permalink / raw)
  To: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel
  Cc: Martin Sustrik

When implementing network protocols in user space, one has to implement
fake user-space file descriptors to represent the sockets for the protocol.

While all the BSD socket API functionality for such descriptors may be faked as
well (myproto_send(), myproto_recv() etc.) this approach doesn't work for
polling  (select, poll, epoll). For polling, real system-level file descriptor
is needed.

In theory, eventfd may be used for this purpose, except that it is well suited
only for signaling POLLIN. With some hacking it can be also used to signal
POLLOUT and POLLERR, however:

I.  There's no way to signal POLLPRI, POLLHUP etc.
II. There's no way to signal arbitraty combination of POLL* flags. Most notably,
    !POLLIN & !POLLOUT, which is a perfectly valid combination for a network
    protocol (rx buffer is empty and tx buffer is full), cannot be signaled
    using current implementation of eventfd.

This patch implements new EFD_MASK flag which attempts to solve this problem.

Additionally, when implementing network protocols in user space, there's a
need to associate user-space state with the each "socket". If eventfd object is
used as a reference to the socket, it should be possible to associate an opaque
pointer to user-space data with it.

The semantics of EFD_MASK are as follows:

eventfd(2):

If eventfd is created with EFD_MASK flag set, it is initialised in such a way
as to signal no events on the file descriptor when it is polled on. 'initval'
argument is ignored.

write(2):

User is allowed to write only buffers containing the following structure:

struct efd_mask {
  short events;
  void *ptr;
};

The value of 'events' should be any combination of event flags as defined by
poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified events will
be signaled when polling (select, poll, epoll) on the eventfd is done later on.
'ptr' is an opaque pointer that is not interpreted by eventfd object.

read(2):

User is allowed to read an efd_mask structure from the eventfd marked by
EFD_MASK. Returned value shall be the last one written to the eventfd.

select(2), poll(2) and similar:

When polling on the eventfd marked by EFD_MASK flag, all the events specified
in last written 'events' field shall be signaled.

Signed-off-by: Martin Sustrik <sustrik@250bpm.com>
---
 fs/eventfd.c            |  105 ++++++++++++++++++++++++++++++++++++-----------
 include/linux/eventfd.h |    3 +-
 2 files changed, 83 insertions(+), 25 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9..9fec49f 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -2,6 +2,7 @@
  *  fs/eventfd.c
  *
  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2013  Martin Sustrik <sustrik@250bpm.com>
  *
  */
 
@@ -22,18 +23,26 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
+struct eventfd_mask {
+	short events;
+	void *ptr;
+};
+
 struct eventfd_ctx {
 	struct kref kref;
 	wait_queue_head_t wqh;
-	/*
-	 * Every time that a write(2) is performed on an eventfd, the
-	 * value of the __u64 being written is added to "count" and a
-	 * wakeup is performed on "wqh". A read(2) will return the "count"
-	 * value to userspace, and will reset "count" to zero. The kernel
-	 * side eventfd_signal() also, adds to the "count" counter and
-	 * issue a wakeup.
-	 */
-	__u64 count;
+	union {
+		/*
+		 * Every time that a write(2) is performed on an eventfd, the
+		 * value of the __u64 being written is added to "count" and a
+		 * wakeup is performed on "wqh". A read(2) will return the
+		 * "count" value to userspace, and will reset "count" to zero.
+		 * The kernel side eventfd_signal() also, adds to the "count"
+		 * counter and issue a wakeup.
+		 */
+		__u64 count;
+		struct eventfd_mask mask;
+	};
 	unsigned int flags;
 };
 
@@ -55,6 +64,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 {
 	unsigned long flags;
 
+	/* This function should never be used with eventfd in the mask mode. */
+	BUG_ON(ctx->flags & EFD_MASK);
+
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
@@ -123,12 +135,16 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &ctx->wqh, wait);
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	if (ctx->count > 0)
-		events |= POLLIN;
-	if (ctx->count == ULLONG_MAX)
-		events |= POLLERR;
-	if (ULLONG_MAX - 1 > ctx->count)
-		events |= POLLOUT;
+	if (ctx->flags & EFD_MASK) {
+		events = ctx->mask.events;
+	} else {
+		if (ctx->count > 0)
+			events |= POLLIN;
+		if (ctx->count == ULLONG_MAX)
+			events |= POLLERR;
+		if (ULLONG_MAX - 1 > ctx->count)
+			events |= POLLOUT;
+	}
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return events;
@@ -158,6 +174,9 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
 {
 	unsigned long flags;
 
+	/* This function should never be used with eventfd in the mask mode. */
+	BUG_ON(ctx->flags & EFD_MASK);
+
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	eventfd_ctx_do_read(ctx, cnt);
 	__remove_wait_queue(&ctx->wqh, wait);
@@ -188,6 +207,9 @@ ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 	ssize_t res;
 	DECLARE_WAITQUEUE(wait, current);
 
+	/* This function should never be used with eventfd in the mask mode. */
+	BUG_ON(ctx->flags & EFD_MASK);
+
 	spin_lock_irq(&ctx->wqh.lock);
 	*cnt = 0;
 	res = -EAGAIN;
@@ -230,13 +252,23 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 	ssize_t res;
 	__u64 cnt;
 
-	if (count < sizeof(cnt))
-		return -EINVAL;
-	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
-	if (res < 0)
+	if (ctx->flags & EFD_MASK) {
+		spin_lock_irq(&ctx->wqh.lock);
+		if (count < sizeof(ctx->mask))
+			return -EINVAL;
+		res = copy_to_user(buf, &ctx->mask, sizeof(ctx->mask)) ?
+			-EFAULT : sizeof(ctx->mask);
+		spin_unlock_irq(&ctx->wqh.lock);
 		return res;
-
-	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+	} else {
+		if (count < sizeof(cnt))
+			return -EINVAL;
+		res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+		if (res < 0)
+			return res;
+		return put_user(cnt, (__u64 __user *) buf) ?
+			-EFAULT : sizeof(cnt);
+	}
 }
 
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
@@ -247,6 +279,21 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	__u64 ucnt;
 	DECLARE_WAITQUEUE(wait, current);
 
+	if (ctx->flags & EFD_MASK) {
+		if (count < sizeof(ctx->mask))
+			return -EINVAL;
+		spin_lock_irq(&ctx->wqh.lock);
+		if (copy_from_user(&ctx->mask, buf, sizeof(ctx->mask))) {
+			spin_unlock_irq(&ctx->wqh.lock);
+			return -EFAULT;
+		}
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh,
+				(unsigned long)ctx->mask.events);
+		spin_unlock_irq(&ctx->wqh.lock);
+		return sizeof(ctx->mask);
+	}
+
 	if (count < sizeof(ucnt))
 		return -EINVAL;
 	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
@@ -293,8 +340,13 @@ static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 	int ret;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	ret = seq_printf(m, "eventfd-count: %16llx\n",
-			 (unsigned long long)ctx->count);
+	if (ctx->flags & EFD_MASK) {
+		ret = seq_printf(m, "eventfd-mask: %x\n",
+				 (unsigned)ctx->mask.events);
+	} else {
+		ret = seq_printf(m, "eventfd-count: %16llx\n",
+				 (unsigned long long)ctx->count);
+	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
 	return ret;
@@ -412,7 +464,12 @@ struct file *eventfd_file_create(unsigned int count, int flags)
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
-	ctx->count = count;
+	if (flags & EFD_MASK) {
+		ctx->mask.events = 0;
+		ctx->mask.ptr = NULL;
+	} else {
+		ctx->count = count;
+	}
 	ctx->flags = flags;
 
 	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 3c3ef19..b806d2b 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -20,11 +20,12 @@
  * shared O_* flags.
  */
 #define EFD_SEMAPHORE (1 << 0)
+#define EFD_MASK (1 << 1)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_MASK)
 
 #ifdef CONFIG_EVENTFD
 
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07  6:41 [PATCH 1/1] eventfd: implementation of EFD_MASK flag Martin Sustrik
@ 2013-02-07 19:12 ` Andy Lutomirski
  2013-02-07 20:11   ` Martin Sustrik
  2013-02-07 22:44 ` Andrew Morton
  1 sibling, 1 reply; 21+ messages in thread
From: Andy Lutomirski @ 2013-02-07 19:12 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel

On 02/06/2013 10:41 PM, Martin Sustrik wrote:
> When implementing network protocols in user space, one has to implement
> fake user-space file descriptors to represent the sockets for the protocol.
> 
> While all the BSD socket API functionality for such descriptors may be faked as
> well (myproto_send(), myproto_recv() etc.) this approach doesn't work for
> polling  (select, poll, epoll). For polling, real system-level file descriptor
> is needed.
> 
> In theory, eventfd may be used for this purpose, except that it is well suited
> only for signaling POLLIN. With some hacking it can be also used to signal
> POLLOUT and POLLERR, however:
> 
> I.  There's no way to signal POLLPRI, POLLHUP etc.
> II. There's no way to signal arbitraty combination of POLL* flags. Most notably,
>     !POLLIN & !POLLOUT, which is a perfectly valid combination for a network
>     protocol (rx buffer is empty and tx buffer is full), cannot be signaled
>     using current implementation of eventfd.
> 
> This patch implements new EFD_MASK flag which attempts to solve this problem.
> 
> Additionally, when implementing network protocols in user space, there's a
> need to associate user-space state with the each "socket". If eventfd object is
> used as a reference to the socket, it should be possible to associate an opaque
> pointer to user-space data with it.
> 
> The semantics of EFD_MASK are as follows:
> 
> eventfd(2):
> 
> If eventfd is created with EFD_MASK flag set, it is initialised in such a way
> as to signal no events on the file descriptor when it is polled on. 'initval'
> argument is ignored.
> 
> write(2):
> 
> User is allowed to write only buffers containing the following structure:
> 
> struct efd_mask {
>   short events;
>   void *ptr;
> };

IMO that should be u64 ptr to avoid compat problems.

> 
> The value of 'events' should be any combination of event flags as defined by
> poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified events will
> be signaled when polling (select, poll, epoll) on the eventfd is done later on.
> 'ptr' is an opaque pointer that is not interpreted by eventfd object.

How does this interact with EPOLLET?

--Andy

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07 19:12 ` Andy Lutomirski
@ 2013-02-07 20:11   ` Martin Sustrik
  2013-02-08  1:03     ` Andy Lutomirski
  0 siblings, 1 reply; 21+ messages in thread
From: Martin Sustrik @ 2013-02-07 20:11 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel

On 07/02/13 20:12, Andy Lutomirski wrote:
> On 02/06/2013 10:41 PM, Martin Sustrik wrote:
>> When implementing network protocols in user space, one has to implement
>> fake user-space file descriptors to represent the sockets for the protocol.
>>
>> While all the BSD socket API functionality for such descriptors may be faked as
>> well (myproto_send(), myproto_recv() etc.) this approach doesn't work for
>> polling  (select, poll, epoll). For polling, real system-level file descriptor
>> is needed.
>>
>> In theory, eventfd may be used for this purpose, except that it is well suited
>> only for signaling POLLIN. With some hacking it can be also used to signal
>> POLLOUT and POLLERR, however:
>>
>> I.  There's no way to signal POLLPRI, POLLHUP etc.
>> II. There's no way to signal arbitraty combination of POLL* flags. Most notably,
>>      !POLLIN&  !POLLOUT, which is a perfectly valid combination for a network
>>      protocol (rx buffer is empty and tx buffer is full), cannot be signaled
>>      using current implementation of eventfd.
>>
>> This patch implements new EFD_MASK flag which attempts to solve this problem.
>>
>> Additionally, when implementing network protocols in user space, there's a
>> need to associate user-space state with the each "socket". If eventfd object is
>> used as a reference to the socket, it should be possible to associate an opaque
>> pointer to user-space data with it.
>>
>> The semantics of EFD_MASK are as follows:
>>
>> eventfd(2):
>>
>> If eventfd is created with EFD_MASK flag set, it is initialised in such a way
>> as to signal no events on the file descriptor when it is polled on. 'initval'
>> argument is ignored.
>>
>> write(2):
>>
>> User is allowed to write only buffers containing the following structure:
>>
>> struct efd_mask {
>>    short events;
>>    void *ptr;
>> };
>
> IMO that should be u64 ptr to avoid compat problems.

I was following the user space declaration of epoll_data:

            typedef union epoll_data {
                void        *ptr;  <-----
                int          fd;
                uint32_t     u32;
                uint64_t     u64;
            } epoll_data_t;

However, now I'm looking at the kernel side definition of the whole 
union which looks like this (obviously it assumes that pointer is never 
longer than 64 bits):

          __u64 data;

Hm, not very helpful. Anyway, I am not a kernel developer, so any 
concrete suggestion about what type to use to map cleanly to user-space 
void* is welcome.

>> The value of 'events' should be any combination of event flags as defined by
>> poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified events will
>> be signaled when polling (select, poll, epoll) on the eventfd is done later on.
>> 'ptr' is an opaque pointer that is not interpreted by eventfd object.
>
> How does this interact with EPOLLET?

That's an interesting question. The original eventfd code doesn't do 
anything specific to either edge or level mode. Neither does my patch.

Inspection of the code seems to suggest that edge vs. level distinction 
is handled elsewhere (ep_send_events_proc) where there is a separate 
list of ready events and the function, after returning the event, 
decides whether to leave the event in the list (level) or delete it from 
the list (edge).

In any case, review from someone with experience with epoll 
implementation would help.

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07  6:41 [PATCH 1/1] eventfd: implementation of EFD_MASK flag Martin Sustrik
  2013-02-07 19:12 ` Andy Lutomirski
@ 2013-02-07 22:44 ` Andrew Morton
  2013-02-07 23:30   ` Martin Sustrik
  2013-02-08 12:43   ` Martin Sustrik
  1 sibling, 2 replies; 21+ messages in thread
From: Andrew Morton @ 2013-02-07 22:44 UTC (permalink / raw)
  To: Martin Sustrik; +Cc: Alexander Viro, Sha Zhengju, linux-fsdevel, linux-kernel

On Thu,  7 Feb 2013 07:41:32 +0100
Martin Sustrik <sustrik@250bpm.com> wrote:

> When implementing network protocols in user space, one has to implement
> fake user-space file descriptors to represent the sockets for the protocol.
> 
> While all the BSD socket API functionality for such descriptors may be faked as
> well (myproto_send(), myproto_recv() etc.) this approach doesn't work for
> polling  (select, poll, epoll). For polling, real system-level file descriptor
> is needed.

That's a nice changelog but it omitted a critical thing: why do you
think the kernel needs this feature?  What's the value and use case for
being able to poll these descriptors?

So please update the changelog and then cc netdev@vger.kernel.org on
the patch - the netdev people are probably best-situated to comment on
the proposal.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07 22:44 ` Andrew Morton
@ 2013-02-07 23:30   ` Martin Sustrik
  2013-02-08 12:43   ` Martin Sustrik
  1 sibling, 0 replies; 21+ messages in thread
From: Martin Sustrik @ 2013-02-07 23:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Alexander Viro, Sha Zhengju, linux-fsdevel, linux-kernel

On 07/02/13 23:44, Andrew Morton wrote:

> So please update the changelog and then cc netdev@vger.kernel.org on
> the patch - the netdev people are probably best-situated to comment on
> the proposal.

OK. Done. Thanks for the advice!

Martin



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07 20:11   ` Martin Sustrik
@ 2013-02-08  1:03     ` Andy Lutomirski
  2013-02-08  5:26       ` Martin Sustrik
  2013-02-08 22:08       ` Eric Wong
  0 siblings, 2 replies; 21+ messages in thread
From: Andy Lutomirski @ 2013-02-08  1:03 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel

On Thu, Feb 7, 2013 at 12:11 PM, Martin Sustrik <sustrik@250bpm.com> wrote:
> On 07/02/13 20:12, Andy Lutomirski wrote:
>>
>> On 02/06/2013 10:41 PM, Martin Sustrik wrote:
>>>
>>> When implementing network protocols in user space, one has to implement
>>> fake user-space file descriptors to represent the sockets for the
>>> protocol.
>>>
>>> While all the BSD socket API functionality for such descriptors may be
>>> faked as
>>> well (myproto_send(), myproto_recv() etc.) this approach doesn't work for
>>> polling  (select, poll, epoll). For polling, real system-level file
>>> descriptor
>>> is needed.
>>>
>>> In theory, eventfd may be used for this purpose, except that it is well
>>> suited
>>> only for signaling POLLIN. With some hacking it can be also used to
>>> signal
>>> POLLOUT and POLLERR, however:
>>>
>>> I.  There's no way to signal POLLPRI, POLLHUP etc.
>>> II. There's no way to signal arbitraty combination of POLL* flags. Most
>>> notably,
>>>      !POLLIN&  !POLLOUT, which is a perfectly valid combination for a
>>> network
>>>
>>>      protocol (rx buffer is empty and tx buffer is full), cannot be
>>> signaled
>>>      using current implementation of eventfd.
>>>
>>> This patch implements new EFD_MASK flag which attempts to solve this
>>> problem.
>>>
>>> Additionally, when implementing network protocols in user space, there's
>>> a
>>> need to associate user-space state with the each "socket". If eventfd
>>> object is
>>> used as a reference to the socket, it should be possible to associate an
>>> opaque
>>> pointer to user-space data with it.
>>>
>>> The semantics of EFD_MASK are as follows:
>>>
>>> eventfd(2):
>>>
>>> If eventfd is created with EFD_MASK flag set, it is initialised in such a
>>> way
>>> as to signal no events on the file descriptor when it is polled on.
>>> 'initval'
>>> argument is ignored.
>>>
>>> write(2):
>>>
>>> User is allowed to write only buffers containing the following structure:
>>>
>>> struct efd_mask {
>>>    short events;
>>>    void *ptr;
>>> };
>>
>>
>> IMO that should be u64 ptr to avoid compat problems.
>
>
> I was following the user space declaration of epoll_data:
>
>            typedef union epoll_data {
>                void        *ptr;  <-----
>                int          fd;
>                uint32_t     u32;
>                uint64_t     u64;
>            } epoll_data_t;
>
> However, now I'm looking at the kernel side definition of the whole union
> which looks like this (obviously it assumes that pointer is never longer
> than 64 bits):
>
>          __u64 data;
>
> Hm, not very helpful. Anyway, I am not a kernel developer, so any concrete
> suggestion about what type to use to map cleanly to user-space void* is
> welcome.

Reusing epoll_data_t seems reasonable.  The main consideration is that
the size of the object should not vary between 32-bit and 64-bit
userspace.

>
>
>>> The value of 'events' should be any combination of event flags as defined
>>> by
>>> poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified
>>> events will
>>> be signaled when polling (select, poll, epoll) on the eventfd is done
>>> later on.
>>> 'ptr' is an opaque pointer that is not interpreted by eventfd object.
>>
>>
>> How does this interact with EPOLLET?
>
>
> That's an interesting question. The original eventfd code doesn't do
> anything specific to either edge or level mode. Neither does my patch.
>
> Inspection of the code seems to suggest that edge vs. level distinction is
> handled elsewhere (ep_send_events_proc) where there is a separate list of
> ready events and the function, after returning the event, decides whether to
> leave the event in the list (level) or delete it from the list (edge).

Hmm.  Having looked at the eventpoll.c source again, I remain
unconvinced that EPOLLET works the way that any userspace developer
would expect it to.  But your code probably has very little to do with
this, so maybe you shouldn't worry about it.  There may be some
advantage to adding (later on, if needed) an option to change the
flags set in:

+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh,
+				(unsigned long)ctx->mask.events);

(i.e. to allow the second parameter to omit some bits that were
already signaled.)  Allowing write to write a bigger struct in the
future won't break anything.

It may be a good idea to return EINVAL if anyone tries to write() an
unknown poll bit.

--Andy

>
> In any case, review from someone with experience with epoll implementation
> would help.
>
> Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08  1:03     ` Andy Lutomirski
@ 2013-02-08  5:26       ` Martin Sustrik
  2013-02-08  6:36         ` Andy Lutomirski
  2013-02-08 22:08       ` Eric Wong
  1 sibling, 1 reply; 21+ messages in thread
From: Martin Sustrik @ 2013-02-08  5:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel

Hi Andy,

On 08/02/13 02:03, Andy Lutomirski wrote:
> There may be some
> advantage to adding (later on, if needed) an option to change the
> flags set in:
>
> +		if (waitqueue_active(&ctx->wqh))
> +			wake_up_locked_poll(&ctx->wqh,
> +				(unsigned long)ctx->mask.events);
>
> (i.e. to allow the second parameter to omit some bits that were
> already signaled.)  Allowing write to write a bigger struct in the
> future won't break anything.

I think I don't follow. Either the second parameter is supposed to be 
*newly* signaled events, in which case the events that were already 
signaled in the past should be ommitted, or it is meant to be *all* 
signaled events, in which case the current implementation is OK.

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08  5:26       ` Martin Sustrik
@ 2013-02-08  6:36         ` Andy Lutomirski
  2013-02-08  6:55           ` Martin Sustrik
  0 siblings, 1 reply; 21+ messages in thread
From: Andy Lutomirski @ 2013-02-08  6:36 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel

On Thu, Feb 7, 2013 at 9:26 PM, Martin Sustrik <sustrik@250bpm.com> wrote:
> Hi Andy,
>
>
> On 08/02/13 02:03, Andy Lutomirski wrote:
>>
>> There may be some
>> advantage to adding (later on, if needed) an option to change the
>> flags set in:
>>
>> +               if (waitqueue_active(&ctx->wqh))
>> +                       wake_up_locked_poll(&ctx->wqh,
>> +                               (unsigned long)ctx->mask.events);
>>
>> (i.e. to allow the second parameter to omit some bits that were
>> already signaled.)  Allowing write to write a bigger struct in the
>> future won't break anything.
>
>
> I think I don't follow. Either the second parameter is supposed to be
> *newly* signaled events, in which case the events that were already signaled
> in the past should be ommitted, or it is meant to be *all* signaled events,
> in which case the current implementation is OK.

I defer to the experts here.  But I suspect that if you want to
perfectly emulate sockets, you may need to vary what you specify.
(IIRC tcp sockets report an EPOLLIN edge every time data is received
even if the receive buffer wasn't empty.)

--Andy

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08  6:36         ` Andy Lutomirski
@ 2013-02-08  6:55           ` Martin Sustrik
  0 siblings, 0 replies; 21+ messages in thread
From: Martin Sustrik @ 2013-02-08  6:55 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel

On 08/02/13 07:36, Andy Lutomirski wrote:

>> On 08/02/13 02:03, Andy Lutomirski wrote:
>>>
>>> There may be some
>>> advantage to adding (later on, if needed) an option to change the
>>> flags set in:
>>>
>>> +               if (waitqueue_active(&ctx->wqh))
>>> +                       wake_up_locked_poll(&ctx->wqh,
>>> +                               (unsigned long)ctx->mask.events);
>>>
>>> (i.e. to allow the second parameter to omit some bits that were
>>> already signaled.)  Allowing write to write a bigger struct in the
>>> future won't break anything.
>>
>>
>> I think I don't follow. Either the second parameter is supposed to be
>> *newly* signaled events, in which case the events that were already signaled
>> in the past should be ommitted, or it is meant to be *all* signaled events,
>> in which case the current implementation is OK.
>
> I defer to the experts here.  But I suspect that if you want to
> perfectly emulate sockets, you may need to vary what you specify.
> (IIRC tcp sockets report an EPOLLIN edge every time data is received
> even if the receive buffer wasn't empty.)

Hm. That sounds like leaking protocol implementation details to the 
user. That's a bad design IMO and should not be encouraged.

Anyway, I have implemented your other suggestions.

Btw, one thing I am not sure about is how to submit improved patches to 
the ML. Should I use the same patch name? Doesn't that cause confusion?

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07 22:44 ` Andrew Morton
  2013-02-07 23:30   ` Martin Sustrik
@ 2013-02-08 12:43   ` Martin Sustrik
  2013-02-08 22:21     ` Eric Wong
  1 sibling, 1 reply; 21+ messages in thread
From: Martin Sustrik @ 2013-02-08 12:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Viro, Sha Zhengju, linux-fsdevel, linux-kernel, netdev

On 07/02/13 23:44, Andrew Morton wrote:

> That's a nice changelog but it omitted a critical thing: why do you
> think the kernel needs this feature?  What's the value and use case for
> being able to poll these descriptors?

To address the question, I've written down detailed description of the 
challenges of the network protocol development in user space and how the 
proposed feature addresses the problems.

It's too long to fit into ChangeLog, but it may be worth reading when 
trying to judge the merit of the patch.

It can be found here: http://www.250bpm.com/blog:16

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08  1:03     ` Andy Lutomirski
  2013-02-08  5:26       ` Martin Sustrik
@ 2013-02-08 22:08       ` Eric Wong
  2013-02-09  3:26         ` Martin Sustrik
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Wong @ 2013-02-08 22:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Martin Sustrik, Alexander Viro, Andrew Morton, Sha Zhengju,
	linux-fsdevel, linux-kernel

Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Feb 7, 2013 at 12:11 PM, Martin Sustrik <sustrik@250bpm.com> wrote:
> > On 07/02/13 20:12, Andy Lutomirski wrote:
> >> On 02/06/2013 10:41 PM, Martin Sustrik wrote:
> >>> The value of 'events' should be any combination of event flags as defined
> >>> by
> >>> poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified
> >>> events will
> >>> be signaled when polling (select, poll, epoll) on the eventfd is done
> >>> later on.
> >>> 'ptr' is an opaque pointer that is not interpreted by eventfd object.
> >>
> >> How does this interact with EPOLLET?
> >
> > That's an interesting question. The original eventfd code doesn't do
> > anything specific to either edge or level mode. Neither does my patch.
> >
> > Inspection of the code seems to suggest that edge vs. level distinction is
> > handled elsewhere (ep_send_events_proc) where there is a separate list of
> > ready events and the function, after returning the event, decides whether to
> > leave the event in the list (level) or delete it from the list (edge).

Right, the edge vs. level distinction is internal to epoll.

> Hmm.  Having looked at the eventpoll.c source again, I remain
> unconvinced that EPOLLET works the way that any userspace developer
> would expect it to.

As as userspace developer, EPOLLET seems to work as expected/documented;
but I realized EPOLLONESHOT is what I want to be using instead.

> > In any case, review from someone with experience with epoll implementation
> > would help.

I'm no expert, but I don't think eventfd (or any file type) needs to
care about what I/O notification scheme/options it's used with.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08 12:43   ` Martin Sustrik
@ 2013-02-08 22:21     ` Eric Wong
  2013-02-09  2:40       ` Martin Sustrik
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Wong @ 2013-02-08 22:21 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Andrew Morton, Alexander Viro, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

Martin Sustrik <sustrik@250bpm.com> wrote:
> On 07/02/13 23:44, Andrew Morton wrote:
> >That's a nice changelog but it omitted a critical thing: why do you
> >think the kernel needs this feature?  What's the value and use case for
> >being able to poll these descriptors?
> 
> To address the question, I've written down detailed description of
> the challenges of the network protocol development in user space and
> how the proposed feature addresses the problems.
> 
> It's too long to fit into ChangeLog, but it may be worth reading
> when trying to judge the merit of the patch.
> 
> It can be found here: http://www.250bpm.com/blog:16

Using one eventfd per userspace socket still seems a bit wasteful.

Couldn't you use a single pipe for all sockets and write the efd_mask to
the pipe for each socket?

A read from the pipe would behave like epoll_wait.

You might need to use one-shot semantics; but that's probably
the easiest thing in multithreaded apps anyways.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08 22:21     ` Eric Wong
@ 2013-02-09  2:40       ` Martin Sustrik
  2013-02-09  3:54         ` Eric Wong
  0 siblings, 1 reply; 21+ messages in thread
From: Martin Sustrik @ 2013-02-09  2:40 UTC (permalink / raw)
  To: Eric Wong
  Cc: Andrew Morton, Alexander Viro, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

Hi Eric,

On 08/02/13 23:21, Eric Wong wrote:
> Martin Sustrik<sustrik@250bpm.com>  wrote:
>> On 07/02/13 23:44, Andrew Morton wrote:
>>> That's a nice changelog but it omitted a critical thing: why do you
>>> think the kernel needs this feature?  What's the value and use case for
>>> being able to poll these descriptors?
>>
>> To address the question, I've written down detailed description of
>> the challenges of the network protocol development in user space and
>> how the proposed feature addresses the problems.
>>
>> It's too long to fit into ChangeLog, but it may be worth reading
>> when trying to judge the merit of the patch.
>>
>> It can be found here: http://www.250bpm.com/blog:16
>
> Using one eventfd per userspace socket still seems a bit wasteful.

Wasteful in what sense? Occupying a slot in file descriptor table? 
That's the price for having the socket uniquely identified by the fd.

> Couldn't you use a single pipe for all sockets and write the efd_mask to
> the pipe for each socket?
>
> A read from the pipe would behave like epoll_wait.
>
> You might need to use one-shot semantics; but that's probably
> the easiest thing in multithreaded apps anyways.

Having multiple sockets represented by a single eventfd. how would you 
distinguish where did individual events came from?

   struct pollfd pfd;
   ...
   poll (pfd, 1, -1);
   if (pfd.revents & POLLIN) /* Incoming data on which socket? */
     ...

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-08 22:08       ` Eric Wong
@ 2013-02-09  3:26         ` Martin Sustrik
  0 siblings, 0 replies; 21+ messages in thread
From: Martin Sustrik @ 2013-02-09  3:26 UTC (permalink / raw)
  To: Eric Wong
  Cc: Andy Lutomirski, Alexander Viro, Andrew Morton, Sha Zhengju,
	linux-fsdevel, linux-kernel

On 08/02/13 23:08, Eric Wong wrote:

>>>>> poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified
>>>>> events will
>>>>> be signaled when polling (select, poll, epoll) on the eventfd is done
>>>>> later on.
>>>>> 'ptr' is an opaque pointer that is not interpreted by eventfd object.
>>>>
>>>> How does this interact with EPOLLET?
>>>
>>> That's an interesting question. The original eventfd code doesn't do
>>> anything specific to either edge or level mode. Neither does my patch.
>>>
>>> Inspection of the code seems to suggest that edge vs. level distinction is
>>> handled elsewhere (ep_send_events_proc) where there is a separate list of
>>> ready events and the function, after returning the event, decides whether to
>>> leave the event in the list (level) or delete it from the list (edge).
>
> Right, the edge vs. level distinction is internal to epoll.

I wrote a test program for EFD_MASK+EPOLLET and it seems to behave in 
intuitive kind of way:

int main ()
{
     int fd;
     struct efd_mask mask;
     ssize_t nbytes;
     int rc;
     int ep;
     struct epoll_event epe;

     fd = eventfd (0, EFD_MASK);

     ep = epoll_create (10);
     assert (ep != -1);
     epe.events = EPOLLIN | EPOLLET;
     rc = epoll_ctl (ep, EPOLL_CTL_ADD, fd, &epe);
     assert (rc != -1);

     mask.events = 0;
     nbytes = write (fd, &mask, sizeof (mask));
     assert (nbytes == sizeof (mask));
     rc = epoll_wait (ep, &epe, 1, 100);
     assert (rc == 0);

     mask.events = POLLIN;
     nbytes = write (fd, &mask, sizeof (mask));
     assert (nbytes == sizeof (mask));
     rc = epoll_wait (ep, &epe, 1, 100);
     assert (rc == 1 && epe.events == EPOLLIN);
     rc = epoll_wait (ep, &epe, 1, 100);
     assert (rc == 0);

     mask.events = POLLIN;
     nbytes = write (fd, &mask, sizeof (mask));
     mask.events = 0;
     nbytes = write (fd, &mask, sizeof (mask));
     rc = epoll_wait (ep, &epe, 1, 100);
     assert (rc == 0);

     rc = close (ep);
     assert (rc == 0);
     rc = close (fd);
     assert (rc == 0);

     return 0;
}

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-09  2:40       ` Martin Sustrik
@ 2013-02-09  3:54         ` Eric Wong
  2013-02-09  7:36           ` Martin Sustrik
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Wong @ 2013-02-09  3:54 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Andrew Morton, Alexander Viro, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

Martin Sustrik <sustrik@250bpm.com> wrote:
> On 08/02/13 23:21, Eric Wong wrote:
> >Martin Sustrik<sustrik@250bpm.com>  wrote:
> >>To address the question, I've written down detailed description of
> >>the challenges of the network protocol development in user space and
> >>how the proposed feature addresses the problems.
> >>
> >>It can be found here: http://www.250bpm.com/blog:16
> >
> >Using one eventfd per userspace socket still seems a bit wasteful.
> 
> Wasteful in what sense? Occupying a slot in file descriptor table?
> That's the price for having the socket uniquely identified by the
> fd.

Yes.  I realize eventfd is small, but I don't think eventfd is needed
at all, here.  Just one pipe.

> >Couldn't you use a single pipe for all sockets and write the efd_mask to
> >the pipe for each socket?
> >
> >A read from the pipe would behave like epoll_wait.
> >
> >You might need to use one-shot semantics; but that's probably
> >the easiest thing in multithreaded apps anyways.
> 
> Having multiple sockets represented by a single eventfd. how would
> you distinguish where did individual events came from?
> 
>   struct pollfd pfd;
>   ...
>   poll (pfd, 1, -1);
>   if (pfd.revents & POLLIN) /* Incoming data on which socket? */
>     ...

No eventfd, you write just write struct to the pipe, and consume the
struct to a fixed size buffer:

/* trigger readiness notification for sock,
 * this probably needs a lock around it
 */
void sock_trigger(struct my_sock *sock, int events)
{
	struct efd_mask mask;

	/* check if the triggeered event is something sock wants: */
	events &= sock->watched_events;

	if (!events)
		return;

	mask.events = events;
	mask.ptr = sock;

	/*
	 * preventing sock from being in the pipe multiple times
	 * is probably required (or just a good idea).  Which is
	 * why I mentioned oneshot semantics are probably required.
	 */
	if (oneshot)
		sock->watched_events = 0;

	/*
	 * This is analogous to:
	 *   list_add_tail(&epi->rdllink, &ep->rdllist);
	 * in fs/eventpoll.c
	 *
	 * This may block, but that's why consumer_loop runs in different
	 * threads.  Or run some iteration of consumer_loop here if
	 * it blocks (beware of stack depth from recursion, though)
	 */
	write(pipe_wr, &mask, sizeof(mask));
}

/* in another thread (or several threads) */
void consumer_loop(int pipe_rd)
{
	struct efd_mask mask;
	struct my_sock *sock;

	for (;;) {
		/*
		 * analogous to:
		 *    epoll_wait(.., maxevents=1, ...);
		 *
		 * You can read several masks at once if have one thread,
		 * but I usually use maxevents=1 (+several threads) to
		 * distribute traffic between threads
		 */
		read(pipe_rd, &mask, sizeof(mask));
		sock = mask.ptr;
		if (mask.events & POLLIN)
			sock_read(sock);
		else if (mask.events & POLLOUT)
			sock_write(sock);
		...

		/* analogous to epoll_ctl() */
		if (sock->write_buffered)
			sock->watched_events |= POLLOUT;
		if (sock->wants_more_data)
			sock->watched_events |= POLLIN;

		/* onto the next ready event */
	}
}

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-09  3:54         ` Eric Wong
@ 2013-02-09  7:36           ` Martin Sustrik
  2013-02-09 11:51             ` Eric Wong
  0 siblings, 1 reply; 21+ messages in thread
From: Martin Sustrik @ 2013-02-09  7:36 UTC (permalink / raw)
  To: Eric Wong
  Cc: Andrew Morton, Alexander Viro, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

On 09/02/13 04:54, Eric Wong wrote:

>>> Using one eventfd per userspace socket still seems a bit wasteful.
>>
>> Wasteful in what sense? Occupying a slot in file descriptor table?
>> That's the price for having the socket uniquely identified by the
>> fd.
>
> Yes.  I realize eventfd is small, but I don't think eventfd is needed
> at all, here.  Just one pipe.

Ah. Got you! You mean not to change the kernel, just use pipe for the 
purpose.

However, the convoluted pipe-style design is the problem I am trying to 
solve rather than the solution. It leads to convoluted APIs with 
convoluted semantics as described in the article. I've been using that 
kind of design for past 8 years and every time I have to deal with it I 
swear that one day I will implement a proper in-kernel solution to get 
rid of the hack.

And now I have finally done so.

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-09  7:36           ` Martin Sustrik
@ 2013-02-09 11:51             ` Eric Wong
  2013-02-09 12:04               ` Martin Sustrik
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Wong @ 2013-02-09 11:51 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Andrew Morton, Alexander Viro, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

Martin Sustrik <sustrik@250bpm.com> wrote:
> On 09/02/13 04:54, Eric Wong wrote:
> >>>Using one eventfd per userspace socket still seems a bit wasteful.
> >>
> >>Wasteful in what sense? Occupying a slot in file descriptor table?
> >>That's the price for having the socket uniquely identified by the
> >>fd.
> >
> >Yes.  I realize eventfd is small, but I don't think eventfd is needed
> >at all, here.  Just one pipe.
> 
> Ah. Got you! You mean not to change the kernel, just use pipe for
> the purpose.
> 
> However, the convoluted pipe-style design is the problem I am trying
> to solve rather than the solution. It leads to convoluted APIs with
> convoluted semantics as described in the article. I've been using
> that kind of design for past 8 years and every time I have to deal
> with it I swear that one day I will implement a proper in-kernel
> solution to get rid of the hack.
> 
> And now I have finally done so.

Yes, your eventfd change is probably the best way if you want/need
to only watch a subset of your sockets, especially if you want
poll/select to be an option.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-09 11:51             ` Eric Wong
@ 2013-02-09 12:04               ` Martin Sustrik
  0 siblings, 0 replies; 21+ messages in thread
From: Martin Sustrik @ 2013-02-09 12:04 UTC (permalink / raw)
  To: Eric Wong
  Cc: Andrew Morton, Alexander Viro, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

On 2013-02-09 12:51, Eric Wong wrote:

> Yes, your eventfd change is probably the best way if you want/need
> to only watch a subset of your sockets, especially if you want
> poll/select to be an option.

Yes, the poll/select thing is the important point.

I wouldn't care if the only problem was that I, as the protocol 
implementer, would have to implement some kind of workaround in my 
protocol library. The problem is that these convoluted semantics leak -- 
through the use of poll, select et al. -- to the end user.

 From my personal experience I can say that end users have pretty hard 
time using such complex workarounds instead of simply using a native 
file descriptor with standardised semantics.

Martin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
  2013-02-07 23:29 Martin Sustrik
@ 2013-02-15  2:45   ` Michał Mirosław
  0 siblings, 0 replies; 21+ messages in thread
From: Michał Mirosław @ 2013-02-15  2:45 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

2013/2/8 Martin Sustrik <sustrik@250bpm.com>:
> When implementing network protocols in user space, one has to implement
> fake user-space file descriptors to represent the sockets for the protocol.
[...]
> This patch implements new EFD_MASK flag which attempts to solve this problem.
[...]
> @@ -55,6 +64,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>  {
>         unsigned long flags;
>
> +       /* This function should never be used with eventfd in the mask mode. */
> +       BUG_ON(ctx->flags & EFD_MASK);
> +
>         spin_lock_irqsave(&ctx->wqh.lock, flags);
>         if (ULLONG_MAX - ctx->count < n)
>                 n = ULLONG_MAX - ctx->count;
> @@ -123,12 +135,16 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
>         poll_wait(file, &ctx->wqh, wait);
>
>         spin_lock_irqsave(&ctx->wqh.lock, flags);
> -       if (ctx->count > 0)
> -               events |= POLLIN;
> -       if (ctx->count == ULLONG_MAX)
> -               events |= POLLERR;
> -       if (ULLONG_MAX - 1 > ctx->count)
> -               events |= POLLOUT;
> +       if (ctx->flags & EFD_MASK) {
> +               events = ctx->mask.events;
> +       } else {
> +               if (ctx->count > 0)
> +                       events |= POLLIN;
> +               if (ctx->count == ULLONG_MAX)
> +                       events |= POLLERR;
> +               if (ULLONG_MAX - 1 > ctx->count)
> +                       events |= POLLOUT;
> +       }
>         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>
>         return events;
[...]
> @@ -412,7 +464,12 @@ struct file *eventfd_file_create(unsigned int count, int flags)
>
>         kref_init(&ctx->kref);
>         init_waitqueue_head(&ctx->wqh);
> -       ctx->count = count;
> +       if (flags & EFD_MASK) {
> +               ctx->mask.events = 0;
> +               ctx->mask.ptr = NULL;
> +       } else {
> +               ctx->count = count;
> +       }
>         ctx->flags = flags;
>
>         file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,

Since EFD_MASK is a persistent flag for a fd's lifetime, maybe you
could instead of all those if/elses and BUG_ON()s use another
file_operations struct for this feature?

Best Regards,
Michał Mirosław

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/1] eventfd: implementation of EFD_MASK flag
@ 2013-02-15  2:45   ` Michał Mirosław
  0 siblings, 0 replies; 21+ messages in thread
From: Michał Mirosław @ 2013-02-15  2:45 UTC (permalink / raw)
  To: Martin Sustrik
  Cc: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel,
	linux-kernel, netdev

2013/2/8 Martin Sustrik <sustrik@250bpm.com>:
> When implementing network protocols in user space, one has to implement
> fake user-space file descriptors to represent the sockets for the protocol.
[...]
> This patch implements new EFD_MASK flag which attempts to solve this problem.
[...]
> @@ -55,6 +64,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>  {
>         unsigned long flags;
>
> +       /* This function should never be used with eventfd in the mask mode. */
> +       BUG_ON(ctx->flags & EFD_MASK);
> +
>         spin_lock_irqsave(&ctx->wqh.lock, flags);
>         if (ULLONG_MAX - ctx->count < n)
>                 n = ULLONG_MAX - ctx->count;
> @@ -123,12 +135,16 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
>         poll_wait(file, &ctx->wqh, wait);
>
>         spin_lock_irqsave(&ctx->wqh.lock, flags);
> -       if (ctx->count > 0)
> -               events |= POLLIN;
> -       if (ctx->count == ULLONG_MAX)
> -               events |= POLLERR;
> -       if (ULLONG_MAX - 1 > ctx->count)
> -               events |= POLLOUT;
> +       if (ctx->flags & EFD_MASK) {
> +               events = ctx->mask.events;
> +       } else {
> +               if (ctx->count > 0)
> +                       events |= POLLIN;
> +               if (ctx->count == ULLONG_MAX)
> +                       events |= POLLERR;
> +               if (ULLONG_MAX - 1 > ctx->count)
> +                       events |= POLLOUT;
> +       }
>         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>
>         return events;
[...]
> @@ -412,7 +464,12 @@ struct file *eventfd_file_create(unsigned int count, int flags)
>
>         kref_init(&ctx->kref);
>         init_waitqueue_head(&ctx->wqh);
> -       ctx->count = count;
> +       if (flags & EFD_MASK) {
> +               ctx->mask.events = 0;
> +               ctx->mask.ptr = NULL;
> +       } else {
> +               ctx->count = count;
> +       }
>         ctx->flags = flags;
>
>         file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,

Since EFD_MASK is a persistent flag for a fd's lifetime, maybe you
could instead of all those if/elses and BUG_ON()s use another
file_operations struct for this feature?

Best Regards,
Michał Mirosław
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/1] eventfd: implementation of EFD_MASK flag
@ 2013-02-07 23:29 Martin Sustrik
  2013-02-15  2:45   ` Michał Mirosław
  0 siblings, 1 reply; 21+ messages in thread
From: Martin Sustrik @ 2013-02-07 23:29 UTC (permalink / raw)
  To: Alexander Viro, Andrew Morton, Sha Zhengju, linux-fsdevel, linux-kernel
  Cc: netdev, Martin Sustrik

When implementing network protocols in user space, one has to implement
fake user-space file descriptors to represent the sockets for the protocol.

While all the BSD socket API functionality for such descriptors may be faked as
well (myproto_send(), myproto_recv() etc.) this approach doesn't work for
polling  (select, poll, epoll). And unfortunately, sockets that can't be polled
on allow only for building the simplest possible applications. Basically, you
can build a simple client, but once you want to implement a server handling
many sockets in parallel, you are stuck.

However, to do polling, real system-level file descriptor is needed,
not a fake one.

In theory, eventfd may be used for this purpose, except that it is well suited
only for signaling POLLIN. With some hacking it can be also used to signal
POLLOUT and POLLERR, but:

I.  There's no way to signal POLLPRI, POLLHUP etc.
II. There's no way to signal arbitraty combination of POLL* flags. Most notably,
    !POLLIN & !POLLOUT, which is a perfectly valid combination for a network
    protocol (rx buffer is empty and tx buffer is full), cannot be signaled
    using current implementation of eventfd.

This patch implements new EFD_MASK flag which attempts to solve this problem.

Additionally, when implementing network protocols in user space, there's a
need to associate user-space state with the each "socket". If eventfd object is
used as a reference to the socket, it should be possible to associate an opaque
pointer to user-space data with it.

The semantics of EFD_MASK are as follows:

eventfd(2):

If eventfd is created with EFD_MASK flag set, it is initialised in such a way
as to signal no events on the file descriptor when it is polled on. 'initval'
argument is ignored.

write(2):

User is allowed to write only buffers containing the following structure:

struct efd_mask {
  short events;
  void *ptr;
};

The value of 'events' should be any combination of event flags as defined by
poll(2) function (POLLIN, POLLOUT, POLLERR, POLLHUP etc.) Specified events will
be signaled when polling (select, poll, epoll) on the eventfd is done later on.
'ptr' is an opaque pointer that is not interpreted by eventfd object.

read(2):

User is allowed to read an efd_mask structure from the eventfd marked by
EFD_MASK. Returned value shall be the last one written to the eventfd.

select(2), poll(2) and similar:

When polling on the eventfd marked by EFD_MASK flag, all the events specified
in last written 'events' field shall be signaled.

Signed-off-by: Martin Sustrik <sustrik@250bpm.com>
---
 fs/eventfd.c            |  105 ++++++++++++++++++++++++++++++++++++-----------
 include/linux/eventfd.h |    3 +-
 2 files changed, 83 insertions(+), 25 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9..9fec49f 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -2,6 +2,7 @@
  *  fs/eventfd.c
  *
  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2013  Martin Sustrik <sustrik@250bpm.com>
  *
  */
 
@@ -22,18 +23,26 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
+struct eventfd_mask {
+	short events;
+	void *ptr;
+};
+
 struct eventfd_ctx {
 	struct kref kref;
 	wait_queue_head_t wqh;
-	/*
-	 * Every time that a write(2) is performed on an eventfd, the
-	 * value of the __u64 being written is added to "count" and a
-	 * wakeup is performed on "wqh". A read(2) will return the "count"
-	 * value to userspace, and will reset "count" to zero. The kernel
-	 * side eventfd_signal() also, adds to the "count" counter and
-	 * issue a wakeup.
-	 */
-	__u64 count;
+	union {
+		/*
+		 * Every time that a write(2) is performed on an eventfd, the
+		 * value of the __u64 being written is added to "count" and a
+		 * wakeup is performed on "wqh". A read(2) will return the
+		 * "count" value to userspace, and will reset "count" to zero.
+		 * The kernel side eventfd_signal() also, adds to the "count"
+		 * counter and issue a wakeup.
+		 */
+		__u64 count;
+		struct eventfd_mask mask;
+	};
 	unsigned int flags;
 };
 
@@ -55,6 +64,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 {
 	unsigned long flags;
 
+	/* This function should never be used with eventfd in the mask mode. */
+	BUG_ON(ctx->flags & EFD_MASK);
+
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
@@ -123,12 +135,16 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &ctx->wqh, wait);
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	if (ctx->count > 0)
-		events |= POLLIN;
-	if (ctx->count == ULLONG_MAX)
-		events |= POLLERR;
-	if (ULLONG_MAX - 1 > ctx->count)
-		events |= POLLOUT;
+	if (ctx->flags & EFD_MASK) {
+		events = ctx->mask.events;
+	} else {
+		if (ctx->count > 0)
+			events |= POLLIN;
+		if (ctx->count == ULLONG_MAX)
+			events |= POLLERR;
+		if (ULLONG_MAX - 1 > ctx->count)
+			events |= POLLOUT;
+	}
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return events;
@@ -158,6 +174,9 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
 {
 	unsigned long flags;
 
+	/* This function should never be used with eventfd in the mask mode. */
+	BUG_ON(ctx->flags & EFD_MASK);
+
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	eventfd_ctx_do_read(ctx, cnt);
 	__remove_wait_queue(&ctx->wqh, wait);
@@ -188,6 +207,9 @@ ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 	ssize_t res;
 	DECLARE_WAITQUEUE(wait, current);
 
+	/* This function should never be used with eventfd in the mask mode. */
+	BUG_ON(ctx->flags & EFD_MASK);
+
 	spin_lock_irq(&ctx->wqh.lock);
 	*cnt = 0;
 	res = -EAGAIN;
@@ -230,13 +252,23 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 	ssize_t res;
 	__u64 cnt;
 
-	if (count < sizeof(cnt))
-		return -EINVAL;
-	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
-	if (res < 0)
+	if (ctx->flags & EFD_MASK) {
+		spin_lock_irq(&ctx->wqh.lock);
+		if (count < sizeof(ctx->mask))
+			return -EINVAL;
+		res = copy_to_user(buf, &ctx->mask, sizeof(ctx->mask)) ?
+			-EFAULT : sizeof(ctx->mask);
+		spin_unlock_irq(&ctx->wqh.lock);
 		return res;
-
-	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+	} else {
+		if (count < sizeof(cnt))
+			return -EINVAL;
+		res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
+		if (res < 0)
+			return res;
+		return put_user(cnt, (__u64 __user *) buf) ?
+			-EFAULT : sizeof(cnt);
+	}
 }
 
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
@@ -247,6 +279,21 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	__u64 ucnt;
 	DECLARE_WAITQUEUE(wait, current);
 
+	if (ctx->flags & EFD_MASK) {
+		if (count < sizeof(ctx->mask))
+			return -EINVAL;
+		spin_lock_irq(&ctx->wqh.lock);
+		if (copy_from_user(&ctx->mask, buf, sizeof(ctx->mask))) {
+			spin_unlock_irq(&ctx->wqh.lock);
+			return -EFAULT;
+		}
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh,
+				(unsigned long)ctx->mask.events);
+		spin_unlock_irq(&ctx->wqh.lock);
+		return sizeof(ctx->mask);
+	}
+
 	if (count < sizeof(ucnt))
 		return -EINVAL;
 	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
@@ -293,8 +340,13 @@ static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 	int ret;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	ret = seq_printf(m, "eventfd-count: %16llx\n",
-			 (unsigned long long)ctx->count);
+	if (ctx->flags & EFD_MASK) {
+		ret = seq_printf(m, "eventfd-mask: %x\n",
+				 (unsigned)ctx->mask.events);
+	} else {
+		ret = seq_printf(m, "eventfd-count: %16llx\n",
+				 (unsigned long long)ctx->count);
+	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
 	return ret;
@@ -412,7 +464,12 @@ struct file *eventfd_file_create(unsigned int count, int flags)
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
-	ctx->count = count;
+	if (flags & EFD_MASK) {
+		ctx->mask.events = 0;
+		ctx->mask.ptr = NULL;
+	} else {
+		ctx->count = count;
+	}
 	ctx->flags = flags;
 
 	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 3c3ef19..b806d2b 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -20,11 +20,12 @@
  * shared O_* flags.
  */
 #define EFD_SEMAPHORE (1 << 0)
+#define EFD_MASK (1 << 1)
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_MASK)
 
 #ifdef CONFIG_EVENTFD
 
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2013-02-15  2:45 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-07  6:41 [PATCH 1/1] eventfd: implementation of EFD_MASK flag Martin Sustrik
2013-02-07 19:12 ` Andy Lutomirski
2013-02-07 20:11   ` Martin Sustrik
2013-02-08  1:03     ` Andy Lutomirski
2013-02-08  5:26       ` Martin Sustrik
2013-02-08  6:36         ` Andy Lutomirski
2013-02-08  6:55           ` Martin Sustrik
2013-02-08 22:08       ` Eric Wong
2013-02-09  3:26         ` Martin Sustrik
2013-02-07 22:44 ` Andrew Morton
2013-02-07 23:30   ` Martin Sustrik
2013-02-08 12:43   ` Martin Sustrik
2013-02-08 22:21     ` Eric Wong
2013-02-09  2:40       ` Martin Sustrik
2013-02-09  3:54         ` Eric Wong
2013-02-09  7:36           ` Martin Sustrik
2013-02-09 11:51             ` Eric Wong
2013-02-09 12:04               ` Martin Sustrik
2013-02-07 23:29 Martin Sustrik
2013-02-15  2:45 ` Michał Mirosław
2013-02-15  2:45   ` Michał Mirosław

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.