linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] async poll for 2.5
@ 2002-10-14 22:36 Shailabh Nagar
  2002-10-14 22:54 ` John Myers
  2002-10-15 15:05 ` Benjamin LaHaise
  0 siblings, 2 replies; 138+ messages in thread
From: Shailabh Nagar @ 2002-10-14 22:36 UTC (permalink / raw)
  To: linux-kernel, linux-aio
  Cc: Andrew Morton, Ben LaHaise, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 1109 bytes --]

As of today, there is no scalable alternative to poll/select in the 2.5
kernel even though the topic has been discussed a number of times
before. The case for a scalable poll has been made often so I won't
get into that.

Attached is a port of the 2.4 async poll code to 2.5.41, written by
David Stevens with assistance from Jay Vosburgh and Mingming Cao (a
port for 2.5.42 is in progress and will be posted shortly). The
patch is a clean port of the 2.4 design and eliminates the use of
worktodos, just as Ben had done through the do_hack() function. The
patch has been tested on 2.5.41 using simple poll tests. A performance
evaluation and further testing is underway.

Even though Ben has indicated, on linux-aio and in OLS, that the 2.4
design doesn't scale well enough, it is a lot better than normal poll.
With the absence of alternatives and the impending feature freeze,
this patch would be one way to ensure that users have at least one
alternative to regular poll.

Ben, are you working on a different async poll implementation that is
likely to be ready by the feature freeze ?

Regards,
Shailabh


[-- Attachment #2: aiopoll-2.5.41-5.patch --]
[-- Type: text/plain, Size: 8694 bytes --]

diff -ruN linux-2.5.41/fs/aio.c linux-2.5.41AIO/fs/aio.c
--- linux-2.5.41/fs/aio.c	Mon Oct  7 11:24:13 2002
+++ linux-2.5.41AIO/fs/aio.c	Mon Oct 14 12:27:05 2002
@@ -59,6 +59,8 @@
 static spinlock_t	fput_lock = SPIN_LOCK_UNLOCKED;
 LIST_HEAD(fput_head);
 
+int async_poll(struct kiocb *iocb, int events);
+
 /* aio_setup
  *	Creates the slab caches used by the aio routines, panic on
  *	failure as this is done early during the boot sequence.
@@ -893,6 +895,19 @@
 	return -EINVAL;
 }
 
+ssize_t generic_aio_poll(struct file *file, struct kiocb *req, struct iocb *iocb)
+{
+	unsigned events = iocb->aio_buf;
+
+	/* Did the user set any bits they weren't supposed to? (The 
+	 * above is actually a cast.
+	 */
+	if (unlikely(events != iocb->aio_buf))
+		return -EINVAL;
+	
+	return async_poll(req, events);
+}
+
 static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
 				  struct iocb *iocb));
 static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -978,12 +993,15 @@
 		if (file->f_op->aio_fsync)
 			ret = file->f_op->aio_fsync(req, 0);
 		break;
+	case IOCB_CMD_POLL:
+		ret = generic_aio_poll(file, req, iocb);
+		break;
 	default:
 		dprintk("EINVAL: io_submit: no operation provided\n");
 		ret = -EINVAL;
 	}
 
-	if (likely(EIOCBQUEUED == ret))
+	if (likely(-EIOCBQUEUED == ret))
 		return 0;
 	if (ret >= 0) {
 		aio_complete(req, ret, 0);
diff -ruN linux-2.5.41/fs/select.c linux-2.5.41AIO/fs/select.c
--- linux-2.5.41/fs/select.c	Mon Oct  7 11:23:21 2002
+++ linux-2.5.41AIO/fs/select.c	Mon Oct 14 13:39:58 2002
@@ -20,6 +20,8 @@
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/init.h>
 
 #include <asm/uaccess.h>
 
@@ -27,19 +29,34 @@
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
 struct poll_table_entry {
-	struct file * filp;
 	wait_queue_t wait;
 	wait_queue_head_t * wait_address;
+	struct file * filp;
+	poll_table *p;
 };
 
 struct poll_table_page {
+	unsigned long size;
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
 	struct poll_table_entry entries[0];
 };
 
 #define POLL_TABLE_FULL(table) \
-	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+	((unsigned long)((table)->entry+1) > \
+	 (table)->size + (unsigned long)(table))
+
+/* async poll uses only one entry per poll table as it is linked to an iocb */
+typedef struct async_poll_table_struct {
+	poll_table		pt;		
+	int			events;		/* event mask for async poll */
+	int			wake;
+	long			sync;
+	struct poll_table_page	pt_page;	/* one poll table page hdr */
+	struct poll_table_entry entries[1];	/* space for a single entry */
+} async_poll_table;
+
+static kmem_cache_t *async_poll_table_cache;
 
 /*
  * Ok, Peter made a complicated, but straightforward multiple_wait() function.
@@ -53,8 +70,7 @@
  * as all select/poll functions have to call it to add an entry to the
  * poll table.
  */
-
-void poll_freewait(poll_table* pt)
+void __poll_freewait(poll_table* pt, wait_queue_t *wait)
 {
 	struct poll_table_page * p = pt->table;
 	while (p) {
@@ -62,15 +78,141 @@
 		struct poll_table_page *old;
 
 		entry = p->entry;
+		if (entry == p->entries) /* may happen with async poll */
+			break;
 		do {
 			entry--;
-			remove_wait_queue(entry->wait_address,&entry->wait);
+			if (wait != &entry->wait)
+				remove_wait_queue(entry->wait_address,&entry->wait);
+			else
+				__remove_wait_queue(entry->wait_address,&entry->wait);
 			fput(entry->filp);
 		} while (entry > p->entries);
 		old = p;
 		p = p->next;
-		free_page((unsigned long) old);
+		if (old->size == PAGE_SIZE)
+			free_page((unsigned long) old);
 	}
+	if (pt->iocb)
+		kmem_cache_free(async_poll_table_cache, pt);
+}
+
+void poll_freewait(poll_table* pt)
+{
+	__poll_freewait(pt, NULL);
+}
+
+void async_poll_complete(void *data)
+{
+	async_poll_table *pasync = data;
+	poll_table *p = data;
+	struct kiocb	*iocb = p->iocb;
+	unsigned int	mask;
+
+	pasync->wake = 0;
+	wmb();
+	do {
+		mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, p);
+		mask &= pasync->events | POLLERR | POLLHUP;
+		if (mask) {
+			poll_table *p2 = xchg(&iocb->ki_data, NULL);
+			if (p2) {
+				poll_freewait(p2); 
+				aio_complete(iocb, mask, 0);
+			}
+			return;
+		}
+		pasync->sync = 0;
+		wmb();
+	} while (pasync->wake);
+}
+
+static int async_poll_waiter(wait_queue_t *wait, unsigned mode, int sync)
+{
+	struct poll_table_entry *entry = (struct poll_table_entry *)wait;
+	async_poll_table *pasync = (async_poll_table *)(entry->p);
+	struct kiocb	*iocb;
+	unsigned int	mask;
+
+	iocb = pasync->pt.iocb;
+	mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, NULL);
+	mask &= pasync->events | POLLERR | POLLHUP;
+	if (mask) {
+		poll_table *p2 = xchg(&iocb->ki_data, NULL);
+		if (p2) {
+			__poll_freewait(p2, wait); 
+			aio_complete(iocb, mask, 0);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int async_poll_cancel(struct kiocb *iocb, struct io_event *res)
+{
+	poll_table *p;
+
+	/* FIXME: almost right */
+	p = xchg(&iocb->ki_data, NULL);
+	if (p) {
+		poll_freewait(p); 
+		aio_complete(iocb, 0, 0);
+		aio_put_req(iocb);
+		return 0;
+	}
+	aio_put_req(iocb);
+	return -EAGAIN;
+}
+
+int async_poll(struct kiocb *iocb, int events)
+{
+	unsigned int mask;
+	async_poll_table *pasync;
+	poll_table *p;
+
+	/* Fast path */
+	if (iocb->ki_filp->f_op && iocb->ki_filp->f_op->poll) {
+		mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, NULL);
+		mask &= events | POLLERR | POLLHUP;
+		if (mask & events)
+			return events;
+	}
+
+	pasync = kmem_cache_alloc(async_poll_table_cache, SLAB_KERNEL);
+	if (!pasync)
+		return -ENOMEM;
+
+	p = (poll_table *)pasync;
+	poll_initwait(p);
+	p->iocb = iocb;
+	pasync->wake = 0;
+	pasync->sync = 0;
+	pasync->events = events;
+	pasync->pt_page.entry = pasync->pt_page.entries;
+	pasync->pt_page.size = sizeof(pasync->pt_page) + sizeof(pasync->entries);
+	pasync->pt_page.next = 0;
+	p->table = &pasync->pt_page;
+
+	iocb->ki_data = p;
+	wmb();
+	iocb->ki_cancel = async_poll_cancel;
+
+	mask = DEFAULT_POLLMASK;
+#warning broken
+	iocb->ki_users ++;
+	if (iocb->ki_filp->f_op && iocb->ki_filp->f_op->poll)
+		mask = iocb->ki_filp->f_op->poll(iocb->ki_filp, p);
+	mask &= events | POLLERR | POLLHUP;
+	if (mask && !test_and_set_bit(0, &pasync->sync))
+		aio_complete(iocb, mask, 0);
+
+	if (aio_put_req(iocb))
+		/* Must be freed after aio_complete to synchronise with 
+		 * cancellation of the request.
+		 */
+		poll_freewait(p);
+
+	return -EIOCBQUEUED;
 }
 
 void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -86,6 +228,7 @@
 			__set_current_state(TASK_RUNNING);
 			return;
 		}
+		new_table->size = PAGE_SIZE;
 		new_table->entry = new_table->entries;
 		new_table->next = table;
 		p->table = new_table;
@@ -99,7 +242,11 @@
 	 	get_file(filp);
 	 	entry->filp = filp;
 		entry->wait_address = wait_address;
-		init_waitqueue_entry(&entry->wait, current);
+		entry->p = p;
+		if (p->iocb) /* async poll */
+			init_waitqueue_func_entry(&entry->wait, async_poll_waiter);
+		else
+			init_waitqueue_entry(&entry->wait, current);
 		add_wait_queue(wait_address,&entry->wait);
 	}
 }
@@ -495,3 +642,14 @@
 	poll_freewait(&table);
 	return err;
 }
+
+static int __init async_poll_init(void)
+{
+	async_poll_table_cache = kmem_cache_create("async poll table",
+                        sizeof(async_poll_table), 0, 0, NULL, NULL);
+	if (!async_poll_table_cache)
+		panic("unable to alloc poll_table_cache");
+	return 0;
+}
+
+module_init(async_poll_init);
diff -ruN linux-2.5.41/include/linux/aio_abi.h linux-2.5.41AIO/include/linux/aio_abi.h
--- linux-2.5.41/include/linux/aio_abi.h	Mon Oct  7 11:23:28 2002
+++ linux-2.5.41AIO/include/linux/aio_abi.h	Mon Oct  7 16:33:36 2002
@@ -40,6 +40,7 @@
 	 * IOCB_CMD_PREADX = 4,
 	 * IOCB_CMD_POLL = 5,
 	 */
+	IOCB_CMD_POLL = 5,
 	IOCB_CMD_NOOP = 6,
 };
 
diff -ruN linux-2.5.41/include/linux/poll.h linux-2.5.41AIO/include/linux/poll.h
--- linux-2.5.41/include/linux/poll.h	Mon Oct  7 11:24:12 2002
+++ linux-2.5.41AIO/include/linux/poll.h	Tue Oct  8 12:06:44 2002
@@ -9,12 +9,14 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
+#include <linux/workqueue.h>
 
 struct poll_table_page;
 
 typedef struct poll_table_struct {
-	int error;
-	struct poll_table_page * table;
+	int			error;
+	struct poll_table_page	*table;
+	struct kiocb		*iocb;		/* iocb for async poll */
 } poll_table;
 
 extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
@@ -29,6 +31,7 @@
 {
 	pt->error = 0;
 	pt->table = NULL;
+	pt->iocb = NULL;
 }
 extern void poll_freewait(poll_table* pt);
 

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-14 22:36 [PATCH] async poll for 2.5 Shailabh Nagar
@ 2002-10-14 22:54 ` John Myers
  2002-10-15 15:05 ` Benjamin LaHaise
  1 sibling, 0 replies; 138+ messages in thread
From: John Myers @ 2002-10-14 22:54 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: linux-kernel, linux-aio, Andrew Morton, Ben LaHaise,
	David Miller, Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 259 bytes --]

Shailabh Nagar wrote:

> Thepatch has been tested on 2.5.41 using simple poll tests.

Your patch has numerous races, a kiocb leak, and incorrectly calls 
aio_complete() upon cancellation.  Please see the patch I sent to 
linux-aio@kvack.org on September 29.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-14 22:36 [PATCH] async poll for 2.5 Shailabh Nagar
  2002-10-14 22:54 ` John Myers
@ 2002-10-15 15:05 ` Benjamin LaHaise
  2002-10-15 17:06   ` Dan Kegel
  2002-10-15 17:38   ` [PATCH] async poll for 2.5 Shailabh Nagar
  1 sibling, 2 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 15:05 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
> As of today, there is no scalable alternative to poll/select in the 2.5
> kernel even though the topic has been discussed a number of times
> before. The case for a scalable poll has been made often so I won't
> get into that.

Have you bothered addressing the fact that async poll scales worse than 
/dev/epoll?  That was the original reason for dropping it.

		-ben

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 17:06   ` Dan Kegel
@ 2002-10-15 17:03     ` Benjamin LaHaise
  2002-10-15 17:18       ` Dan Kegel
  2002-10-15 18:09     ` Shailabh Nagar
  1 sibling, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 17:03 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, Oct 15, 2002 at 10:06:22AM -0700, Dan Kegel wrote:
> Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
> alternative?

No.

		-ben

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 15:05 ` Benjamin LaHaise
@ 2002-10-15 17:06   ` Dan Kegel
  2002-10-15 17:03     ` Benjamin LaHaise
  2002-10-15 18:09     ` Shailabh Nagar
  2002-10-15 17:38   ` [PATCH] async poll for 2.5 Shailabh Nagar
  1 sibling, 2 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-15 17:06 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Benjamin LaHaise wrote:
> 
> On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
> > As of today, there is no scalable alternative to poll/select in the 2.5
> > kernel even though the topic has been discussed a number of times
> > before. The case for a scalable poll has been made often so I won't
> > get into that.
> 
> Have you bothered addressing the fact that async poll scales worse than
> /dev/epoll?  That was the original reason for dropping it.

Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
alternative?
It's in 2.5 as far as I know.   It does suffer from using the signal
queue,
but it's in production use on servers that handle many thousands of 
concurrent connections, so it's pretty scalable.

- Dan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 17:03     ` Benjamin LaHaise
@ 2002-10-15 17:18       ` Dan Kegel
  2002-10-16  2:11         ` Lincoln Dale
  0 siblings, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-15 17:18 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Benjamin LaHaise wrote:
> 
> On Tue, Oct 15, 2002 at 10:06:22AM -0700, Dan Kegel wrote:
> > Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
> > alternative?
> 
> No.

What's the worst part about it?  The use of the signal queue?
- Dan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 15:05 ` Benjamin LaHaise
  2002-10-15 17:06   ` Dan Kegel
@ 2002-10-15 17:38   ` Shailabh Nagar
  2002-10-15 17:50     ` Benjamin LaHaise
  1 sibling, 1 reply; 138+ messages in thread
From: Shailabh Nagar @ 2002-10-15 17:38 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

Benjamin LaHaise wrote:

>On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
>
>>As of today, there is no scalable alternative to poll/select in the 2.5
>>kernel even though the topic has been discussed a number of times
>>before. The case for a scalable poll has been made often so I won't
>>get into that.
>>
>
>Have you bothered addressing the fact that async poll scales worse than 
>/dev/epoll?  That was the original reason for dropping it.
>
>		-ben
>
Hi Ben,

I didn't address async poll's scalability  vs. /dev/epoll because 
/dev/epoll isn't in the kernel either. I wasn't sure whether you had a 
better async poll
in the making.

Either solution (/dev/epoll or async poll) would be a considerable 
improvement over what we have today.

So I guess the question would now be: whats keeping /dev/epoll from 
being included in the kernel given the time left before the feature freeze ?

-- Shailabh





^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 17:38   ` [PATCH] async poll for 2.5 Shailabh Nagar
@ 2002-10-15 17:50     ` Benjamin LaHaise
  2002-10-15 18:16       ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 17:50 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> So I guess the question would now be: whats keeping /dev/epoll from 
> being included in the kernel given the time left before the feature freeze ?

We don't need yet another event reporting mechanism as /dev/epoll presents.  
I was thinking it should just be its own syscall but report its events in 
the same way as aio.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 17:06   ` Dan Kegel
  2002-10-15 17:03     ` Benjamin LaHaise
@ 2002-10-15 18:09     ` Shailabh Nagar
  2002-10-15 18:53       ` Dan Kegel
  1 sibling, 1 reply; 138+ messages in thread
From: Shailabh Nagar @ 2002-10-15 18:09 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Benjamin LaHaise, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Dan Kegel wrote:

>Benjamin LaHaise wrote:
>
>>On Mon, Oct 14, 2002 at 06:36:45PM -0400, Shailabh Nagar wrote:
>>
>>>As of today, there is no scalable alternative to poll/select in the 2.5
>>>kernel even though the topic has been discussed a number of times
>>>before. The case for a scalable poll has been made often so I won't
>>>get into that.
>>>
>>Have you bothered addressing the fact that async poll scales worse than
>>/dev/epoll?  That was the original reason for dropping it.
>>
>
>Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
>alternative?
>It's in 2.5 as far as I know.   It does suffer from using the signal
>queue,
>but it's in production use on servers that handle many thousands of 
>concurrent connections, so it's pretty scalable.
>
>- Dan
>

Dan,

Are there any performance numbers for F_SETSIG/F_SETOWN/SIGIO on 2.5 ? 
Does it scale with the number of active connections too ?

Signal-per-fd seems to be a decent alternative (from the measurements on 
Davide's /dev/epoll page) but Vitaly Luban's patch for that isn't available
for 2.5 so I'm not sure what other issues it might have.

-- Shailabh







^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 17:50     ` Benjamin LaHaise
@ 2002-10-15 18:16       ` Davide Libenzi
  2002-10-15 18:18         ` Shailabh Nagar
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 18:16 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, 15 Oct 2002, Benjamin LaHaise wrote:

> On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> > So I guess the question would now be: whats keeping /dev/epoll from
> > being included in the kernel given the time left before the feature freeze ?
>
> We don't need yet another event reporting mechanism as /dev/epoll presents.
> I was thinking it should just be its own syscall but report its events in
> the same way as aio.

Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
was the fastest way to have a kernel interface exposed w/out having to beg
for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
might be a nice complement to AIO for specific applications.




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 18:16       ` Davide Libenzi
@ 2002-10-15 18:18         ` Shailabh Nagar
  2002-10-15 19:00           ` Davide Libenzi
  2002-10-15 19:02           ` Davide Libenzi
  0 siblings, 2 replies; 138+ messages in thread
From: Shailabh Nagar @ 2002-10-15 18:18 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Davide Libenzi wrote:
> On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> 
> 
>>On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
>>
>>>So I guess the question would now be: whats keeping /dev/epoll from
>>>being included in the kernel given the time left before the feature freeze ?
>>
>>We don't need yet another event reporting mechanism as /dev/epoll presents.
>>I was thinking it should just be its own syscall but report its events in
>>the same way as aio.
> 
> 
> Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
> was the fastest way to have a kernel interface exposed w/out having to beg
> for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
> might be a nice complement to AIO for specific applications.


So what would the syscall look like ? Could you give a few more details on the interface ?

- Shailabh





^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 18:09     ` Shailabh Nagar
@ 2002-10-15 18:53       ` Dan Kegel
  2002-10-15 18:57         ` Benjamin LaHaise
  0 siblings, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-15 18:53 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Benjamin LaHaise, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Shailabh Nagar wrote:
> Are there any performance numbers for F_SETSIG/F_SETOWN/SIGIO on 2.5 ?

I don't recall any.  (I suppose I should get off my butt and start
running 2.5.)

> Does it scale with the number of active connections too ?

The overhead is probably mostly linear with activity, I don't know.
 
> Signal-per-fd seems to be a decent alternative (from the measurements on
> Davide's /dev/epoll page) but Vitaly Luban's patch for that isn't available
> for 2.5 so I'm not sure what other issues it might have.

Seems like the thing to do is to move /dev/epoll over to use
Ben's event system rather than worry about the old /dev/epoll interface.
But like signal-per-fd, we will want to collapse readiness events,
which is something Ben's event system might not do naturally.
(I wouldn't know -- I haven't actually looked at Ben's code.)

- Dan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 18:53       ` Dan Kegel
@ 2002-10-15 18:57         ` Benjamin LaHaise
  2002-10-15 20:25           ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 18:57 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, Oct 15, 2002 at 11:53:48AM -0700, Dan Kegel wrote:
> Seems like the thing to do is to move /dev/epoll over to use
> Ben's event system rather than worry about the old /dev/epoll interface.
> But like signal-per-fd, we will want to collapse readiness events,
> which is something Ben's event system might not do naturally.
> (I wouldn't know -- I haven't actually looked at Ben's code.)

If you look at how /dev/epoll does it, the collapsing of readiness 
events is very elegant: a given fd is only allowed to report a change 
in its state once per run through the event loop.  The ioctl that swaps 
event buffers acts as a barrier between the two possible reports.

As to how this would interact with the aio event loops, I thought the 
"barrier" syscall could be the point at which aio event slots are reserved 
and freed.  Interest registration would be the other syscall (which would 
naturally have to reserve an event for the descriptor in the current set 
of readiness notifications).  Anyways, just a few thoughts...

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:02             ` Benjamin LaHaise
@ 2002-10-15 18:59               ` Shailabh Nagar
  2002-10-15 19:16               ` Davide Libenzi
  1 sibling, 0 replies; 138+ messages in thread
From: Shailabh Nagar @ 2002-10-15 18:59 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Davide Libenzi, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Benjamin LaHaise wrote:
> On Tue, Oct 15, 2002 at 12:00:30PM -0700, Davide Libenzi wrote:
> 
>>Something like this might work :
>>
>>int sys_epoll_create(int maxfds);
>>void sys_epoll_close(int epd);
>>int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
>>
>>where sys_epoll_wait() return the number of events available, 0 for
>>timeout, -1 for error.
> 
> 
> There's no reason to make epoll_wait a new syscall -- poll events can 
> easily be returned via the aio_complete mechanism (with the existing 
> aio_poll experiment as a possible means for doing so).


So a user would setup an ioctx  and use io_getevents to retrieve events on
an interest set of fds created and manipulated through the new system calls ?

-- Shailabh


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 18:18         ` Shailabh Nagar
@ 2002-10-15 19:00           ` Davide Libenzi
  2002-10-15 19:02             ` Benjamin LaHaise
  2002-10-15 19:02           ` Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 19:00 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Benjamin LaHaise, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, 15 Oct 2002, Shailabh Nagar wrote:

> Davide Libenzi wrote:
> > On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> >
> >
> >>On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> >>
> >>>So I guess the question would now be: whats keeping /dev/epoll from
> >>>being included in the kernel given the time left before the feature freeze ?
> >>
> >>We don't need yet another event reporting mechanism as /dev/epoll presents.
> >>I was thinking it should just be its own syscall but report its events in
> >>the same way as aio.
> >
> >
> > Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
> > was the fastest way to have a kernel interface exposed w/out having to beg
> > for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
> > might be a nice complement to AIO for specific applications.
>
>
> So what would the syscall look like ? Could you give a few more details on the interface ?

Something like this might work :

int sys_epoll_create(int maxfds);
void sys_epoll_close(int epd);
int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);

where sys_epoll_wait() return the number of events available, 0 for
timeout, -1 for error.




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:00           ` Davide Libenzi
@ 2002-10-15 19:02             ` Benjamin LaHaise
  2002-10-15 18:59               ` Shailabh Nagar
  2002-10-15 19:16               ` Davide Libenzi
  0 siblings, 2 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 19:02 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, Oct 15, 2002 at 12:00:30PM -0700, Davide Libenzi wrote:
> Something like this might work :
> 
> int sys_epoll_create(int maxfds);
> void sys_epoll_close(int epd);
> int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> 
> where sys_epoll_wait() return the number of events available, 0 for
> timeout, -1 for error.

There's no reason to make epoll_wait a new syscall -- poll events can 
easily be returned via the aio_complete mechanism (with the existing 
aio_poll experiment as a possible means for doing so).

		-ben

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 18:18         ` Shailabh Nagar
  2002-10-15 19:00           ` Davide Libenzi
@ 2002-10-15 19:02           ` Davide Libenzi
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 19:02 UTC (permalink / raw)
  To: Shailabh Nagar
  Cc: Benjamin LaHaise, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, 15 Oct 2002, Shailabh Nagar wrote:

> Davide Libenzi wrote:
> > On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> >
> >
> >>On Tue, Oct 15, 2002 at 01:38:53PM -0400, Shailabh Nagar wrote:
> >>
> >>>So I guess the question would now be: whats keeping /dev/epoll from
> >>>being included in the kernel given the time left before the feature freeze ?
> >>
> >>We don't need yet another event reporting mechanism as /dev/epoll presents.
> >>I was thinking it should just be its own syscall but report its events in
> >>the same way as aio.
> >
> >
> > Yes, Linus ( like myself ) hates magic inodes inside /dev. At that time it
> > was the fastest way to have a kernel interface exposed w/out having to beg
> > for a syscall. I'm all for a new syscall obviously, and IMHO /dev/epoll
> > might be a nice complement to AIO for specific applications.
>
>
> So what would the syscall look like ? Could you give a few more details on the interface ?

Since i guess that w/out having the bility to add fds to the monitores set
would make the API useless ...

int sys_epoll_addfd(int epd, int fd);




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:16               ` Davide Libenzi
@ 2002-10-15 19:12                 ` Benjamin LaHaise
  2002-10-15 19:31                   ` Davide Libenzi
  2002-10-15 20:36                   ` John Gardiner Myers
  0 siblings, 2 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 19:12 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> Ben, one of the reasons of the /dev/epoll speed is how it returns events
> and how it collapses them. A memory mapped array is divided by two and
> while the user consumes events in one set, the kernel fill the other one.
> The next wait() will switch the pointers. There is no copy from kernel to
> user space. Doing :
> 
> int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> 
> the only data the kernel has to copy to userspace is the 4(8) bytes for
> the "pevts" pointer.

Erm, the aio interface has support for the event ringbuffer being accessed 
by userspace (it lives in user memory and the kernel acts as a writer, with 
userspace as a reader), that's one of its advantages -- completion events 
are directly accessible from userspace after being written to by an 
interrupt.  Ideally this is to be wrapped in a vsyscall, but we don't have 
support for that yet on x86, although much of the code written for x86-64 
should be reusable.

		-ben

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:02             ` Benjamin LaHaise
  2002-10-15 18:59               ` Shailabh Nagar
@ 2002-10-15 19:16               ` Davide Libenzi
  2002-10-15 19:12                 ` Benjamin LaHaise
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 19:16 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, 15 Oct 2002, Benjamin LaHaise wrote:

> On Tue, Oct 15, 2002 at 12:00:30PM -0700, Davide Libenzi wrote:
> > Something like this might work :
> >
> > int sys_epoll_create(int maxfds);
> > void sys_epoll_close(int epd);
> > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> >
> > where sys_epoll_wait() return the number of events available, 0 for
> > timeout, -1 for error.
>
> There's no reason to make epoll_wait a new syscall -- poll events can
> easily be returned via the aio_complete mechanism (with the existing
> aio_poll experiment as a possible means for doing so).

Ben, one of the reasons of the /dev/epoll speed is how it returns events
and how it collapses them. A memory mapped array is divided by two and
while the user consumes events in one set, the kernel fill the other one.
The next wait() will switch the pointers. There is no copy from kernel to
user space. Doing :

int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);

the only data the kernel has to copy to userspace is the 4(8) bytes for
the "pevts" pointer.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:12                 ` Benjamin LaHaise
@ 2002-10-15 19:31                   ` Davide Libenzi
  2002-10-15 19:38                     ` Dan Kegel
  2002-10-15 20:36                   ` John Gardiner Myers
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 19:31 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Tue, 15 Oct 2002, Benjamin LaHaise wrote:

> On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> > Ben, one of the reasons of the /dev/epoll speed is how it returns events
> > and how it collapses them. A memory mapped array is divided by two and
> > while the user consumes events in one set, the kernel fill the other one.
> > The next wait() will switch the pointers. There is no copy from kernel to
> > user space. Doing :
> >
> > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> >
> > the only data the kernel has to copy to userspace is the 4(8) bytes for
> > the "pevts" pointer.
>
> Erm, the aio interface has support for the event ringbuffer being accessed
> by userspace (it lives in user memory and the kernel acts as a writer, with
> userspace as a reader), that's one of its advantages -- completion events
> are directly accessible from userspace after being written to by an
> interrupt.  Ideally this is to be wrapped in a vsyscall, but we don't have
> support for that yet on x86, although much of the code written for x86-64
> should be reusable.

In general I would like to have a "common" interface to retrieve IO
events, but IMHO the two solutions should be benchmarked before adopting
the one or the other.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:31                   ` Davide Libenzi
@ 2002-10-15 19:38                     ` Dan Kegel
  2002-10-15 19:55                       ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-15 19:38 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

Davide Libenzi wrote:
> 
> On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> 
> > On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> > > Ben, one of the reasons of the /dev/epoll speed is how it returns events
> > > and how it collapses them. A memory mapped array is divided by two and
> > > while the user consumes events in one set, the kernel fill the other one.
> > > The next wait() will switch the pointers. There is no copy from kernel to
> > > user space. Doing :
> > >
> > > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> > >
> > > the only data the kernel has to copy to userspace is the 4(8) bytes for
> > > the "pevts" pointer.
> >
> > Erm, the aio interface has support for the event ringbuffer being accessed
> > by userspace (it lives in user memory and the kernel acts as a writer, with
> > userspace as a reader), that's one of its advantages -- completion events
> > are directly accessible from userspace after being written to by an
> > interrupt.  Ideally this is to be wrapped in a vsyscall, but we don't have
> > support for that yet on x86, although much of the code written for x86-64
> > should be reusable.
> 
> In general I would like to have a "common" interface to retrieve IO
> events, but IMHO the two solutions should be benchmarked before adopting
> the one or the other.

Seems like /dev/epoll uses a double-buffering scheme rather than
a ring buffer, and this is not just a trivial difference; it's
related to how redundant events are collapsed, right?
- Dan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:38                     ` Dan Kegel
@ 2002-10-15 19:55                       ` Davide Libenzi
  0 siblings, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 19:55 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Benjamin LaHaise, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

On Tue, 15 Oct 2002, Dan Kegel wrote:

> Davide Libenzi wrote:
> >
> > On Tue, 15 Oct 2002, Benjamin LaHaise wrote:
> >
> > > On Tue, Oct 15, 2002 at 12:16:39PM -0700, Davide Libenzi wrote:
> > > > Ben, one of the reasons of the /dev/epoll speed is how it returns events
> > > > and how it collapses them. A memory mapped array is divided by two and
> > > > while the user consumes events in one set, the kernel fill the other one.
> > > > The next wait() will switch the pointers. There is no copy from kernel to
> > > > user space. Doing :
> > > >
> > > > int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);
> > > >
> > > > the only data the kernel has to copy to userspace is the 4(8) bytes for
> > > > the "pevts" pointer.
> > >
> > > Erm, the aio interface has support for the event ringbuffer being accessed
> > > by userspace (it lives in user memory and the kernel acts as a writer, with
> > > userspace as a reader), that's one of its advantages -- completion events
> > > are directly accessible from userspace after being written to by an
> > > interrupt.  Ideally this is to be wrapped in a vsyscall, but we don't have
> > > support for that yet on x86, although much of the code written for x86-64
> > > should be reusable.
> >
> > In general I would like to have a "common" interface to retrieve IO
> > events, but IMHO the two solutions should be benchmarked before adopting
> > the one or the other.
>
> Seems like /dev/epoll uses a double-buffering scheme rather than
> a ring buffer, and this is not just a trivial difference; it's
> related to how redundant events are collapsed, right?

It's just a matter of implementation. With a double buffer you have
clearly two distinct working zones, one is the user zone and the other is
the kernel zone. With a ring buffer you have to mark what is the area that
is currently returned as event set to the user and avoid the kernel to
overflow on such area. The double buffer is probably faster and easy to
implement ( for event collapsing ).




- Davide




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 18:57         ` Benjamin LaHaise
@ 2002-10-15 20:25           ` John Gardiner Myers
  2002-10-15 21:09             ` Dan Kegel
  2002-10-15 21:11             ` Davide Libenzi
  0 siblings, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 20:25 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Dan Kegel, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 1452 bytes --]

Benjamin LaHaise wrote:

>If you look at how /dev/epoll does it, the collapsing of readiness 
>events is very elegant: a given fd is only allowed to report a change 
>in its state once per run through the event loop.
>
And the way /dev/epoll does it has a key flaw: it only works with single 
threaded callers.  If you have multiple threads simultaneously trying to 
get events, then race conditions abound.

>The ioctl that swaps 
>event buffers acts as a barrier between the two possible reports.
>
Which assumes there are only single threaded callers.  To work correctly 
with multithreaded callers, there needs to be a more explicit mechanism 
for a caller to indicate it has completed handling an event and wants to 
rearm its interest.

There are also additional interactions with cancellation.  How does the 
cancellation interface report and handle the case where an associated 
event is being delivered or handled by another thread?  What happens 
when that thread then tries to rearm the canceled interest?

I certainly hope /dev/epoll itself doesn't get accepted into the kernel, 
the interface is error prone.  Registering interest in a condition when 
the condition is already true should immediately generate an event, the 
epoll interface did not do that last time I saw it discussed.  This 
deficiency in the interface requires callers to include more complex 
workaround code and is likely to result in subtle, hard to diagnose bugs.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 19:12                 ` Benjamin LaHaise
  2002-10-15 19:31                   ` Davide Libenzi
@ 2002-10-15 20:36                   ` John Gardiner Myers
  2002-10-15 20:39                     ` Benjamin LaHaise
  1 sibling, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 20:36 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Davide Libenzi, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 329 bytes --]

Benjamin LaHaise wrote:

>Erm, the aio interface has support for the event ringbuffer being accessed 
>by userspace
>
Making the event ringbuffer visible to userspace conflicts with being 
able to support event priorities.  To support event priorities, the 
ringbuffer would need to be replaced with some other data structure.



[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 20:36                   ` John Gardiner Myers
@ 2002-10-15 20:39                     ` Benjamin LaHaise
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 20:39 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

On Tue, Oct 15, 2002 at 01:36:38PM -0700, John Gardiner Myers wrote:
> Benjamin LaHaise wrote:
> 
> >Erm, the aio interface has support for the event ringbuffer being accessed 
> >by userspace
> >
> Making the event ringbuffer visible to userspace conflicts with being 
> able to support event priorities.  To support event priorities, the 
> ringbuffer would need to be replaced with some other data structure.

No it does not.  Event priorities are easily accomplished via separate 
event queues for events of different priorities.  Most hardware implements 
event priorities in this fashion.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 20:25           ` John Gardiner Myers
@ 2002-10-15 21:09             ` Dan Kegel
  2002-10-15 21:50               ` John Myers
  2002-10-15 21:11             ` Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-15 21:09 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

John Gardiner Myers wrote:
> 
> Benjamin LaHaise wrote:
> 
> >If you look at how /dev/epoll does it, the collapsing of readiness
> >events is very elegant: a given fd is only allowed to report a change
> >in its state once per run through the event loop.
> >
> And the way /dev/epoll does it has a key flaw: it only works with single
> threaded callers.  If you have multiple threads simultaneously trying to
> get events, then race conditions abound.

Delaying the "get next batch of readiness events" call as long as
possible 
increases the amount of event collapsing possible, which is important
because 
the network stack seems to generate lots of spurious events.  Thus I
suspect 
you don't want multiple threads all calling the "get next batch of
events"
entry point frequently.
The most effective way to use something like /dev/epoll in a
multithreaded 
program might be to have one thread call "get next batch of events",
then divvy up the events across multiple threads.  
Thus I disagree that the way /dev/epoll does it is flawed.

> I certainly hope /dev/epoll itself doesn't get accepted into the kernel,
> the interface is error prone.  Registering interest in a condition when
> the condition is already true should immediately generate an event, the
> epoll interface did not do that last time I saw it discussed.  This
> deficiency in the interface requires callers to include more complex
> workaround code and is likely to result in subtle, hard to diagnose bugs.

With queued readiness notification schemes like SIGIO and /dev/epoll,
it's safest to allow readiness notifications from the kernel
to be wrong sometimes; this happens at least in the case of accept
readiness,
and possibly other places.  Once you allow that, it's easy to handle the
condition you're worried about by generating a spurious readiness
indication when registering a fd.  That's what I do in my wrapper
library.  

Also, because /dev/epoll and friends are single-shot notifications of
*changes* in readiness, there is little reason to register interest in
this or that event, and change that interest over time; instead,
apps should simply register interest in any event they might ever
be interested in.  The number of extra events they then have to ignore
is very
small, since if you take no action on a 'read ready' event, no more
of those events will occur.

So I pretty much disagree all around :-) but I do understand where
you're
coming from.  I used to feel similarly until I figured out the
'right' way to use one-shot readiness notification systems
(sometime last week :-)

- Dan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 20:25           ` John Gardiner Myers
  2002-10-15 21:09             ` Dan Kegel
@ 2002-10-15 21:11             ` Davide Libenzi
  2002-10-15 22:01               ` John Gardiner Myers
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 21:11 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Gardiner Myers wrote:

> Benjamin LaHaise wrote:
>
> >If you look at how /dev/epoll does it, the collapsing of readiness
> >events is very elegant: a given fd is only allowed to report a change
> >in its state once per run through the event loop.
> >
> And the way /dev/epoll does it has a key flaw: it only works with single
> threaded callers.  If you have multiple threads simultaneously trying to
> get events, then race conditions abound.
>
> >The ioctl that swaps
> >event buffers acts as a barrier between the two possible reports.
> >
> Which assumes there are only single threaded callers.  To work correctly
> with multithreaded callers, there needs to be a more explicit mechanism
> for a caller to indicate it has completed handling an event and wants to
> rearm its interest.
>
> There are also additional interactions with cancellation.  How does the
> cancellation interface report and handle the case where an associated
> event is being delivered or handled by another thread?  What happens
> when that thread then tries to rearm the canceled interest?
>

Why would you need to use threads with a multiplex-like interface like
/dev/epoll ? The reason of these ( poll()/select()//dev/epoll//dev/poll )
interfaces is to be able to handle more file descriptors inside a _single_
task.



> I certainly hope /dev/epoll itself doesn't get accepted into the kernel,
> the interface is error prone.  Registering interest in a condition when
> the condition is already true should immediately generate an event, the
> epoll interface did not do that last time I saw it discussed.  This
> deficiency in the interface requires callers to include more complex
> workaround code and is likely to result in subtle, hard to diagnose bugs.

It works exactly like rt-signals and all you have to do is to change your
code from :

int myread(...) {

	if (wait(POLLIN))
		read();

}

to :

int myread(...) {

	while (read() == EGAIN)
		wait(POLLIN);

}




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 21:09             ` Dan Kegel
@ 2002-10-15 21:50               ` John Myers
  2002-10-15 22:33                 ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Myers @ 2002-10-15 21:50 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Benjamin LaHaise, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 2703 bytes --]

Dan Kegel wrote:

>The most effective way to use something like /dev/epoll in a
>multithreaded 
>program might be to have one thread call "get next batch of events",
>then divvy up the events across multiple threads.  
>
That is a tautology.  As /dev/epoll is inherently a single threaded 
interface, of course the most effective way for a multithreaded program 
to use it is to have one thread call it then divvy up the events. 
 That's the *only* way a multithreaded program can deal with a single 
threaded interface.

The cost to divvy up the events can be substantial, not the mention the 
cost of funneling them all through a single CPU.  This cost can easily 
be greater than what one saves by combining events.  io_getevents() is a 
great model for divvying events across multiple threads, the poll 
facility should work with that model.

The solution for optimizing the amount of event collapsing is to 
implement concurrency control.

>Once you allow that, it's easy to handle the
>condition you're worried about by generating a spurious readiness
>indication when registering a fd.  That's what I do in my wrapper
>library.
>
This is a workaround.  You are adding additional code and complexity to 
the caller in order to deal with a deficiency in the interface.  This 
has several problems:

Someone writing to the /dev/epoll interface needs to know that they need 
to write such workaround code.  If they don't know to write the code or 
if they make an error in the workaround code, it is still likely that 
the program will pass testing.  What will result are intermittent, hard 
to diagnose failures in production.

User space does not have the information to address this as well as the 
kernel can.  As a result, the workaround requires more processing in the 
common, non racy case.

The kernel can clearly handle this situation better than user space.  It 
can do the necessary checks while it is still holding the necessary 
locks.  It is less error prone to handle the situation in the kernel. 
 The logic clearly belongs in the kernel.

>Also, because /dev/epoll and friends are single-shot notifications of
>*changes* in readiness, there is little reason to register interest in
>this or that event, and change that interest over time; instead,
>apps should simply register interest in any event they might ever
>be interested in.
>
You're making assumptions about the structure and flow of an 
application.  Sometimes when an application stops having interest in 
this or that event, it needs to free/invalidate the context associated 
with that event.

So that's fine as a strategy for amortizing the cost of 
registration/deregistration, but it isn't universally applicable.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 21:11             ` Davide Libenzi
@ 2002-10-15 22:01               ` John Gardiner Myers
  2002-10-15 22:27                 ` Davide Libenzi
  2002-10-16 20:03                 ` Dan Kegel
  0 siblings, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 22:01 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 578 bytes --]

Davide Libenzi wrote:

>Why would you need to use threads with a multiplex-like interface like
>/dev/epoll ?
>
Because in some applications processing an event can cause the thread to 
block, potentially for a long time.  Multiple threads are needed to 
isolate that block to the context associated with the event.

>	while (read() == EGAIN)
>		wait(POLLIN);
>  
>
Assuming registration of interest is inside wait(), this has a race.  If 
the file becomes readable between the time that read() returns and the 
time that wait() can register interest, the connection will hang.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:01               ` John Gardiner Myers
@ 2002-10-15 22:27                 ` Davide Libenzi
  2002-10-15 22:36                   ` John Gardiner Myers
  2002-10-16 20:03                 ` Dan Kegel
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 22:27 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Gardiner Myers wrote:

> Davide Libenzi wrote:
>
> >Why would you need to use threads with a multiplex-like interface like
> >/dev/epoll ?
> >
> Because in some applications processing an event can cause the thread to
> block, potentially for a long time.  Multiple threads are needed to
> isolate that block to the context associated with the event.

I don't want this to become the latest pro/against threads but if your
processing thread block for a long time you should consider handling the
blocking condition asynchronously. If your procesing thread blocks, your
application model should very likely be redesigned, or you just go with
threads ( and you do not need any multiplex interface ).



> >	while (read() == EGAIN)
> >		wait(POLLIN);
> >
> >
> Assuming registration of interest is inside wait(), this has a race.  If
> the file becomes readable between the time that read() returns and the
> time that wait() can register interest, the connection will hang.

Your assumption is wrong, the registration is done as soon as the fd
"born" ( socket() or accept() for example ) and is typically removed when
it dies.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 21:50               ` John Myers
@ 2002-10-15 22:33                 ` Davide Libenzi
  2002-10-15 22:56                   ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 22:33 UTC (permalink / raw)
  To: John Myers
  Cc: Dan Kegel, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Myers wrote:

> Dan Kegel wrote:
>
> >The most effective way to use something like /dev/epoll in a
> >multithreaded
> >program might be to have one thread call "get next batch of events",
> >then divvy up the events across multiple threads.
> >
> That is a tautology.  As /dev/epoll is inherently a single threaded
> interface, of course the most effective way for a multithreaded program
> to use it is to have one thread call it then divvy up the events.
>  That's the *only* way a multithreaded program can deal with a single
> threaded interface.
>
> The cost to divvy up the events can be substantial, not the mention the
> cost of funneling them all through a single CPU.  This cost can easily
> be greater than what one saves by combining events.  io_getevents() is a
> great model for divvying events across multiple threads, the poll
> facility should work with that model.
>
> The solution for optimizing the amount of event collapsing is to
> implement concurrency control.

There're many ways to have /dev/epoll working in a threaded environment if
you think about it, and no you don't need to have a single thread fetching
events. You can have, if you really like threads, N fetching threads (
working on N private /dev/epoll fds ), feeding M queues, polled by P
service threads. If you like it ...




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:27                 ` Davide Libenzi
@ 2002-10-15 22:36                   ` John Gardiner Myers
  2002-10-15 22:41                     ` Benjamin LaHaise
                                       ` (2 more replies)
  0 siblings, 3 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 22:36 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 1284 bytes --]

Davide Libenzi wrote:

>I don't want this to become the latest pro/against threads but if your
>processing thread block for a long time you should consider handling the
>blocking condition asynchronously. If your procesing thread blocks, your
>application model should very likely be redesigned, or you just go with
>threads ( and you do not need any multiplex interface ).
>
Rewriting the code to handle the blocking condition asynchronously can 
be inordinately expensive and time consuming.  This is particularly true 
when using third party code (such as the system DNS resolver) which only 
has blocking interfaces.

A much more cost effective and timely methodology is to only 
asynchronously code the most important conditions, leaving threads to 
handle the rest.

>Your assumption is wrong, the registration is done as soon as the fd
>"born" ( socket() or accept() for example ) and is typically removed when
>it dies.
>
Nonetheless, the requirement for user space to test the condition after 
the registration, not before, is subtle.  A program which does these in 
the wrong order is still likely to pass QA and will fail in production 
in a way that will be difficult to diagnose.  There is no rational 
reason for the kernel to not test the condition upon registration.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:36                   ` John Gardiner Myers
@ 2002-10-15 22:41                     ` Benjamin LaHaise
  2002-10-15 23:26                       ` John Gardiner Myers
  2002-10-15 23:05                     ` Davide Libenzi
  2002-10-16 19:59                     ` Dan Kegel
  2 siblings, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-15 22:41 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, Oct 15, 2002 at 03:36:09PM -0700, John Gardiner Myers wrote:
> Nonetheless, the requirement for user space to test the condition after 
> the registration, not before, is subtle.  A program which does these in 
> the wrong order is still likely to pass QA and will fail in production 
> in a way that will be difficult to diagnose.  There is no rational 
> reason for the kernel to not test the condition upon registration.

I suppose one way of getting the async poll code up to snuff would be to 
cache the poll registration in the file descriptor.  Alternatively, the 
iocb could simply persist until it is cancelled or a refire is permitted 
(so that the event queue does not get overrun).  Would you care to try 
crunching the numbers with polltest on 2.5?

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:33                 ` Davide Libenzi
@ 2002-10-15 22:56                   ` John Gardiner Myers
  2002-10-15 23:23                     ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 22:56 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Dan Kegel, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 476 bytes --]

Davide Libenzi wrote:

>There're many ways to have /dev/epoll working in a threaded environment if
>you think about it, and no you don't need to have a single thread fetching
>events. You can have, if you really like threads, N fetching threads (
>working on N private /dev/epoll fds ), feeding M queues
>
In such models, you still have to pay the cost of divvying up the events 
after you receive them.  You also have to worry about keeping load 
distributed evenly enough.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:36                   ` John Gardiner Myers
  2002-10-15 22:41                     ` Benjamin LaHaise
@ 2002-10-15 23:05                     ` Davide Libenzi
  2002-10-15 23:33                       ` John Gardiner Myers
  2002-10-16 19:59                     ` Dan Kegel
  2 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 23:05 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Gardiner Myers wrote:

> Nonetheless, the requirement for user space to test the condition after
> the registration, not before, is subtle.  A program which does these in
> the wrong order is still likely to pass QA and will fail in production
> in a way that will be difficult to diagnose.  There is no rational
> reason for the kernel to not test the condition upon registration.

All APIs have their own specifications and if you do not follow them, or
you're using a different interfacing just because the name looks similar
to other APIs, you're going to have problems. The problem it's not inside
the API but inside the user ...



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:56                   ` John Gardiner Myers
@ 2002-10-15 23:23                     ` Davide Libenzi
  2002-10-16 19:16                       ` John Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-15 23:23 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Dan Kegel, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Gardiner Myers wrote:

> Davide Libenzi wrote:
>
> >There're many ways to have /dev/epoll working in a threaded environment if
> >you think about it, and no you don't need to have a single thread fetching
> >events. You can have, if you really like threads, N fetching threads (
> >working on N private /dev/epoll fds ), feeding M queues
> >
> In such models, you still have to pay the cost of divvying up the events
> after you receive them.  You also have to worry about keeping load
> distributed evenly enough.

That's exactly the reason why you don't want to use many threads. Typical
applications that uses multiplex interfaces uses only one task (
eventually for each CPU ) that handles many connections. And again, you
can also use threads, if you design your application correctly. It is not
that expensive having service threads popping from an array. Yes, you have
a lock to be acquired but this is a price you have to pay as soon as you
choose threads.



- Davide




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:41                     ` Benjamin LaHaise
@ 2002-10-15 23:26                       ` John Gardiner Myers
  0 siblings, 0 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 23:26 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Davide Libenzi, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 2285 bytes --]

Benjamin LaHaise wrote:

>I suppose one way of getting the async poll code up to snuff would be to 
>cache the poll registration in the file descriptor.  Alternatively, the 
>iocb could simply persist until it is cancelled or a refire is permitted 
>(so that the event queue does not get overrun).
>
First, can you confirm that your only problem with the async poll code 
is the fact that it doesn't amortize the registration/deregistration 
cost across multiple events?

I would say that the fact that the async poll code doesn't do this 
amortization is not sufficient reason to hold up the patch.  A 
non-amortizing async poll is sufficiently useful to warrant inclusion. 
 It is conceivable that some applications will not want to amortize some 
of their poll requests.  The non-amortizing interface can later be 
extended to an amortizing one by defining an "amortize me" bit to the 
events request mask.

I don't think caching the registration in the file descriptor is a good 
idea--there can be multiple registrations against a given fd.  The 
registration should be cached close to the iocb--either in the iocb or 
in some structure that directly references the iocb.

The model for multiple-events-per-iocb I was thinking of is as follows:

Add a concept of a "partial completion".  Unlike a normal completion, 
when a partial completion event fires the iocb is not freed.  Instead 
the iocb goes into a "fired" state.

When an iocb is in the fired state, it will not generate any more 
completion events until it is "rearmed" by user space.  The method by 
which user space rearms an iocb is to be determined.  Upon rearming, the 
iocb is once again able to generate either a partial or normal 
completion.  As with submission, rearming can generate this completion 
immediately if the situation warrants.

Canceling an iocb in the rearmed state is the same as canceling an iocb 
that has never generated a completion event.  Canceling an iocb in the 
fired state returns a normal completion through the cancellation 
interface and returns a distinct error code (not 0 or -EAGAIN) to inform 
the caller that there is an outstanding event to be synchronized with.

Attempting to rearm a canceled iocb returns an error, probably -EAGAIN 
to be consistent with cancellation.



[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 23:05                     ` Davide Libenzi
@ 2002-10-15 23:33                       ` John Gardiner Myers
  2002-10-16  0:05                         ` Davide Libenzi
  2002-10-16  2:45                         ` [PATCH] async poll for 2.5 Charles 'Buck' Krasic
  0 siblings, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-15 23:33 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 643 bytes --]

Davide Libenzi wrote:

>All APIs have their own specifications and if you do not follow them, or
>you're using a different interfacing just because the name looks similar
>to other APIs, you're going to have problems. The problem it's not inside
>the API but inside the user ...
>  
>
The epoll API is deficient--it is subtly error prone and it forces work 
on user space that is better done in the kernel.  That the API is 
specified in a deficient way does not make it any less deficient.

Again, there is no rational justification for the kernel to not test the 
condition upon registration.  There is ample justification for it to do so.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 23:33                       ` John Gardiner Myers
@ 2002-10-16  0:05                         ` Davide Libenzi
  2002-10-16  0:15                           ` John Myers
  2002-10-16  2:45                         ` [PATCH] async poll for 2.5 Charles 'Buck' Krasic
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-16  0:05 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Gardiner Myers wrote:

> The epoll API is deficient--it is subtly error prone and it forces work
> on user space that is better done in the kernel.  That the API is
> specified in a deficient way does not make it any less deficient.

Just a simple question : Have you ever used RT-Signal API ? Is it the API
"deficent" or is it the one that does not understand it ? Do you know the
difference between level triggered ( poll() - select() - /dev/poll ) and
edge triggered ( /dev/epoll - RT-Signal ) interfaces ?




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16  0:05                         ` Davide Libenzi
@ 2002-10-16  0:15                           ` John Myers
  2002-10-16 14:25                             ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Myers @ 2002-10-16  0:15 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 531 bytes --]

Davide Libenzi wrote:

>Just a simple question : Have you ever used RT-Signal API ? Is it the API
>"deficent" [...] ?
>
No.  Yes.  The (fixed) size of the signal queue is far too small.  One 
either gets catastrophic failure on overload or one has to pay to do 
redundant accounting of interest.

>Do you know the
>difference between level triggered ( poll() - select() - /dev/poll ) and
>edge triggered ( /dev/epoll - RT-Signal ) interfaces ?
>  
>
Yes.  The registration of interest can itself be considered an edge 
condition.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 17:18       ` Dan Kegel
@ 2002-10-16  2:11         ` Lincoln Dale
  0 siblings, 0 replies; 138+ messages in thread
From: Lincoln Dale @ 2002-10-16  2:11 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Benjamin LaHaise, Shailabh Nagar, linux-kernel, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

At 10:18 AM 15/10/2002 -0700, Dan Kegel wrote:
>Benjamin LaHaise wrote:
> >
> > On Tue, Oct 15, 2002 at 10:06:22AM -0700, Dan Kegel wrote:
> > > Doesn't the F_SETSIG/F_SETOWN/SIGIO stuff qualify as a scalable
> > > alternative?
> >
> > No.
>
>What's the worst part about it?  The use of the signal queue?

there are four things that really suck about sigio.
in order of most-significant-suckage to least-significant-suckage, i see 
them as:

  [1] signals are very heavy.
      thousands of signals/second does not scale on SMP due to the
      serialization of them in the kernel.
      (just look at the code path for delivering a signal)

      signals also resulted in 128 u32's being transferred from kernel
      to userspace for every signal.  thats a lot of memory i/o
      bandwidth consumed at 1000's of concurrent sockets
      and tens-of-thousands of events/sec happening.

  [2] SIGIO only exposes half of the POLL_OUT semantics.
      with poll(), you can use POLL_OUT to indicate if there
      is free buffer space to write into or not.
      with SIGIO, for most applications, you can only find out
      by issuing a write() and get back a -EWOULDBLOCK.
      to indicate !POLL_OUT.
      (perhaps that has been addressed in the last 12 months or so;
      but i doubt it)

  [3] SIGIO had no easy recovery path if you hit the maximum-
      queue-limit for number of signals queued to userspace.
      (ok, you *could* do a poll() and start again, but it couldn't
      be done in a 100% race-free manner)

  [4] you couldn't enable SIGIO on a incoming socket
      accept()ed without there being a race window where
      something happened to that socket between accept() and
      enable-SIGIO.  [sort-of related to (3); you could work-around
      it by doing a poll() on the socket after enable-SIGIO, but
      it makes a clean interface a horrible interface]

other miscellaneous things that makes SIGIO less usable in the real-world:
  - you can only get one event at a time -- that means tens-of-thousands
    to hundreds-of-thousands of system calls / second just to get event
    status, when it'd probably make more sense to poll for multiple signals
    at the same time
  - SIGIO only addressed "event notification".  it did nothing to address the
    other large scalability that you typically hit when writing 
high-performance i/o
    systems: overhead of memory-copy from userspace<->kernelspace.
    various zerocopy mechanisms help address that side of thing, but if you're
    comparing aio to SIGIO, aio *is* addressing a much larger problem than just
    SIGIO on its own


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 23:33                       ` John Gardiner Myers
  2002-10-16  0:05                         ` Davide Libenzi
@ 2002-10-16  2:45                         ` Charles 'Buck' Krasic
  2002-10-16 14:28                           ` Davide Libenzi
  2002-10-16 18:29                           ` John Gardiner Myers
  1 sibling, 2 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-16  2:45 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie


John Gardiner Myers <jgmyers@netscape.com> writes:

> The epoll API is deficient--it is subtly error prone and it forces
> work on user space that is better done in the kernel.  That the API
> is specified in a deficient way does not make it any less deficient.

You can argue that any API is subtly error prone.  The whole sockets
API is that way.  That's why the W. Richard Stevens network
programming books are such gems.  That's why having access to kernel
source is invaluable.  You have to pay attention to details to avoid
errors.

With /dev/epoll, it is perfectly feasible to write user level
wrapper libraries that help avoid the potential pitfalls.

I think it was Dan Kegel who has already mentioned one. 

I've written one myself, and I'm very confident in it.  I've written a
traffic generator application on top of my library that stresses the
Linux kernel protocol stack to the extreme.  It generates the
proverbial 10k cps, saturates gigabit networks, etc.

It has no problem running over /dev/epoll.  

IMHO, the code inside my wrapper library for the epoll case is
significantly easier to understand than the code for the case that
uses the legacy poll() interface.

If /dev/epoll were so error prone as you say it is, I think I would
have noticed it.  

-- Buck

ps If anybody cares. I can give them a pointer to my code.

> Again, there is no rational justification for the kernel to not test
> the condition upon registration.  There is ample justification for
> it to do so.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16  0:15                           ` John Myers
@ 2002-10-16 14:25                             ` Davide Libenzi
  2002-10-16 18:15                               ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-16 14:25 UTC (permalink / raw)
  To: John Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Tue, 15 Oct 2002, John Myers wrote:

> Davide Libenzi wrote:
>
> >Just a simple question : Have you ever used RT-Signal API ? Is it the API
> >"deficent" [...] ?
> >
> No.  Yes.  The (fixed) size of the signal queue is far too small.  One
> either gets catastrophic failure on overload or one has to pay to do
> redundant accounting of interest.
>
> >Do you know the
> >difference between level triggered ( poll() - select() - /dev/poll ) and
> >edge triggered ( /dev/epoll - RT-Signal ) interfaces ?
> >
> >
> Yes.  The registration of interest can itself be considered an edge
> condition.

I knew you were going there, aka you do not understand how edge triggered
API have to be used. Even if the API will drop an event at registration
time you still cannot use this code scheme :

int my_io(...) {

	if (event_wait(...))
		do_io(...);

}

You CAN NOT. And look, it is not an API problem, it's your problem that
you want to use the API like a poll()-like API. The code scheme for an
edge triggered API is :

int my_io(...) {

	while (do_io(...) == EAGAIN)
		event_wait(...);

}

This because you have to consume the I/O space to push the level to 0 so
that a transaction 0->1 can happen and you can happily receive your
events.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16  2:45                         ` [PATCH] async poll for 2.5 Charles 'Buck' Krasic
@ 2002-10-16 14:28                           ` Davide Libenzi
  2002-10-17 18:47                             ` Charles 'Buck' Krasic
  2002-10-16 18:29                           ` John Gardiner Myers
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-16 14:28 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: John Gardiner Myers, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On 15 Oct 2002, Charles 'Buck' Krasic wrote:

>
> John Gardiner Myers <jgmyers@netscape.com> writes:
>
> > The epoll API is deficient--it is subtly error prone and it forces
> > work on user space that is better done in the kernel.  That the API
> > is specified in a deficient way does not make it any less deficient.
>
> You can argue that any API is subtly error prone.  The whole sockets
> API is that way.  That's why the W. Richard Stevens network
> programming books are such gems.  That's why having access to kernel
> source is invaluable.  You have to pay attention to details to avoid
> errors.
>
> With /dev/epoll, it is perfectly feasible to write user level
> wrapper libraries that help avoid the potential pitfalls.
>
> I think it was Dan Kegel who has already mentioned one.
>
> I've written one myself, and I'm very confident in it.  I've written a
> traffic generator application on top of my library that stresses the
> Linux kernel protocol stack to the extreme.  It generates the
> proverbial 10k cps, saturates gigabit networks, etc.
>
> It has no problem running over /dev/epoll.
>
> IMHO, the code inside my wrapper library for the epoll case is
> significantly easier to understand than the code for the case that
> uses the legacy poll() interface.
>
> If /dev/epoll were so error prone as you say it is, I think I would
> have noticed it.

The /dev/epoll usage is IMHO very simple. Once the I/O fd is created you
register it with POLLIN|POLLOUT and you leave it inside the monitor set
until it is needed ( mainly until you close() it ). It is not necessary to
continuosly switch the event mask from POLLIN and POLLOUT. An hypothetical
syscall API should look like :

int sys_epoll_create(int maxfds);
void sys_epoll_close(int epd);
int sys_epoll_addfd(int epd, int fd, int evtmask);
int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);

with the option ( if benchmarks will give positive results ) like Ben
suggested, of using the AIO event collector instead of sys_epoll_wait().




- Davide





^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16 14:25                             ` Davide Libenzi
@ 2002-10-16 18:15                               ` John Gardiner Myers
  2002-10-16 19:20                                 ` Davide Libenzi
  2002-10-16 20:06                                 ` [PATCH] async poll for 2.5 Mark Mielke
  0 siblings, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-16 18:15 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 1107 bytes --]

Davide Libenzi wrote:

>I knew you were going there, aka you do not understand how edge triggered
>API have to be used.
>
Nonsense.

>Even if the API will drop an event at registration
>time you still cannot use this code scheme :
>
>int my_io(...) {
>
>	if (event_wait(...))
>		do_io(...);
>
>}
>
>You CAN NOT. And look, it is not an API problem, it's your problem that
>you want to use the API like a poll()-like API.
>
You have insufficient basis upon which to claim I would write code as 
broken as above.

>This because you have to consume the I/O space to push the level to 0 so
>that a transaction 0->1 can happen and you can happily receive your
>events.
>  
>
Of course you have to consume the I/O space to push the level to 0. 
 What do you think I am, stupid?

This is done with something like:

for (;;) {
     fd = event_wait(...);
     while (do_io(fd) != EAGAIN);
}

Trying to do at once as much work as one can on a given fd helps keep 
that fd's context information in cache.  If one needs to have the fd 
yield the CPU in order to reduce system latency, one generates a 
user-mode event.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16  2:45                         ` [PATCH] async poll for 2.5 Charles 'Buck' Krasic
  2002-10-16 14:28                           ` Davide Libenzi
@ 2002-10-16 18:29                           ` John Gardiner Myers
  2002-10-16 20:39                             ` Charles 'Buck' Krasic
  2002-10-21 16:58                             ` [PATCH] async poll for 2.5 Alan Cox
  1 sibling, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-16 18:29 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 779 bytes --]

Charles 'Buck' Krasic wrote:

>You can argue that any API is subtly error prone.
>
You can also argue that the earth is flat.  It's just that some 
arguments have more basis than others.

>With /dev/epoll, it is perfectly feasible to write user level
>wrapper libraries that help avoid the potential pitfalls.
>
In other words, you don't deny the problem.  Instead, you work around it 
in user space.

Better to fix the API.  The kernel has more information than user space 
and can do a better job.  In the kernel, the problem can be fixed once 
and for all, not over and over again in each different wrapper library. 
 It's not even as if the change would break programs correctly written 
to the old API, not that we particularly care about programs written to 
the old API.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 23:23                     ` Davide Libenzi
@ 2002-10-16 19:16                       ` John Myers
  0 siblings, 0 replies; 138+ messages in thread
From: John Myers @ 2002-10-16 19:16 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Dan Kegel, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 1315 bytes --]

Davide Libenzi wrote:

>Typical
>applications that uses multiplex interfaces uses only one task (
>eventually for each CPU ) that handles many connections.
>
As I mentioned before, this only works if you have the luxury of being 
able to write your application and its supporting libraries from 
scratch.  If you don't have that luxury, you need additional threads in 
order to isolate the latency caused by blocking operations.

>Yes, you have
>a lock to be acquired but this is a price you have to pay as soon as you
>choose threads.
>  
>
You have to pay for the lock needed to receive the events.  What I am 
objecting to is having to pay again for a second lock (and condition 
variable) to redistribute those events from the thread they were 
received on to the thread they will be processed on.  An interface that 
supports multithreading well will permit events to be delivered directly 
to the threads that need to process them.

io_getevents() is an example of an interface that support multithreaded 
callers well.  Since the divvying up is done in the kernel, the 
information exists for its implementation to be later refined to be more 
intelligent about when to deliver which events to which threads.  For 
example, the interface can later implement CPU affinity of events and 
concurrency control.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16 18:15                               ` John Gardiner Myers
@ 2002-10-16 19:20                                 ` Davide Libenzi
  2002-10-16 23:31                                   ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
  2002-10-16 20:06                                 ` [PATCH] async poll for 2.5 Mark Mielke
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-16 19:20 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Wed, 16 Oct 2002, John Gardiner Myers wrote:

> Davide Libenzi wrote:
>
> >I knew you were going there, aka you do not understand how edge triggered
> >API have to be used.
> >
> Nonsense.
>
> >Even if the API will drop an event at registration
> >time you still cannot use this code scheme :
> >
> >int my_io(...) {
> >
> >	if (event_wait(...))
> >		do_io(...);
> >
> >}
> >
> >You CAN NOT. And look, it is not an API problem, it's your problem that
> >you want to use the API like a poll()-like API.
> >
> You have insufficient basis upon which to claim I would write code as
> broken as above.

Yes I have, look down 15 lines ...


> >This because you have to consume the I/O space to push the level to 0 so
> >that a transaction 0->1 can happen and you can happily receive your
> >events.
> >
> >
> Of course you have to consume the I/O space to push the level to 0.
>  What do you think I am, stupid?
>
> This is done with something like:
>
> for (;;) {
>      fd = event_wait(...);
>      while (do_io(fd) != EAGAIN);
> }

I told you did not understand the API, this code won't work for edge
triggered APIs. Please consider investigating a little bit more before
shooting to perfectly working APIs.




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:36                   ` John Gardiner Myers
  2002-10-15 22:41                     ` Benjamin LaHaise
  2002-10-15 23:05                     ` Davide Libenzi
@ 2002-10-16 19:59                     ` Dan Kegel
  2 siblings, 0 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-16 19:59 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

John Gardiner Myers wrote:
> Nonetheless, the requirement for user space to test the condition after 
> the registration, not before, is subtle.  A program which does these in 
> the wrong order is still likely to pass QA and will fail in production 
> in a way that will be difficult to diagnose.  There is no rational 
> reason for the kernel to not test the condition upon registration.

As long as we agree that the kernel may provide spurious readiness
notifications on occasion, I agree.  Then /dev/epoll can easily fulfill
this by signaling readiness on everything at registration; more
accurate notifications could be added later as an optimization.

- Dan


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-15 22:01               ` John Gardiner Myers
  2002-10-15 22:27                 ` Davide Libenzi
@ 2002-10-16 20:03                 ` Dan Kegel
  2002-10-17 17:43                   ` epoll (was Re: [PATCH] async poll for 2.5) John Myers
  1 sibling, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-16 20:03 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

John Gardiner Myers wrote:
>>     while (read() == EAGAIN)
>>         wait(POLLIN);
>>
> Assuming registration of interest is inside wait(), this has a race.  If 
> the file becomes readable between the time that read() returns and the 
> time that wait() can register interest, the connection will hang.

Shouldn't the should be rearmed inside read() when it returns EAGAIN?
That's how I do it in my wrapper library these days.
No reason to have a race.

- Dan


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16 18:15                               ` John Gardiner Myers
  2002-10-16 19:20                                 ` Davide Libenzi
@ 2002-10-16 20:06                                 ` Mark Mielke
  2002-10-16 23:48                                   ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
  1 sibling, 1 reply; 138+ messages in thread
From: Mark Mielke @ 2002-10-16 20:06 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Wed, Oct 16, 2002 at 11:15:50AM -0700, John Gardiner Myers wrote:
> Davide Libenzi wrote:
> >This because you have to consume the I/O space to push the level to 0 so
> >that a transaction 0->1 can happen and you can happily receive your
> >events.
> This is done with something like:
> for (;;) {
>     fd = event_wait(...);
>     while (do_io(fd) != EAGAIN);
> }
> Trying to do at once as much work as one can on a given fd helps keep 
> that fd's context information in cache.  If one needs to have the fd 
> yield the CPU in order to reduce system latency, one generates a 
> user-mode event.

Not to enter into any of the other discussions on this issue, I wouldn't
usually do what you suggest above. Sure, for operations like accept() that
are inherently inefficient, I would loop until EAGAIN, but if I did I
a recv() or read() of 2K, and I only received 1K, there is no reason why
another system call should be invoked on the resource that likely will not
have any data ready.

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16 18:29                           ` John Gardiner Myers
@ 2002-10-16 20:39                             ` Charles 'Buck' Krasic
  2002-10-17 17:59                               ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
  2002-10-21 16:58                             ` [PATCH] async poll for 2.5 Alan Cox
  1 sibling, 1 reply; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-16 20:39 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

John Gardiner Myers <jgmyers@netscape.com> writes:

> Charles 'Buck' Krasic wrote:
> 
> In other words, you don't deny the problem.  Instead, you work around
> it in user space.

Not exactly.  I'm saying that the context in which /dev/epoll is used
(at least originally), is non-blocking socket IO.  Anybody who has
worked with that API can tell you there are subtleties, and that if
they're ignored, will certainly lead to pitfalls.  These are not the
fault of the /dev/epoll interface.

> Better to fix the API.  

> The kernel has more information than user space and can do a better
> job.  

I think we're talking across purposes. 

I know this is the AIO list, but I think epoll has value independent
of AIO.  They're complementary, not mutually exclusive.  

> In the kernel, the problem can be fixed once and for all, not
> over and over again in each different wrapper library. It's not even
> as if the change would break programs correctly written to the old
> API, not that we particularly care about programs written to the old
> API.

I agree if you are talking about AIO as a whole.  But epoll is more
limited in its scope, it really relates only to poll()/select not the
whole IO api.

-- Buck




^ permalink raw reply	[flat|nested] 138+ messages in thread

* epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 19:20                                 ` Davide Libenzi
@ 2002-10-16 23:31                                   ` John Gardiner Myers
  2002-10-16 23:51                                     ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-16 23:31 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 230 bytes --]

Davide Libenzi wrote:

>I told you did not understand the API, this code won't work for edge
>triggered APIs.
>
Nonsense.  If you wish to make such a claim, you need to provide an 
example of a situation in which it won't work.



[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 20:06                                 ` [PATCH] async poll for 2.5 Mark Mielke
@ 2002-10-16 23:48                                   ` John Gardiner Myers
  2002-10-17  0:23                                     ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-16 23:48 UTC (permalink / raw)
  To: Mark Mielke
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 841 bytes --]

Mark Mielke wrote:

>Not to enter into any of the other discussions on this issue, I wouldn't
>usually do what you suggest above. [...] if I did I
>a recv() or read() of 2K, and I only received 1K, there is no reason why
>another system call should be invoked on the resource that likely will not
>have any data ready.
>  
>
You're into the minutiae here.  Sure, you can optimize the read() in 
some cases, but Mr. Libenzi's example of a correct code scheme is no 
better than mine when it comes to this.

In some situations, I've found that optimizing in the other direction is 
useful.  When a peer is feeding you data slowly you can increase 
throughput by having the thread block on read for a couple of 
milliseconds before going back to the pool.  A large part of that was 
that the particular event delivery subsystem was expensive.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 23:31                                   ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
@ 2002-10-16 23:51                                     ` Davide Libenzi
  2002-10-17 18:06                                       ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-16 23:51 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Wed, 16 Oct 2002, John Gardiner Myers wrote:

> Davide Libenzi wrote:
>
> >I told you did not understand the API, this code won't work for edge
> >triggered APIs.
> >
> Nonsense.  If you wish to make such a claim, you need to provide an
> example of a situation in which it won't work.

Your welcome. This is your code :

for (;;) {
     fd = event_wait(...);
     while (do_io(fd) != EAGAIN);
}

If the I/O space is not exhausted when you call event_wait(...); you'll
never receive the event because you'll be waiting a 0->1 transaction
without bringing the signal to 0 ( I/O space exhausted ). That one is a
typical use of poll() - select() - /dev/poll and you showed pretty clearly
that you do not seem to understand edge triggered event APIs. If you code
your I/O function like :

int my_io(...) {

	if (event_wait(...))
		do_io(...);

}

and you consume only part of the I/O space with the first call to my_io(),
the second call will block _infinitely_.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 23:48                                   ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
@ 2002-10-17  0:23                                     ` Davide Libenzi
  2002-10-17 17:45                                       ` John Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-17  0:23 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Mark Mielke, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Wed, 16 Oct 2002, John Gardiner Myers wrote:

> Mark Mielke wrote:
>
> >Not to enter into any of the other discussions on this issue, I wouldn't
> >usually do what you suggest above. [...] if I did I
> >a recv() or read() of 2K, and I only received 1K, there is no reason why
> >another system call should be invoked on the resource that likely will not
> >have any data ready.
> >
> >
> You're into the minutiae here.  Sure, you can optimize the read() in
> some cases, but Mr. Libenzi's example of a correct code scheme is no
> better than mine when it comes to this.

The poll()-like code :

int my_io(...) {

	if (poll(...))
		do_io(...);

}

The epoll-like code :

int my_io(...) {

	while (do_io(...) == EAGAIN)
		event_wait(...);

}

I would say that the epoll-like code generates less system calls because
if you call my_io() by processing small chunks of the I/O space, the
epoll-like code will generate only one system call while the poll()-like
code two. In case of I/O that ends up in wait the poll()-like code
generate two system calls while epoll-like code three. Globally the number
of system calls are about the same and from a performance point of view
/dev/epoll looks "pretty good" ( see /dev/epoll page ).




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 20:03                 ` Dan Kegel
@ 2002-10-17 17:43                   ` John Myers
  2002-10-18 17:00                     ` Mark Mielke
  0 siblings, 1 reply; 138+ messages in thread
From: John Myers @ 2002-10-17 17:43 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Davide Libenzi, Benjamin LaHaise, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 959 bytes --]

Dan Kegel wrote:

> As long as we agree that the kernel may provide spurious readiness
> notifications on occasion, I agree.

Great!  We agree!  Progress!

>>>     while (read() == EAGAIN)
>>>         wait(POLLIN);
>>>
>> Assuming registration of interest is inside wait(), this has a race.  
>> If the file becomes readable between the time that read() returns and 
>> the time that wait() can register interest, the connection will hang.
>
>
> Shouldn't the should be rearmed inside read() when it returns EAGAIN?

The key phrase is "assuming registration of interest is inside wait()." 
 The code fragment didn't cover when registration of interest occurs. 
 If registration of interest occurs before the read() or if registration 
of interest while the fd is ready generates an event, there is no race. 
 If registration of interest occurs after the read() and registration of 
interest while the fd is ready does not generate an event, there is a race.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-17  0:23                                     ` Davide Libenzi
@ 2002-10-17 17:45                                       ` John Myers
  0 siblings, 0 replies; 138+ messages in thread
From: John Myers @ 2002-10-17 17:45 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Mark Mielke, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 220 bytes --]

Davide Libenzi wrote:

>The poll()-like code :
>
>int my_io(...) {
>
>	if (poll(...))
>		do_io(...);
>
>}
>  
>
This is not my example of a correct code scheme.  You're made a strawman 
argument, which proves nothing.



[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 20:39                             ` Charles 'Buck' Krasic
@ 2002-10-17 17:59                               ` John Gardiner Myers
  0 siblings, 0 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-17 17:59 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 822 bytes --]

Charles 'Buck' Krasic wrote:

>Not exactly.  I'm saying that the context in which /dev/epoll is used
>(at least originally), is non-blocking socket IO.  Anybody who has
>worked with that API can tell you there are subtleties, and that if
>they're ignored, will certainly lead to pitfalls.  These are not the
>fault of the /dev/epoll interface.
>
The particular subtlety I am pointing out is the fault of the currently 
defined /dev/epoll interface and can be fixed by making a minor change 
to the /dev/epoll interface.

>I agree if you are talking about AIO as a whole.  But epoll is more
>limited in its scope, it really relates only to poll()/select not the
>whole IO api.
>  
>
My objection to the current /dev/epoll API does apply to said "limited 
scope," it is not dependent on the scope that is "AIO as a whole."


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-16 23:51                                     ` Davide Libenzi
@ 2002-10-17 18:06                                       ` John Gardiner Myers
  2002-10-17 18:33                                         ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-17 18:06 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

[-- Attachment #1: Type: text/plain, Size: 981 bytes --]

Davide Libenzi wrote:

>>Nonsense.  If you wish to make such a claim, you need to provide an
>>example of a situation in which it won't work.
>>    
>>
>
>Your welcome. This is your code :
>
>for (;;) {
>     fd = event_wait(...);
>     while (do_io(fd) != EAGAIN);
>}
>
>If the I/O space is not exhausted when you call event_wait(...); you'll
>never receive the event because you'll be waiting a 0->1 transaction
>without bringing the signal to 0 ( I/O space exhausted ).
>
My code above does exhaust the I/O space.

> That one is a
>typical use of poll() - select() - /dev/poll and you showed pretty clearly
>that you do not seem to understand edge triggered event APIs. If you code
>your I/O function like :
>
>int my_io(...) {
>
>	if (event_wait(...))
>		do_io(...);
>
>}
>
This is not how my example is coded.

while (do_io(...) != EAGAIN);

is not equivalent to:

do_io(...);

The former is guaranteed to exhaust the I/O space, the latter is not.

You're spouting nonsense.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3537 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-17 18:06                                       ` John Gardiner Myers
@ 2002-10-17 18:33                                         ` Davide Libenzi
  2002-10-18 19:02                                           ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-17 18:33 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Thu, 17 Oct 2002, John Gardiner Myers wrote:

> Davide Libenzi wrote:
>
> >>Nonsense.  If you wish to make such a claim, you need to provide an
> >>example of a situation in which it won't work.
> >>
> >>
> >
> >Your welcome. This is your code :
> >
> >for (;;) {
> >     fd = event_wait(...);
> >     while (do_io(fd) != EAGAIN);
> >}
> >
> >If the I/O space is not exhausted when you call event_wait(...); you'll
> >never receive the event because you'll be waiting a 0->1 transaction
> >without bringing the signal to 0 ( I/O space exhausted ).
> >
> My code above does exhaust the I/O space.

Look, I'm usually very polite but you're really wasting my time. You
should know that an instruction at line N is usually executed before an
instruction at line N+1. Now this IS your code :

[N-1] for (;;) {
[N  ]     fd = event_wait(...);
[N+1]     while (do_io(fd) != EAGAIN);
[N+2} }

I will leave you as an exercise to understand what happens when you call
the first event_wait(...); and there is still data to be read/write on the
file descriptor. The reason you're asking /dev/epoll to drop an event at
fd insertion time shows very clearly that you're going to use the API is
the WRONG way and that you do not understand how such APIs works. And the
fact that there're users currently using the rt-sig and epoll APIs means
that either those guys are genius or you're missing something.




- Davide




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16 14:28                           ` Davide Libenzi
@ 2002-10-17 18:47                             ` Charles 'Buck' Krasic
  2002-10-17 19:20                               ` Davide Libenzi
  2002-10-18  3:30                               ` Dan Kegel
  0 siblings, 2 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-17 18:47 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: John Gardiner Myers, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie


Hi Davide,

On thinking about this a bit, I wonder if the evtmask isn't superflous
in sys_epoll_addf? (And in the existing epoll interface where the
application writes to /dev/epoll).

As you say, the normal usage will be to register for all events
anyway.  My wrapper library does exactly that.  As you say, not having
to continously switch the mask is the simpler way to go.  If
registering for all events is the only sensible approach, the argument
isn't needed at all.

What do you think?  It's a minor detail, I know.

Taking the idea further, I would prefer that ALL non-blocking sockets
are automatically added to the epoll interest set if the application
has already called epoll_create().  Maybe that behaviour could be an
option to epoll_create().   

BTW, I'm not clear on another aspect of the API below, is there still
an mmap() for the pollfd buffers?   

-- Buck

Davide Libenzi <davidel@xmailserver.org> writes:

> The /dev/epoll usage is IMHO very simple. Once the I/O fd is created you
> register it with POLLIN|POLLOUT and you leave it inside the monitor set
> until it is needed ( mainly until you close() it ). It is not necessary to
> continuosly switch the event mask from POLLIN and POLLOUT. An hypothetical
> syscall API should look like :

> int sys_epoll_create(int maxfds);
> void sys_epoll_close(int epd);
> int sys_epoll_addfd(int epd, int fd, int evtmask);
> int sys_epoll_wait(int epd, struct pollfd **pevts, int timeout);

> with the option ( if benchmarks will give positive results ) like Ben
> suggested, of using the AIO event collector instead of sys_epoll_wait().

> - Davide

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-17 18:47                             ` Charles 'Buck' Krasic
@ 2002-10-17 19:20                               ` Davide Libenzi
  2002-10-18  3:30                               ` Dan Kegel
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-17 19:20 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: John Gardiner Myers, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On 17 Oct 2002, Charles 'Buck' Krasic wrote:

>
> Hi Davide,
>
> On thinking about this a bit, I wonder if the evtmask isn't superflous
> in sys_epoll_addf? (And in the existing epoll interface where the
> application writes to /dev/epoll).
>
> As you say, the normal usage will be to register for all events
> anyway.  My wrapper library does exactly that.  As you say, not having
> to continously switch the mask is the simpler way to go.  If
> registering for all events is the only sensible approach, the argument
> isn't needed at all.
>
> What do you think?  It's a minor detail, I know.

Even if it is the fastest way to use the API, iI would still prefer such
behaviour to be encoded in wrapper libraries instead of inside the API
itself. Having a choice is usually better that not having it, if the cost
for having a choice is not too much ( and in this particular case is not ).


> Taking the idea further, I would prefer that ALL non-blocking sockets
> are automatically added to the epoll interest set if the application
> has already called epoll_create().  Maybe that behaviour could be an
> option to epoll_create().

Same thing, I would leave this task to your my_socket() and my_accept().
I think what is really missing about /dev/epoll is an easy-to-use
interface library to avoid users confused by the presence of "poll" inside
its name, to use it like select()/poll().


> BTW, I'm not clear on another aspect of the API below, is there still
> an mmap() for the pollfd buffers?

Yes, it creates a mapping shared between the kernel and the user space.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-17 18:47                             ` Charles 'Buck' Krasic
  2002-10-17 19:20                               ` Davide Libenzi
@ 2002-10-18  3:30                               ` Dan Kegel
  1 sibling, 0 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-18  3:30 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: Davide Libenzi, John Gardiner Myers, Benjamin LaHaise,
	Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Charles 'Buck' Krasic wrote:
> On thinking about this a bit, I wonder if the evtmask isn't superflous
> in sys_epoll_addf? ... As you say, the normal usage will be to 
 > register for all events anyway.

I agree... but we might eventually have events that apps aren't
interested in.  No harm in letting app specify an interest mask once.

> Taking the idea further, I would prefer that ALL non-blocking sockets
> are automatically added to the epoll interest set if the application
> has already called epoll_create().

That would prevent apps from having more than one i/o readiness
notification event source.  This is a problem for modular
software, where you try to combine libraries in a multithreaded
program.

- Dan



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-17 17:43                   ` epoll (was Re: [PATCH] async poll for 2.5) John Myers
@ 2002-10-18 17:00                     ` Mark Mielke
  2002-10-18 17:28                       ` Dan Kegel
  2002-10-18 18:55                       ` Chris Friesen
  0 siblings, 2 replies; 138+ messages in thread
From: Mark Mielke @ 2002-10-18 17:00 UTC (permalink / raw)
  To: John Myers
  Cc: Dan Kegel, Davide Libenzi, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

> >>>    while (read() == EAGAIN)
> >>>        wait(POLLIN);

I find myself still not understanding this thread. Lots of examples of
code that should or should not be used, but I would always choose:

   ... ensure file descriptor is blocking ...
   for (;;) {
       int nread = read(...);
       ...
   }

Over the above, or any derivative of the above.

What would be the point of using an event notification mechanism for
synchronous reads with no other multiplexed options?

A 'proper' event loop is significantly more complicated. Since everybody
here knows this... I'm still confused...

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 17:00                     ` Mark Mielke
@ 2002-10-18 17:28                       ` Dan Kegel
  2002-10-18 17:41                         ` Davide Libenzi
  2002-10-18 18:55                       ` Chris Friesen
  1 sibling, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-18 17:28 UTC (permalink / raw)
  To: Mark Mielke
  Cc: John Myers, Davide Libenzi, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

Mark Mielke wrote:
>>>>>   while (read() == EAGAIN)
>>>>>       wait(POLLIN);
> 
> I find myself still not understanding this thread. Lots of examples of
> code that should or should not be used, but I would always choose:
> 
>    ... ensure file descriptor is blocking ...
>    for (;;) {
>        int nread = read(...);
>        ...
>    }
> 
> Over the above, or any derivative of the above.
> 
> What would be the point of using an event notification mechanism for
> synchronous reads with no other multiplexed options?
> 
> A 'proper' event loop is significantly more complicated. Since everybody
> here knows this... I'm still confused...

I was afraid someone would be confused by the examples.  Davide loves
coroutines (check out http://www.xmailserver.org/linux-patches/nio-improve.html )
and I think his examples are written in that style.  He really means
what you think he should be meaning :-)
which is something like
     while (1) {
         grab next bunch of events from epoll
         for each event
             while (do_io(event->fd) != EAGAIN);
     }
I'm pretty sure.

- Dan



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 17:28                       ` Dan Kegel
@ 2002-10-18 17:41                         ` Davide Libenzi
  2002-10-18 18:55                           ` Mark Mielke
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-18 17:41 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Mark Mielke, John Myers, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Fri, 18 Oct 2002, Dan Kegel wrote:

> I was afraid someone would be confused by the examples.  Davide loves
> coroutines (check out http://www.xmailserver.org/linux-patches/nio-improve.html )
> and I think his examples are written in that style.  He really means
> what you think he should be meaning :-)
> which is something like
>      while (1) {
>          grab next bunch of events from epoll
>          for each event
>              while (do_io(event->fd) != EAGAIN);
>      }
> I'm pretty sure.

Yes, I like coroutines :) even if sometimes you have to be carefull with
the stack usage ( at least if you do not want to waste all your memory ).
Since there're N coroutines/stacks for N connections even 4Kb does mean
something when N is about 100000. The other solution is a state machine,
cheaper about memory, a little bit more complex about coding. Coroutines
though helps a graceful migration from a thread based application to a
multiplexed one. If you take a thread application and you code your
connect()/accept()/recv()/send() like the ones coded in the example http
server linked inside the epoll page, you can easily migrate a threaded app
by simply adding a distribution loop like :

for (;;) {
	get_events();
	for_each_fd
		call_coroutines_associated_with_ready_fd();
}



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 17:00                     ` Mark Mielke
  2002-10-18 17:28                       ` Dan Kegel
@ 2002-10-18 18:55                       ` Chris Friesen
  2002-10-18 19:00                         ` Mark Mielke
  1 sibling, 1 reply; 138+ messages in thread
From: Chris Friesen @ 2002-10-18 18:55 UTC (permalink / raw)
  To: Mark Mielke
  Cc: John Myers, Dan Kegel, Davide Libenzi, Benjamin LaHaise,
	Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

Mark Mielke wrote:
>>>>>   while (read() == EAGAIN)
>>>>>       wait(POLLIN);
>>>>>
> 
> I find myself still not understanding this thread. Lots of examples of
> code that should or should not be used, but I would always choose:
> 
>    ... ensure file descriptor is blocking ...
>    for (;;) {
>        int nread = read(...);
>        ...
>    }
> 
> Over the above, or any derivative of the above.

The main point here is determining which of many open connections need servicing.

select() and poll() do not scale well, so this is where stuff like /dev/epoll comes in--to tell you 
which of those file descriptors need to be serviced.

Chris



-- 
Chris Friesen                    | MailStop: 043/33/F10
Nortel Networks                  | work: (613) 765-0557
3500 Carling Avenue              | fax:  (613) 765-2986
Nepean, ON K2H 8E9 Canada        | email: cfriesen@nortelnetworks.com


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 17:41                         ` Davide Libenzi
@ 2002-10-18 18:55                           ` Mark Mielke
  2002-10-18 19:16                             ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: Mark Mielke @ 2002-10-18 18:55 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Dan Kegel, John Myers, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Fri, Oct 18, 2002 at 10:41:28AM -0700, Davide Libenzi wrote:
> Yes, I like coroutines :) even if sometimes you have to be carefull with
> the stack usage ( at least if you do not want to waste all your memory ).
> Since there're N coroutines/stacks for N connections even 4Kb does mean
> something when N is about 100000. The other solution is a state machine,
> cheaper about memory, a little bit more complex about coding. Coroutines
> though helps a graceful migration from a thread based application to a
> multiplexed one. If you take a thread application and you code your
> connect()/accept()/recv()/send() like the ones coded in the example http
> server linked inside the epoll page, you can easily migrate a threaded app
> by simply adding a distribution loop like :

> for (;;) {
> 	get_events();
> 	for_each_fd
> 		call_coroutines_associated_with_ready_fd();
> }

If each of these co-routines does "while (read() != EAGAIN) wait",
your implementation is seriously flawed unless you do not mind certain
file descriptors that may have lower numbers to have a real time
priority higher than file descriptors with a higher number.

If efficiency is the true goal - the event loop itself needs to be
abstracted - not just the query and dispatch routines. Using /dev/epoll
in a way that it is compatible with other applications is an excercise
in abuse, that will only show positive results because the alternatives
to /dev/epoll are ineffective, not because /dev/epoll is a better model.

I like the idea of /dev/epoll - which means that if I used it, I would
implement an efficient model with it, not a traditional model that through
hackery will function transparently using /dev/epoll.

But that might just be me...

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 18:55                       ` Chris Friesen
@ 2002-10-18 19:00                         ` Mark Mielke
  0 siblings, 0 replies; 138+ messages in thread
From: Mark Mielke @ 2002-10-18 19:00 UTC (permalink / raw)
  To: Chris Friesen
  Cc: John Myers, Dan Kegel, Davide Libenzi, Benjamin LaHaise,
	Shailabh Nagar, linux-kernel, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Fri, Oct 18, 2002 at 02:55:20PM -0400, Chris Friesen wrote:
> Mark Mielke wrote:
> >I find myself still not understanding this thread. Lots of examples of
> The main point here is determining which of many open connections need 
> servicing.

> select() and poll() do not scale well, so this is where stuff like 
> /dev/epoll comes in--to tell you which of those file descriptors need to be 
> serviced.

I know what the point is, and I know the concept behind /dev/epoll.

I'm speaking more about the snippets of code that fail to show the
real benefits of /dev/epoll, and the following discussion about which
snippet is better than which other snippet.

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-17 18:33                                         ` Davide Libenzi
@ 2002-10-18 19:02                                           ` John Gardiner Myers
  2002-10-18 19:52                                             ` Davide Libenzi
  2002-10-18 21:01                                             ` Charles 'Buck' Krasic
  0 siblings, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-18 19:02 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

Davide Libenzi wrote:

>Look, I'm usually very polite but you're really wasting my time. You
>should know that an instruction at line N is usually executed before an
>instruction at line N+1. Now this IS your code :
>
>[N-1] for (;;) {
>[N  ]     fd = event_wait(...);
>[N+1]     while (do_io(fd) != EAGAIN);
>[N+2} }
>
>I will leave you as an exercise to understand what happens when you call
>the first event_wait(...); and there is still data to be read/write on the
>file descriptor.
>
Your claim was that even if the API will drop an event at registration 
time, my code scheme would not work.  Thus, we can take "the API will 
drop an event at registration time" as postulated.  That being 
postulated, if there is still data to be read/written on the file 
descriptor then the first event_wait will return immediately.

In fact, given that postulate and the appropriate axioms about the 
behavior of event_wait() and do_io(), one can prove that my code scheme 
is equivalent to yours.  The logical conclusion from that and your claim 
would be that you don't understand how edge triggered APIs have to be used.

>The reason you're asking /dev/epoll to drop an event at
>fd insertion time shows very clearly that you're going to use the API is
>the WRONG way and that you do not understand how such APIs works.
>
The wrong way as defined by what?  Having /dev/epoll drop appropriate 
events at registration time permits a useful simplification/optimization 
and makes the system significantly less prone to subtle progamming errors.

I do understand how such APIs work, to the extent that I am pointing out 
a flaw in their current models.

>And the fact that there're users currently using the rt-sig and epoll APIs means
>that either those guys are genius or you're missing something.
>  
>
Nonsense.  People are able to use flawed APIs all of the time.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 18:55                           ` Mark Mielke
@ 2002-10-18 19:16                             ` Davide Libenzi
  2002-10-19  6:56                               ` Mark Mielke
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-18 19:16 UTC (permalink / raw)
  To: Mark Mielke
  Cc: Dan Kegel, John Myers, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Fri, 18 Oct 2002, Mark Mielke wrote:

> On Fri, Oct 18, 2002 at 10:41:28AM -0700, Davide Libenzi wrote:
> > Yes, I like coroutines :) even if sometimes you have to be carefull with
> > the stack usage ( at least if you do not want to waste all your memory ).
> > Since there're N coroutines/stacks for N connections even 4Kb does mean
> > something when N is about 100000. The other solution is a state machine,
> > cheaper about memory, a little bit more complex about coding. Coroutines
> > though helps a graceful migration from a thread based application to a
> > multiplexed one. If you take a thread application and you code your
> > connect()/accept()/recv()/send() like the ones coded in the example http
> > server linked inside the epoll page, you can easily migrate a threaded app
> > by simply adding a distribution loop like :
>
> > for (;;) {
> > 	get_events();
> > 	for_each_fd
> > 		call_coroutines_associated_with_ready_fd();
> > }
>
> If each of these co-routines does "while (read() != EAGAIN) wait",
> your implementation is seriously flawed unless you do not mind certain
> file descriptors that may have lower numbers to have a real time
> priority higher than file descriptors with a higher number.

No, once a file descriptor is ready the associated coroutine is called
with a co_call() and, for example, the read function is :

int dph_read(struct dph_conn *conn, char *buf, int nbyte)
{
    int n;

    while ((n = read(conn->sfd, buf, nbyte)) < 0) {
        if (errno == EINTR)
            continue;
        if (errno != EAGAIN && errno != EWOULDBLOCK)
            return -1;
        conn->events = POLLIN | POLLERR | POLLHUP;
        co_resume(conn);
    }
    return n;
}

where co_resume() preempt the current coroutine and run another one ( that
is the scheduler coroutine ). The scheduler coroutine looks like :

static int dph_scheduler(int loop, unsigned int timeout)
{
    int ii;
    static int nfds = 0;
    struct dph_conn *conn;
    static struct pollfd *pfds = NULL;

    do {
        if (!nfds) {
            struct evpoll evp;

            evp.ep_timeout = timeout;
            evp.ep_resoff = 0;

            nfds = ioctl(kdpfd, EP_POLL, &evp);
            pfds = (struct pollfd *) (map + evp.ep_resoff);
        }
        for (ii = 0; ii < EPLIMTEVENTS && nfds > 0; ii++, nfds--, pfds++) {
            if ((conn = dph_find(pfds->fd))) {
                conn->revents = pfds->revents;

                if (conn->revents & conn->events)
                    co_call(conn->co, conn);
            }
        }
    } while (loop);
    return 0;
}

These functions are taken from the really simple example http server used
to test/compare /dev/epoll with poll()/select()/rt-sig//dev/poll :

http://www.xmailserver.org/linux-patches/dphttpd_last.tar.gz




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 19:02                                           ` John Gardiner Myers
@ 2002-10-18 19:52                                             ` Davide Libenzi
  2002-10-19  0:55                                               ` John Myers
  2002-10-18 21:01                                             ` Charles 'Buck' Krasic
  1 sibling, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-18 19:52 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

On Fri, 18 Oct 2002, John Gardiner Myers wrote:

> Your claim was that even if the API will drop an event at registration
> time, my code scheme would not work.  Thus, we can take "the API will
> drop an event at registration time" as postulated.  That being
> postulated, if there is still data to be read/written on the file
> descriptor then the first event_wait will return immediately.
>
> In fact, given that postulate and the appropriate axioms about the
> behavior of event_wait() and do_io(), one can prove that my code scheme
> is equivalent to yours.  The logical conclusion from that and your claim
> would be that you don't understand how edge triggered APIs have to be used.

No, the concept of edge triggered APIs is that you have to use the fd
until EAGAIN. It's a very simple concept. That means that after a
connect()/accept() you have to start using the fd because I/O space might
be available for read()/write(). Dropping an event is an attempt of using
the API like poll() & Co., where after an fd born, it is put inside the
set to be later wake up. You're basically saying "the kernel should drop an
event at creation time" and I'm saying that, to keep the API usage
consistent to "use the fd until EAGAIN", you have to use the fd as soon as
it'll become available.



> >The reason you're asking /dev/epoll to drop an event at
> >fd insertion time shows very clearly that you're going to use the API is
> >the WRONG way and that you do not understand how such APIs works.
> >
> The wrong way as defined by what?  Having /dev/epoll drop appropriate
> events at registration time permits a useful simplification/optimization
> and makes the system significantly less prone to subtle progamming errors.
>
> I do understand how such APIs work, to the extent that I am pointing out
> a flaw in their current models.

I'm sorry but why do you want to sell your mistakes for API flaws ?



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 19:02                                           ` John Gardiner Myers
  2002-10-18 19:52                                             ` Davide Libenzi
@ 2002-10-18 21:01                                             ` Charles 'Buck' Krasic
  2002-10-18 21:33                                               ` Davide Libenzi
  2002-10-19  1:05                                               ` John Myers
  1 sibling, 2 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-18 21:01 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie



> >[N-1] for (;;) {
> >[N  ]     fd = event_wait(...);
> >[N+1]     while (do_io(fd) != EAGAIN);
> >[N+2} }

I'm getting confused over what minute details are being disputed here.

This debate might get clearer, to me anyway, if the example code
fragments were more concrete.

So if anybody still cares at this point, here is my stab at clarifying
some things.

PART I:  THE RACE

Suppose we have the following:

1 for(;;) {
2      fd = event_wait(...);
3      if(fd == my_listen_fd) {
4           /* new connections */
5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN)) 
6                   epoll_addf(new_fd, ...);
7       } else {
8           /* established connections */
9           while(do_io(fd) != EAGAIN)
10      }
11 }

With the current epoll/rtsig semantics, there is a race condition
above.  I think this essentially the same race condition as the
snippet at the top of this message.  

Just to be clear, I walk completely through the steps in the race
scenario, as follows.

We start with our application blocked in line 2.  

A new connection is initiated by the application on other side.

The kernels exchange SYNs, causing the connection to be established.

The kernel on our side queues the new connection, waiting for the
application on this side to call accept().  In the process it fires an
edge POLLIN on the listen_fd, which wakes up the kernel side of line
2.  However, some time may pass before we actually wake up.

Meanwhile, the other side immediately sends some application level
data. The other side is going to wait for us to read the application
level data and respond.  So it is now blocked.

All of this happens before our application runs line 5 to pick up the
new connection from the kernel.  

Here comes the race:

Before we reach line 6, new_fd is not in epoll mode, so packet
arrivals do not trigger a POLLIN edge notfication on new_fd.

After line 6, there will be no data from the other side, so there will
still be no POLLIN edge notification for new_fd.

Therefore, line 2 will never yield a POLLIN event for new_fd, and the
new connection is now deadlocked.

Is this the kind of race we're talking about?

If so, we proceed as follows.

PART 2: SOLUTIONS

A race free alternative to write the code above is as follows.  Only
one new line (marked with *) is added.

1 for(;;) {
2      fd = event_wait(...);
3      if(fd == my_listen_fd) {
4           /* new connections */
5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN)) {
6                    epoll_addf(new_fd, ...);
7*                   while(do_io(new_fd) != EAGAIN);
8           }
9       } else {
10           /* established connections */
11           while(do_io(fd) != EAGAIN)
12      }
13 }

The example above works with current epoll and rtsig semantics.  This
is just rephrasing what Davide has been saying: "Never call event_wait
without first ensuring that IO space is definitively exhausted".

Or we could have (to make John happier?):

1 for(;;) {
2      fd = event_wait(...);
3      if(fd == my_listen_fd) {
4           /* new connections */
5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN)) {
6*                  epoll_addf(new_fd, &pfd, ...);
7*                  if(pfd.revents & POLLIN) {
7*                      while(do_io(new_fd) != EAGAIN);
8*                  } 
8           }
9       } else {
10           /* established connections */
11           while(do_io(fd) != EAGAIN)
12      }
13 }

Here, epoll_addf primitive has been modified to return the initial
status.  Presumably so we avoid the first call to do_io if there is
nothing to do yet.

If it's easy to do (change add primitive that is), why not?

The first solution works either way.

-- Buck










^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 21:01                                             ` Charles 'Buck' Krasic
@ 2002-10-18 21:33                                               ` Davide Libenzi
  2002-10-19  1:05                                               ` John Myers
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-18 21:33 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: John Gardiner Myers, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On 18 Oct 2002, Charles 'Buck' Krasic wrote:

> I'm getting confused over what minute details are being disputed here.
>
> This debate might get clearer, to me anyway, if the example code
> fragments were more concrete.
>
> So if anybody still cares at this point, here is my stab at clarifying
> some things.
>
> PART I:  THE RACE
>
> Suppose we have the following:
>
> 1 for(;;) {
> 2      fd = event_wait(...);
> 3      if(fd == my_listen_fd) {
> 4           /* new connections */
> 5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN))
> 6                   epoll_addf(new_fd, ...);
> 7       } else {
> 8           /* established connections */
> 9           while(do_io(fd) != EAGAIN)
> 10      }
> 11 }
>
> With the current epoll/rtsig semantics, there is a race condition
> above.  I think this essentially the same race condition as the
> snippet at the top of this message.
>
> Just to be clear, I walk completely through the steps in the race
> scenario, as follows.
>
> We start with our application blocked in line 2.
>
> A new connection is initiated by the application on other side.
>
> The kernels exchange SYNs, causing the connection to be established.
>
> The kernel on our side queues the new connection, waiting for the
> application on this side to call accept().  In the process it fires an
> edge POLLIN on the listen_fd, which wakes up the kernel side of line
> 2.  However, some time may pass before we actually wake up.
>
> Meanwhile, the other side immediately sends some application level
> data. The other side is going to wait for us to read the application
> level data and respond.  So it is now blocked.
>
> All of this happens before our application runs line 5 to pick up the
> new connection from the kernel.
>
> Here comes the race:
>
> Before we reach line 6, new_fd is not in epoll mode, so packet
> arrivals do not trigger a POLLIN edge notfication on new_fd.
>
> After line 6, there will be no data from the other side, so there will
> still be no POLLIN edge notification for new_fd.
>
> Therefore, line 2 will never yield a POLLIN event for new_fd, and the
> new connection is now deadlocked.
>
> Is this the kind of race we're talking about?

Exactly, you're going to wait for an event w/out having consumed the
possibly available I/O space.



> If so, we proceed as follows.
>
> PART 2: SOLUTIONS
>
> A race free alternative to write the code above is as follows.  Only
> one new line (marked with *) is added.
>
> 1 for(;;) {
> 2      fd = event_wait(...);
> 3      if(fd == my_listen_fd) {
> 4           /* new connections */
> 5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN)) {
> 6                    epoll_addf(new_fd, ...);
> 7*                   while(do_io(new_fd) != EAGAIN);
> 8           }
> 9       } else {
> 10           /* established connections */
> 11           while(do_io(fd) != EAGAIN)
> 12      }
> 13 }

Exactly, this is the sketchy solution ( but event_wait() return more than
one fd though ).




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 19:52                                             ` Davide Libenzi
@ 2002-10-19  0:55                                               ` John Myers
  2002-10-19  5:40                                                 ` Davide Libenzi
  2002-10-19  6:59                                                 ` Mark Mielke
  0 siblings, 2 replies; 138+ messages in thread
From: John Myers @ 2002-10-19  0:55 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Benjamin LaHaise, Dan Kegel, Shailabh Nagar, linux-kernel,
	linux-aio, Andrew Morton, David Miller, Linus Torvalds,
	Stephen Tweedie

Davide Libenzi wrote:

>No, the concept of edge triggered APIs is that you have to use the fd
>until EAGAIN.
>
Which my code does, given the postulate.

>It's a very simple concept. That means that after a
>connect()/accept() you have to start using the fd because I/O space might
>be available for read()/write(). Dropping an event is an attempt of using
>the API like poll() & Co., where after an fd born, it is put inside the
>set to be later wake up. You're basically saying "the kernel should drop an
>event at creation time" and I'm saying that, to keep the API usage
>consistent to "use the fd until EAGAIN", you have to use the fd as soon as
>it'll become available.
>
Here's where your argument is inconsistent with the Linux philosophy.

Linux has a strong philosophy of practicality.  The goal of Linux is to 
do useful things, including provide applications with the semantics they 
need to do useful things.  The criteria for deciding what goes into 
Linux is heavily weighted towards what works best in practice.

Whether or not some API matches someone's Platonic ideal of of an OS 
interface is not a criterion.  In Linux, APIs are judged by their 
practical merits.  This is why Linux does not have such things as 
message passing and separate address spaces for drivers.

So whether or not a proposed set of epoll semantics is consistent with 
your Platonic ideal of "use the fd until EAGAIN" is simply not an issue. 
 What matters is what works best in practice.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 21:01                                             ` Charles 'Buck' Krasic
  2002-10-18 21:33                                               ` Davide Libenzi
@ 2002-10-19  1:05                                               ` John Myers
  2002-10-19  1:27                                                 ` Tervel Atanassov
  2002-10-19  4:07                                                 ` Charles 'Buck' Krasic
  1 sibling, 2 replies; 138+ messages in thread
From: John Myers @ 2002-10-19  1:05 UTC (permalink / raw)
  To: Charles 'Buck' Krasic
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

Charles 'Buck' Krasic wrote:

>Or we could have (to make John happier?):
>
>1 for(;;) {
>2      fd = event_wait(...);
>3      if(fd == my_listen_fd) {
>4           /* new connections */
>5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN)) {
>6*                  epoll_addf(new_fd, &pfd, ...);
>7*                  if(pfd.revents & POLLIN) {
>7*                      while(do_io(new_fd) != EAGAIN);
>8*                  } 
>8           }
>9       } else {
>10           /* established connections */
>11           while(do_io(fd) != EAGAIN)
>12      }
>13 }
>  
>
Close.  What we would have is a modification of the epoll_addf() 
semantics such that it would have an additional postcondition that if 
the new_fd is in the ready state (has data available) then at least one 
notification has been generated.  In the code above, the three lines 
comprising the if statement labeled "7*" would be removed.



^ permalink raw reply	[flat|nested] 138+ messages in thread

* RE: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  1:05                                               ` John Myers
@ 2002-10-19  1:27                                                 ` Tervel Atanassov
  2002-10-19 18:52                                                   ` John G. Myers
  2002-10-19  4:07                                                 ` Charles 'Buck' Krasic
  1 sibling, 1 reply; 138+ messages in thread
From: Tervel Atanassov @ 2002-10-19  1:27 UTC (permalink / raw)
  To: 'John Myers', 'Charles 'Buck' Krasic'
  Cc: 'Davide Libenzi', 'Benjamin LaHaise',
	'Dan Kegel', 'Shailabh Nagar',
	'linux-kernel', 'linux-aio',
	'Andrew Morton', 'David Miller',
	'Linus Torvalds', 'Stephen Tweedie'

I am just joining your discussion today for the fist time.  I come from
a Windows implementation of async I/O, so please don't hold it against
me.  I can't say that I am following 100% percent, but I think you guys
are talking about what the user API will look like, correct?

Assuming the answer is yes.  Here are my two cents.  The code you have
below seems a bit awkward -- the line while(do_io(fd) != EAGAIN) appears
twice.  I think the reason for that is that you're trying to do too many
things at once, namely, you're trying to handle both the initial
accept/setup of the socket and its steady state servicing.  I don't see
any benefit to that -- it definitely doesn't make for cleaner code.  Why
not do things separately.

1.  Have a setup phase which more or less does:

*  listen()
*  accept()
*  add the new fd/socket to an "event" which all the worker threads are
waiting on.

2.  Have the worker tread/steady state operation be:

*  event_wait() which returns the fd, some descriptor of what exactly
happened (read/write), the number of bytes transferred.
*  based upon the return from event wait the user updates his state, and
posts the next operation (read/write).

Thanks,

Tervel Atanassov

-----Original Message-----
From: owner-linux-aio@kvack.org [mailto:owner-linux-aio@kvack.org] On
Behalf Of John Myers
Sent: Friday, October 18, 2002 6:05 PM
To: Charles 'Buck' Krasic
Cc: Davide Libenzi; Benjamin LaHaise; Dan Kegel; Shailabh Nagar;
linux-kernel; linux-aio; Andrew Morton; David Miller; Linus Torvalds;
Stephen Tweedie
Subject: Re: epoll (was Re: [PATCH] async poll for 2.5)

Charles 'Buck' Krasic wrote:

>Or we could have (to make John happier?):
>
>1 for(;;) {
>2      fd = event_wait(...);
>3      if(fd == my_listen_fd) {
>4           /* new connections */
>5           while((new_fd = my_accept(my_listen_fd, ...) != EAGAIN)) {
>6*                  epoll_addf(new_fd, &pfd, ...);
>7*                  if(pfd.revents & POLLIN) {
>7*                      while(do_io(new_fd) != EAGAIN);
>8*                  } 
>8           }
>9       } else {
>10           /* established connections */
>11           while(do_io(fd) != EAGAIN)
>12      }
>13 }
>  
>
Close.  What we would have is a modification of the epoll_addf() 
semantics such that it would have an additional postcondition that if 
the new_fd is in the ready state (has data available) then at least one 
notification has been generated.  In the code above, the three lines 
comprising the if statement labeled "7*" would be removed.


--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  1:05                                               ` John Myers
  2002-10-19  1:27                                                 ` Tervel Atanassov
@ 2002-10-19  4:07                                                 ` Charles 'Buck' Krasic
  1 sibling, 0 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-19  4:07 UTC (permalink / raw)
  To: John Myers; +Cc: linux-kernel, linux-aio


jgmyers@netscape.com (John Myers) writes:

> Close.  What we would have is a modification of the epoll_addf()
> semantics such that it would have an additional postcondition that if
> the new_fd is in the ready state (has data available) then at least
> one notification has been generated.  In the code above, the three
> lines comprising the if statement labeled "7*" would be removed.

I see.

I assume the kernel implementation is no big deal: epoll_addf() has to
call the kernel internal equivalent to poll() with a zero timeout.

This wouldn't break the first "solution" in my earlier post, but it
would cause every new connection to experience one extra EAGAIN.  

I see three possibilities:

  1) keep the current epoll_addf()
  2) modify it as John suggests, posting the initial ready state in 
     the next epoll_getevents()
  3) both: add an option to epoll_addf() that says which of 1 or 2 is desired.

-- Buck













How hard would it be to modify the current epoll code to work that
way?  I'd assume it's just a matter having epoll_addf call the legacy
poll() code to check the condition (with a zero timeout).



-- Buck







^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  0:55                                               ` John Myers
@ 2002-10-19  5:40                                                 ` Davide Libenzi
  2002-10-19  6:59                                                 ` Mark Mielke
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-19  5:40 UTC (permalink / raw)
  To: John Myers; +Cc: linux-kernel, linux-aio

On Fri, 18 Oct 2002, John Myers wrote:

> >It's a very simple concept. That means that after a
> >connect()/accept() you have to start using the fd because I/O space might
> >be available for read()/write(). Dropping an event is an attempt of using
> >the API like poll() & Co., where after an fd born, it is put inside the
> >set to be later wake up. You're basically saying "the kernel should drop an
> >event at creation time" and I'm saying that, to keep the API usage
> >consistent to "use the fd until EAGAIN", you have to use the fd as soon as
> >it'll become available.
> >
> Here's where your argument is inconsistent with the Linux philosophy.
>
> Linux has a strong philosophy of practicality.  The goal of Linux is to
> do useful things, including provide applications with the semantics they
> need to do useful things.  The criteria for deciding what goes into
> Linux is heavily weighted towards what works best in practice.
>
> Whether or not some API matches someone's Platonic ideal of of an OS
> interface is not a criterion.  In Linux, APIs are judged by their
> practical merits.  This is why Linux does not have such things as
> message passing and separate address spaces for drivers.
>
> So whether or not a proposed set of epoll semantics is consistent with
> your Platonic ideal of "use the fd until EAGAIN" is simply not an issue.
>  What matters is what works best in practice.

Luckily enough, being the only one that wasted my time in those couple of
days arguing against the API semantic, you pretty much down in the list of
people that are able to decide what "works best in practice".



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-18 19:16                             ` Davide Libenzi
@ 2002-10-19  6:56                               ` Mark Mielke
  2002-10-19 16:10                                 ` Charles 'Buck' Krasic
  2002-10-19 17:19                                 ` epoll (was Re: [PATCH] async poll for 2.5) Davide Libenzi
  0 siblings, 2 replies; 138+ messages in thread
From: Mark Mielke @ 2002-10-19  6:56 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Dan Kegel, John Myers, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Fri, Oct 18, 2002 at 12:16:48PM -0700, Davide Libenzi wrote:
> These functions are taken from the really simple example http server used
> to test/compare /dev/epoll with poll()/select()/rt-sig//dev/poll :

They still represent an excessive complicated model that attempts to
implement /dev/epoll the same way that one would implement poll()/select().

Sometimes the answer isn't emulation, or comparability.

Sometimes the answer is innovation.

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  0:55                                               ` John Myers
  2002-10-19  5:40                                                 ` Davide Libenzi
@ 2002-10-19  6:59                                                 ` Mark Mielke
  2002-10-19 17:26                                                   ` Davide Libenzi
  2002-10-19 17:48                                                   ` Dan Kegel
  1 sibling, 2 replies; 138+ messages in thread
From: Mark Mielke @ 2002-10-19  6:59 UTC (permalink / raw)
  To: John Myers
  Cc: Davide Libenzi, Benjamin LaHaise, Dan Kegel, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

On Fri, Oct 18, 2002 at 05:55:21PM -0700, John Myers wrote:
> So whether or not a proposed set of epoll semantics is consistent with 
> your Platonic ideal of "use the fd until EAGAIN" is simply not an issue. 
> What matters is what works best in practice.

>From this side of the fence: One vote for "use the fd until EAGAIN" being
flawed. If I wanted a method of monopolizing the event loop with real time
priorities, I would implement real time priorities within the event loop.

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  6:56                               ` Mark Mielke
@ 2002-10-19 16:10                                 ` Charles 'Buck' Krasic
  2002-10-22 17:22                                   ` Mark Mielke
  2002-10-19 17:19                                 ` epoll (was Re: [PATCH] async poll for 2.5) Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-19 16:10 UTC (permalink / raw)
  To: Mark Mielke; +Cc: linux-kernel, linux-aio


Mark Mielke <mark@mark.mielke.cc> writes:

> They still represent an excessive complicated model that attempts to
> implement /dev/epoll the same way that one would implement poll()/select().

epoll is about fixing one aspect of an otherwise well established api.
That is, fixing the scalability of poll()/select() for applications
based on non-blocking sockets.

Yes, programming non-blocking sockets has its complexity.  Most people
probably end up writing their own API to simplify things, building
wrapper libraries above.  However, a wrapper library can not fix the
performance problems of poll()/select().

epoll() is a relatively modest kernel modification that delivers
impressive performance benefits.  It isn't exactly a drop in
replacement for poll()/select(), especially because of the difference
between level and edge semantics.  That's why wrapper libraries make
sense.   

> Sometimes the answer isn't emulation, or comparability.
> 
> Sometimes the answer is innovation.

> mark

Yes, building a better API into the kernel makes sense.  Not only to
eliminate wrappers, but to generalize beyond sockets.  I went to local
user group meeting this week where a guy gave an overview of what's
new in kernel 2.5.  When AIO was presented, the first question was
"why not use nonblocking IO?".  I bet more than half the programmers
in the room had no idea that nonblocking flags have absolutely no
effect for files.    

It seems to me that most people pushing on AIO so far have been using
it for files, e.g.  Oracle.  They have no choice.  It's either that or
use clumsy worker thread arrangements.  Network code on the other
hand, has had non-blocking IO available for years.  That's why there
are all kinds of examples of web servers that use threads only for the
disk side of things.

But let's not mix them up.  AIO is a new model (for linux anyway),
whose implementation is necessitating fairly major architectural
changes to the kernel.

epoll is a nice well contained fix, that fits neatly into the existing
architecture.   

As it happens, epoll will probably have a place in AIO too.  But there
is no mutual dependency between them.  They are both important on
their own.

-- Buck


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  6:56                               ` Mark Mielke
  2002-10-19 16:10                                 ` Charles 'Buck' Krasic
@ 2002-10-19 17:19                                 ` Davide Libenzi
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-19 17:19 UTC (permalink / raw)
  To: Mark Mielke; +Cc: linux-kernel, linux-aio

On Sat, 19 Oct 2002, Mark Mielke wrote:

> On Fri, Oct 18, 2002 at 12:16:48PM -0700, Davide Libenzi wrote:
> > These functions are taken from the really simple example http server used
> > to test/compare /dev/epoll with poll()/select()/rt-sig//dev/poll :
>
> They still represent an excessive complicated model that attempts to
> implement /dev/epoll the same way that one would implement poll()/select().
>
> Sometimes the answer isn't emulation, or comparability.
>
> Sometimes the answer is innovation.

Hem ... they're about 100 lines of code and they rapresent a complete I/O
dispatching engine. And yes, like you could guess from the source code,
the same skeleton was used, with different event retrieval methods, to
test poll() , rt-sig , /dev/poll and /dev/epoll. And, as I said in the
previous email, you could have implemented an I/O driven state machine. I
personally like a little bit more coroutines, that the reason of such
implementation.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  6:59                                                 ` Mark Mielke
@ 2002-10-19 17:26                                                   ` Davide Libenzi
  2002-10-19 17:48                                                   ` Dan Kegel
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-19 17:26 UTC (permalink / raw)
  To: Mark Mielke; +Cc: linux-kernel, linux-aio

On Sat, 19 Oct 2002, Mark Mielke wrote:

> On Fri, Oct 18, 2002 at 05:55:21PM -0700, John Myers wrote:
> > So whether or not a proposed set of epoll semantics is consistent with
> > your Platonic ideal of "use the fd until EAGAIN" is simply not an issue.
> > What matters is what works best in practice.
>
> >From this side of the fence: One vote for "use the fd until EAGAIN" being
> flawed. If I wanted a method of monopolizing the event loop with real time
> priorities, I would implement real time priorities within the event loop.

You don't need to "use the fd until EAGAIN", you can consume even only
byte out of 10000 and stop using the fd. As long as you keep such fd in
your ready-list. As soon as you receive an EAGAIN from that fd, you remove
it from your ready-list and the next time you'll go to fish for events it
will reemerge as soon as it'll have something for you. The concept is very
simple, "you don't have to go waiting for events for a given fd before
having consumed its I/O space".



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  6:59                                                 ` Mark Mielke
  2002-10-19 17:26                                                   ` Davide Libenzi
@ 2002-10-19 17:48                                                   ` Dan Kegel
  2002-10-19 18:52                                                     ` Charles 'Buck' Krasic
  2002-10-22 19:35                                                     ` John Gardiner Myers
  1 sibling, 2 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-19 17:48 UTC (permalink / raw)
  To: Mark Mielke
  Cc: John Myers, Davide Libenzi, Benjamin LaHaise, Shailabh Nagar,
	linux-kernel, linux-aio, Andrew Morton, David Miller,
	Linus Torvalds, Stephen Tweedie

Mark Mielke wrote:
> On Fri, Oct 18, 2002 at 05:55:21PM -0700, John Myers wrote:
> 
>>So whether or not a proposed set of epoll semantics is consistent with 
>>your Platonic ideal of "use the fd until EAGAIN" is simply not an issue. 
>>What matters is what works best in practice.
> 
> 
>>From this side of the fence: One vote for "use the fd until EAGAIN" being
> flawed. If I wanted a method of monopolizing the event loop with real time
> priorities, I would implement real time priorities within the event loop.

The choice I see is between:
1. re-arming the one-shot notification when the user gets EAGAIN
2. re-arming the one-shot notification when the user reads all the data
    that was waiting (such that the very next read would return EGAIN).

#1 is what Davide wants; I think John and Mark are arguing for #2.

I suspect that Davide would be happy with #2, but advises
programmers to read until EGAIN anyway just to make things clear.

If the programmer is smart enough to figure out how to do that without
hitting EAGAIN, that's fine.  Essentially, if he tries to get away
without getting an EAGAIN, and his program stalls because he didn't
read all the data that's available and thereby doesn't reset the
one-shot readiness event, it's his own damn fault, and he should
go back to using level-triggered techniques like classical poll()
or blocking i/o.

- Dan



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19 17:48                                                   ` Dan Kegel
@ 2002-10-19 18:52                                                     ` Charles 'Buck' Krasic
  2002-10-19 20:18                                                       ` Charles 'Buck' Krasic
  2002-10-22 19:35                                                     ` John Gardiner Myers
  1 sibling, 1 reply; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-19 18:52 UTC (permalink / raw)
  To: Dan Kegel; +Cc: linux-kernel, linux-aio


Dan Kegel <dank@kegel.com> writes:

> The choice I see is between:
> 1. re-arming the one-shot notification when the user gets EAGAIN
> 2. re-arming the one-shot notification when the user reads all the data
>     that was waiting (such that the very next read would return EGAIN).

> #1 is what Davide wants; I think John and Mark are arguing for #2.

I thought the debate was over how the initial arming of the one-shot
was done.  

The choice above is a different issue.

Neither of 1 or 2 above accurately reflects what the code in the
kernel actually does.

Hitting EAGAIN does not "re-arm" the one-shot notification.  

Consider TCP.

The tcp write code issues a POLLOUT edge when the socket-buffer fill
level drops below a hi-water mark (tcp_min_write_space()). 

For reads, AFAIK, tcp issues POLLIN for every new TCP segment that
arrives, which get coalesced automatically by virtue of the getevents
barrier.

A short count means the appliation has hit an extreme end of the
buffer (completely full or empty).  EAGAIN means the buffer was
already at the extreme.  Either way you know just as well that new
activity will trigger a new edge event.

In summary, a short count is every bit as reliable as EAGAIN to know
that it is safe to wait on epoll_getevents.

Davide?

-- Buck

> I suspect that Davide would be happy with #2, but advises
> programmers to read until EGAIN anyway just to make things clear.

It's a minor point, but based on the logic above, I think this advise
is overkill.

-- Buck

> If the programmer is smart enough to figure out how to do that without
> hitting EAGAIN, that's fine.  Essentially, if he tries to get away
> without getting an EAGAIN, and his program stalls because he didn't
> read all the data that's available and thereby doesn't reset the
> one-shot readiness event, it's his own damn fault, and he should
> go back to using level-triggered techniques like classical poll()
> or blocking i/o.

> - Dan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19  1:27                                                 ` Tervel Atanassov
@ 2002-10-19 18:52                                                   ` John G. Myers
  0 siblings, 0 replies; 138+ messages in thread
From: John G. Myers @ 2002-10-19 18:52 UTC (permalink / raw)
  To: Tervel Atanassov
  Cc: 'Benjamin LaHaise', 'linux-kernel', 'linux-aio'


On Friday, October 18, 2002, at 06:27  PM, Tervel Atanassov wrote:
>  The code you have
> below seems a bit awkward -- the line while(do_io(fd) != EAGAIN) 
> appears
> twice.  I think the reason for that is that you're trying to do too 
> many
> things at once, namely, you're trying to handle both the initial
> accept/setup of the socket and its steady state servicing.  I don't see
> any benefit to that -- it definitely doesn't make for cleaner code.  
> Why
> not do things separately.

If you carefully reread the message you replied to, you will see that 
this is exactly what I am proposing.  The redundant copy of the line 
you consider awkward would be removed.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19 18:52                                                     ` Charles 'Buck' Krasic
@ 2002-10-19 20:18                                                       ` Charles 'Buck' Krasic
  2002-10-19 21:08                                                         ` Dan Kegel
  0 siblings, 1 reply; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-19 20:18 UTC (permalink / raw)
  To: Dan Kegel; +Cc: linux-kernel, linux-aio


Whoops.  I just realized a flaw in my own argument.  

With read, a short count might precede EOF.  Indeed, in that case,
calling epoll_getevents would cause the connection to get stuck.

Never mind my earlier message then. 

-- Buck

"Charles 'Buck' Krasic" <krasic@acm.org> writes:

> In summary, a short count is every bit as reliable as EAGAIN to know
> that it is safe to wait on epoll_getevents.

> -- Buck

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19 20:18                                                       ` Charles 'Buck' Krasic
@ 2002-10-19 21:08                                                         ` Dan Kegel
  0 siblings, 0 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-19 21:08 UTC (permalink / raw)
  To: Charles 'Buck' Krasic; +Cc: linux-kernel, linux-aio

Charles 'Buck' Krasic wrote:
 >>In summary, a short count is every bit as reliable as EAGAIN to know
 >>that it is safe to wait on epoll_getevents.
 >
> Whoops.  I just realized a flaw in my own argument.  
> 
> With read, a short count might precede EOF.  Indeed, in that case,
> calling epoll_getevents would cause the connection to get stuck.

Maybe epoll should be extended with a specific EOF event.
Then short reads would be fine.
- Dan


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-21 16:58                             ` [PATCH] async poll for 2.5 Alan Cox
@ 2002-10-21 16:50                               ` Benjamin LaHaise
  0 siblings, 0 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-21 16:50 UTC (permalink / raw)
  To: Alan Cox
  Cc: John Gardiner Myers, Charles 'Buck' Krasic,
	Davide Libenzi, Dan Kegel, Shailabh Nagar,
	Linux Kernel Mailing List, linux-aio, Andrew Morton,
	David Miller, Linus Torvalds, Stephen Tweedie

On Mon, Oct 21, 2002 at 05:58:17PM +0100, Alan Cox wrote:
> I think a chunk of the poll scaling problem is better addressed by
> futexes. If I can say "this futex list for this fd for events X Y and Z"
> I can construct almost all the efficient stuff I need out of the futex
> interfaces, much like doing it with SIGIO setting flags but a lot less
> clocks

I've structured the aio userland structure so that this is possible, just 
not implemented yet.  There are fields for compatible and incompatible 
features, as well as the length of the header.  This way, the library can 
implement a faster getevents call when futex support is added, but it 
always has the option of falling back to the syscall should it not understand 
any changes we make to the data structure.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: [PATCH] async poll for 2.5
  2002-10-16 18:29                           ` John Gardiner Myers
  2002-10-16 20:39                             ` Charles 'Buck' Krasic
@ 2002-10-21 16:58                             ` Alan Cox
  2002-10-21 16:50                               ` Benjamin LaHaise
  1 sibling, 1 reply; 138+ messages in thread
From: Alan Cox @ 2002-10-21 16:58 UTC (permalink / raw)
  To: John Gardiner Myers
  Cc: Charles 'Buck' Krasic, Davide Libenzi, Benjamin LaHaise,
	Dan Kegel, Shailabh Nagar, Linux Kernel Mailing List, linux-aio,
	Andrew Morton, David Miller, Linus Torvalds, Stephen Tweedie

On Wed, 2002-10-16 at 19:29, John Gardiner Myers wrote:
> Better to fix the API.  The kernel has more information than user space 
> and can do a better job.  In the kernel, the problem can be fixed once 
> and for all, not over and over again in each different wrapper library. 
>  It's not even as if the change would break programs correctly written 
> to the old API, not that we particularly care about programs written to 
> the old API.

I think a chunk of the poll scaling problem is better addressed by
futexes. If I can say "this futex list for this fd for events X Y and Z"
I can construct almost all the efficient stuff I need out of the futex
interfaces, much like doing it with SIGIO setting flags but a lot less
clocks

Alan

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19 16:10                                 ` Charles 'Buck' Krasic
@ 2002-10-22 17:22                                   ` Mark Mielke
  2002-10-22 17:46                                     ` Dan Kegel
                                                       ` (2 more replies)
  0 siblings, 3 replies; 138+ messages in thread
From: Mark Mielke @ 2002-10-22 17:22 UTC (permalink / raw)
  To: Charles 'Buck' Krasic; +Cc: linux-kernel, linux-aio

On Sat, Oct 19, 2002 at 09:10:52AM -0700, Charles 'Buck' Krasic wrote:
> Mark Mielke <mark@mark.mielke.cc> writes:
> > They still represent an excessive complicated model that attempts to
> > implement /dev/epoll the same way that one would implement poll()/select().
> epoll is about fixing one aspect of an otherwise well established api.
> That is, fixing the scalability of poll()/select() for applications
> based on non-blocking sockets.

epoll is not a poll()/select() enhancement (unless it is used in
conjuction with poll()/select()). It is a poll()/select()
replacement.

Meaning... purposefully creating an API that is designed the way one
would design a poll()/select() loop is purposefully limiting the benefits
of /dev/epoll.

It's like inventing a power drill to replace the common screw driver,
but rather than plugging the power drill in, manually turning the
drill as if it was a socket wrench for the drill bit.

I find it an excercise in self defeat... except that /dev/epoll used the
same way one would use poll()/select() happens to perform better even
when it is crippled.

mark

-- 
mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
.  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
|\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
|  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada

  One ring to rule them all, one ring to find them, one ring to bring them all
                       and in the darkness bind them...

                           http://mark.mielke.cc/


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 17:22                                   ` Mark Mielke
@ 2002-10-22 17:46                                     ` Dan Kegel
  2002-10-22 17:47                                     ` Davide Libenzi
  2002-10-22 18:42                                     ` Charles 'Buck' Krasic
  2 siblings, 0 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-22 17:46 UTC (permalink / raw)
  To: Mark Mielke; +Cc: Charles 'Buck' Krasic, linux-kernel, linux-aio

Mark Mielke wrote:
> epoll is not a poll()/select() enhancement (unless it is used in
> conjuction with poll()/select()). It is a poll()/select()
> replacement.
> 
> Meaning... purposefully creating an API that is designed the way one
> would design a poll()/select() loop is purposefully limiting the benefits
> of /dev/epoll.
> 
> It's like inventing a power drill to replace the common screw driver,
> but rather than plugging the power drill in, manually turning the
> drill as if it was a socket wrench for the drill bit.
> 
> I find it an excercise in self defeat... except that /dev/epoll used the
> same way one would use poll()/select() happens to perform better even
> when it is crippled.

Agreed.
- Dan



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 17:22                                   ` Mark Mielke
  2002-10-22 17:46                                     ` Dan Kegel
@ 2002-10-22 17:47                                     ` Davide Libenzi
  2002-10-22 18:13                                       ` Alan Cox
  2002-10-22 18:42                                     ` Charles 'Buck' Krasic
  2 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-22 17:47 UTC (permalink / raw)
  To: Mark Mielke; +Cc: Charles 'Buck' Krasic, linux-kernel, linux-aio

On Tue, 22 Oct 2002, Mark Mielke wrote:

> On Sat, Oct 19, 2002 at 09:10:52AM -0700, Charles 'Buck' Krasic wrote:
> > Mark Mielke <mark@mark.mielke.cc> writes:
> > > They still represent an excessive complicated model that attempts to
> > > implement /dev/epoll the same way that one would implement poll()/select().
> > epoll is about fixing one aspect of an otherwise well established api.
> > That is, fixing the scalability of poll()/select() for applications
> > based on non-blocking sockets.
>
> epoll is not a poll()/select() enhancement (unless it is used in
> conjuction with poll()/select()). It is a poll()/select()
> replacement.
>
> Meaning... purposefully creating an API that is designed the way one
> would design a poll()/select() loop is purposefully limiting the benefits
> of /dev/epoll.
>
> It's like inventing a power drill to replace the common screw driver,
> but rather than plugging the power drill in, manually turning the
> drill as if it was a socket wrench for the drill bit.
>
> I find it an excercise in self defeat... except that /dev/epoll used the
> same way one would use poll()/select() happens to perform better even
> when it is crippled.

Since the sys_epoll ( and /dev/epoll ) fd support standard polling, you
can mix sys_epoll handling with other methods like poll() and the AIO's
POLL function when it'll be ready. For example, for devices that sys_epoll
intentionally does not support, you can use a method like :

	put_sys_epoll_fd_inside_XXX();
	...
	wait_for_XXX_events();
	...
	if (XXX_event_fd() == sys_epoll_fd) {
		sys_epoll_wait();
		for_each_sys_epoll_event {
			handle_fd_event();
		}
	}



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 17:47                                     ` Davide Libenzi
@ 2002-10-22 18:13                                       ` Alan Cox
  2002-10-22 18:18                                         ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: Alan Cox @ 2002-10-22 18:13 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Mark Mielke, Charles 'Buck' Krasic,
	Linux Kernel Mailing List, linux-aio

On Tue, 2002-10-22 at 18:47, Davide Libenzi wrote:
> Since the sys_epoll ( and /dev/epoll ) fd support standard polling, you
> can mix sys_epoll handling with other methods like poll() and the AIO's
> POLL function when it'll be ready. For example, for devices that sys_epoll
> intentionally does not support, you can use a method like :

The more important question is why do you need epoll. asynchronous I/O
completions setting a list of futexes can already be made to do the job
and is much more flexible.


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 18:13                                       ` Alan Cox
@ 2002-10-22 18:18                                         ` Davide Libenzi
  2002-10-22 18:37                                           ` Benjamin LaHaise
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-22 18:18 UTC (permalink / raw)
  To: Alan Cox
  Cc: Mark Mielke, Charles 'Buck' Krasic,
	Linux Kernel Mailing List, linux-aio

On 22 Oct 2002, Alan Cox wrote:

> On Tue, 2002-10-22 at 18:47, Davide Libenzi wrote:
> > Since the sys_epoll ( and /dev/epoll ) fd support standard polling, you
> > can mix sys_epoll handling with other methods like poll() and the AIO's
> > POLL function when it'll be ready. For example, for devices that sys_epoll
> > intentionally does not support, you can use a method like :
>
> The more important question is why do you need epoll. asynchronous I/O
> completions setting a list of futexes can already be made to do the job
> and is much more flexible.

Alan, could you provide a code snipped to show how easy it is and how well
it fits a 1:N ( one task/thread , N connections ) architecture ? And
looking at Ben's presentation about benchmarks ( and for pipe's ), you'll
discover that both poll() and AIO are "a little bit slower" than
sys_epoll. Anyway I do not want anything superflous added to the kernel
w/out reason, that's why, beside the Ben's presentation, there're curretly
people benchmarking existing solutions.




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 18:18                                         ` Davide Libenzi
@ 2002-10-22 18:37                                           ` Benjamin LaHaise
  2002-10-22 19:22                                             ` John Gardiner Myers
  2002-10-22 19:49                                             ` epoll " Davide Libenzi
  0 siblings, 2 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-22 18:37 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Alan Cox, Mark Mielke, Charles 'Buck' Krasic,
	Linux Kernel Mailing List, linux-aio

On Tue, Oct 22, 2002 at 11:18:20AM -0700, Davide Libenzi wrote:
> Alan, could you provide a code snipped to show how easy it is and how well
> it fits a 1:N ( one task/thread , N connections ) architecture ? And
> looking at Ben's presentation about benchmarks ( and for pipe's ), you'll
> discover that both poll() and AIO are "a little bit slower" than
> sys_epoll. Anyway I do not want anything superflous added to the kernel
> w/out reason, that's why, beside the Ben's presentation, there're curretly
> people benchmarking existing solutions.

That's why I was hoping async poll would get fixed to have the same 
performance characteristics as /dev/epoll.  But.... :-/

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 17:22                                   ` Mark Mielke
  2002-10-22 17:46                                     ` Dan Kegel
  2002-10-22 17:47                                     ` Davide Libenzi
@ 2002-10-22 18:42                                     ` Charles 'Buck' Krasic
  2002-10-22 19:35                                       ` Davide Libenzi
  2 siblings, 1 reply; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-22 18:42 UTC (permalink / raw)
  To: Mark Mielke; +Cc: linux-kernel, linux-aio


I don't think the big picture is that complicated.   

epoll is useful for programs that use the old nonblocking socket API.
It improves performance significantly for a case where poll() and
select() are deficient (large numbers of slow or idle connections).

There are a number of people, including myself, who already use epoll.
At least some of us don't think it is too complicated.  I claim most
of the complication is in the nonblocking socket API, in which case
the complexity falls under the category of "the devil you know...".

The old nonblocking socket API (and hence epoll) does nothing for file
IO, and it just doesn't make sense relative to file IO.  (EAGAIN,
POLLIN, POLLOUT, etc. aren't terribly useful signals from a disk
device).

So, its a great thing that the new AIO API is forthcoming.

So maybe epoll's moment of utility is only transient.  It should have
been in the kernel a long time ago.  Is it too late now that AIO is
imminent?  

-- Buck

Mark Mielke <mark@mark.mielke.cc> writes:

> On Sat, Oct 19, 2002 at 09:10:52AM -0700, Charles 'Buck' Krasic wrote:
> > Mark Mielke <mark@mark.mielke.cc> writes:
> > > They still represent an excessive complicated model that attempts to
> > > implement /dev/epoll the same way that one would implement poll()/select().
> > epoll is about fixing one aspect of an otherwise well established api.
> > That is, fixing the scalability of poll()/select() for applications
> > based on non-blocking sockets.
> 
> epoll is not a poll()/select() enhancement (unless it is used in
> conjuction with poll()/select()). It is a poll()/select()
> replacement.
> 
> Meaning... purposefully creating an API that is designed the way one
> would design a poll()/select() loop is purposefully limiting the benefits
> of /dev/epoll.
> 
> It's like inventing a power drill to replace the common screw driver,
> but rather than plugging the power drill in, manually turning the
> drill as if it was a socket wrench for the drill bit.
> 
> I find it an excercise in self defeat... except that /dev/epoll used the
> same way one would use poll()/select() happens to perform better even
> when it is crippled.
> 
> mark
> 
> -- 
> mark@mielke.cc/markm@ncf.ca/markm@nortelnetworks.com __________________________
> .  .  _  ._  . .   .__    .  . ._. .__ .   . . .__  | Neighbourhood Coder
> |\/| |_| |_| |/    |_     |\/|  |  |_  |   |/  |_   | 
> |  | | | | \ | \   |__ .  |  | .|. |__ |__ | \ |__  | Ottawa, Ontario, Canada
> 
>   One ring to rule them all, one ring to find them, one ring to bring them all
>                        and in the darkness bind them...
> 
>                            http://mark.mielke.cc/

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 18:37                                           ` Benjamin LaHaise
@ 2002-10-22 19:22                                             ` John Gardiner Myers
  2002-10-22 19:28                                               ` Benjamin LaHaise
  2002-10-22 19:49                                             ` epoll " Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-22 19:22 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Linux Kernel Mailing List, linux-aio

[-- Attachment #1: Type: text/plain, Size: 348 bytes --]



Benjamin LaHaise wrote:

>That's why I was hoping async poll would get fixed to have the same 
>performance characteristics as /dev/epoll.  But.... :-/
>
If you would like this to happen, it would help if you would respond to 
questions asked and proposals made on the linux-aio mailing list.  It's 
a little difficult trying to read your mind.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 19:22                                             ` John Gardiner Myers
@ 2002-10-22 19:28                                               ` Benjamin LaHaise
  2002-10-22 19:50                                                 ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-22 19:28 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: Linux Kernel Mailing List, linux-aio

On Tue, Oct 22, 2002 at 12:22:59PM -0700, John Gardiner Myers wrote:
> 
> 
> Benjamin LaHaise wrote:
> 
> >That's why I was hoping async poll would get fixed to have the same 
> >performance characteristics as /dev/epoll.  But.... :-/
> >
> If you would like this to happen, it would help if you would respond to 
> questions asked and proposals made on the linux-aio mailing list.  It's 
> a little difficult trying to read your mind.

*Which* proposals?  There was enough of a discussion that I don't know 
what people had decided on.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 18:42                                     ` Charles 'Buck' Krasic
@ 2002-10-22 19:35                                       ` Davide Libenzi
  2002-10-23 16:49                                         ` Dan Kegel
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-22 19:35 UTC (permalink / raw)
  To: Charles 'Buck' Krasic; +Cc: Mark Mielke, linux-kernel, linux-aio

On 22 Oct 2002, Charles 'Buck' Krasic wrote:

> So maybe epoll's moment of utility is only transient.  It should have
> been in the kernel a long time ago.  Is it too late now that AIO is
> imminent?

This is not my call actually. But beside comparing actual performance
between AIO and sys_epoll, one of the advantages that the patch had is
this :

arch/i386/kernel/entry.S  |    4
drivers/char/Makefile     |    4
fs/Makefile               |    4
fs/file_table.c           |    4
fs/pipe.c                 |   36 +
include/asm-i386/poll.h   |    1
include/asm-i386/unistd.h |    3
include/linux/fs.h        |    4
include/linux/list.h      |    5
include/linux/pipe_fs_i.h |    4
include/linux/sys.h       |    2
include/net/sock.h        |   10
net/ipv4/tcp.c            |    4

That is, it has a very little "intrusion" in the original code by plugging
in the existing architecture.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-19 17:48                                                   ` Dan Kegel
  2002-10-19 18:52                                                     ` Charles 'Buck' Krasic
@ 2002-10-22 19:35                                                     ` John Gardiner Myers
  2002-10-22 20:06                                                       ` Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-22 19:35 UTC (permalink / raw)
  To: linux-kernel, linux-aio; +Cc: Benjamin LaHaise

[-- Attachment #1: Type: text/plain, Size: 811 bytes --]



Dan Kegel wrote:

> The choice I see is between:
> 1. re-arming the one-shot notification when the user gets EAGAIN
> 2. re-arming the one-shot notification when the user reads all the data
>    that was waiting (such that the very next read would return EGAIN).
>
> #1 is what Davide wants; I think John and Mark are arguing for #2. 

No, this is not what I'm arguing.  Once an event arrives for a fd, my 
proposed semantics are no different than Mr. Libenzi's.  The only 
difference is what happens upon registration of interest for a fd.  With 
my semantics, the kernel guarantees that if the fd is ready then at 
least one event has been generated.  With Mr Libenzi's semantics, there 
is no such guarantee and the application is required to behave as if an 
event had been generated upon registration.



[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 18:37                                           ` Benjamin LaHaise
  2002-10-22 19:22                                             ` John Gardiner Myers
@ 2002-10-22 19:49                                             ` Davide Libenzi
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-22 19:49 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Linux Kernel Mailing List, linux-aio

On Tue, 22 Oct 2002, Benjamin LaHaise wrote:

> On Tue, Oct 22, 2002 at 11:18:20AM -0700, Davide Libenzi wrote:
> > Alan, could you provide a code snipped to show how easy it is and how well
> > it fits a 1:N ( one task/thread , N connections ) architecture ? And
> > looking at Ben's presentation about benchmarks ( and for pipe's ), you'll
> > discover that both poll() and AIO are "a little bit slower" than
> > sys_epoll. Anyway I do not want anything superflous added to the kernel
> > w/out reason, that's why, beside the Ben's presentation, there're curretly
> > people benchmarking existing solutions.
>
> That's why I was hoping async poll would get fixed to have the same
> performance characteristics as /dev/epoll.  But.... :-/

Yep, like I wrote you yesterday ( and you did not answer ) is that I was
trying to add supprt for AIO into the test HTTP server I use to do
benchmark tests ( http://www.xmailserver.org/linux-patches/ephttpd-0.1.tar.gz )
but since there's no support for AIO POLL, it is impossible ( at least to
my knowledge ) to compare sys_epoll and AIO on a "real HTTP load"
networking test.




- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 19:28                                               ` Benjamin LaHaise
@ 2002-10-22 19:50                                                 ` John Gardiner Myers
  2002-10-22 20:00                                                   ` Benjamin LaHaise
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-22 19:50 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Linux Kernel Mailing List, linux-aio

[-- Attachment #1: Type: text/plain, Size: 366 bytes --]



Benjamin LaHaise wrote:

>*Which* proposals?  There was enough of a discussion that I don't know 
>what people had decided on.
>
Primarily the ones in my message of Tue, 15 Oct 2002 16:26:59 -0700. In 
that I repeat a question I posed in my message of Tue, 01 Oct 2002 
14:16:23 -0700.

There's also the IOCB_CMD_NOOP strawman of Fri, 18 Oct 2002 17:16:41 -0700.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 19:50                                                 ` John Gardiner Myers
@ 2002-10-22 20:00                                                   ` Benjamin LaHaise
  2002-10-22 20:23                                                     ` async poll John Myers
  2002-10-23 11:10                                                     ` Latest aio code (was Re: [PATCH] async poll for 2.5) Suparna Bhattacharya
  0 siblings, 2 replies; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-22 20:00 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: Linux Kernel Mailing List, linux-aio

On Tue, Oct 22, 2002 at 12:50:44PM -0700, John Gardiner Myers wrote:
> 
> 
> Benjamin LaHaise wrote:
> 
> >*Which* proposals?  There was enough of a discussion that I don't know 
> >what people had decided on.
> >
> Primarily the ones in my message of Tue, 15 Oct 2002 16:26:59 -0700. In 
> that I repeat a question I posed in my message of Tue, 01 Oct 2002 
> 14:16:23 -0700.

How does it perform?

> There's also the IOCB_CMD_NOOP strawman of Fri, 18 Oct 2002 17:16:41 -0700.

That's going in unless there are any other objections to it from folks.  
Part of it was that I had problems with 2.4.43-bk not working on my 
test machines last week that delayed a few things.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 19:35                                                     ` John Gardiner Myers
@ 2002-10-22 20:06                                                       ` Davide Libenzi
  2002-10-22 21:54                                                         ` Erich Nahum
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-22 20:06 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: linux-kernel, linux-aio

On Tue, 22 Oct 2002, John Gardiner Myers wrote:

>
>
> Dan Kegel wrote:
>
> > The choice I see is between:
> > 1. re-arming the one-shot notification when the user gets EAGAIN
> > 2. re-arming the one-shot notification when the user reads all the data
> >    that was waiting (such that the very next read would return EGAIN).
> >
> > #1 is what Davide wants; I think John and Mark are arguing for #2.
>
> No, this is not what I'm arguing.  Once an event arrives for a fd, my
> proposed semantics are no different than Mr. Libenzi's.  The only
> difference is what happens upon registration of interest for a fd.  With
> my semantics, the kernel guarantees that if the fd is ready then at
> least one event has been generated.  With Mr Libenzi's semantics, there
> is no such guarantee and the application is required to behave as if an
> event had been generated upon registration.

sed s/Mr. Libenzi/Davide/g ... I'm not that old :)
There're a couple of reason's why the drop of the initial event is a waste
of time :

1) The I/O write space is completely available at fd creation
2) For sockets it's very likely that the first packet brought something
	more than the SYN == The I/O read space might have something for you

I strongly believe that the concept "use the fd until EAGAIN" should be
applied even at creation time, w/out making exceptions to what is the
API's rule to follow.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* async poll
  2002-10-22 20:00                                                   ` Benjamin LaHaise
@ 2002-10-22 20:23                                                     ` John Myers
  2002-10-23 11:10                                                     ` Latest aio code (was Re: [PATCH] async poll for 2.5) Suparna Bhattacharya
  1 sibling, 0 replies; 138+ messages in thread
From: John Myers @ 2002-10-22 20:23 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Linux Kernel Mailing List, linux-aio

[-- Attachment #1: Type: text/plain, Size: 466 bytes --]



Benjamin LaHaise wrote:

>How does it perform?
>  
>
That was one of the questions, since you claimed it didn't "scale" 
without being specific as to which axes that was with respect to.

If the problem is the cost of reregistration (which seems likely) then 
fixing that requires extending the model upon which the aio framework is 
based. If the proposed extended model is not acceptable, it would be 
best to fix that before spending the time to implement it.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 20:06                                                       ` Davide Libenzi
@ 2002-10-22 21:54                                                         ` Erich Nahum
  2002-10-22 22:17                                                           ` Dan Kegel
  2002-10-22 22:25                                                           ` Davide Libenzi
  0 siblings, 2 replies; 138+ messages in thread
From: Erich Nahum @ 2002-10-22 21:54 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: John Gardiner Myers, linux-kernel, linux-aio

Davide Libenzi writes:
> On Tue, 22 Oct 2002, John Gardiner Myers wrote:
> 
> > > 1. re-arming the one-shot notification when the user gets EAGAIN
> > > 2. re-arming the one-shot notification when the user reads all the data
> > >    that was waiting (such that the very next read would return EGAIN).
> > >
> > > #1 is what Davide wants; I think John and Mark are arguing for #2.
> >
> > No, this is not what I'm arguing.  Once an event arrives for a fd, my
> > proposed semantics are no different than Mr. Libenzi's.  The only
> > difference is what happens upon registration of interest for a fd.  With
> > my semantics, the kernel guarantees that if the fd is ready then at
> > least one event has been generated.  With Mr Libenzi's semantics, there
> > is no such guarantee and the application is required to behave as if an
> > event had been generated upon registration.
> 
> There're a couple of reason's why the drop of the initial event is a waste
> of time :
> 
> 1) The I/O write space is completely available at fd creation
> 2) For sockets it's very likely that the first packet brought something
> 	more than the SYN == The I/O read space might have something for you
> 
> I strongly believe that the concept "use the fd until EAGAIN" should be
> applied even at creation time, w/out making exceptions to what is the
> API's rule to follow.

There is a third way, described in the original Banga/Mogul/Druschel
paper, available via Dan Kegel's web site: extend the accept() call to 
return whether an event has already happened on that FD.  That way you 
can service a ready FD without reading /dev/epoll or calling
sigtimedwait, and you don't have to waste a read() call on the socket
only to find out you got EAGAIN.

Of course, this changes the accept API, which is another matter.  But
if we're talking a new API then there's no problem.

-Erich


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 21:54                                                         ` Erich Nahum
@ 2002-10-22 22:17                                                           ` Dan Kegel
  2002-10-22 22:25                                                           ` Davide Libenzi
  1 sibling, 0 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-22 22:17 UTC (permalink / raw)
  To: Erich M. Nahum
  Cc: Davide Libenzi, John Gardiner Myers, linux-kernel, linux-aio

Erich Nahum wrote:
>>There're a couple of reason's why the drop of the initial event is a waste
>>of time :
>>
>>1) The I/O write space is completely available at fd creation
>>2) For sockets it's very likely that the first packet brought something
>>	more than the SYN == The I/O read space might have something for you
>>
>>I strongly believe that the concept "use the fd until EAGAIN" should be
>>applied even at creation time, w/out making exceptions to what is the
>>API's rule to follow.
> 
> 
> There is a third way, described in the original Banga/Mogul/Druschel
> paper, available via Dan Kegel's web site: extend the accept() call to 
> return whether an event has already happened on that FD.  That way you 
> can service a ready FD without reading /dev/epoll or calling
> sigtimedwait, and you don't have to waste a read() call on the socket
> only to find out you got EAGAIN.
> 
> Of course, this changes the accept API, which is another matter.  But
> if we're talking a new API then there's no problem.

That would be the fastest way of finding out, maybe.
But I'd rather use a uniform way of notifying about readiness events.
Rather than using a new API (acceptEx :-) or a rule ("always ready initially"),
when not just deliver the initial readiness event via the usual channel?
And for ease of coding for the moment, David can just deliver
a 'ready for everything' event initially unconditionally.

No API changes, just a simple, uniform way of tickling the user's
"I gotta do I/O" code.
- Dan



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 21:54                                                         ` Erich Nahum
  2002-10-22 22:17                                                           ` Dan Kegel
@ 2002-10-22 22:25                                                           ` Davide Libenzi
  1 sibling, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-22 22:25 UTC (permalink / raw)
  To: Erich Nahum; +Cc: linux-kernel, linux-aio

On Tue, 22 Oct 2002, Erich Nahum wrote:

> There is a third way, described in the original Banga/Mogul/Druschel
> paper, available via Dan Kegel's web site: extend the accept() call to
> return whether an event has already happened on that FD.  That way you
> can service a ready FD without reading /dev/epoll or calling
> sigtimedwait, and you don't have to waste a read() call on the socket
> only to find out you got EAGAIN.
>
> Of course, this changes the accept API, which is another matter.  But
> if we're talking a new API then there's no problem.

Why differentiate between connect and accept. At that point you should
also handle connect as a particular case, that's the point. And that's why
I like the API's rule to be consistent and I would not like to put inside
the kernel source code explicit event dispatch inside accept/connect.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Latest aio code (was Re: [PATCH] async poll for 2.5)
  2002-10-22 20:00                                                   ` Benjamin LaHaise
  2002-10-22 20:23                                                     ` async poll John Myers
@ 2002-10-23 11:10                                                     ` Suparna Bhattacharya
  1 sibling, 0 replies; 138+ messages in thread
From: Suparna Bhattacharya @ 2002-10-23 11:10 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: John Gardiner Myers, Linux Kernel Mailing List, linux-aio

On Tue, Oct 22, 2002 at 04:00:22PM -0400, Benjamin LaHaise wrote:
> On Tue, Oct 22, 2002 at 12:50:44PM -0700, John Gardiner Myers wrote:
> > 
> > 
> > Benjamin LaHaise wrote:
> > 
> > >*Which* proposals?  There was enough of a discussion that I don't know 
> > >what people had decided on.
> > >
> > Primarily the ones in my message of Tue, 15 Oct 2002 16:26:59 -0700. In 
> > that I repeat a question I posed in my message of Tue, 01 Oct 2002 
> > 14:16:23 -0700.
> 
> How does it perform?
> 
> > There's also the IOCB_CMD_NOOP strawman of Fri, 18 Oct 2002 17:16:41 -0700.
> 
> That's going in unless there are any other objections to it from folks.  
> Part of it was that I had problems with 2.4.43-bk not working on my 
> test machines last week that delayed a few things.

Ben,

Is there a patch against 2.5.44 with all the latest fixes that
we can sync up with ?

Regards
Suparna

> 
> 		-ben
> -- 
> "Do you seek knowledge in time travel?"
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majordomo@kvack.org.  For more info on Linux AIO,
> see: http://www.kvack.org/aio/

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-22 19:35                                       ` Davide Libenzi
@ 2002-10-23 16:49                                         ` Dan Kegel
  2002-10-23 17:39                                           ` Benjamin LaHaise
  2002-10-23 17:49                                           ` Charles 'Buck' Krasic
  0 siblings, 2 replies; 138+ messages in thread
From: Dan Kegel @ 2002-10-23 16:49 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Charles 'Buck' Krasic, Mark Mielke, linux-kernel, linux-aio

Davide Libenzi <davidel@xmailserver.org> wrote:
 > On 22 Oct 2002, Charles 'Buck' Krasic wrote:
 >
 >> So maybe epoll's moment of utility is only transient.  It should have
 >> been in the kernel a long time ago.  Is it too late now that AIO is
 >> imminent?
 >
 > This is not my call actually. But beside comparing actual performance
 > between AIO and sys_epoll, one of the advantages that the patch had is
 > ... it has a very little "intrusion" in the original code by plugging
 > in the existing architecture.

epoll has another benefit: it works with read() and write().  That
makes it easier to use with existing libraries like OpenSSL
without having to recode them to use aio_read() and aio_write().

Furthermore, epoll is nice because it delivers one-shot readiness change
notification (I used to think that was a drawback, but coding
nonblocking OpenSSL apps has convinced me otherwise).
I may be confused, but I suspect the async poll being proposed by
Ben only delivers absolute readiness, not changes in readiness.

I think epoll is worth having, even if Ben's AIO already handled
networking properly.
- Dan


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 16:49                                         ` Dan Kegel
@ 2002-10-23 17:39                                           ` Benjamin LaHaise
  2002-10-23 18:47                                             ` Davide Libenzi
  2002-10-23 17:49                                           ` Charles 'Buck' Krasic
  1 sibling, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-23 17:39 UTC (permalink / raw)
  To: Dan Kegel
  Cc: Davide Libenzi, Charles 'Buck' Krasic, Mark Mielke,
	linux-kernel, linux-aio

On Wed, Oct 23, 2002 at 09:49:54AM -0700, Dan Kegel wrote:
> Furthermore, epoll is nice because it delivers one-shot readiness change
> notification (I used to think that was a drawback, but coding
> nonblocking OpenSSL apps has convinced me otherwise).
> I may be confused, but I suspect the async poll being proposed by
> Ben only delivers absolute readiness, not changes in readiness.
> 
> I think epoll is worth having, even if Ben's AIO already handled
> networking properly.

That depends on how it compares to async read/write, which hasn't 
been looked into yet.  The way the pipe code worked involved walking 
the page tables, which is still quite expensive for small data sizes.  
With the new code, the CPU's tlb will be used, which will make a big 
difference, especially for the case where only a single address space 
is in use on the system.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 16:49                                         ` Dan Kegel
  2002-10-23 17:39                                           ` Benjamin LaHaise
@ 2002-10-23 17:49                                           ` Charles 'Buck' Krasic
  2002-10-23 18:14                                             ` Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-23 17:49 UTC (permalink / raw)
  To: Dan Kegel; +Cc: Davide Libenzi, Mark Mielke, linux-kernel, linux-aio


Dan Kegel <dank@kegel.com> writes:

> Davide Libenzi <davidel@xmailserver.org> wrote:

> I may be confused, but I suspect the async poll being proposed by
> Ben only delivers absolute readiness, not changes in readiness.

> I think epoll is worth having, even if Ben's AIO already handled
> networking properly.

> - Dan

Can someone remind me why poll is needed in the AIO api at all?

How would it be used?

-- Buck









^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 17:49                                           ` Charles 'Buck' Krasic
@ 2002-10-23 18:14                                             ` Davide Libenzi
  2002-10-23 18:32                                               ` Charles 'Buck' Krasic
  2002-10-23 20:36                                               ` async poll John Myers
  0 siblings, 2 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 18:14 UTC (permalink / raw)
  To: Charles 'Buck' Krasic; +Cc: linux-kernel, linux-aio

On 23 Oct 2002, Charles 'Buck' Krasic wrote:

>
> Dan Kegel <dank@kegel.com> writes:
>
> > Davide Libenzi <davidel@xmailserver.org> wrote:
>
> > I may be confused, but I suspect the async poll being proposed by
> > Ben only delivers absolute readiness, not changes in readiness.
>
> > I think epoll is worth having, even if Ben's AIO already handled
> > networking properly.
>
> > - Dan
>
> Can someone remind me why poll is needed in the AIO api at all?
>
> How would it be used?

Maybe my understanding of AIO on Linux is limited but how would you do
async accept/connect ? Will you be using std poll/select for that, and
then you'll switch to AIO for read/write requests ?



- Davide







^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 18:14                                             ` Davide Libenzi
@ 2002-10-23 18:32                                               ` Charles 'Buck' Krasic
  2002-10-23 20:36                                               ` async poll John Myers
  1 sibling, 0 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-23 18:32 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, linux-aio


I see.  Adding async accept/connect would seem to make more sense to me.

-- Buck

Davide Libenzi <davidel@xmailserver.org> writes:

> Maybe my understanding of AIO on Linux is limited but how would you do
> async accept/connect ? Will you be using std poll/select for that, and
> then you'll switch to AIO for read/write requests ?

> - Davide

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 17:39                                           ` Benjamin LaHaise
@ 2002-10-23 18:47                                             ` Davide Libenzi
  2002-10-23 21:18                                               ` Benjamin LaHaise
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 18:47 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-kernel, linux-aio

On Wed, 23 Oct 2002, Benjamin LaHaise wrote:

> On Wed, Oct 23, 2002 at 09:49:54AM -0700, Dan Kegel wrote:
> > Furthermore, epoll is nice because it delivers one-shot readiness change
> > notification (I used to think that was a drawback, but coding
> > nonblocking OpenSSL apps has convinced me otherwise).
> > I may be confused, but I suspect the async poll being proposed by
> > Ben only delivers absolute readiness, not changes in readiness.
> >
> > I think epoll is worth having, even if Ben's AIO already handled
> > networking properly.
>
> That depends on how it compares to async read/write, which hasn't
> been looked into yet.  The way the pipe code worked involved walking
> the page tables, which is still quite expensive for small data sizes.
> With the new code, the CPU's tlb will be used, which will make a big
> difference, especially for the case where only a single address space
> is in use on the system.

Ben, does it work at all currently read/write requests on sockets ? I
would like to test AIO on networking using my test http server, and I was
thinking about using poll() for async accept and AIO for read/write. The
poll() should be pretty fast because there's only one fd in the set and
the remaining code will use AIO for read/write. Might this work currently ?



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* async poll
  2002-10-23 18:14                                             ` Davide Libenzi
  2002-10-23 18:32                                               ` Charles 'Buck' Krasic
@ 2002-10-23 20:36                                               ` John Myers
  2002-10-23 20:57                                                 ` Dan Kegel
  2002-10-23 21:13                                                 ` Charles 'Buck' Krasic
  1 sibling, 2 replies; 138+ messages in thread
From: John Myers @ 2002-10-23 20:36 UTC (permalink / raw)
  To: linux-aio; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 921 bytes --]



Davide Libenzi wrote:

>Maybe my understanding of AIO on Linux is limited but how would you do
>async accept/connect ? Will you be using std poll/select for that, and
>then you'll switch to AIO for read/write requests ?
>
If a connection is likely to be idle, one would want to use an async 
read poll instead of an async read in order to avoid having to allocate 
input buffers to idle connections.  (What one really wants is a variant 
of async read that allocates an input buffer to an fd at completion 
time, not submission time).

Sometimes one wants to use a library which only has a nonblocking 
interface, so when the library says WOULDBLOCK you have to do an async 
write poll.

Sometimes one wants to use a kernel interface (e.g. sendfile) that does 
not yet have an async equivalent.  Accept/connect are in this 
class--there should be nothing to prevent us from creating async 
versions of accept/connect.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 20:36                                               ` async poll John Myers
@ 2002-10-23 20:57                                                 ` Dan Kegel
  2002-10-23 21:23                                                   ` John Gardiner Myers
  2002-10-23 21:13                                                 ` Charles 'Buck' Krasic
  1 sibling, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-23 20:57 UTC (permalink / raw)
  To: John Myers; +Cc: linux-aio, linux-kernel

John Myers wrote:
> Davide Libenzi wrote:
> 
>> Maybe my understanding of AIO on Linux is limited but how would you do
>> async accept/connect ? Will you be using std poll/select for that, and
>> then you'll switch to AIO for read/write requests ?
>>
> If a connection is likely to be idle, one would want to use an async 
> read poll instead of an async read in order to avoid having to allocate 
> input buffers to idle connections.  (What one really wants is a variant 
> of async read that allocates an input buffer to an fd at completion 
> time, not submission time).

In that situation, why not just add the fd to an epoll, and have the
epoll deliver events through Ben's interface?  That way you'd get
notified of changes in readability, and wouldn't have to issue
the read poll call over and over.

- Dan


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 20:36                                               ` async poll John Myers
  2002-10-23 20:57                                                 ` Dan Kegel
@ 2002-10-23 21:13                                                 ` Charles 'Buck' Krasic
  1 sibling, 0 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-23 21:13 UTC (permalink / raw)
  To: John Myers; +Cc: linux-aio, linux-kernel



jgmyers@netscape.com (John Myers) writes:

> If a connection is likely to be idle, one would want to use an async
> read poll instead of an async read in order to avoid having to
> allocate input buffers to idle connections.  (What one really wants
> is a variant of async read that allocates an input buffer to an fd
> at completion time, not submission time).

Right.  This does make sense for a server with thousands of idle
connections.   You could have lots of unused memory in that case.

> Sometimes one wants to use a library which only has a nonblocking
> interface, so when the library says WOULDBLOCK you have to do an
> async write poll.

Right.  I can see this too.  This might be thorny for epoll though,
since it's entirely conceivable that such libraries would be expecting
level style poll semantics.

> Sometimes one wants to use a kernel interface (e.g. sendfile) that
> does not yet have an async equivalent.  Accept/connect are in this
> class--there should be nothing to prevent us from creating async
> versions of accept/connect.

Right.  However in this case, I think using poll is a stop gap
solution.

It would be better to give accept, connect, and sendfile (and
sendfile64) native status in AIO.  Even if the implementations are not
going to be ready for a while, I think their spots in the API should
be reserved now.

That said, the first two points above convince me that there are valid
reasons why poll is also needed in AIO.

-- Buck


^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 18:47                                             ` Davide Libenzi
@ 2002-10-23 21:18                                               ` Benjamin LaHaise
  2002-10-23 21:35                                                 ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: Benjamin LaHaise @ 2002-10-23 21:18 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel, linux-aio

On Wed, Oct 23, 2002 at 11:47:33AM -0700, Davide Libenzi wrote:
> Ben, does it work at all currently read/write requests on sockets ? I
> would like to test AIO on networking using my test http server, and I was
> thinking about using poll() for async accept and AIO for read/write. The
> poll() should be pretty fast because there's only one fd in the set and
> the remaining code will use AIO for read/write. Might this work currently ?

The socket async read/write code is not yet in the kernel.

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 20:57                                                 ` Dan Kegel
@ 2002-10-23 21:23                                                   ` John Gardiner Myers
  2002-10-23 21:51                                                     ` Davide Libenzi
  2002-10-23 22:24                                                     ` Dan Kegel
  0 siblings, 2 replies; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-23 21:23 UTC (permalink / raw)
  To: Dan Kegel; +Cc: linux-aio, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 346 bytes --]



Dan Kegel wrote:

> In that situation, why not just add the fd to an epoll, and have the
> epoll deliver events through Ben's interface?

Because you might need to use the aio_data facility of the iocb 
interface.  Because you might want to keep the kernel from 
simultaneously delivering two events for the same fd to two different 
threads.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 21:18                                               ` Benjamin LaHaise
@ 2002-10-23 21:35                                                 ` Davide Libenzi
  2002-10-23 21:39                                                   ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 21:35 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-kernel, linux-aio

On Wed, 23 Oct 2002, Benjamin LaHaise wrote:

> On Wed, Oct 23, 2002 at 11:47:33AM -0700, Davide Libenzi wrote:
> > Ben, does it work at all currently read/write requests on sockets ? I
> > would like to test AIO on networking using my test http server, and I was
> > thinking about using poll() for async accept and AIO for read/write. The
> > poll() should be pretty fast because there's only one fd in the set and
> > the remaining code will use AIO for read/write. Might this work currently ?
>
> The socket async read/write code is not yet in the kernel.

Ok, this pretty much stops every attempt to test/compare AIO with sys_epoll ...
ETA ?



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 21:35                                                 ` Davide Libenzi
@ 2002-10-23 21:39                                                   ` John Gardiner Myers
  2002-10-23 21:54                                                     ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-23 21:39 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Benjamin LaHaise, linux-kernel, linux-aio

[-- Attachment #1: Type: text/plain, Size: 386 bytes --]



Davide Libenzi wrote:

>Ok, this pretty much stops every attempt to test/compare AIO with sys_epoll ...
>
It would be useful to compare async poll (the patch that started this 
thread) with sys_epoll.  sys_epoll is expected to perform better since 
it ignores multithreading issues and amortizes registration across 
multiple events, but it would be interesting to know by how much.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 21:51                                                     ` Davide Libenzi
@ 2002-10-23 21:51                                                       ` bert hubert
  2002-10-23 22:10                                                         ` Davide Libenzi
  2002-10-23 21:54                                                       ` John Gardiner Myers
  1 sibling, 1 reply; 138+ messages in thread
From: bert hubert @ 2002-10-23 21:51 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: John Gardiner Myers, linux-aio, linux-kernel

On Wed, Oct 23, 2002 at 02:51:21PM -0700, Davide Libenzi wrote:

> Why would you want to have a single fd simultaneously handled by two
> different threads with all the locking issues that would arise ? I can
> understand loving threads but this seems to be too much :)

We in fact tried to do this and for good reason. Our nameserver sofware gets
great benefit when two processes listen to the same socket on an SMP system.
In some cases, this means 70% more packets/second, which is close to the
theoretical maximum beneft.

We would heavily prefer to have two *threads* listening to the same socket
instead of to processes. The two processes do not share caching information
now because that expects to live in the same memory.

Right now, we can't do that because of very weird locking behaviour, which
is documented here: http://www.mysql.com/doc/en/Linux.html and leads to
250.000 context switches/second and dysmal peformance.

I expect NPTL to fix this situation and I would just love to be able to call
select() or poll() or recvfrom() on the same fd(s) from different threads.

Regards,

bert hubert

-- 
http://www.PowerDNS.com          Versatile DNS Software & Services
http://lartc.org           Linux Advanced Routing & Traffic Control HOWTO

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 21:23                                                   ` John Gardiner Myers
@ 2002-10-23 21:51                                                     ` Davide Libenzi
  2002-10-23 21:51                                                       ` bert hubert
  2002-10-23 21:54                                                       ` John Gardiner Myers
  2002-10-23 22:24                                                     ` Dan Kegel
  1 sibling, 2 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 21:51 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: linux-aio, linux-kernel

On Wed, 23 Oct 2002, John Gardiner Myers wrote:

> > In that situation, why not just add the fd to an epoll, and have the
> > epoll deliver events through Ben's interface?
>
> Because you might need to use the aio_data facility of the iocb
> interface.  Because you might want to keep the kernel from
> simultaneously delivering two events for the same fd to two different
> threads.

Why would you want to have a single fd simultaneously handled by two
different threads with all the locking issues that would arise ? I can
understand loving threads but this seems to be too much :)



- Davide




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 21:51                                                     ` Davide Libenzi
  2002-10-23 21:51                                                       ` bert hubert
@ 2002-10-23 21:54                                                       ` John Gardiner Myers
  2002-10-23 22:22                                                         ` Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-23 21:54 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-aio, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 384 bytes --]



Davide Libenzi wrote:

>Why would you want to have a single fd simultaneously handled by two
>different threads with all the locking issues that would arise ?
>
You would not want this to happen.  Thus you would want the poll 
facility to somehow prevent returning event N+1 until after the thread 
that got event N has somehow indicated that it has finished handling the 
event.



[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: epoll (was Re: [PATCH] async poll for 2.5)
  2002-10-23 21:39                                                   ` John Gardiner Myers
@ 2002-10-23 21:54                                                     ` Davide Libenzi
  0 siblings, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 21:54 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: linux-kernel, linux-aio

On Wed, 23 Oct 2002, John Gardiner Myers wrote:

>
>
> Davide Libenzi wrote:
>
> >Ok, this pretty much stops every attempt to test/compare AIO with sys_epoll ...
> >
> It would be useful to compare async poll (the patch that started this
> thread) with sys_epoll.  sys_epoll is expected to perform better since
> it ignores multithreading issues and amortizes registration across
> multiple events, but it would be interesting to know by how much.

I think this can be done, David Stevens ( IBM ) already proposed me the
patch to test.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 21:51                                                       ` bert hubert
@ 2002-10-23 22:10                                                         ` Davide Libenzi
  0 siblings, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 22:10 UTC (permalink / raw)
  To: bert hubert; +Cc: linux-aio, linux-kernel

On Wed, 23 Oct 2002, bert hubert wrote:

> On Wed, Oct 23, 2002 at 02:51:21PM -0700, Davide Libenzi wrote:
>
> > Why would you want to have a single fd simultaneously handled by two
> > different threads with all the locking issues that would arise ? I can
> > understand loving threads but this seems to be too much :)
>
> We in fact tried to do this and for good reason. Our nameserver sofware gets
> great benefit when two processes listen to the same socket on an SMP system.
> In some cases, this means 70% more packets/second, which is close to the
> theoretical maximum beneft.
>
> We would heavily prefer to have two *threads* listening to the same socket
> instead of to processes. The two processes do not share caching information
> now because that expects to live in the same memory.
>
> Right now, we can't do that because of very weird locking behaviour, which
> is documented here: http://www.mysql.com/doc/en/Linux.html and leads to
> 250.000 context switches/second and dysmal peformance.
>
> I expect NPTL to fix this situation and I would just love to be able to call
> select() or poll() or recvfrom() on the same fd(s) from different threads.

I feel this topic is going somewhere else ;) ... but if you need your
processes to share memory, and hence you would like them to be threads,
you're very likely going to have some form of syncronization mechanism to
access the shared area, don't you? So, isn't it better to have two
separate tasks, that can freely access the memory w/out locks instead of N
threads?



- Davide




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 21:54                                                       ` John Gardiner Myers
@ 2002-10-23 22:22                                                         ` Davide Libenzi
  2002-10-23 22:29                                                           ` John Gardiner Myers
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 22:22 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: linux-aio, linux-kernel

On Wed, 23 Oct 2002, John Gardiner Myers wrote:

> Davide Libenzi wrote:
>
> >Why would you want to have a single fd simultaneously handled by two
> >different threads with all the locking issues that would arise ?
> >
> You would not want this to happen.  Thus you would want the poll
> facility to somehow prevent returning event N+1 until after the thread
> that got event N has somehow indicated that it has finished handling the
> event.

We're again looping talking about threads and fd being bounced between
threads. It seems that we've very different opinions about the use of
threads and how server applications should be designed. IMHO if you're
thinking of bouncing fds among threads for their handling you're doing
something somehow wrong, but this is just my opinion ...



- Davide





^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 21:23                                                   ` John Gardiner Myers
  2002-10-23 21:51                                                     ` Davide Libenzi
@ 2002-10-23 22:24                                                     ` Dan Kegel
  2002-10-23 22:30                                                       ` Davide Libenzi
  1 sibling, 1 reply; 138+ messages in thread
From: Dan Kegel @ 2002-10-23 22:24 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: linux-aio, linux-kernel

John Gardiner Myers wrote:
> 
> Dan Kegel wrote:
> 
>> In that situation, why not just add the fd to an epoll, and have the
>> epoll deliver events through Ben's interface?
> 
> 
> Because you might need to use the aio_data facility of the iocb 
> interface.

Presumably epoll_add could be enhanced to let user specify a user data
word.

 > Because you might want to keep the kernel from
> simultaneously delivering two events for the same fd to two different 
> threads.

You might want to use aio_write() for writes, and read() for reads,
in which case you could tell epoll you're not interested in
write readiness events.  Then there'd be no double notification
for reads or writes on same fd.
It's a bit contrived, but I can imagine it being useful.

- Dan




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 22:22                                                         ` Davide Libenzi
@ 2002-10-23 22:29                                                           ` John Gardiner Myers
  2002-10-23 22:50                                                             ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: John Gardiner Myers @ 2002-10-23 22:29 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-aio, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1042 bytes --]



Davide Libenzi wrote:

>We're again looping talking about threads and fd being bounced between
>threads. It seems that we've very different opinions about the use of
>threads and how server applications should be designed. IMHO if you're
>thinking of bouncing fds among threads for their handling you're doing
>something somehow wrong, but this is just my opinion ...
>
Again, it comes down to whether or not one has the luxury of being able 
to (re)write one's application and supporting libraries from scratch. 
 If one can ensure that the code for handling an event will never block 
for any significant amount of time, then single-threaded process-per-CPU 
will most likely perform best.  If, on the other hand, the code for 
handling an event can occasionally block, then one needs a thread pool 
in order to have reasonable latency.

A thread pool based server that is released will trivially outperform a 
single threaded server that needs a few more years development to 
convert all the blocking calls to use the event subsystem.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 3711 bytes --]

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 22:24                                                     ` Dan Kegel
@ 2002-10-23 22:30                                                       ` Davide Libenzi
  2002-10-23 22:53                                                         ` Davide Libenzi
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 22:30 UTC (permalink / raw)
  To: Dan Kegel; +Cc: linux-aio, linux-kernel

On Wed, 23 Oct 2002, Dan Kegel wrote:

> John Gardiner Myers wrote:
> >
> > Dan Kegel wrote:
> >
> >> In that situation, why not just add the fd to an epoll, and have the
> >> epoll deliver events through Ben's interface?
> >
> >
> > Because you might need to use the aio_data facility of the iocb
> > interface.
>
> Presumably epoll_add could be enhanced to let user specify a user data
> word.

It'll take 2 minutes to do such a thing. Actually the pollfd struct
contains the "events" field that is wasted when returning events and it
could be used for something more useful.



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 22:29                                                           ` John Gardiner Myers
@ 2002-10-23 22:50                                                             ` Davide Libenzi
  2002-10-24  7:32                                                               ` Eduardo Pérez
  0 siblings, 1 reply; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 22:50 UTC (permalink / raw)
  To: John Gardiner Myers; +Cc: linux-aio, linux-kernel

On Wed, 23 Oct 2002, John Gardiner Myers wrote:

> Again, it comes down to whether or not one has the luxury of being able
> to (re)write one's application and supporting libraries from scratch.
>  If one can ensure that the code for handling an event will never block
> for any significant amount of time, then single-threaded process-per-CPU
> will most likely perform best.  If, on the other hand, the code for
> handling an event can occasionally block, then one needs a thread pool
> in order to have reasonable latency.
>
> A thread pool based server that is released will trivially outperform a
> single threaded server that needs a few more years development to
> convert all the blocking calls to use the event subsystem.

I beg you pardon but where an application is possibly waiting for ?
Couldn't it be waiting to something somehow identifiable as a file ? So,
supposing that an interface like sys_epoll ( or AIO, or whatever )
delivers you events for all your file descriptors your application is
waiting for, why would you need threads ? In fact, I personally find that
coroutines make threaded->single-task transaction very easy. Your virtual
threads shares everything by default w/out having a single lock inside
your application.



- Davide




^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 22:30                                                       ` Davide Libenzi
@ 2002-10-23 22:53                                                         ` Davide Libenzi
  0 siblings, 0 replies; 138+ messages in thread
From: Davide Libenzi @ 2002-10-23 22:53 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-aio, linux-kernel

On Wed, 23 Oct 2002, Davide Libenzi wrote:

> It'll take 2 minutes to do such a thing. Actually the pollfd struct
> contains the "events" field that is wasted when returning events and it
> could be used for something more useful.

Also, I was just wondering if this might be usefull :

asmlinkage int sys_epoll_wait(int epfd, int minevents, struct pollfd **events, int timeout);

Where "minevents" rapresent the minimum number of events returned by
sys_epoll ...



- Davide



^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-23 22:50                                                             ` Davide Libenzi
@ 2002-10-24  7:32                                                               ` Eduardo Pérez
  2002-10-24 15:05                                                                 ` Charles 'Buck' Krasic
  0 siblings, 1 reply; 138+ messages in thread
From: Eduardo Pérez @ 2002-10-24  7:32 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-aio, linux-kernel

On 2002-10-23 15:50:43 -0700, Davide Libenzi wrote:
> On Wed, 23 Oct 2002, John Gardiner Myers wrote:
> > Again, it comes down to whether or not one has the luxury of being able
> > to (re)write one's application and supporting libraries from scratch.
> >  If one can ensure that the code for handling an event will never block
> > for any significant amount of time, then single-threaded process-per-CPU
> > will most likely perform best.  If, on the other hand, the code for
> > handling an event can occasionally block, then one needs a thread pool
> > in order to have reasonable latency.
> >
> > A thread pool based server that is released will trivially outperform a
> > single threaded server that needs a few more years development to
> > convert all the blocking calls to use the event subsystem.
> 
> I beg you pardon but where an application is possibly waiting for ?
> Couldn't it be waiting to something somehow identifiable as a file ? So,
> supposing that an interface like sys_epoll ( or AIO, or whatever )
> delivers you events for all your file descriptors your application is
> waiting for, why would you need threads ? In fact, I personally find that
> coroutines make threaded->single-task transaction very easy. Your virtual
> threads shares everything by default w/out having a single lock inside
> your application.

The only uses of threads in a full aio application is task independence
(or interactivity) and process context separation

Example from GUI side:
Suppose your web (http) client is fully ported to aio (thus only one
thread), if you have two windows and one window receives a big
complicated html page that needs much CPU time to render, this window
can block the other one. If you have a thread for each window, once the
html parser has elapsed its timeslice, the other window can continue
parsing or displaying its (tiny html) page.
(In fact you should use two (or more) threads per window, as html parsing
shouldn't block widget redrawing (like menus and toolbars))

^ permalink raw reply	[flat|nested] 138+ messages in thread

* Re: async poll
  2002-10-24  7:32                                                               ` Eduardo Pérez
@ 2002-10-24 15:05                                                                 ` Charles 'Buck' Krasic
  0 siblings, 0 replies; 138+ messages in thread
From: Charles 'Buck' Krasic @ 2002-10-24 15:05 UTC (permalink / raw)
  To: Eduardo Pérez; +Cc: linux-aio, linux-kernel

Eduardo Pérez <100018135@alumnos.uc3m.es> writes:

> The only uses of threads in a full aio application is task
> independence (or interactivity) and process context separation

> Example from GUI side: Suppose your web (http) client is fully
> ported to aio (thus only one thread), if you have two windows and
> one window receives a big complicated html page that needs much CPU
> time to render, this window can block the other one. If you have a
> thread for each window, once the html parser has elapsed its
> timeslice, the other window can continue parsing or displaying its
> (tiny html) page.  (In fact you should use two (or more) threads per
> window, as html parsing shouldn't block widget redrawing (like menus
> and toolbars))

It's not strictly necessary to use threads here.  At least not with
gtk+.  I don't know whether other toolkits are the same.

Anyway with gtk+, you can install "idle callbacks".  These are
functions to be called whenever the GUI code has nothing to do.  If
you can transform your html parser(s) into idle function(s), then it
won't necessarily disrupt the GUI.  You just have to make sure that
the parser yields periodically (returns to gtk+).  Since parsers are
loop oriented, this shouldn't be too hard, you just make each call to
the function do 1 iteration of the top level loop.

My traffic generator, mxtraf, works this way.  It has a software
oscilliscope (gscope) that graphically displays signals in real time.
At the same time it does lots of IO work to generate synthetic network
traffic.  mxtraf is single threaded.

-- Buck

--
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majordomo@kvack.org.  For more info on Linux AIO,
> see: http://www.kvack.org/aio/

^ permalink raw reply	[flat|nested] 138+ messages in thread

end of thread, other threads:[~2002-10-24 14:59 UTC | newest]

Thread overview: 138+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-14 22:36 [PATCH] async poll for 2.5 Shailabh Nagar
2002-10-14 22:54 ` John Myers
2002-10-15 15:05 ` Benjamin LaHaise
2002-10-15 17:06   ` Dan Kegel
2002-10-15 17:03     ` Benjamin LaHaise
2002-10-15 17:18       ` Dan Kegel
2002-10-16  2:11         ` Lincoln Dale
2002-10-15 18:09     ` Shailabh Nagar
2002-10-15 18:53       ` Dan Kegel
2002-10-15 18:57         ` Benjamin LaHaise
2002-10-15 20:25           ` John Gardiner Myers
2002-10-15 21:09             ` Dan Kegel
2002-10-15 21:50               ` John Myers
2002-10-15 22:33                 ` Davide Libenzi
2002-10-15 22:56                   ` John Gardiner Myers
2002-10-15 23:23                     ` Davide Libenzi
2002-10-16 19:16                       ` John Myers
2002-10-15 21:11             ` Davide Libenzi
2002-10-15 22:01               ` John Gardiner Myers
2002-10-15 22:27                 ` Davide Libenzi
2002-10-15 22:36                   ` John Gardiner Myers
2002-10-15 22:41                     ` Benjamin LaHaise
2002-10-15 23:26                       ` John Gardiner Myers
2002-10-15 23:05                     ` Davide Libenzi
2002-10-15 23:33                       ` John Gardiner Myers
2002-10-16  0:05                         ` Davide Libenzi
2002-10-16  0:15                           ` John Myers
2002-10-16 14:25                             ` Davide Libenzi
2002-10-16 18:15                               ` John Gardiner Myers
2002-10-16 19:20                                 ` Davide Libenzi
2002-10-16 23:31                                   ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
2002-10-16 23:51                                     ` Davide Libenzi
2002-10-17 18:06                                       ` John Gardiner Myers
2002-10-17 18:33                                         ` Davide Libenzi
2002-10-18 19:02                                           ` John Gardiner Myers
2002-10-18 19:52                                             ` Davide Libenzi
2002-10-19  0:55                                               ` John Myers
2002-10-19  5:40                                                 ` Davide Libenzi
2002-10-19  6:59                                                 ` Mark Mielke
2002-10-19 17:26                                                   ` Davide Libenzi
2002-10-19 17:48                                                   ` Dan Kegel
2002-10-19 18:52                                                     ` Charles 'Buck' Krasic
2002-10-19 20:18                                                       ` Charles 'Buck' Krasic
2002-10-19 21:08                                                         ` Dan Kegel
2002-10-22 19:35                                                     ` John Gardiner Myers
2002-10-22 20:06                                                       ` Davide Libenzi
2002-10-22 21:54                                                         ` Erich Nahum
2002-10-22 22:17                                                           ` Dan Kegel
2002-10-22 22:25                                                           ` Davide Libenzi
2002-10-18 21:01                                             ` Charles 'Buck' Krasic
2002-10-18 21:33                                               ` Davide Libenzi
2002-10-19  1:05                                               ` John Myers
2002-10-19  1:27                                                 ` Tervel Atanassov
2002-10-19 18:52                                                   ` John G. Myers
2002-10-19  4:07                                                 ` Charles 'Buck' Krasic
2002-10-16 20:06                                 ` [PATCH] async poll for 2.5 Mark Mielke
2002-10-16 23:48                                   ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
2002-10-17  0:23                                     ` Davide Libenzi
2002-10-17 17:45                                       ` John Myers
2002-10-16  2:45                         ` [PATCH] async poll for 2.5 Charles 'Buck' Krasic
2002-10-16 14:28                           ` Davide Libenzi
2002-10-17 18:47                             ` Charles 'Buck' Krasic
2002-10-17 19:20                               ` Davide Libenzi
2002-10-18  3:30                               ` Dan Kegel
2002-10-16 18:29                           ` John Gardiner Myers
2002-10-16 20:39                             ` Charles 'Buck' Krasic
2002-10-17 17:59                               ` epoll (was Re: [PATCH] async poll for 2.5) John Gardiner Myers
2002-10-21 16:58                             ` [PATCH] async poll for 2.5 Alan Cox
2002-10-21 16:50                               ` Benjamin LaHaise
2002-10-16 19:59                     ` Dan Kegel
2002-10-16 20:03                 ` Dan Kegel
2002-10-17 17:43                   ` epoll (was Re: [PATCH] async poll for 2.5) John Myers
2002-10-18 17:00                     ` Mark Mielke
2002-10-18 17:28                       ` Dan Kegel
2002-10-18 17:41                         ` Davide Libenzi
2002-10-18 18:55                           ` Mark Mielke
2002-10-18 19:16                             ` Davide Libenzi
2002-10-19  6:56                               ` Mark Mielke
2002-10-19 16:10                                 ` Charles 'Buck' Krasic
2002-10-22 17:22                                   ` Mark Mielke
2002-10-22 17:46                                     ` Dan Kegel
2002-10-22 17:47                                     ` Davide Libenzi
2002-10-22 18:13                                       ` Alan Cox
2002-10-22 18:18                                         ` Davide Libenzi
2002-10-22 18:37                                           ` Benjamin LaHaise
2002-10-22 19:22                                             ` John Gardiner Myers
2002-10-22 19:28                                               ` Benjamin LaHaise
2002-10-22 19:50                                                 ` John Gardiner Myers
2002-10-22 20:00                                                   ` Benjamin LaHaise
2002-10-22 20:23                                                     ` async poll John Myers
2002-10-23 11:10                                                     ` Latest aio code (was Re: [PATCH] async poll for 2.5) Suparna Bhattacharya
2002-10-22 19:49                                             ` epoll " Davide Libenzi
2002-10-22 18:42                                     ` Charles 'Buck' Krasic
2002-10-22 19:35                                       ` Davide Libenzi
2002-10-23 16:49                                         ` Dan Kegel
2002-10-23 17:39                                           ` Benjamin LaHaise
2002-10-23 18:47                                             ` Davide Libenzi
2002-10-23 21:18                                               ` Benjamin LaHaise
2002-10-23 21:35                                                 ` Davide Libenzi
2002-10-23 21:39                                                   ` John Gardiner Myers
2002-10-23 21:54                                                     ` Davide Libenzi
2002-10-23 17:49                                           ` Charles 'Buck' Krasic
2002-10-23 18:14                                             ` Davide Libenzi
2002-10-23 18:32                                               ` Charles 'Buck' Krasic
2002-10-23 20:36                                               ` async poll John Myers
2002-10-23 20:57                                                 ` Dan Kegel
2002-10-23 21:23                                                   ` John Gardiner Myers
2002-10-23 21:51                                                     ` Davide Libenzi
2002-10-23 21:51                                                       ` bert hubert
2002-10-23 22:10                                                         ` Davide Libenzi
2002-10-23 21:54                                                       ` John Gardiner Myers
2002-10-23 22:22                                                         ` Davide Libenzi
2002-10-23 22:29                                                           ` John Gardiner Myers
2002-10-23 22:50                                                             ` Davide Libenzi
2002-10-24  7:32                                                               ` Eduardo Pérez
2002-10-24 15:05                                                                 ` Charles 'Buck' Krasic
2002-10-23 22:24                                                     ` Dan Kegel
2002-10-23 22:30                                                       ` Davide Libenzi
2002-10-23 22:53                                                         ` Davide Libenzi
2002-10-23 21:13                                                 ` Charles 'Buck' Krasic
2002-10-19 17:19                                 ` epoll (was Re: [PATCH] async poll for 2.5) Davide Libenzi
2002-10-18 18:55                       ` Chris Friesen
2002-10-18 19:00                         ` Mark Mielke
2002-10-15 17:38   ` [PATCH] async poll for 2.5 Shailabh Nagar
2002-10-15 17:50     ` Benjamin LaHaise
2002-10-15 18:16       ` Davide Libenzi
2002-10-15 18:18         ` Shailabh Nagar
2002-10-15 19:00           ` Davide Libenzi
2002-10-15 19:02             ` Benjamin LaHaise
2002-10-15 18:59               ` Shailabh Nagar
2002-10-15 19:16               ` Davide Libenzi
2002-10-15 19:12                 ` Benjamin LaHaise
2002-10-15 19:31                   ` Davide Libenzi
2002-10-15 19:38                     ` Dan Kegel
2002-10-15 19:55                       ` Davide Libenzi
2002-10-15 20:36                   ` John Gardiner Myers
2002-10-15 20:39                     ` Benjamin LaHaise
2002-10-15 19:02           ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).