linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC,PATCH] use rcu for fasync_lock
@ 2003-12-20 18:20 Manfred Spraul
  2003-12-20 21:10 ` [Lse-tech] " Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Manfred Spraul @ 2003-12-20 18:20 UTC (permalink / raw)
  To: lse-tech, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 931 bytes --]

Hi,

kill_fasync and fasync_helper were intended for mice and similar, rare 
users, thus it uses a simple rwlock for the locking. This is not true 
anymore: e.g. every pipe read and write operation calls kill_fasync, 
which must acquire the rwlock before handling the fasync list.
What about switching to rcu? I did a reaim run on a 4-way pIII with STP, 
and it reduced the time within kill_fasync by 80%:

diffprofile reaim_End_stock reaim_End_rcu          
     21166     1.2% default_idle
     18882     0.9% total
       290    12.8% page_address
       269    23.5% group_send_sig_info
       259    41.1% do_brk
       244     6.3% current_kernel_time
[ delta < 200: skipped]
      -205   -16.1% get_signal_to_deliver
      -240    -3.7% page_add_rmap
      -364    -4.7% __might_sleep
      -369    -8.4% page_remove_rmap
      -975   -81.2% kill_fasync

What do you think? Patch against 2.6.0 is attached.

--
    Manfred


[-- Attachment #2: patch-fasync-rcu --]
[-- Type: text/plain, Size: 2890 bytes --]

--- 2.6/fs/fcntl.c	2003-12-04 19:44:38.000000000 +0100
+++ build-2.6/fs/fcntl.c	2003-12-20 10:56:23.344256035 +0100
@@ -537,9 +537,19 @@
 	return ret;
 }
 
-static rwlock_t fasync_lock = RW_LOCK_UNLOCKED;
+static spinlock_t fasync_lock = SPIN_LOCK_UNLOCKED;
 static kmem_cache_t *fasync_cache;
 
+struct fasync_rcu_struct {
+	struct fasync_struct data;
+	struct rcu_head rcu;
+};
+
+static void fasync_free(void *data)
+{
+	kmem_cache_free(fasync_cache, data);
+}
+
 /*
  * fasync_helper() is used by some character device drivers (mainly mice)
  * to set up the fasync queue. It returns negative on error, 0 if it did
@@ -548,7 +558,7 @@
 int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
 {
 	struct fasync_struct *fa, **fp;
-	struct fasync_struct *new = NULL;
+	struct fasync_rcu_struct *new = NULL;
 	int result = 0;
 
 	if (on) {
@@ -556,15 +566,23 @@
 		if (!new)
 			return -ENOMEM;
 	}
-	write_lock_irq(&fasync_lock);
+	spin_lock(&fasync_lock);
 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 		if (fa->fa_file == filp) {
 			if(on) {
+				/* RCU violation:
+				 * We are modifying a struct that's visible by
+				 * readers. If there is a fasync notification
+				 * right now, then it could go to either the
+				 * old or the new fd. Shouldn't matter.
+				 * 	Manfred <manfred@colorfullife.com>
+				 */
 				fa->fa_fd = fd;
 				kmem_cache_free(fasync_cache, new);
 			} else {
 				*fp = fa->fa_next;
-				kmem_cache_free(fasync_cache, fa);
+				new = container_of(fa, struct fasync_rcu_struct, data);
+				call_rcu(&new->rcu, fasync_free, new);
 				result = 1;
 			}
 			goto out;
@@ -572,15 +590,16 @@
 	}
 
 	if (on) {
-		new->magic = FASYNC_MAGIC;
-		new->fa_file = filp;
-		new->fa_fd = fd;
-		new->fa_next = *fapp;
-		*fapp = new;
+		new->data.magic = FASYNC_MAGIC;
+		new->data.fa_file = filp;
+		new->data.fa_fd = fd;
+		new->data.fa_next = *fapp;
+		smp_wmb();
+		*fapp = &new->data;
 		result = 1;
 	}
 out:
-	write_unlock_irq(&fasync_lock);
+	spin_unlock(&fasync_lock);
 	return result;
 }
 
@@ -590,7 +609,8 @@
 {
 	while (fa) {
 		struct fown_struct * fown;
-		if (fa->magic != FASYNC_MAGIC) {
+		read_barrier_depends();
+		if (unlikely(fa->magic != FASYNC_MAGIC)) {
 			printk(KERN_ERR "kill_fasync: bad magic number in "
 			       "fasync_struct!\n");
 			return;
@@ -609,9 +629,9 @@
 
 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
-	read_lock(&fasync_lock);
+	rcu_read_lock();
 	__kill_fasync(*fp, sig, band);
-	read_unlock(&fasync_lock);
+	rcu_read_unlock();
 }
 
 EXPORT_SYMBOL(kill_fasync);
@@ -619,7 +639,7 @@
 static int __init fasync_init(void)
 {
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, 0, NULL, NULL);
+		sizeof(struct fasync_rcu_struct), 0, 0, NULL, NULL);
 	if (!fasync_cache)
 		panic("cannot create fasync slab cache");
 	return 0;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Lse-tech] [RFC,PATCH] use rcu for fasync_lock
  2003-12-20 18:20 [RFC,PATCH] use rcu for fasync_lock Manfred Spraul
@ 2003-12-20 21:10 ` Stephen Hemminger
  2003-12-20 21:35   ` Manfred Spraul
  2003-12-21 11:36 ` Jamie Lokier
  2003-12-21 18:38 ` OGAWA Hirofumi
  2 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2003-12-20 21:10 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: lse-tech, linux-kernel

Manfred Spraul wrote:

> Hi,
>
> kill_fasync and fasync_helper were intended for mice and similar, rare 
> users, thus it uses a simple rwlock for the locking. This is not true 
> anymore: e.g. every pipe read and write operation calls kill_fasync, 
> which must acquire the rwlock before handling the fasync list.
> What about switching to rcu? I did a reaim run on a 4-way pIII with 
> STP, and it reduced the time within kill_fasync by 80%:
>
> diffprofile reaim_End_stock reaim_End_rcu              21166     1.2% 
> default_idle
>     18882     0.9% total
>       290    12.8% page_address
>       269    23.5% group_send_sig_info
>       259    41.1% do_brk
>       244     6.3% current_kernel_time
> [ delta < 200: skipped]
>      -205   -16.1% get_signal_to_deliver
>      -240    -3.7% page_add_rmap
>      -364    -4.7% __might_sleep
>      -369    -8.4% page_remove_rmap
>      -975   -81.2% kill_fasync
>
> What do you think? Patch against 2.6.0 is attached.
>
> -- 
>    Manfred
>
>------------------------------------------------------------------------
>
>--- 2.6/fs/fcntl.c	2003-12-04 19:44:38.000000000 +0100
>+++ build-2.6/fs/fcntl.c	2003-12-20 10:56:23.344256035 +0100
>@@ -537,9 +537,19 @@
> 	return ret;
> }
> 
>-static rwlock_t fasync_lock = RW_LOCK_UNLOCKED;
>+static spinlock_t fasync_lock = SPIN_LOCK_UNLOCKED;
> static kmem_cache_t *fasync_cache;
> 
>+struct fasync_rcu_struct {
>+	struct fasync_struct data;
>+	struct rcu_head rcu;
>+};
>  
>
Why do needless wrapping of existing structure? Just add and rcu element 
to it!



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Lse-tech] [RFC,PATCH] use rcu for fasync_lock
  2003-12-20 21:10 ` [Lse-tech] " Stephen Hemminger
@ 2003-12-20 21:35   ` Manfred Spraul
  0 siblings, 0 replies; 28+ messages in thread
From: Manfred Spraul @ 2003-12-20 21:35 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: lse-tech, linux-kernel

Stephen Hemminger wrote:

>>
>> +struct fasync_rcu_struct {
>> +    struct fasync_struct data;
>> +    struct rcu_head rcu;
>> +};
>>  
>>
> Why do needless wrapping of existing structure? Just add and rcu 
> element to it!
>
There are two independant users of fasync_struct
- networking does it's own locking and allocation and uses __kill_fasync 
directly.
- everyone else uses fasync_helper and calls kill_fasync, with the 
locking logic in fcntl.c.

I didn't convert the network code, thus I couldn't add the rcu member 
into fasync_struct.

--
    Manfred


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-20 18:20 [RFC,PATCH] use rcu for fasync_lock Manfred Spraul
  2003-12-20 21:10 ` [Lse-tech] " Stephen Hemminger
@ 2003-12-21 11:36 ` Jamie Lokier
  2003-12-21 12:40   ` Manfred Spraul
  2003-12-21 18:38 ` OGAWA Hirofumi
  2 siblings, 1 reply; 28+ messages in thread
From: Jamie Lokier @ 2003-12-21 11:36 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: lse-tech, linux-kernel

Manfred Spraul wrote:
> What about switching to rcu?

What about killing fasync_helper altogether and using the method that
epoll uses to register "listeners" which send a signal when the poll
state of a device changes?

That would trim off code all over the place, make the fast paths a
little bit faster (in the case that there aren't any listeners), and
most importantly make SIGIO reliable for every kind of file descriptor,
instead of the pot luck you get now.

Just an idea :)

-- Jamie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 11:36 ` Jamie Lokier
@ 2003-12-21 12:40   ` Manfred Spraul
  2003-12-21 14:14     ` Jamie Lokier
  2003-12-21 15:14     ` Davide Libenzi
  0 siblings, 2 replies; 28+ messages in thread
From: Manfred Spraul @ 2003-12-21 12:40 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: lse-tech, linux-kernel

Jamie Lokier wrote:

>Manfred Spraul wrote:
>  
>
>>What about switching to rcu?
>>    
>>
>
>What about killing fasync_helper altogether and using the method that
>epoll uses to register "listeners" which send a signal when the poll
>state of a device changes?
>
I think it would be a step in the wrong direction: poll should go away 
from a simple wake-up to an interface that transfers the band info 
(POLL_IN, POLL_OUT, etc). Right now at least two passes over the f_poll 
functions are necessary, because the info which event actually triggered 
is lost. kill_fasync transfers the band info, thus I don't want to 
remove it.

>
>That would trim off code all over the place, make the fast paths a
>little bit faster (in the case that there aren't any listeners), and
>most importantly make SIGIO reliable for every kind of file descriptor,
>instead of the pot luck you get now.
>
>Just an idea :)
>
It's a good idea, but requires lots of changes - perhaps it will be 
necessary to change the pollwait and f_poll prototypes.

--
    Manfred


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 12:40   ` Manfred Spraul
@ 2003-12-21 14:14     ` Jamie Lokier
  2003-12-21 14:59       ` Manfred Spraul
  2004-01-02 21:15       ` Bill Davidsen
  2003-12-21 15:14     ` Davide Libenzi
  1 sibling, 2 replies; 28+ messages in thread
From: Jamie Lokier @ 2003-12-21 14:14 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: lse-tech, linux-kernel

Manfred Spraul wrote:
> >What about killing fasync_helper altogether and using the method that
> >epoll uses to register "listeners" which send a signal when the poll
> >state of a device changes?
> >
> I think it would be a step in the wrong direction: poll should go away 
> from a simple wake-up to an interface that transfers the band info 
> (POLL_IN, POLL_OUT, etc). Right now at least two passes over the f_poll 
> functions are necessary, because the info which event actually triggered 
> is lost. kill_fasync transfers the band info, thus I don't want to 
> remove it.

I agree with the principle of the poll wakeup passing the event
information to the wakeup function - that would make select, poll,
epoll _and_ this new version of fsync faster.  That may be easier to
implement now than it was in 2.4, because we have wakeup functions,
although it is still a big change and it would be hard to get right in
some drivers.  Perhaps very hard.

We have found the performance impact of the extra ->poll calls
negligable with epoll.  They're simply not slow calls.  It's
only when you're doing select() or poll() of many descriptors
repeatedly that you notice, and that's already poor usage in other
ways.

So I am not convinced that such an invasive change is worthwhile,
particularly as drivers would become more complicated.  (Those drivers
which already call kill_fasync have the right logic, assuming there
are no bugs, but many don't and a big ->poll interface change implies they
all have to have it).

> It's a good idea, but requires lots of changes - perhaps it will be 
> necessary to change the pollwait and f_poll prototypes.

However, the two changes: fasync -> eventpoll-like waiter, and poll ->
fewer function calls are really quite orthogonal.

The fasync change is best done separately, with no changes to pollwait
and f_poll and virtually no changes to the drivers except to remove
calls to kill_fasync.

I don't think you need to change pollwait or ->poll, because the band
information for the signal is available, as you say, by calling ->poll
after the wakeup.

Put it this way: Davide thought epoll needed special hooks in all the
devices, until I convinced him they weren't needed.  He tried it and
not only did all the hooks go away, epoll became simpler and smaller,
it worked with every pollable fd instead of just the ones useful for
web servers, and surprisingly ran a bit faster too.

-- Jamie


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 14:14     ` Jamie Lokier
@ 2003-12-21 14:59       ` Manfred Spraul
  2003-12-21 15:08         ` Jamie Lokier
  2004-01-02 21:15       ` Bill Davidsen
  1 sibling, 1 reply; 28+ messages in thread
From: Manfred Spraul @ 2003-12-21 14:59 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: lse-tech, linux-kernel

Jamie Lokier wrote:

>I don't think you need to change pollwait or ->poll, because the band
>information for the signal is available, as you say, by calling ->poll
>after the wakeup.
>
I'm not convinced:
The wakeup happens at irq time. The band info is necessary for 
send_sigio(). Calling f_poll at irq time is not an option - it will 
definitively cause breakage. schedule_work() for every call is IMHO not 
an option. And even that is not reliable: fasync users might expect 
seperate POLL_OUT and POLL_IN signals.


--
    Manfred



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 14:59       ` Manfred Spraul
@ 2003-12-21 15:08         ` Jamie Lokier
  0 siblings, 0 replies; 28+ messages in thread
From: Jamie Lokier @ 2003-12-21 15:08 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: lse-tech, linux-kernel

Manfred Spraul wrote:
> The wakeup happens at irq time. The band info is necessary for 
> send_sigio(). Calling f_poll at irq time is not an option - it will 
> definitively cause breakage.

Agree * 3.

> schedule_work() for every call is IMHO not an option.

Agree, the latency would suck and it wouldn't even work for RT processes.

> And even that is not reliable: fasync users might expect seperate
> POLL_OUT and POLL_IN signals.

They might, although they probably shouldn't (band is a bitmask for a
reason).

Anyway, you can handle all these problems by computing the band at
signal delivery time.  Yes it sounds like it would complicate the
signal delivery code, but sigio should really be handled specially
anyway, so that a signal queue entry for every fd is guaranteed and
queue overflow is not possible.  Somebody already has a patch for
that, it might be worth working from.

-- Jamie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 12:40   ` Manfred Spraul
  2003-12-21 14:14     ` Jamie Lokier
@ 2003-12-21 15:14     ` Davide Libenzi
  2003-12-21 15:17       ` Davide Libenzi
  2003-12-21 15:28       ` Jamie Lokier
  1 sibling, 2 replies; 28+ messages in thread
From: Davide Libenzi @ 2003-12-21 15:14 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Jamie Lokier, lse-tech, Linux Kernel Mailing List

On Sun, 21 Dec 2003, Manfred Spraul wrote:

> >What about killing fasync_helper altogether and using the method that
> >epoll uses to register "listeners" which send a signal when the poll
> >state of a device changes?
> >
> I think it would be a step in the wrong direction: poll should go away 
> from a simple wake-up to an interface that transfers the band info 
> (POLL_IN, POLL_OUT, etc). Right now at least two passes over the f_poll 
> functions are necessary, because the info which event actually triggered 
> is lost. kill_fasync transfers the band info, thus I don't want to 
> remove it.

It is my plan to propose (Linus is not contrary, in principle) a change of 
the poll/wake infrastructure for 2.7. There are two areas that can be 
improved. First, f_op->poll() does not allow you to send and event mask, 
and this requires the driver to indiscriminately wake up both IN and OUT 
waiters. The second area will be to give the driver to specify some "info" 
for the wake up. Something like:

wake_up_info(&wq, XXXX);

And add to the wait queue item storage for the passed info. Where "info" 
could be anything from an event mask, up to an allocated object with its 
own destructor. In this way the callback'd waked up will have the "info" 
ready w/out issuing an extra f_op->poll(). The code is pretty much 
trivial, even if changes will touch a bunch of code. The good thing is 
that migration can be gradual, beside the initial dumb compile fixing to 
suite the new f_op->poll() interface.



- Davide



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 15:14     ` Davide Libenzi
@ 2003-12-21 15:17       ` Davide Libenzi
  2003-12-21 15:28       ` Jamie Lokier
  1 sibling, 0 replies; 28+ messages in thread
From: Davide Libenzi @ 2003-12-21 15:17 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Manfred Spraul, Jamie Lokier, lse-tech, Linux Kernel Mailing List

On Sun, 21 Dec 2003, Davide Libenzi wrote:

> On Sun, 21 Dec 2003, Manfred Spraul wrote:
> 
> > >What about killing fasync_helper altogether and using the method that
> > >epoll uses to register "listeners" which send a signal when the poll
> > >state of a device changes?
> > >
> > I think it would be a step in the wrong direction: poll should go away 
> > from a simple wake-up to an interface that transfers the band info 
> > (POLL_IN, POLL_OUT, etc). Right now at least two passes over the f_poll 
> > functions are necessary, because the info which event actually triggered 
> > is lost. kill_fasync transfers the band info, thus I don't want to 
> > remove it.
> 
> It is my plan to propose (Linus is not contrary, in principle) a change of 
> the poll/wake infrastructure for 2.7. There are two areas that can be 
> improved. First, f_op->poll() does not allow you to send and event mask, 

Sorry, poll_wait() does not allow you to specify an event mask ...



- Davide



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 15:14     ` Davide Libenzi
  2003-12-21 15:17       ` Davide Libenzi
@ 2003-12-21 15:28       ` Jamie Lokier
  1 sibling, 0 replies; 28+ messages in thread
From: Jamie Lokier @ 2003-12-21 15:28 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Manfred Spraul, lse-tech, Linux Kernel Mailing List

Davide Libenzi wrote:
> First, f_op->poll() does not allow you to send and event mask, 
> and this requires the driver to indiscriminately wake up both IN and OUT 
> waiters. The second area will be to give the driver to specify some "info" 
> 
> wake_up_info(&wq, XXXX);

I agree totally, both of these are (and always were, isn't it amazing
how long these things take) the way to do it "properly".

> The good thing is that migration can be gradual, beside the initial
> dumb compile fixing to suite the new f_op->poll() interface.

Even that's trivial, if a little time consuming, as it's only a
function signature change.  Actually using the extra argument is
optional for each driver.

-- Jamie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-20 18:20 [RFC,PATCH] use rcu for fasync_lock Manfred Spraul
  2003-12-20 21:10 ` [Lse-tech] " Stephen Hemminger
  2003-12-21 11:36 ` Jamie Lokier
@ 2003-12-21 18:38 ` OGAWA Hirofumi
  2003-12-21 19:14   ` Manfred Spraul
  2 siblings, 1 reply; 28+ messages in thread
From: OGAWA Hirofumi @ 2003-12-21 18:38 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: lse-tech, linux-kernel

Manfred Spraul <manfred@colorfullife.com> writes:

>  void kill_fasync(struct fasync_struct **fp, int sig, int band)
>  {
> -	read_lock(&fasync_lock);
> +	rcu_read_lock();
>  	__kill_fasync(*fp, sig, band);
> -	read_unlock(&fasync_lock);
> +	rcu_read_unlock();
>  }

Usually *fp is NULL, I think. So what about the following test?

void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
	if (*fp) {
		rcu_read_lock();
		__kill_fasync(*fp, sig, band);
		rcu_read_unlock();
	}
}

Or use inline function for testing *fp.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 18:38 ` OGAWA Hirofumi
@ 2003-12-21 19:14   ` Manfred Spraul
  2003-12-21 20:51     ` Linus Torvalds
  0 siblings, 1 reply; 28+ messages in thread
From: Manfred Spraul @ 2003-12-21 19:14 UTC (permalink / raw)
  To: OGAWA Hirofumi; +Cc: lse-tech, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 362 bytes --]

OGAWA Hirofumi wrote:

>Or use inline function for testing *fp.
>  
>
Initially I tried to keep the patch as tiny as possible, thus I avoided 
adding an inline function. But Stephen Hemminger convinced me to update 
the network code, and thus it didn't matter and I've switched to an 
inline function.
What do you think about the attached patch?

--
    Manfred

[-- Attachment #2: patch-fasync-rcu --]
[-- Type: text/plain, Size: 8775 bytes --]

// $Header$
// Kernel Version:
//  VERSION = 2
//  PATCHLEVEL = 6
//  SUBLEVEL = 0
//  EXTRAVERSION =
--- 2.6/include/linux/fs.h	2003-12-04 19:44:40.000000000 +0100
+++ build-2.6/include/linux/fs.h	2003-12-21 09:31:16.000000000 +0100
@@ -636,17 +636,36 @@ struct fasync_struct {
 	int	fa_fd;
 	struct	fasync_struct	*fa_next; /* singly linked list */
 	struct	file 		*fa_file;
+	struct  rcu_head	rcu;
 };
 
-#define FASYNC_MAGIC 0x4601
+#define FASYNC_MAGIC 0x4602
 
-/* SMP safe fasync helpers: */
-extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
-/* can be called from interrupts */
-extern void kill_fasync(struct fasync_struct **, int, int);
-/* only for net: no internal synchronization */
-extern void __kill_fasync(struct fasync_struct *, int, int);
+/* fasync helper functions: */
+/* 1) setting up the fasync queue. The functions return negative on error, 0
+ * if they did no changes and positive if they added/deleted the entry.
+ * Networking uses lock_sock, everyone else a global spinlock for locking.
+ */
+struct sock;
+extern int __fasync_helper(int fd, struct file *filp, int on, struct fasync_struct **fapp, struct sock *s);
+static inline int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+{
+	return __fasync_helper(fd, filp, on, fapp, NULL);
+}
+
+/* 2) send out signals. Can be called from interrupts */
+extern void fasync_send_sig(struct fasync_struct *fa, int sig, int band);
+static inline void kill_fasync(struct fasync_struct **fp, int sig, int band)
+{
+	struct fasync_struct *fa;
+	rcu_read_lock();
+	fa = *fp;
+	if (fa)
+		fasync_send_sig(fa, sig, band);
+	rcu_read_unlock();
+}
 
+/* 3) high level helpers */
 extern int f_setown(struct file *filp, unsigned long arg, int force);
 extern void f_delown(struct file *filp);
 extern int send_sigurg(struct fown_struct *fown);
--- 2.6/fs/fcntl.c	2003-12-04 19:44:38.000000000 +0100
+++ build-2.6/fs/fcntl.c	2003-12-21 09:29:16.000000000 +0100
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <net/sock.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -537,15 +538,15 @@ int send_sigurg(struct fown_struct *fown
 	return ret;
 }
 
-static rwlock_t fasync_lock = RW_LOCK_UNLOCKED;
+static spinlock_t fasync_lock = SPIN_LOCK_UNLOCKED;
 static kmem_cache_t *fasync_cache;
 
-/*
- * fasync_helper() is used by some character device drivers (mainly mice)
- * to set up the fasync queue. It returns negative on error, 0 if it did
- * no changes and positive if it added/deleted the entry.
- */
-int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+static void fasync_free(void *data)
+{
+	kmem_cache_free(fasync_cache, data);
+}
+
+int __fasync_helper(int fd, struct file *filp, int on, struct fasync_struct **fapp, struct sock *s)
 {
 	struct fasync_struct *fa, **fp;
 	struct fasync_struct *new = NULL;
@@ -556,15 +557,27 @@ int fasync_helper(int fd, struct file * 
 		if (!new)
 			return -ENOMEM;
 	}
-	write_lock_irq(&fasync_lock);
+	if (s)
+		lock_sock(s);
+	else
+		spin_lock(&fasync_lock);
+
 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 		if (fa->fa_file == filp) {
 			if(on) {
+				/* RCU violation:
+				 * We are modifying a struct that can be seen
+				 * by readers. If there is a fasync
+				 * notification right now, then it could go
+				 * to either the old or the new fd. Shouldn't
+				 * matter.
+				 * 	Manfred <manfred@colorfullife.com>
+				 */
 				fa->fa_fd = fd;
 				kmem_cache_free(fasync_cache, new);
 			} else {
 				*fp = fa->fa_next;
-				kmem_cache_free(fasync_cache, fa);
+				call_rcu(&fa->rcu, fasync_free, fa);
 				result = 1;
 			}
 			goto out;
@@ -576,21 +589,27 @@ int fasync_helper(int fd, struct file * 
 		new->fa_file = filp;
 		new->fa_fd = fd;
 		new->fa_next = *fapp;
+		smp_wmb();
 		*fapp = new;
 		result = 1;
 	}
 out:
-	write_unlock_irq(&fasync_lock);
+	if (s)
+		release_sock(s);
+	else
+		spin_unlock(&fasync_lock);
+	spin_unlock(&fasync_lock);
 	return result;
 }
 
-EXPORT_SYMBOL(fasync_helper);
+EXPORT_SYMBOL(__fasync_helper);
 
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+void fasync_send_sig(struct fasync_struct *fa, int sig, int band)
 {
-	while (fa) {
+	do {
 		struct fown_struct * fown;
-		if (fa->magic != FASYNC_MAGIC) {
+		smp_read_barrier_depends();
+		if (unlikely(fa->magic != FASYNC_MAGIC)) {
 			printk(KERN_ERR "kill_fasync: bad magic number in "
 			       "fasync_struct!\n");
 			return;
@@ -602,19 +621,10 @@ void __kill_fasync(struct fasync_struct 
 		if (!(sig == SIGURG && fown->signum == 0))
 			send_sigio(fown, fa->fa_fd, band);
 		fa = fa->fa_next;
-	}
-}
-
-EXPORT_SYMBOL(__kill_fasync);
-
-void kill_fasync(struct fasync_struct **fp, int sig, int band)
-{
-	read_lock(&fasync_lock);
-	__kill_fasync(*fp, sig, band);
-	read_unlock(&fasync_lock);
+	} while(fa);
 }
 
-EXPORT_SYMBOL(kill_fasync);
+EXPORT_SYMBOL(fasync_send_sig);
 
 static int __init fasync_init(void)
 {
--- 2.6/net/socket.c	2003-12-04 19:44:40.000000000 +0100
+++ build-2.6/net/socket.c	2003-12-21 09:01:48.000000000 +0100
@@ -878,81 +878,29 @@ int sock_close(struct inode *inode, stru
  *	Fasync_list locking strategy.
  *
  *	1. fasync_list is modified only under process context socket lock
- *	   i.e. under semaphore.
- *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
- *	   or under socket lock.
- *	3. fasync_list can be used from softirq context, so that
- *	   modification under socket lock have to be enhanced with
- *	   write_lock_bh(&sk->sk_callback_lock).
- *							--ANK (990710)
+ *	   i.e. under semaphore, following rcu rules.
+ *	2. All users are either under read_lock(&sk->sk_callback_lock),
+ *	   or rcu_read_lock().
+ *	3. Exception: wakeup calls are skipped if sock->fasync_list is NULL,
+ *	   and these checks are done without locking. Doesn't matter.
+ *				Manfred <manfred@colorfullife.com>
  */
 
 static int sock_fasync(int fd, struct file *filp, int on)
 {
-	struct fasync_struct *fa, *fna=NULL, **prev;
 	struct socket *sock;
 	struct sock *sk;
 
-	if (on)
-	{
-		fna=(struct fasync_struct *)kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
-		if(fna==NULL)
-			return -ENOMEM;
-	}
-
 	sock = SOCKET_I(filp->f_dentry->d_inode);
 
 	if ((sk=sock->sk) == NULL) {
-		if (fna)
-			kfree(fna);
 		return -EINVAL;
 	}
 
-	lock_sock(sk);
-
-	prev=&(sock->fasync_list);
-
-	for (fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev)
-		if (fa->fa_file==filp)
-			break;
-
-	if(on)
-	{
-		if(fa!=NULL)
-		{
-			write_lock_bh(&sk->sk_callback_lock);
-			fa->fa_fd=fd;
-			write_unlock_bh(&sk->sk_callback_lock);
-
-			kfree(fna);
-			goto out;
-		}
-		fna->fa_file=filp;
-		fna->fa_fd=fd;
-		fna->magic=FASYNC_MAGIC;
-		fna->fa_next=sock->fasync_list;
-		write_lock_bh(&sk->sk_callback_lock);
-		sock->fasync_list=fna;
-		write_unlock_bh(&sk->sk_callback_lock);
-	}
-	else
-	{
-		if (fa!=NULL)
-		{
-			write_lock_bh(&sk->sk_callback_lock);
-			*prev=fa->fa_next;
-			write_unlock_bh(&sk->sk_callback_lock);
-			kfree(fa);
-		}
-	}
-
-out:
-	release_sock(sock->sk);
-	return 0;
+	return __fasync_helper(fd, filp, on, &sock->fasync_list, sk);
 }
 
 /* This function may be called only under socket lock or callback_lock */
-
 int sock_wake_async(struct socket *sock, int how, int band)
 {
 	if (!sock || !sock->fasync_list)
@@ -970,10 +918,10 @@ int sock_wake_async(struct socket *sock,
 		/* fall through */
 	case 0:
 	call_kill:
-		__kill_fasync(sock->fasync_list, SIGIO, band);
+		kill_fasync(&sock->fasync_list, SIGIO, band);
 		break;
 	case 3:
-		__kill_fasync(sock->fasync_list, SIGURG, band);
+		kill_fasync(&sock->fasync_list, SIGURG, band);
 	}
 	return 0;
 }
--- 2.6/arch/sparc64/solaris/timod.c	2003-10-25 20:43:27.000000000 +0200
+++ build-2.6/arch/sparc64/solaris/timod.c	2003-12-21 08:31:34.000000000 +0100
@@ -151,7 +151,7 @@ static void timod_wake_socket(unsigned i
 	wake_up_interruptible(&sock->wait);
 	read_lock(&sock->sk->sk_callback_lock);
 	if (sock->fasync_list && !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
-		__kill_fasync(sock->fasync_list, SIGIO, POLL_IN);
+		kill_fasync(&sock->fasync_list, SIGIO, POLL_IN);
 	read_unlock(&sock->sk->sk_callback_lock);
 	SOLD("done");
 }
--- 2.6/net/decnet/dn_nsp_in.c	2003-10-25 20:43:22.000000000 +0200
+++ build-2.6/net/decnet/dn_nsp_in.c	2003-12-21 08:31:27.000000000 +0100
@@ -602,7 +602,7 @@ static __inline__ int dn_queue_skb(struc
 		wake_up_interruptible(sk->sk_sleep);
 		if (sock && sock->fasync_list &&
 		    !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
-			__kill_fasync(sock->fasync_list, sig, 
+			kill_fasync(&sock->fasync_list, sig, 
 				    (sig == SIGURG) ? POLL_PRI : POLL_IN);
 	}
 	read_unlock(&sk->sk_callback_lock);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 19:14   ` Manfred Spraul
@ 2003-12-21 20:51     ` Linus Torvalds
  2003-12-21 21:08       ` Manfred Spraul
  0 siblings, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2003-12-21 20:51 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: OGAWA Hirofumi, lse-tech, linux-kernel



On Sun, 21 Dec 2003, Manfred Spraul wrote:
>
> Initially I tried to keep the patch as tiny as possible, thus I avoided 
> adding an inline function. But Stephen Hemminger convinced me to update 
> the network code, and thus it didn't matter and I've switched to an 
> inline function.
> What do you think about the attached patch?

Please, NO!

Stuff like this

	-       write_lock_irq(&fasync_lock);
	+       if (s)
	+               lock_sock(s);
	+       else
	+               spin_lock(&fasync_lock);
	+

should not be allowed. That's especially true since the choice really is a 
static one depending on the caller.

Just make the caller do the locking.

		Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 20:51     ` Linus Torvalds
@ 2003-12-21 21:08       ` Manfred Spraul
  2003-12-21 21:19         ` Linus Torvalds
  0 siblings, 1 reply; 28+ messages in thread
From: Manfred Spraul @ 2003-12-21 21:08 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: OGAWA Hirofumi, lse-tech, linux-kernel

Linus Torvalds wrote:

>On Sun, 21 Dec 2003, Manfred Spraul wrote:
>  
>
>>Initially I tried to keep the patch as tiny as possible, thus I avoided 
>>adding an inline function. But Stephen Hemminger convinced me to update 
>>the network code, and thus it didn't matter and I've switched to an 
>>inline function.
>>What do you think about the attached patch?
>>    
>>
>
>Please, NO!
>
>Stuff like this
>
>	-       write_lock_irq(&fasync_lock);
>	+       if (s)
>	+               lock_sock(s);
>	+       else
>	+               spin_lock(&fasync_lock);
>	+
>
>should not be allowed. That's especially true since the choice really is a 
>static one depending on the caller.
>
>Just make the caller do the locking.
>  
>
It's not that simple: the function does
    kmalloc();
    spin_lock();
    use_allocation.
If the caller does the locking, then the kmalloc would have to use 
GFP_ATOMIC, or the caller would have to do the alloc.
But: as far as I can see, these lines usually run under lock_kernel(). 
If this is true, then the spin_lock(&fasync_lock) won't cause any 
scalability regression, and I'll use that lock instead of lock_sock, 
even for network sockets.

--
    Manfred


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 21:08       ` Manfred Spraul
@ 2003-12-21 21:19         ` Linus Torvalds
  2003-12-21 21:54           ` Manfred Spraul
  0 siblings, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2003-12-21 21:19 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: OGAWA Hirofumi, lse-tech, linux-kernel



On Sun, 21 Dec 2003, Manfred Spraul wrote:
> >
> >Just make the caller do the locking.
>
> It's not that simple:

It _is_ that simple.

The choices are:
 - let the caller do the locking
 - make the callee locking be statically determinable

Those are the choices. Your kind of code is not goign to be integrated.

> the function does
>     kmalloc();
>     spin_lock();
>     use_allocation.

This is trivially handled by splitting out the allocation as a separate 
phase. 

Yes, it requires that the caller be changed, but if the choice is between 
insane locking and making a caller change, then the choice is very very 
clear.

> But: as far as I can see, these lines usually run under lock_kernel(). 
> If this is true, then the spin_lock(&fasync_lock) won't cause any 
> scalability regression, and I'll use that lock instead of lock_sock, 
> even for network sockets.

Don't.

Here's a big clue: if you make code worse than it is today, it won't be 
accepted. I don't even see why you'd bother in the first place.

So go back to the drawing board, and just do it _right_. Or don't do it at 
all. There's no point to making the code look and behave worse than it 
does today.

		Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 21:19         ` Linus Torvalds
@ 2003-12-21 21:54           ` Manfred Spraul
  2003-12-21 22:05             ` Linus Torvalds
  0 siblings, 1 reply; 28+ messages in thread
From: Manfred Spraul @ 2003-12-21 21:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: OGAWA Hirofumi, lse-tech, linux-kernel

Linus Torvalds wrote:

>Here's a big clue: if you make code worse than it is today, it won't be 
>accepted. I don't even see why you'd bother in the first place.
>  
>
fasync_helper != kill_fasync
fasync_helper is rare, and usually running under lock_kernel().
kill_fasync is far more common (every pipe_read and _write), I want to 
remove the unconditional read_lock(&global_lock).

>So go back to the drawing board, and just do it _right_. Or don't do it at 
>all. There's no point to making the code look and behave worse than it 
>does today.
>
Today's solution is two copies of fasync_helper: one with lock_sock in 
net/socket.c, one with write_lock_irq(&fasync_lock) in fs/fcntl.c.

Perhaps just a "if (*fp == NULL) return;" before grabbing the read_lock 
in kill_fasync, without touching fasync_helper - that would be 
sufficient to fix pipe_read and _write.

--
    Manfred


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 21:54           ` Manfred Spraul
@ 2003-12-21 22:05             ` Linus Torvalds
  2003-12-25  1:21               ` Manfred Spraul
  0 siblings, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2003-12-21 22:05 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: OGAWA Hirofumi, lse-tech, linux-kernel



On Sun, 21 Dec 2003, Manfred Spraul wrote:
>
> >Here's a big clue: if you make code worse than it is today, it won't be 
> >accepted. I don't even see why you'd bother in the first place.
>
> fasync_helper != kill_fasync
> fasync_helper is rare, and usually running under lock_kernel().

But we want to get rid of lock_kernel(), not create new code that depends 
on it.

And _especially_ if fasync_helper() is rarely used, that means that 
changing the callers to have a nicer calling convention would not be 
painful.

> kill_fasync is far more common (every pipe_read and _write), I want to 
> remove the unconditional read_lock(&global_lock).

Note that my personal preference would be to kill off "kill_fasync()" 
entirely.

We actually have almost all the infrastructure in place already: it's 
called a "wait queue". In 2.5.x it took a callback function, and the only 
thing missing is really the "band" information at wakeup time.

So if we instead made the whole fasync infrastructure use the existing 
wait-queues, and made wakeup() say what kind of wakeup it is, we could 
probably get rid of the specific fasync datastructures entirely. And we'd 
only take locks that we take _anyway_.

I dunno. But to me that at least sounds like a real cleanup.

> Today's solution is two copies of fasync_helper: one with lock_sock in 
> net/socket.c, one with write_lock_irq(&fasync_lock) in fs/fcntl.c.

And two functions that statically do something different is actually 
_better_ than one function that does two different things dynamically.

And if the two cases have different locking, then they should remain as 
two separate cases.

			Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 22:05             ` Linus Torvalds
@ 2003-12-25  1:21               ` Manfred Spraul
  2003-12-25 15:11                 ` OGAWA Hirofumi
  0 siblings, 1 reply; 28+ messages in thread
From: Manfred Spraul @ 2003-12-25  1:21 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: OGAWA Hirofumi, lse-tech, linux-kernel



On Sun, 21 Dec 2003, Linus Torvalds wrote:

>
> > kill_fasync is far more common (every pipe_read and _write), I want to
> > remove the unconditional read_lock(&global_lock).
>
> Note that my personal preference would be to kill off "kill_fasync()"
> entirely.
>
We've discussed that earlier in the thread, and came to the same
conclusion. Unfortunately it touches several drivers, and is not a simple
patch. Viro's summary of fasync in Documentation/filesystem/Locking is
"fasync is a mess" - converting kill_fasync to wake_up_band() is 2.7
stuff.

What about this minimal approach:

<<<<
--- 2.6/fs/fcntl.c	2003-12-04 19:44:38.000000000 +0100
+++ build-2.6/fs/fcntl.c	2003-12-24 00:15:16.000000000 +0100
@@ -609,9 +609,15 @@

 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
-	read_lock(&fasync_lock);
-	__kill_fasync(*fp, sig, band);
-	read_unlock(&fasync_lock);
+	/* First a quick test without locking: usually
+	 * the list is empty.
+	 */
+	if (*fp) {
+		read_lock(&fasync_lock);
+		/* reread *fp after obtaining the lock */
+		__kill_fasync(*fp, sig, band);
+		read_unlock(&fasync_lock);
+	}
 }

 EXPORT_SYMBOL(kill_fasync);
<<<<


--
	Manfred


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-25  1:21               ` Manfred Spraul
@ 2003-12-25 15:11                 ` OGAWA Hirofumi
  0 siblings, 0 replies; 28+ messages in thread
From: OGAWA Hirofumi @ 2003-12-25 15:11 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Linus Torvalds, lse-tech, linux-kernel

Manfred Spraul <manfred@colorfullife.com> writes:

> --- 2.6/fs/fcntl.c	2003-12-04 19:44:38.000000000 +0100
> +++ build-2.6/fs/fcntl.c	2003-12-24 00:15:16.000000000 +0100
> @@ -609,9 +609,15 @@
> 
>  void kill_fasync(struct fasync_struct **fp, int sig, int band)
>  {
> -	read_lock(&fasync_lock);
> -	__kill_fasync(*fp, sig, band);
> -	read_unlock(&fasync_lock);
> +	/* First a quick test without locking: usually
> +	 * the list is empty.
> +	 */
> +	if (*fp) {
> +		read_lock(&fasync_lock);
> +		/* reread *fp after obtaining the lock */
> +		__kill_fasync(*fp, sig, band);
> +		read_unlock(&fasync_lock);
> +	}
>  }

Looks good to me. This should be the enough effect for usual path.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2003-12-21 14:14     ` Jamie Lokier
  2003-12-21 14:59       ` Manfred Spraul
@ 2004-01-02 21:15       ` Bill Davidsen
  2004-01-02 22:41         ` Jamie Lokier
  1 sibling, 1 reply; 28+ messages in thread
From: Bill Davidsen @ 2004-01-02 21:15 UTC (permalink / raw)
  To: linux-kernel, Jamie Lokier; +Cc: Manfred Spraul, lse-tech, linux-kernel

Jamie Lokier wrote:

> We have found the performance impact of the extra ->poll calls
> negligable with epoll.  They're simply not slow calls.  It's
> only when you're doing select() or poll() of many descriptors
> repeatedly that you notice, and that's already poor usage in other
> ways.

I do agree with you, but there is a lot of old software, and software 
written on/for BSD, which does do this. I'm not prepared to say that BSD 
does it better, but it's easier to fix in one place, the kernel, than 
many other places.

Your point about the complexity is also correct, but perhaps someone 
will offer a better solution to speeding up select(). I think anything 
as major as this might be better off in a development series, and that's 
a clear prod for someone to find a simpler way to do it ;-)

Old programs grow; INN uses select and worked fine with 10-20 peers, 
with 200 peers sharing 2m articles and 1 TB of data it seems to work 
less well on Linux than BSD or Solaris. I'd love to see faster, there 
are lots of other servers out there as well.

-- 
bill davidsen <davidsen@tmr.com>
   CTO TMR Associates, Inc
   Doing interesting things with small computers since 1979

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-02 21:15       ` Bill Davidsen
@ 2004-01-02 22:41         ` Jamie Lokier
  2004-01-03  1:09           ` Mike Fedyk
  0 siblings, 1 reply; 28+ messages in thread
From: Jamie Lokier @ 2004-01-02 22:41 UTC (permalink / raw)
  To: Bill Davidsen; +Cc: Manfred Spraul, lse-tech, linux-kernel

Bill Davidsen wrote:
> Jamie Lokier wrote:
> >We have found the performance impact of the extra ->poll calls
> >negligable with epoll.  They're simply not slow calls.  It's
> >only when you're doing select() or poll() of many descriptors
> >repeatedly that you notice, and that's already poor usage in other
> >ways.
> 
> I do agree with you, but there is a lot of old software, and software 
> written on/for BSD, which does do this. I'm not prepared to say that BSD 
> does it better, but it's easier to fix in one place, the kernel, than 
> many other places.
> 
> Your point about the complexity is also correct, but perhaps someone 
> will offer a better solution to speeding up select(). I think anything 
> as major as this might be better off in a development series, and that's 
> a clear prod for someone to find a simpler way to do it ;-)

Eliminating up to half of the ->poll calls using wake_up_info() and
reducing the number of wakeups using an event mask argument to ->poll
are not the best ways to speed up select() or poll() for large numbers
of descriptors.

The best way is to maintain poll state in each "struct file".  The
order of complexity for the bitmap scan is still significant, but
->poll calls are limited to the number of transitions which actually
happen.

I think somebody, maybe Richard Gooch, has a patch to do this that's
several years old by now.

-- Jamie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-02 22:41         ` Jamie Lokier
@ 2004-01-03  1:09           ` Mike Fedyk
  2004-01-03 21:28             ` Jamie Lokier
  0 siblings, 1 reply; 28+ messages in thread
From: Mike Fedyk @ 2004-01-03  1:09 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Bill Davidsen, Manfred Spraul, lse-tech, linux-kernel

On Fri, Jan 02, 2004 at 10:41:50PM +0000, Jamie Lokier wrote:
> The best way is to maintain poll state in each "struct file".  The
> order of complexity for the bitmap scan is still significant, but
> ->poll calls are limited to the number of transitions which actually
> happen.

What's the drawback to this approach?

Where is the poll state kept now?

> I think somebody, maybe Richard Gooch, has a patch to do this that's
> several years old by now.

Why wasn't it merged?  

Implementation issues?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-03  1:09           ` Mike Fedyk
@ 2004-01-03 21:28             ` Jamie Lokier
  2004-01-04 19:01               ` Ingo Oeser
  0 siblings, 1 reply; 28+ messages in thread
From: Jamie Lokier @ 2004-01-03 21:28 UTC (permalink / raw)
  To: Bill Davidsen, Manfred Spraul, lse-tech, linux-kernel

Mike Fedyk wrote:
> On Fri, Jan 02, 2004 at 10:41:50PM +0000, Jamie Lokier wrote:
> > The best way is to maintain poll state in each "struct file".  The
> > order of complexity for the bitmap scan is still significant, but
> > ->poll calls are limited to the number of transitions which actually
> > happen.
> 
> What's the drawback to this approach?
> 
> Where is the poll state kept now?

The poll state is not maintained at all _between_ calls to poll/select
at the moment, so at least one fresh call to ->poll is required per
file descriptor.  That is something that can be changed.

> > I think somebody, maybe Richard Gooch, has a patch to do this that's
> > several years old by now.
> 
> Why wasn't it merged?  
> Implementation issues?

The impression I had was that the code is quite complicated and
invasive, and select/poll aren't considered worth optimising because
epoll is an overall better solution (which is true; optimising
select/poll would change the complexity of the slow part but not
reduce the complexity of the API part, while epoll does both).

See ftp://ftp.atnf.csiro.au/pub/people/rgooch/linux/kernel-patches/v2.1/fastpoll-readme

-- Jamie

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-03 21:28             ` Jamie Lokier
@ 2004-01-04 19:01               ` Ingo Oeser
  2004-01-04 19:20                 ` Davide Libenzi
  0 siblings, 1 reply; 28+ messages in thread
From: Ingo Oeser @ 2004-01-04 19:01 UTC (permalink / raw)
  To: Jamie Lokier, Bill Davidsen, Manfred Spraul, lse-tech, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On Saturday 03 January 2004 22:28, Jamie Lokier wrote:
> Mike Fedyk wrote:
> > On Fri, Jan 02, 2004 at 10:41:50PM +0000, Jamie Lokier wrote:
> > > The best way is to maintain poll state in each "struct file".  The
> > > order of complexity for the bitmap scan is still significant, but
> > > ->poll calls are limited to the number of transitions which actually
> > > happen.
> >
> > What's the drawback to this approach?
> >
> > Where is the poll state kept now?
>
> The poll state is not maintained at all _between_ calls to poll/select
> at the moment, so at least one fresh call to ->poll is required per
> file descriptor.  That is something that can be changed.

Yes, file->f_mode can be hijacked for this. Only 2 bits of it are used at the
moment. More headache is clearing this state again, but this might not be
necessary, since we can always return EAGAIN, if the cache is stale,
right?

> The impression I had was that the code is quite complicated and
> invasive, and select/poll aren't considered worth optimising because
> epoll is an overall better solution (which is true; optimising
> select/poll would change the complexity of the slow part but not
> reduce the complexity of the API part, while epoll does both).

This is true. But old software continues to exist and for INN there is
pretty much nothing else in this category available, I've been told by
several admins. Nobody really likes it, but it is used and improved
where necessary (epoll might be on the list already).

Regards 

Ingo Oeser

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.2 (GNU/Linux)

iD8DBQE/+GMqU56oYWuOrkARAlC5AJ4sX3OvARw0lE7n35tvr0NfeUkJGgCgmUt6
PuPC9O9DMZt+bCNIiUa/viU=
=d23f
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-04 19:01               ` Ingo Oeser
@ 2004-01-04 19:20                 ` Davide Libenzi
  2004-01-05 21:17                   ` Ingo Oeser
  0 siblings, 1 reply; 28+ messages in thread
From: Davide Libenzi @ 2004-01-04 19:20 UTC (permalink / raw)
  To: Ingo Oeser
  Cc: Jamie Lokier, Bill Davidsen, Manfred Spraul, lse-tech,
	Linux Kernel Mailing List

On Sun, 4 Jan 2004, Ingo Oeser wrote:

> > The impression I had was that the code is quite complicated and
> > invasive, and select/poll aren't considered worth optimising because
> > epoll is an overall better solution (which is true; optimising
> > select/poll would change the complexity of the slow part but not
> > reduce the complexity of the API part, while epoll does both).
> 
> This is true. But old software continues to exist and for INN there is
> pretty much nothing else in this category available, I've been told by
> several admins. Nobody really likes it, but it is used and improved
> where necessary (epoll might be on the list already).

The problem with poll/select is not the Linux implementation. It is the 
API that is flawed when applied to large fd sets. Every call pass to the 
system the whole fd set, and this makes the API O(N) by definition. While 
poll/select are perfectly ok for small fd sets, epoll LT might enable the 
application to migrate from poll/select to epoll pretty quickly (if the 
application architecture is fairly sane). For example, it took about 15 
minutes to me to make an epoll'd thttpd.




- Davide



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-04 19:20                 ` Davide Libenzi
@ 2004-01-05 21:17                   ` Ingo Oeser
  2004-01-05 22:24                     ` Davide Libenzi
  0 siblings, 1 reply; 28+ messages in thread
From: Ingo Oeser @ 2004-01-05 21:17 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi Davide,
hi lkml,

On Sunday 04 January 2004 20:20, you wrote:
> The problem with poll/select is not the Linux implementation. It is the
> API that is flawed when applied to large fd sets. Every call pass to the
> system the whole fd set, and this makes the API O(N) by definition. While
> poll/select are perfectly ok for small fd sets, epoll LT might enable the
> application to migrate from poll/select to epoll pretty quickly (if the
> application architecture is fairly sane). For example, it took about 15
> minutes to me to make an epoll'd thttpd.

Yes, I've read your analysis several years ago already and I'm the first
one lobbying for epoll, but look at the posting stating, that INN sucks
under Linux currently, but doesn't suck that hard under FreeBSD and
Solaris.

There are already enough things you cannot do properly under Linux
(which are mostly not Linux' fault, but still), so I don't want to add
another one. Especially in the server market, where the M$ lobbyists are
growing their market share.


But if there is some minimal funding available (50 EUR?), I would do it
myself and push the patches upstream ;-)


Regards

Ingo Oeser


-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.2 (GNU/Linux)

iD8DBQE/+dRTU56oYWuOrkARAo60AJ9kYn39UEOvf/XR/Jx6aR4yIZWIYwCggPiT
zB84XIY75b3Z05KXS7qewbw=
=+wI/
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC,PATCH] use rcu for fasync_lock
  2004-01-05 21:17                   ` Ingo Oeser
@ 2004-01-05 22:24                     ` Davide Libenzi
  0 siblings, 0 replies; 28+ messages in thread
From: Davide Libenzi @ 2004-01-05 22:24 UTC (permalink / raw)
  To: Ingo Oeser; +Cc: Linux Kernel Mailing List

On Mon, 5 Jan 2004, Ingo Oeser wrote:

> On Sunday 04 January 2004 20:20, you wrote:
> > The problem with poll/select is not the Linux implementation. It is the
> > API that is flawed when applied to large fd sets. Every call pass to the
> > system the whole fd set, and this makes the API O(N) by definition. While
> > poll/select are perfectly ok for small fd sets, epoll LT might enable the
> > application to migrate from poll/select to epoll pretty quickly (if the
> > application architecture is fairly sane). For example, it took about 15
> > minutes to me to make an epoll'd thttpd.
> 
> Yes, I've read your analysis several years ago already and I'm the first
> one lobbying for epoll, but look at the posting stating, that INN sucks
> under Linux currently, but doesn't suck that hard under FreeBSD and
> Solaris.
> 
> There are already enough things you cannot do properly under Linux
> (which are mostly not Linux' fault, but still), so I don't want to add
> another one. Especially in the server market, where the M$ lobbyists are
> growing their market share.
> 
> 
> But if there is some minimal funding available (50 EUR?), I would do it
> myself and push the patches upstream ;-)

IIRC INN was not using multiplexing multiple client with a single task. 
Wasn't it a fork-and-handle kinda server?



- Davide



^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2004-01-05 22:28 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-12-20 18:20 [RFC,PATCH] use rcu for fasync_lock Manfred Spraul
2003-12-20 21:10 ` [Lse-tech] " Stephen Hemminger
2003-12-20 21:35   ` Manfred Spraul
2003-12-21 11:36 ` Jamie Lokier
2003-12-21 12:40   ` Manfred Spraul
2003-12-21 14:14     ` Jamie Lokier
2003-12-21 14:59       ` Manfred Spraul
2003-12-21 15:08         ` Jamie Lokier
2004-01-02 21:15       ` Bill Davidsen
2004-01-02 22:41         ` Jamie Lokier
2004-01-03  1:09           ` Mike Fedyk
2004-01-03 21:28             ` Jamie Lokier
2004-01-04 19:01               ` Ingo Oeser
2004-01-04 19:20                 ` Davide Libenzi
2004-01-05 21:17                   ` Ingo Oeser
2004-01-05 22:24                     ` Davide Libenzi
2003-12-21 15:14     ` Davide Libenzi
2003-12-21 15:17       ` Davide Libenzi
2003-12-21 15:28       ` Jamie Lokier
2003-12-21 18:38 ` OGAWA Hirofumi
2003-12-21 19:14   ` Manfred Spraul
2003-12-21 20:51     ` Linus Torvalds
2003-12-21 21:08       ` Manfred Spraul
2003-12-21 21:19         ` Linus Torvalds
2003-12-21 21:54           ` Manfred Spraul
2003-12-21 22:05             ` Linus Torvalds
2003-12-25  1:21               ` Manfred Spraul
2003-12-25 15:11                 ` OGAWA Hirofumi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).