linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] iowait statistics
@ 2002-05-14  1:19 Rik van Riel
  2002-05-14  2:18 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Rik van Riel @ 2002-05-14  1:19 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-mm

Hi,

the following patch implements iowait statistics in a simple way:

1) if we go to sleep while waiting on a page or buffer, we
   increment nr_iowait_tasks, note that this is done only in
   the slow path so overhead shouldn't even be measurable

2) if no process is running, the timer interrupt adds a jiffy
   to the iowait time

3) iowait time is counted separately from user/system/idle and
   can overlap with either system or idle (when no process is
   running the system can still be busy processing interrupts)

4) on SMP systems the iowait time can be overestimated, no big
   deal IMHO but cheap suggestions for improvement are welcome

The only issue I could see with this patch is (3), should iowait
be counted the same as user/system/idle and should /proc/stat
be changed or should we keep the thing backward compatible and
have iowait "differently" accounted for ?

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


===== fs/buffer.c 1.64 vs edited =====
--- 1.64/fs/buffer.c	Mon May 13 19:04:59 2002
+++ edited/fs/buffer.c	Mon May 13 19:16:57 2002
@@ -156,8 +156,10 @@
 	get_bh(bh);
 	add_wait_queue(&bh->b_wait, &wait);
 	do {
+		atomic_inc(&nr_iowait_tasks);
 		run_task_queue(&tq_disk);
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		atomic_dec(&nr_iowait_tasks);
 		if (!buffer_locked(bh))
 			break;
 		schedule();
===== fs/proc/proc_misc.c 1.14 vs edited =====
--- 1.14/fs/proc/proc_misc.c	Sun Apr  7 18:04:14 2002
+++ edited/fs/proc/proc_misc.c	Mon May 13 19:16:59 2002
@@ -169,7 +169,7 @@
 		"Active:       %8u kB\n"
 		"Inact_dirty:  %8u kB\n"
 		"Inact_clean:  %8u kB\n"
-		"Inact_target: %8lu kB\n"
+		"Inact_target: %8u kB\n"
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
@@ -266,7 +266,7 @@
 	int i, len;
 	extern unsigned long total_forks;
 	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	unsigned int sum = 0, user = 0, nice = 0, system = 0, iowait = 0;
 	int major, disk;

 	for (i = 0 ; i < smp_num_cpus; i++) {
@@ -275,23 +275,26 @@
 		user += kstat.per_cpu_user[cpu];
 		nice += kstat.per_cpu_nice[cpu];
 		system += kstat.per_cpu_system[cpu];
+		iowait += kstat.per_cpu_iowait[cpu];
 #if !defined(CONFIG_ARCH_S390)
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat.irqs[cpu][j];
 #endif
 	}

-	len = sprintf(page, "cpu  %u %u %u %lu\n", user, nice, system,
-		      jif * smp_num_cpus - (user + nice + system));
+	len = sprintf(page, "cpu  %u %u %u %lu %u\n", user, nice, system,
+		      jif * smp_num_cpus - (user + nice + system),
+		      iowait);
 	for (i = 0 ; i < smp_num_cpus; i++)
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
+		len += sprintf(page + len, "cpu%d %u %u %u %lu %u\n",
 			i,
 			kstat.per_cpu_user[cpu_logical_map(i)],
 			kstat.per_cpu_nice[cpu_logical_map(i)],
 			kstat.per_cpu_system[cpu_logical_map(i)],
 			jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
 				   + kstat.per_cpu_nice[cpu_logical_map(i)] \
-				   + kstat.per_cpu_system[cpu_logical_map(i)]));
+				   + kstat.per_cpu_system[cpu_logical_map(i)]),
+			kstat.per_cpu_iowait[cpu_logical_map(i)]);
 	len += sprintf(page + len,
 		"page %u %u\n"
 		"swap %u %u\n"
===== include/linux/kernel_stat.h 1.3 vs edited =====
--- 1.3/include/linux/kernel_stat.h	Thu Apr 11 01:27:34 2002
+++ edited/include/linux/kernel_stat.h	Mon May 13 19:31:31 2002
@@ -18,7 +18,8 @@
 struct kernel_stat {
 	unsigned int per_cpu_user[NR_CPUS],
 	             per_cpu_nice[NR_CPUS],
-	             per_cpu_system[NR_CPUS];
+	             per_cpu_system[NR_CPUS],
+	             per_cpu_iowait[NR_CPUS];
 	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
===== include/linux/swap.h 1.35 vs edited =====
--- 1.35/include/linux/swap.h	Mon May 13 19:04:59 2002
+++ edited/include/linux/swap.h	Mon May 13 19:17:32 2002
@@ -90,6 +90,7 @@
 extern int nr_inactive_clean_pages;
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
+extern atomic_t nr_iowait_tasks;

 extern spinlock_cacheline_t pagecache_lock_cacheline;
 #define pagecache_lock (pagecache_lock_cacheline.lock)
===== kernel/timer.c 1.4 vs edited =====
--- 1.4/kernel/timer.c	Tue Apr 30 13:38:16 2002
+++ edited/kernel/timer.c	Mon May 13 22:04:48 2002
@@ -608,8 +608,16 @@
 		else
 			kstat.per_cpu_user[cpu] += user_tick;
 		kstat.per_cpu_system[cpu] += system;
-	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-		kstat.per_cpu_system[cpu] += system;
+	} else {
+		/*
+		 * No process is running, but if we're handling interrupts
+		 * or processes are waiting on disk IO, we're not really idle.
+		 */
+	       	if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+			kstat.per_cpu_system[cpu] += system;
+		if (atomic_read(&nr_iowait_tasks) > 0)
+			kstat.per_cpu_iowait[cpu] += system;
+	}
 }

 /*
===== mm/filemap.c 1.69 vs edited =====
--- 1.69/mm/filemap.c	Mon May 13 19:05:00 2002
+++ edited/mm/filemap.c	Mon May 13 22:04:18 2002
@@ -44,6 +44,7 @@
  */

 atomic_t page_cache_size = ATOMIC_INIT(0);
+atomic_t nr_iowait_tasks = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;

@@ -828,8 +829,10 @@
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!PageLocked(page))
 			break;
+		atomic_inc(&nr_iowait_tasks);
 		sync_page(page);
 		schedule();
+		atomic_dec(&nr_iowait_tasks);
 	} while (PageLocked(page));
 	__set_task_state(tsk, TASK_RUNNING);
 	remove_wait_queue(waitqueue, &wait);
@@ -864,8 +867,10 @@
 	for (;;) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (PageLocked(page)) {
+			atomic_inc(&nr_iowait_tasks);
 			sync_page(page);
 			schedule();
+			atomic_dec(&nr_iowait_tasks);
 		}
 		if (!TryLockPage(page))
 			break;


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14  1:19 [RFC][PATCH] iowait statistics Rik van Riel
@ 2002-05-14  2:18 ` Andrew Morton
  2002-05-14 12:30   ` Rik van Riel
  2002-05-15 17:02   ` Denis Vlasenko
  2002-05-14 15:39 ` William Lee Irwin III
  2002-05-15  1:31 ` Bill Davidsen
  2 siblings, 2 replies; 28+ messages in thread
From: Andrew Morton @ 2002-05-14  2:18 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel, linux-mm

Rik van Riel wrote:
> 
> Hi,
> 
> the following patch implements iowait statistics in a simple way:
> 
> 1) if we go to sleep while waiting on a page or buffer, we
>    increment nr_iowait_tasks, note that this is done only in
>    the slow path so overhead shouldn't even be measurable
> 
> 2) if no process is running, the timer interrupt adds a jiffy
>    to the iowait time
> 
> 3) iowait time is counted separately from user/system/idle and
>    can overlap with either system or idle (when no process is
>    running the system can still be busy processing interrupts)
> 
> 4) on SMP systems the iowait time can be overestimated, no big
>    deal IMHO but cheap suggestions for improvement are welcome

I suspect that a number of these statistical accounting mechanisms
are going to break.  The new irq-affinity code works awfully well.

The kernel profiler in 2.5 doesn't work very well at present.
When investigating this, I ran a busy-wait process.  It attached
itself to CPU #3 and that CPU received precisely zero interrupts
across a five minute period.  So the profiler cunningly avoids profiling
busy CPUs, which is rather counter-productive.  Fortunate that oprofile
uses NMI.

> ...
> ===== fs/buffer.c 1.64 vs edited =====
> --- 1.64/fs/buffer.c    Mon May 13 19:04:59 2002
> +++ edited/fs/buffer.c  Mon May 13 19:16:57 2002
> @@ -156,8 +156,10 @@
>         get_bh(bh);
>         add_wait_queue(&bh->b_wait, &wait);
>         do {
> +               atomic_inc(&nr_iowait_tasks);
>                 run_task_queue(&tq_disk);
>                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
> +               atomic_dec(&nr_iowait_tasks);
>                 if (!buffer_locked(bh))
>                         break;
>                 schedule();

Shouldn't the atomic_inc cover the schedule()?


-

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14  2:18 ` Andrew Morton
@ 2002-05-14 12:30   ` Rik van Riel
  2002-05-15 17:02   ` Denis Vlasenko
  1 sibling, 0 replies; 28+ messages in thread
From: Rik van Riel @ 2002-05-14 12:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, linux-mm

On Mon, 13 May 2002, Andrew Morton wrote:

> > ===== fs/buffer.c 1.64 vs edited =====
> > --- 1.64/fs/buffer.c    Mon May 13 19:04:59 2002
> > +++ edited/fs/buffer.c  Mon May 13 19:16:57 2002
> > @@ -156,8 +156,10 @@
> >         get_bh(bh);
> >         add_wait_queue(&bh->b_wait, &wait);
> >         do {
> > +               atomic_inc(&nr_iowait_tasks);
> >                 run_task_queue(&tq_disk);
> >                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
> > +               atomic_dec(&nr_iowait_tasks);
> >                 if (!buffer_locked(bh))
> >                         break;
> >                 schedule();
>
> Shouldn't the atomic_inc cover the schedule()?

DOH, indeed.  Placed in the wrong place ;/

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14  1:19 [RFC][PATCH] iowait statistics Rik van Riel
  2002-05-14  2:18 ` Andrew Morton
@ 2002-05-14 15:39 ` William Lee Irwin III
  2002-05-14 16:36   ` Rik van Riel
  2002-05-15  1:31 ` Bill Davidsen
  2 siblings, 1 reply; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-14 15:39 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel, linux-mm

On Mon, May 13, 2002 at 10:19:26PM -0300, Rik van Riel wrote:
> 2) if no process is running, the timer interrupt adds a jiffy
>    to the iowait time
[...]
> 4) on SMP systems the iowait time can be overestimated, no big
>    deal IMHO but cheap suggestions for improvement are welcome

This appears to be global across all cpu's. Maybe nr_iowait_tasks
should be accounted on a per-cpu basis, where

	(1) If a task sleeps for an io while bound to a cpu it
		counts toward the cpu's number of iowait tasks.

	(2) iowait time is accounted and reports are generated already
		on a per-cpu basis, so there's nothing to do there.

	(3) The global statistic does not need to be entirely accurate;
		a lockfree approximation by summing across all cpus'
		local counters should suffice for global iowait. I also
		suspect it will not fluctuate rapidly enough for truly
		horribly inaccurate results to occur.

	(4) A per-cpu nr_iowait_tasks counter may still well need
		to be atomic as other cpu's may be stealing sleeping
		tasks purportedly bound to a given cpu at migration
		time (in order to prevent going negative) and in that
		process altering other cpus' counters.

	(5) A flag marking a task as in iowait may well need to be kept
		in the task_struct so that at migration time the
		appropriate counter adjustments can be made.

	(6) Given sufficient cpu affinity in the scheduler the case
		where one cpu's counter needs alteration from another
		should be relatively uncommon.

The scheduler already participates in keeping per_cpu_user[],
per_cpu_system[], and per_cpu_nice[] up-to-date, so it's not
unreasonable to expect its support for per_cpu_iowait[].


Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14 15:39 ` William Lee Irwin III
@ 2002-05-14 16:36   ` Rik van Riel
  2002-05-14 16:54     ` William Lee Irwin III
  2002-05-14 18:19     ` Martin J. Bligh
  0 siblings, 2 replies; 28+ messages in thread
From: Rik van Riel @ 2002-05-14 16:36 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: linux-kernel, linux-mm

On Tue, 14 May 2002, William Lee Irwin III wrote:
> On Mon, May 13, 2002 at 10:19:26PM -0300, Rik van Riel wrote:
> > 2) if no process is running, the timer interrupt adds a jiffy
> >    to the iowait time
> [...]
> > 4) on SMP systems the iowait time can be overestimated, no big
> >    deal IMHO but cheap suggestions for improvement are welcome
                     ^^^^^
> This appears to be global across all cpu's. Maybe nr_iowait_tasks
> should be accounted on a per-cpu basis, where

While your proposal should work, somehow I doubt it's worth
the complexity. It's just a statistic to help sysadmins ;)

regards,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14 16:36   ` Rik van Riel
@ 2002-05-14 16:54     ` William Lee Irwin III
  2002-05-15 17:17       ` Denis Vlasenko
  2002-05-14 18:19     ` Martin J. Bligh
  1 sibling, 1 reply; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-14 16:54 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel, linux-mm

On Tue, 14 May 2002, William Lee Irwin III wrote:
>> This appears to be global across all cpu's. Maybe nr_iowait_tasks
>> should be accounted on a per-cpu basis, where

On Tue, May 14, 2002 at 01:36:00PM -0300, Rik van Riel wrote:
> While your proposal should work, somehow I doubt it's worth
> the complexity. It's just a statistic to help sysadmins ;)

I reserved judgment on that in order to present a possible mechanism.
I'm not sure it is either; we'll know it matters if sysadmins scream.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14 16:36   ` Rik van Riel
  2002-05-14 16:54     ` William Lee Irwin III
@ 2002-05-14 18:19     ` Martin J. Bligh
  1 sibling, 0 replies; 28+ messages in thread
From: Martin J. Bligh @ 2002-05-14 18:19 UTC (permalink / raw)
  To: Rik van Riel, William Lee Irwin III; +Cc: linux-kernel, linux-mm

>> This appears to be global across all cpu's. Maybe nr_iowait_tasks
>> should be accounted on a per-cpu basis, where
> 
> While your proposal should work, somehow I doubt it's worth
> the complexity. It's just a statistic to help sysadmins ;)

Depends how often you're going to end up bouncing that cacheline 
around ... do you do this for every IO?

M.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14  1:19 [RFC][PATCH] iowait statistics Rik van Riel
  2002-05-14  2:18 ` Andrew Morton
  2002-05-14 15:39 ` William Lee Irwin III
@ 2002-05-15  1:31 ` Bill Davidsen
  2002-05-15  1:41   ` William Lee Irwin III
  2 siblings, 1 reply; 28+ messages in thread
From: Bill Davidsen @ 2002-05-15  1:31 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel, linux-mm

[-- Attachment #1: Type: TEXT/PLAIN, Size: 957 bytes --]

On Mon, 13 May 2002, Rik van Riel wrote:

> Hi,
> 
> the following patch implements iowait statistics in a simple way:

This follows some work I was doing back in 2.4.10 or so WRT just measuring
the delay caused by waiting for page IO. Attached is a patch against
2.4.19-pre8-ac3 which is the offspring of preempt-kernel and Rik's waitio
patch. All I can claim is that it still runs and boots, seems to have
working preempt, and returns reasonable numbers for iowait on a uni
processor. I intend to test more, but I know that this is a popular ac
version, and hopefully this will be useful.

  Until I try this on a real machine and see what tuning of things like
hdparm and elvtune do, test rmap again -aa, etc, this is an exercise in
getting it to work. I'd be interested in feedback, and I hope Rik will
continue his development.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

[-- Attachment #2: Type: TEXT/PLAIN, Size: 69768 bytes --]

*** ./Makefile	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./Makefile	Tue May 14 17:01:49 2002
***************
*** 1,7 ****
  VERSION = 2
  PATCHLEVEL = 4
  SUBLEVEL = 19
! EXTRAVERSION = -pre8-ac3
  
  KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
  
--- 1,7 ----
  VERSION = 2
  PATCHLEVEL = 4
  SUBLEVEL = 19
! EXTRAVERSION = -pre8-ac3.1p+iow
  
  KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
  
*** ./fs/buffer.c	Tue May 14 14:59:19 2002
--- ../linux-2.4.19-pre8-ac3p/./fs/buffer.c	Tue May 14 16:43:04 2002
***************
*** 154,164 ****
--- 154,166 ----
  	get_bh(bh);
  	add_wait_queue(&bh->b_wait, &wait);
  	do {
+ 		atomic_inc(&nr_iowait_tasks);
  		run_task_queue(&tq_disk);
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  		if (!buffer_locked(bh))
  			break;
  		schedule();
+ 		atomic_dec(&nr_iowait_tasks);
  	} while (buffer_locked(bh));
  	tsk->state = TASK_RUNNING;
  	remove_wait_queue(&bh->b_wait, &wait);
*** ./fs/exec.c	Tue May 14 14:59:19 2002
--- ../linux-2.4.19-pre8-ac3p/./fs/exec.c	Tue May 14 15:09:17 2002
***************
*** 427,434 ****
  		active_mm = current->active_mm;
  		current->mm = mm;
  		current->active_mm = mm;
- 		task_unlock(current);
  		activate_mm(active_mm, mm);
  		mm_release();
  		if (old_mm) {
  			if (active_mm != old_mm) BUG();
--- 427,434 ----
  		active_mm = current->active_mm;
  		current->mm = mm;
  		current->active_mm = mm;
  		activate_mm(active_mm, mm);
+ 		task_unlock(current);
  		mm_release();
  		if (old_mm) {
  			if (active_mm != old_mm) BUG();
*** ./fs/proc/proc_misc.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./fs/proc/proc_misc.c	Tue May 14 16:48:00 2002
***************
*** 273,279 ****
  	int i, len;
  	extern unsigned long total_forks;
  	unsigned long jif = jiffies;
! 	unsigned int sum = 0, user = 0, nice = 0, system = 0;
  	int major, disk;
  
  	for (i = 0 ; i < smp_num_cpus; i++) {
--- 273,279 ----
  	int i, len;
  	extern unsigned long total_forks;
  	unsigned long jif = jiffies;
! 	unsigned int sum = 0, user = 0, nice = 0, system = 0, iowait = 0;
  	int major, disk;
  
  	for (i = 0 ; i < smp_num_cpus; i++) {
***************
*** 282,304 ****
  		user += kstat.per_cpu_user[cpu];
  		nice += kstat.per_cpu_nice[cpu];
  		system += kstat.per_cpu_system[cpu];
  #if !defined(CONFIG_ARCH_S390)
  		for (j = 0 ; j < NR_IRQS ; j++)
  			sum += kstat.irqs[cpu][j];
  #endif
  	}
  
! 	len = sprintf(page, "cpu  %u %u %u %lu\n", user, nice, system,
! 		      jif * smp_num_cpus - (user + nice + system));
  	for (i = 0 ; i < smp_num_cpus; i++)
! 		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
  			i,
  			kstat.per_cpu_user[cpu_logical_map(i)],
  			kstat.per_cpu_nice[cpu_logical_map(i)],
  			kstat.per_cpu_system[cpu_logical_map(i)],
  			jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
  				   + kstat.per_cpu_nice[cpu_logical_map(i)] \
! 				   + kstat.per_cpu_system[cpu_logical_map(i)]));
  	len += sprintf(page + len,
  		"page %u %u\n"
  		"swap %u %u\n"
--- 282,307 ----
  		user += kstat.per_cpu_user[cpu];
  		nice += kstat.per_cpu_nice[cpu];
  		system += kstat.per_cpu_system[cpu];
+ 		iowait += kstat.per_cpu_iowait[cpu];
  #if !defined(CONFIG_ARCH_S390)
  		for (j = 0 ; j < NR_IRQS ; j++)
  			sum += kstat.irqs[cpu][j];
  #endif
  	}
  
! 	len = sprintf(page, "cpu  %u %u %u %lu %u\n", user, nice, system,
! 		      jif * smp_num_cpus - (user + nice + system),
! 		      iowait);
  	for (i = 0 ; i < smp_num_cpus; i++)
! 		len += sprintf(page + len, "cpu%d %u %u %u %lu %u\n",
  			i,
  			kstat.per_cpu_user[cpu_logical_map(i)],
  			kstat.per_cpu_nice[cpu_logical_map(i)],
  			kstat.per_cpu_system[cpu_logical_map(i)],
  			jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
  				   + kstat.per_cpu_nice[cpu_logical_map(i)] \
! 				   + kstat.per_cpu_system[cpu_logical_map(i)]),
! 			kstat.per_cpu_iowait[cpu_logical_map(i)]);
  	len += sprintf(page + len,
  		"page %u %u\n"
  		"swap %u %u\n"
*** ./fs/fat/cache.c	Fri Oct 12 16:48:42 2001
--- ../linux-2.4.19-pre8-ac3p/./fs/fat/cache.c	Tue May 14 15:09:17 2002
***************
*** 14,19 ****
--- 14,20 ----
  #include <linux/string.h>
  #include <linux/stat.h>
  #include <linux/fat_cvf.h>
+ #include <linux/sched.h>
  
  #if 0
  #  define PRINTK(x) printk x
*** ./fs/nls/nls_base.c	Tue May 14 14:55:54 2002
--- ../linux-2.4.19-pre8-ac3p/./fs/nls/nls_base.c	Tue May 14 15:09:18 2002
***************
*** 18,23 ****
--- 18,24 ----
  #ifdef CONFIG_KMOD
  #include <linux/kmod.h>
  #endif
+ #include <linux/sched.h>
  #include <linux/spinlock.h>
  
  static struct nls_table *tables;
*** ./fs/adfs/map.c	Thu Oct 25 16:53:53 2001
--- ../linux-2.4.19-pre8-ac3p/./fs/adfs/map.c	Tue May 14 15:09:17 2002
***************
*** 12,17 ****
--- 12,18 ----
  #include <linux/fs.h>
  #include <linux/adfs_fs.h>
  #include <linux/spinlock.h>
+ #include <linux/sched.h>
  
  #include "adfs.h"
  
*** ./kernel/sched.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./kernel/sched.c	Tue May 14 15:09:18 2002
***************
*** 165,174 ****
--- 165,176 ----
  	struct runqueue *rq;
  
  repeat_lock_task:
+ 	preempt_disable();
  	rq = task_rq(p);
  	spin_lock_irqsave(&rq->lock, *flags);
  	if (unlikely(rq != task_rq(p))) {
  		spin_unlock_irqrestore(&rq->lock, *flags);
+ 		preempt_enable();
  		goto repeat_lock_task;
  	}
  	return rq;
***************
*** 177,182 ****
--- 179,185 ----
  static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  {
  	spin_unlock_irqrestore(&rq->lock, *flags);
+ 	preempt_enable();
  }
  
  /*
***************
*** 257,267 ****
--- 260,272 ----
  {
  	int need_resched;
  
+ 	preempt_disable();
  	need_resched = p->need_resched;
  	wmb();
  	set_tsk_need_resched(p);
  	if (!need_resched && (p->cpu != smp_processor_id()))
  		smp_send_reschedule(p->cpu);
+ 	preempt_enable();
  }
  
  #ifdef CONFIG_SMP
***************
*** 276,281 ****
--- 281,287 ----
  	runqueue_t *rq;
  
  repeat:
+ 	preempt_disable();
  	rq = task_rq(p);
  	while (unlikely(rq->curr == p)) {
  		cpu_relax();
***************
*** 284,292 ****
--- 290,300 ----
  	rq = task_rq_lock(p, &flags);
  	if (unlikely(rq->curr == p)) {
  		task_rq_unlock(rq, &flags);
+ 		preempt_enable();
  		goto repeat;
  	}
  	task_rq_unlock(rq, &flags);
+ 	preempt_enable();
  }
  
  /*
***************
*** 340,345 ****
--- 348,354 ----
  {
  	runqueue_t *rq;
  
+ 	preempt_disable();
  	rq = this_rq();
  	spin_lock_irq(&rq->lock);
  
***************
*** 357,362 ****
--- 366,372 ----
  	p->cpu = smp_processor_id();
  	activate_task(p, rq);
  	spin_unlock_irq(&rq->lock);
+ 	preempt_enable();
  }
  
  /*
***************
*** 384,390 ****
  			p->sleep_avg) / (EXIT_WEIGHT + 1);
  }
  
! #if CONFIG_SMP
  asmlinkage void schedule_tail(task_t *prev)
  {
  	spin_unlock_irq(&this_rq()->frozen);
--- 394,400 ----
  			p->sleep_avg) / (EXIT_WEIGHT + 1);
  }
  
! #if CONFIG_SMP || CONFIG_PREEMPT
  asmlinkage void schedule_tail(task_t *prev)
  {
  	spin_unlock_irq(&this_rq()->frozen);
***************
*** 739,744 ****
--- 749,755 ----
  	BUG_ON(in_interrupt());
  
  need_resched:
+ 	preempt_disable();
  	prev = current;
  	rq = this_rq();
  
***************
*** 746,751 ****
--- 757,769 ----
  	prev->sleep_timestamp = jiffies;
  	spin_lock_irq(&rq->lock);
  
+ 	/*
+ 	 * if entering from preempt_schedule, off a kernel preemption,
+ 	 * go straight to picking the next task.
+ 	 */
+ 	if (unlikely(preempt_get_count() & PREEMPT_ACTIVE))
+ 		goto pick_next_task;
+ 
  	switch (prev->state) {
  	case TASK_INTERRUPTIBLE:
  		if (unlikely(signal_pending(prev))) {
***************
*** 757,765 ****
  	case TASK_RUNNING:
  		;
  	}
- #if CONFIG_SMP
  pick_next_task:
- #endif
  	if (unlikely(!rq->nr_running)) {
  #if CONFIG_SMP
  		load_balance(rq, 1);
--- 775,781 ----
***************
*** 810,820 ****
--- 826,855 ----
  	}
  
  	reacquire_kernel_lock(current);
+ 	preempt_enable_no_resched();
  	if (need_resched())
  		goto need_resched;
  	return;
  }
  
+ #ifdef CONFIG_PREEMPT
+ /*
+  * this is is the entry point to schedule() from in-kernel preemption.
+ */
+ asmlinkage void preempt_schedule(void)
+ {
+ need_resched:
+ 	current->preempt_count += PREEMPT_ACTIVE;
+ 	schedule();
+  	current->preempt_count -= PREEMPT_ACTIVE;
+ 
+ 	/* we could miss a preemption between schedule() and now */
+  	barrier();
+ 	if (unlikely((current->need_resched)))
+ 		goto need_resched;
+ }
+ #endif /* CONFIG_PREEMP */
+ 
  /*
   * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
   * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
***************
*** 1192,1197 ****
--- 1227,1233 ----
  	runqueue_t *rq;
  	prio_array_t *array;
  
+ 	preempt_disable();
  	rq = this_rq();
  
  	/*
***************
*** 1220,1225 ****
--- 1256,1262 ----
  		__set_bit(current->prio, array->bitmap);
  	}
  	spin_unlock(&rq->lock);
+ 	preempt_enable_no_resched();
  
  	schedule();
  
***************
*** 1424,1429 ****
--- 1461,1469 ----
  	double_rq_unlock(idle_rq, rq);
  	set_tsk_need_resched(idle);
  	__restore_flags(flags);
+ 
+ 	/* Set the preempt count _outside_ the spinlocks! */
+ 	idle->preempt_count = (idle->lock_depth >= 0);
  }
  
  extern void init_timervecs(void);
***************
*** 1520,1525 ****
--- 1560,1566 ----
  	if (!new_mask)
  		BUG();
  
+ 	preempt_disable();
  	rq = task_rq_lock(p, &flags);
  	p->cpus_allowed = new_mask;
  	/*
***************
*** 1528,1534 ****
  	 */
  	if (new_mask & (1UL << p->cpu)) {
  		task_rq_unlock(rq, &flags);
! 		return;
  	}
  
  	init_MUTEX_LOCKED(&req.sem);
--- 1569,1575 ----
  	 */
  	if (new_mask & (1UL << p->cpu)) {
  		task_rq_unlock(rq, &flags);
! 		goto out;
  	}
  
  	init_MUTEX_LOCKED(&req.sem);
***************
*** 1538,1543 ****
--- 1579,1586 ----
  	wake_up_process(rq->migration_thread);
  
  	down(&req.sem);
+ out:
+ 	preempt_enable();
  }
  
  static int migration_thread(void * bind_cpu)
***************
*** 1592,1609 ****
  		cpu_dest = __ffs(p->cpus_allowed);
  		rq_dest = cpu_rq(cpu_dest);
  repeat:
! 		cpu_src = p->thread_info->cpu;
  		rq_src = cpu_rq(cpu_src);
  
  		local_irq_save(flags);
  		double_rq_lock(rq_src, rq_dest);
! 		if (p->thread_info->cpu != cpu_src) {
  			double_rq_unlock(rq_src, rq_dest);
  			local_irq_restore(flags);
  			goto repeat;
  		}
  		if (rq_src == rq) {
! 			p->thread_info->cpu = cpu_dest;
  			if (p->array) {
  				deactivate_task(p, rq_src);
  				activate_task(p, rq_dest);
--- 1635,1652 ----
  		cpu_dest = __ffs(p->cpus_allowed);
  		rq_dest = cpu_rq(cpu_dest);
  repeat:
! 		cpu_src = p->cpu;
  		rq_src = cpu_rq(cpu_src);
  
  		local_irq_save(flags);
  		double_rq_lock(rq_src, rq_dest);
! 		if (p->cpu != cpu_src) {
  			double_rq_unlock(rq_src, rq_dest);
  			local_irq_restore(flags);
  			goto repeat;
  		}
  		if (rq_src == rq) {
! 			p->cpu = cpu_dest;
  			if (p->array) {
  				deactivate_task(p, rq_src);
  				activate_task(p, rq_dest);
*** ./kernel/exit.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./kernel/exit.c	Tue May 14 15:09:18 2002
***************
*** 373,380 ****
  		/* more a memory barrier than a real lock */
  		task_lock(tsk);
  		tsk->mm = NULL;
- 		task_unlock(tsk);
  		enter_lazy_tlb(mm, current, smp_processor_id());
  		mmput(mm);
  	}
  }
--- 373,380 ----
  		/* more a memory barrier than a real lock */
  		task_lock(tsk);
  		tsk->mm = NULL;
  		enter_lazy_tlb(mm, current, smp_processor_id());
+ 		task_unlock(tsk);
  		mmput(mm);
  	}
  }
***************
*** 494,499 ****
--- 494,504 ----
  		panic("Attempted to kill init!");
  	tsk->flags |= PF_EXITING;
  	del_timer_sync(&tsk->real_timer);
+ 
+ 	if (unlikely(preempt_get_count()))
+ 		printk(KERN_ERR "%s[%d] exited with preempt_count %d\n",
+ 				current->comm, current->pid,
+ 				preempt_get_count());
  
  fake_volatile:
  #ifdef CONFIG_BSD_PROCESS_ACCT
*** ./kernel/fork.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./kernel/fork.c	Tue May 14 15:09:18 2002
***************
*** 640,645 ****
--- 640,652 ----
  	if (p->binfmt && p->binfmt->module)
  		__MOD_INC_USE_COUNT(p->binfmt->module);
  
+ #ifdef CONFIG_PREEMPT
+ 	/*
+ 	 * schedule_tail drops this_rq()->lock so compensate with a count
+ 	 * of 1.  Also, we want to start with kernel preemption disabled.
+ 	 */
+ 	p->preempt_count = 1;
+ #endif
  	p->did_exec = 0;
  	p->swappable = 0;
  	p->state = TASK_UNINTERRUPTIBLE;
*** ./kernel/ksyms.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./kernel/ksyms.c	Tue May 14 15:09:18 2002
***************
*** 442,447 ****
--- 442,450 ----
  EXPORT_SYMBOL(interruptible_sleep_on);
  EXPORT_SYMBOL(interruptible_sleep_on_timeout);
  EXPORT_SYMBOL(schedule);
+ #ifdef CONFIG_PREEMPT
+ EXPORT_SYMBOL(preempt_schedule);
+ #endif
  EXPORT_SYMBOL(schedule_timeout);
  EXPORT_SYMBOL(sys_sched_yield);
  EXPORT_SYMBOL(set_user_nice);
*** ./kernel/timer.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./kernel/timer.c	Tue May 14 18:42:46 2002
***************
*** 585,590 ****
--- 585,592 ----
  {
  	p->per_cpu_utime[cpu] += user;
  	p->per_cpu_stime[cpu] += system;
+ 	if (atomic_read(&nr_iowait_tasks) > 0)
+ 		kstat.per_cpu_iowait[cpu] += system;
  	do_process_times(p, user, system);
  	do_it_virt(p, user);
  	do_it_prof(p);
*** ./lib/dec_and_lock.c	Wed Oct  3 12:11:26 2001
--- ../linux-2.4.19-pre8-ac3p/./lib/dec_and_lock.c	Tue May 14 15:09:18 2002
***************
*** 1,5 ****
--- 1,6 ----
  #include <linux/module.h>
  #include <linux/spinlock.h>
+ #include <linux/sched.h>
  #include <asm/atomic.h>
  
  /*
*** ./mm/filemap.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./mm/filemap.c	Tue May 14 17:01:27 2002
***************
*** 45,50 ****
--- 45,51 ----
   */
  
  atomic_t page_cache_size = ATOMIC_INIT(0);
+ atomic_t nr_iowait_tasks = ATOMIC_INIT(0);
  unsigned int page_hash_bits;
  struct page **page_hash_table;
  
***************
*** 828,835 ****
--- 829,838 ----
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  		if (!PageLocked(page))
  			break;
+ 		atomic_inc(&nr_iowait_tasks);
  		sync_page(page);
  		schedule();
+ 		atomic_dec(&nr_iowait_tasks);
  	} while (PageLocked(page));
  	__set_task_state(tsk, TASK_RUNNING);
  	remove_wait_queue(waitqueue, &wait);
***************
*** 875,882 ****
--- 878,887 ----
  	for (;;) {
  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  		if (PageLocked(page)) {
+ 			atomic_inc(&nr_iowait_tasks);
  			sync_page(page);
  			schedule();
+ 			atomic_dec(&nr_iowait_tasks);
  		}
  		if (!TryLockPage(page))
  			break;
*** ./mm/slab.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./mm/slab.c	Tue May 14 15:09:18 2002
***************
*** 49,55 ****
   *  constructors and destructors are called without any locking.
   *  Several members in kmem_cache_t and slab_t never change, they
   *	are accessed without any locking.
!  *  The per-cpu arrays are never accessed from the wrong cpu, no locking.
   *  The non-constant members are protected with a per-cache irq spinlock.
   *
   * Further notes from the original documentation:
--- 49,56 ----
   *  constructors and destructors are called without any locking.
   *  Several members in kmem_cache_t and slab_t never change, they
   *	are accessed without any locking.
!  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
!  *  	and local interrupts are disabled so slab code is preempt-safe.
   *  The non-constant members are protected with a per-cache irq spinlock.
   *
   * Further notes from the original documentation:
*** ./CREDITS	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./CREDITS	Tue May 14 15:09:17 2002
***************
*** 996,1003 ****
  
  N: Nigel Gamble
  E: nigel@nrg.org
- E: nigel@sgi.com
  D: Interrupt-driven printer driver
  S: 120 Alley Way
  S: Mountain View, California 94040
  S: USA
--- 996,1003 ----
  
  N: Nigel Gamble
  E: nigel@nrg.org
  D: Interrupt-driven printer driver
+ D: Preemptible kernel
  S: 120 Alley Way
  S: Mountain View, California 94040
  S: USA
*** ./include/linux/sched.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/sched.h	Tue May 14 18:09:05 2002
***************
*** 91,96 ****
--- 91,97 ----
  #define TASK_UNINTERRUPTIBLE	2
  #define TASK_ZOMBIE		4
  #define TASK_STOPPED		8
+ #define PREEMPT_ACTIVE		0x4000000
  
  #define __set_task_state(tsk, state_value)		\
  	do { (tsk)->state = (state_value); } while (0)
***************
*** 156,161 ****
--- 157,165 ----
  #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
  extern signed long FASTCALL(schedule_timeout(signed long timeout));
  asmlinkage void schedule(void);
+ #ifdef CONFIG_PREEMPT
+ asmlinkage void preempt_schedule(void);
+ #endif
  
  extern int schedule_task(struct tq_struct *task);
  extern void flush_scheduled_tasks(void);
***************
*** 291,297 ****
  	 * offsets of these are hardcoded elsewhere - touch with care
  	 */
  	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
! 	unsigned long flags;	/* per process flags, defined below */
  	int sigpending;
  	mm_segment_t addr_limit;	/* thread address space:
  					 	0-0xBFFFFFFF for user-thead
--- 295,301 ----
  	 * offsets of these are hardcoded elsewhere - touch with care
  	 */
  	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
! 	int preempt_count;	/* 0 => preemptable, <0 => BUG */
  	int sigpending;
  	mm_segment_t addr_limit;	/* thread address space:
  					 	0-0xBFFFFFFF for user-thead
***************
*** 317,322 ****
--- 321,327 ----
  	unsigned long policy;
  	unsigned long cpus_allowed;
  	unsigned int time_slice;
+ 	unsigned long flags;
  
  	task_t *next_task, *prev_task;
  
***************
*** 358,363 ****
--- 363,369 ----
  	struct tms times;
  	unsigned long start_time;
  	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+ 	long per_cpu_iowait[NR_CPUS];
  /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
  	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
  	int swappable:1;
***************
*** 942,947 ****
--- 948,958 ----
  {
  	return unlikely(current->need_resched);
  }
+ 
+ #define _TASK_STRUCT_DEFINED
+ #include <linux/dcache.h>
+ #include <linux/tqueue.h>
+ #include <linux/fs_struct.h>
  
  #endif /* __KERNEL__ */
  
*** ./include/linux/tqueue.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/tqueue.h	Tue May 14 15:20:25 2002
***************
*** 94,99 ****
--- 94,115 ----
  extern spinlock_t tqueue_lock;
  
  /*
+  * Call all "bottom halfs" on a given list.
+  */
+ 
+ extern void __run_task_queue(task_queue *list);
+ 
+ static inline void run_task_queue(task_queue *list)
+ {
+ 	if (TQ_ACTIVE(*list))
+ 		__run_task_queue(list);
+ }
+ 
+ #endif /* _LINUX_TQUEUE_H */
+ 
+ #if !defined(_LINUX_TQUEUE_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+ #define _LINUX_TQUEUE_H_INLINES
+ /*
   * Queue a task on a tq.  Return non-zero if it was successfully
   * added.
   */
***************
*** 109,125 ****
  	}
  	return ret;
  }
! 
! /*
!  * Call all "bottom halfs" on a given list.
!  */
! 
! extern void __run_task_queue(task_queue *list);
! 
! static inline void run_task_queue(task_queue *list)
! {
! 	if (TQ_ACTIVE(*list))
! 		__run_task_queue(list);
! }
! 
! #endif /* _LINUX_TQUEUE_H */
--- 125,128 ----
  	}
  	return ret;
  }
! #endif
*** ./include/linux/kernel_stat.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/kernel_stat.h	Tue May 14 16:49:04 2002
***************
*** 18,24 ****
  struct kernel_stat {
  	unsigned int per_cpu_user[NR_CPUS],
  	             per_cpu_nice[NR_CPUS],
! 	             per_cpu_system[NR_CPUS];
  	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
  	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
  	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
--- 18,25 ----
  struct kernel_stat {
  	unsigned int per_cpu_user[NR_CPUS],
  	             per_cpu_nice[NR_CPUS],
! 	             per_cpu_system[NR_CPUS],
! 	             per_cpu_iowait[NR_CPUS];
  	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
  	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
  	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
*** ./include/linux/swap.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/swap.h	Tue May 14 16:49:58 2002
***************
*** 90,95 ****
--- 90,96 ----
  extern int nr_inactive_clean_pages;
  extern atomic_t page_cache_size;
  extern atomic_t buffermem_pages;
+ extern atomic_t nr_iowait_tasks;
  extern spinlock_cacheline_t pagecache_lock_cacheline;
  #define pagecache_lock (pagecache_lock_cacheline.lock)
  extern void __remove_inode_page(struct page *);
*** ./include/linux/smp_lock.h	Thu Nov 22 14:46:27 2001
--- ../linux-2.4.19-pre8-ac3p/./include/linux/smp_lock.h	Tue May 14 18:09:08 2002
***************
*** 3,9 ****
  
  #include <linux/config.h>
  
! #ifndef CONFIG_SMP
  
  #define lock_kernel()				do { } while(0)
  #define unlock_kernel()				do { } while(0)
--- 3,9 ----
  
  #include <linux/config.h>
  
! #if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT)
  
  #define lock_kernel()				do { } while(0)
  #define unlock_kernel()				do { } while(0)
*** ./include/linux/dcache.h	Tue May 14 14:55:57 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/dcache.h	Tue May 14 15:20:25 2002
***************
*** 126,156 ****
  
  extern spinlock_t dcache_lock;
  
- /**
-  * d_drop - drop a dentry
-  * @dentry: dentry to drop
-  *
-  * d_drop() unhashes the entry from the parent
-  * dentry hashes, so that it won't be found through
-  * a VFS lookup any more. Note that this is different
-  * from deleting the dentry - d_delete will try to
-  * mark the dentry negative if possible, giving a
-  * successful _negative_ lookup, while d_drop will
-  * just make the cache lookup fail.
-  *
-  * d_drop() is used mainly for stuff that wants
-  * to invalidate a dentry for some reason (NFS
-  * timeouts or autofs deletes).
-  */
- 
- static __inline__ void d_drop(struct dentry * dentry)
- {
- 	spin_lock(&dcache_lock);
- 	list_del(&dentry->d_hash);
- 	INIT_LIST_HEAD(&dentry->d_hash);
- 	spin_unlock(&dcache_lock);
- }
- 
  static __inline__ int dname_external(struct dentry *d)
  {
  	return d->d_name.name != d->d_iname; 
--- 126,131 ----
***************
*** 275,277 ****
--- 250,283 ----
  #endif /* __KERNEL__ */
  
  #endif	/* __LINUX_DCACHE_H */
+ 
+ #if !defined(__LINUX_DCACHE_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+ #define __LINUX_DCACHE_H_INLINES
+ 
+ #ifdef __KERNEL__
+ /**
+  * d_drop - drop a dentry
+  * @dentry: dentry to drop
+  *
+  * d_drop() unhashes the entry from the parent
+  * dentry hashes, so that it won't be found through
+  * a VFS lookup any more. Note that this is different
+  * from deleting the dentry - d_delete will try to
+  * mark the dentry negative if possible, giving a
+  * successful _negative_ lookup, while d_drop will
+  * just make the cache lookup fail.
+  *
+  * d_drop() is used mainly for stuff that wants
+  * to invalidate a dentry for some reason (NFS
+  * timeouts or autofs deletes).
+  */
+ 
+ static __inline__ void d_drop(struct dentry * dentry)
+ {
+ 	spin_lock(&dcache_lock);
+ 	list_del(&dentry->d_hash);
+ 	INIT_LIST_HEAD(&dentry->d_hash);
+ 	spin_unlock(&dcache_lock);
+ }
+ #endif
+ #endif
*** ./include/linux/smp.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/smp.h	Tue May 14 15:20:25 2002
***************
*** 81,87 ****
--- 81,89 ----
  #define smp_processor_id()			0
  #define hard_smp_processor_id()			0
  #define smp_threads_ready			1
+ #ifndef CONFIG_PREEMPT
  #define kernel_lock()
+ #endif
  #define cpu_logical_map(cpu)			0
  #define cpu_number_map(cpu)			0
  #define smp_call_function(func,info,retry,wait)	({ 0; })
*** ./include/linux/spinlock.h	Tue May 14 14:55:58 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/spinlock.h	Tue May 14 15:20:25 2002
***************
*** 2,7 ****
--- 2,8 ----
  #define __LINUX_SPINLOCK_H
  
  #include <linux/config.h>
+ #include <linux/compiler.h>
  
  /*
   * These are the generic versions of the spinlocks and read-write
***************
*** 62,69 ****
--- 63,72 ----
  
  #if (DEBUG_SPINLOCKS < 1)
  
+ #ifndef CONFIG_PREEMPT
  #define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
  #define ATOMIC_DEC_AND_LOCK
+ #endif
  
  /*
   * Your basic spinlocks, allowing only a single CPU anywhere
***************
*** 79,89 ****
  #endif
  
  #define spin_lock_init(lock)	do { } while(0)
! #define spin_lock(lock)		(void)(lock) /* Not "unused variable". */
  #define spin_is_locked(lock)	(0)
! #define spin_trylock(lock)	({1; })
  #define spin_unlock_wait(lock)	do { } while(0)
! #define spin_unlock(lock)	do { } while(0)
  
  #elif (DEBUG_SPINLOCKS < 2)
  
--- 82,92 ----
  #endif
  
  #define spin_lock_init(lock)	do { } while(0)
! #define _raw_spin_lock(lock)	(void)(lock) /* Not "unused variable". */
  #define spin_is_locked(lock)	(0)
! #define _raw_spin_trylock(lock)	({1; })
  #define spin_unlock_wait(lock)	do { } while(0)
! #define _raw_spin_unlock(lock)	do { } while(0)
  
  #elif (DEBUG_SPINLOCKS < 2)
  
***************
*** 142,153 ****
  #endif
  
  #define rwlock_init(lock)	do { } while(0)
! #define read_lock(lock)		(void)(lock) /* Not "unused variable". */
! #define read_unlock(lock)	do { } while(0)
! #define write_lock(lock)	(void)(lock) /* Not "unused variable". */
! #define write_unlock(lock)	do { } while(0)
  
  #endif /* !SMP */
  
  /* "lock on reference count zero" */
  #ifndef ATOMIC_DEC_AND_LOCK
--- 145,219 ----
  #endif
  
  #define rwlock_init(lock)	do { } while(0)
! #define _raw_read_lock(lock)	(void)(lock) /* Not "unused variable". */
! #define _raw_read_unlock(lock)	do { } while(0)
! #define _raw_write_lock(lock)	(void)(lock) /* Not "unused variable". */
! #define _raw_write_unlock(lock)	do { } while(0)
  
  #endif /* !SMP */
+ 
+ #ifdef CONFIG_PREEMPT
+ 
+ #define preempt_get_count() (current->preempt_count)
+ 
+ #define preempt_disable() \
+ do { \
+ 	++current->preempt_count; \
+ 	barrier(); \
+ } while (0)
+ 
+ #define preempt_enable_no_resched() \
+ do { \
+ 	--current->preempt_count; \
+ 	barrier(); \
+ } while (0)
+ 
+ #define preempt_enable() \
+ do { \
+ 	--current->preempt_count; \
+ 	barrier(); \
+ 	if (unlikely(current->preempt_count < current->need_resched)) \
+ 		preempt_schedule(); \
+ } while (0)
+ 
+ #define spin_lock(lock)	\
+ do { \
+ 	preempt_disable(); \
+ 	_raw_spin_lock(lock); \
+ } while(0)
+ 
+ #define spin_trylock(lock)	({preempt_disable(); _raw_spin_trylock(lock) ? \
+ 				1 : ({preempt_enable(); 0;});})
+ #define spin_unlock(lock) \
+ do { \
+ 	_raw_spin_unlock(lock); \
+ 	preempt_enable(); \
+ } while (0)
+ 
+ #define read_lock(lock)		({preempt_disable(); _raw_read_lock(lock);})
+ #define read_unlock(lock)	({_raw_read_unlock(lock); preempt_enable();})
+ #define write_lock(lock)	({preempt_disable(); _raw_write_lock(lock);})
+ #define write_unlock(lock)	({_raw_write_unlock(lock); preempt_enable();})
+ #define write_trylock(lock)	({preempt_disable();_raw_write_trylock(lock) ? \
+ 				1 : ({preempt_enable(); 0;});})
+ 
+ #else
+ 
+ #define preempt_get_count()	(0)
+ #define preempt_disable()	do { } while (0)
+ #define preempt_enable_no_resched()	do {} while(0)
+ #define preempt_enable()	do { } while (0)
+ 
+ #define spin_lock(lock)		_raw_spin_lock(lock)
+ #define spin_trylock(lock)	_raw_spin_trylock(lock)
+ #define spin_unlock(lock)	_raw_spin_unlock(lock)
+ 
+ #define read_lock(lock)		_raw_read_lock(lock)
+ #define read_unlock(lock)	_raw_read_unlock(lock)
+ #define write_lock(lock)	_raw_write_lock(lock)
+ #define write_unlock(lock)	_raw_write_unlock(lock)
+ #define write_trylock(lock)	_raw_write_trylock(lock)
+ #endif
  
  /* "lock on reference count zero" */
  #ifndef ATOMIC_DEC_AND_LOCK
*** ./include/linux/fs_struct.h	Fri Jul 13 18:10:44 2001
--- ../linux-2.4.19-pre8-ac3p/./include/linux/fs_struct.h	Tue May 14 15:09:18 2002
***************
*** 20,25 ****
--- 20,34 ----
  extern void exit_fs(struct task_struct *);
  extern void set_fs_altroot(void);
  
+ struct fs_struct *copy_fs_struct(struct fs_struct *old);
+ void put_fs_struct(struct fs_struct *fs);
+ 
+ #endif
+ #endif
+ 
+ #if !defined(_LINUX_FS_STRUCT_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+ #define _LINUX_FS_STRUCT_H_INLINES
+ #ifdef __KERNEL__
  /*
   * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
   * It can block. Requires the big lock held.
***************
*** 65,73 ****
  		mntput(old_pwdmnt);
  	}
  }
- 
- struct fs_struct *copy_fs_struct(struct fs_struct *old);
- void put_fs_struct(struct fs_struct *fs);
- 
  #endif
  #endif
--- 74,78 ----
*** ./include/linux/brlock.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/linux/brlock.h	Tue May 14 15:20:27 2002
***************
*** 173,183 ****
  }
  
  #else
! # define br_read_lock(idx)	((void)(idx))
! # define br_read_unlock(idx)	((void)(idx))
! # define br_write_lock(idx)	((void)(idx))
! # define br_write_unlock(idx)	((void)(idx))
! #endif
  
  /*
   * Now enumerate all of the possible sw/hw IRQ protected
--- 173,183 ----
  }
  
  #else
! # define br_read_lock(idx)	({ (void)(idx); preempt_disable(); })
! # define br_read_unlock(idx)	({ (void)(idx); preempt_enable(); })
! # define br_write_lock(idx)	({ (void)(idx); preempt_disable(); })
! # define br_write_unlock(idx)	({ (void)(idx); preempt_enable(); })
! #endif	/* CONFIG_SMP */
  
  /*
   * Now enumerate all of the possible sw/hw IRQ protected
*** ./include/asm-i386/hardirq.h	Thu Nov 22 14:46:19 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/hardirq.h	Tue May 14 15:20:25 2002
***************
*** 36,41 ****
--- 36,43 ----
  
  #define synchronize_irq()	barrier()
  
+ #define release_irqlock(cpu)	do { } while (0)
+ 
  #else
  
  #include <asm/atomic.h>
*** ./include/asm-i386/softirq.h	Tue May 14 14:55:55 2002
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/softirq.h	Tue May 14 15:20:25 2002
***************
*** 5,13 ****
  #include <asm/hardirq.h>
  
  #define __cpu_bh_enable(cpu) \
! 		do { barrier(); local_bh_count(cpu)--; } while (0)
  #define cpu_bh_disable(cpu) \
! 		do { local_bh_count(cpu)++; barrier(); } while (0)
  
  #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
  #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
--- 5,13 ----
  #include <asm/hardirq.h>
  
  #define __cpu_bh_enable(cpu) \
! 		do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0)
  #define cpu_bh_disable(cpu) \
! 		do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0)
  
  #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
  #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
***************
*** 22,28 ****
   * If you change the offsets in irq_stat then you have to
   * update this code as well.
   */
! #define local_bh_enable()						\
  do {									\
  	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
  									\
--- 22,28 ----
   * If you change the offsets in irq_stat then you have to
   * update this code as well.
   */
! #define _local_bh_enable()						\
  do {									\
  	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
  									\
***************
*** 44,48 ****
--- 44,50 ----
  		: "r" (ptr), "i" (do_softirq)				\
  		/* no registers clobbered */ );				\
  } while (0)
+ 
+ #define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0)
  
  #endif	/* __ASM_SOFTIRQ_H */
*** ./include/asm-i386/spinlock.h	Tue May 14 14:55:55 2002
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/spinlock.h	Tue May 14 15:20:25 2002
***************
*** 77,83 ****
  		:"=m" (lock->lock) : : "memory"
  
  
! static inline void spin_unlock(spinlock_t *lock)
  {
  #if SPINLOCK_DEBUG
  	if (lock->magic != SPINLOCK_MAGIC)
--- 77,83 ----
  		:"=m" (lock->lock) : : "memory"
  
  
! static inline void _raw_spin_unlock(spinlock_t *lock)
  {
  #if SPINLOCK_DEBUG
  	if (lock->magic != SPINLOCK_MAGIC)
***************
*** 97,103 ****
  		:"=q" (oldval), "=m" (lock->lock) \
  		:"0" (oldval) : "memory"
  
! static inline void spin_unlock(spinlock_t *lock)
  {
  	char oldval = 1;
  #if SPINLOCK_DEBUG
--- 97,103 ----
  		:"=q" (oldval), "=m" (lock->lock) \
  		:"0" (oldval) : "memory"
  
! static inline void _raw_spin_unlock(spinlock_t *lock)
  {
  	char oldval = 1;
  #if SPINLOCK_DEBUG
***************
*** 113,119 ****
  
  #endif
  
! static inline int spin_trylock(spinlock_t *lock)
  {
  	char oldval;
  	__asm__ __volatile__(
--- 113,119 ----
  
  #endif
  
! static inline int _raw_spin_trylock(spinlock_t *lock)
  {
  	char oldval;
  	__asm__ __volatile__(
***************
*** 123,129 ****
  	return oldval > 0;
  }
  
! static inline void spin_lock(spinlock_t *lock)
  {
  #if SPINLOCK_DEBUG
  	__label__ here;
--- 123,129 ----
  	return oldval > 0;
  }
  
! static inline void _raw_spin_lock(spinlock_t *lock)
  {
  #if SPINLOCK_DEBUG
  	__label__ here;
***************
*** 179,185 ****
   */
  /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
  
! static inline void read_lock(rwlock_t *rw)
  {
  #if SPINLOCK_DEBUG
  	if (rw->magic != RWLOCK_MAGIC)
--- 179,185 ----
   */
  /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
  
! static inline void _raw_read_lock(rwlock_t *rw)
  {
  #if SPINLOCK_DEBUG
  	if (rw->magic != RWLOCK_MAGIC)
***************
*** 188,194 ****
  	__build_read_lock(rw, "__read_lock_failed");
  }
  
! static inline void write_lock(rwlock_t *rw)
  {
  #if SPINLOCK_DEBUG
  	if (rw->magic != RWLOCK_MAGIC)
--- 188,194 ----
  	__build_read_lock(rw, "__read_lock_failed");
  }
  
! static inline void _raw_write_lock(rwlock_t *rw)
  {
  #if SPINLOCK_DEBUG
  	if (rw->magic != RWLOCK_MAGIC)
***************
*** 197,206 ****
  	__build_write_lock(rw, "__write_lock_failed");
  }
  
! #define read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
! #define write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
  
! static inline int write_trylock(rwlock_t *lock)
  {
  	atomic_t *count = (atomic_t *)lock;
  	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
--- 197,206 ----
  	__build_write_lock(rw, "__write_lock_failed");
  }
  
! #define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
! #define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
  
! static inline int _raw_write_trylock(rwlock_t *lock)
  {
  	atomic_t *count = (atomic_t *)lock;
  	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
*** ./include/asm-i386/smplock.h	Tue May 14 14:55:55 2002
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/smplock.h	Tue May 14 18:09:08 2002
***************
*** 11,17 ****
--- 11,25 ----
  extern spinlock_cacheline_t kernel_flag_cacheline;  
  #define kernel_flag kernel_flag_cacheline.lock      
  
+ #ifdef CONFIG_SMP
  #define kernel_locked()		spin_is_locked(&kernel_flag)
+ #else
+ #ifdef CONFIG_PREEMPT
+ #define kernel_locked()		preempt_get_count()
+ #else
+ #define kernel_locked()		1
+ #endif
+ #endif
  
  /*
   * Release global kernel lock and global interrupt lock
***************
*** 43,48 ****
--- 51,61 ----
   */
  static __inline__ void lock_kernel(void)
  {
+ #ifdef CONFIG_PREEMPT
+ 	if (current->lock_depth == -1)
+ 		spin_lock(&kernel_flag);
+ 	++current->lock_depth;
+ #else
  #if 1
  	if (!++current->lock_depth)
  		spin_lock(&kernel_flag);
***************
*** 54,59 ****
--- 67,73 ----
  		"\n9:"
  		:"=m" (__dummy_lock(&kernel_flag)),
  		 "=m" (current->lock_depth));
+ #endif
  #endif
  }
  
*** ./include/asm-i386/hw_irq.h	Thu Nov 22 14:46:18 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/hw_irq.h	Tue May 14 15:20:25 2002
***************
*** 95,100 ****
--- 95,112 ----
  #define __STR(x) #x
  #define STR(x) __STR(x)
  
+ #define GET_CURRENT \
+ 	"movl %esp, %ebx\n\t" \
+ 	"andl $-8192, %ebx\n\t"
+ 
+ #ifdef CONFIG_PREEMPT
+ #define BUMP_LOCK_COUNT \
+ 	GET_CURRENT \
+ 	"incl 4(%ebx)\n\t"
+ #else
+ #define BUMP_LOCK_COUNT
+ #endif
+ 
  #define SAVE_ALL \
  	"cld\n\t" \
  	"pushl %es\n\t" \
***************
*** 108,121 ****
  	"pushl %ebx\n\t" \
  	"movl $" STR(__KERNEL_DS) ",%edx\n\t" \
  	"movl %edx,%ds\n\t" \
! 	"movl %edx,%es\n\t"
  
  #define IRQ_NAME2(nr) nr##_interrupt(void)
  #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
- 
- #define GET_CURRENT \
- 	"movl %esp, %ebx\n\t" \
- 	"andl $-8192, %ebx\n\t"
  
  /*
   *	SMP has a few special interrupts for IPI messages
--- 120,130 ----
  	"pushl %ebx\n\t" \
  	"movl $" STR(__KERNEL_DS) ",%edx\n\t" \
  	"movl %edx,%ds\n\t" \
! 	"movl %edx,%es\n\t" \
! 	BUMP_LOCK_COUNT
  
  #define IRQ_NAME2(nr) nr##_interrupt(void)
  #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
  
  /*
   *	SMP has a few special interrupts for IPI messages
*** ./include/asm-i386/highmem.h	Tue May 14 14:55:55 2002
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/highmem.h	Tue May 14 15:20:25 2002
***************
*** 88,93 ****
--- 88,94 ----
  	enum fixed_addresses idx;
  	unsigned long vaddr;
  
+ 	preempt_disable();
  	if (page < highmem_start_page)
  		return page_address(page);
  
***************
*** 109,116 ****
  	unsigned long vaddr = (unsigned long) kvaddr;
  	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
  
! 	if (vaddr < FIXADDR_START) // FIXME
  		return;
  
  	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
  		out_of_line_bug();
--- 110,119 ----
  	unsigned long vaddr = (unsigned long) kvaddr;
  	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
  
! 	if (vaddr < FIXADDR_START) { // FIXME
! 		preempt_enable();
  		return;
+ 	}
  
  	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
  		out_of_line_bug();
***************
*** 122,127 ****
--- 125,132 ----
  	pte_clear(kmap_pte-idx);
  	__flush_tlb_one(vaddr);
  #endif
+ 
+ 	preempt_enable();
  }
  
  #endif /* __KERNEL__ */
*** ./include/asm-i386/pgalloc.h	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/pgalloc.h	Tue May 14 15:20:25 2002
***************
*** 75,94 ****
  {
  	unsigned long *ret;
  
  	if ((ret = pgd_quicklist) != NULL) {
  		pgd_quicklist = (unsigned long *)(*ret);
  		ret[0] = 0;
  		pgtable_cache_size--;
! 	} else
  		ret = (unsigned long *)get_pgd_slow();
  	return (pgd_t *)ret;
  }
  
  static inline void free_pgd_fast(pgd_t *pgd)
  {
  	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
  	pgd_quicklist = (unsigned long *) pgd;
  	pgtable_cache_size++;
  }
  
  static inline void free_pgd_slow(pgd_t *pgd)
--- 75,100 ----
  {
  	unsigned long *ret;
  
+ 	preempt_disable();
  	if ((ret = pgd_quicklist) != NULL) {
  		pgd_quicklist = (unsigned long *)(*ret);
  		ret[0] = 0;
  		pgtable_cache_size--;
! 		preempt_enable();
! 	} else {
! 		preempt_enable();
  		ret = (unsigned long *)get_pgd_slow();
+ 	}
  	return (pgd_t *)ret;
  }
  
  static inline void free_pgd_fast(pgd_t *pgd)
  {
+ 	preempt_disable();
  	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
  	pgd_quicklist = (unsigned long *) pgd;
  	pgtable_cache_size++;
+ 	preempt_enable();
  }
  
  static inline void free_pgd_slow(pgd_t *pgd)
***************
*** 119,129 ****
--- 125,137 ----
  {
  	unsigned long *ret;
  
+ 	preempt_disable();
  	if ((ret = (unsigned long *)pte_quicklist) != NULL) {
  		pte_quicklist = (unsigned long *)(*ret);
  		ret[0] = ret[1];
  		pgtable_cache_size--;
  	}
+ 	preempt_enable();
  	return (pte_t *)ret;
  }
  
***************
*** 135,146 ****
--- 143,156 ----
  extern int pgt_cache_water[];
  static inline void pte_free_fast(pte_t *pte)
  {
+ 	preempt_disable();
  	if (pgtable_cache_size < pgt_cache_water[1]) {
  		*(unsigned long *)pte = (unsigned long) pte_quicklist;
  		pte_quicklist = (unsigned long *) pte;
  		pgtable_cache_size++;
  	} else
  		pte_free_slow(pte);
+ 	preempt_enable();
  }
  
  #define pte_free(pte)		pte_free_fast(pte)
*** ./include/asm-i386/i387.h	Thu Nov 22 14:48:58 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-i386/i387.h	Tue May 14 18:09:08 2002
***************
*** 12,17 ****
--- 12,18 ----
  #define __ASM_I386_I387_H
  
  #include <linux/sched.h>
+ #include <linux/spinlock.h>
  #include <asm/processor.h>
  #include <asm/sigcontext.h>
  #include <asm/user.h>
***************
*** 24,30 ****
  extern void restore_fpu( struct task_struct *tsk );
  
  extern void kernel_fpu_begin(void);
! #define kernel_fpu_end() stts()
  
  
  #define unlazy_fpu( tsk ) do { \
--- 25,31 ----
  extern void restore_fpu( struct task_struct *tsk );
  
  extern void kernel_fpu_begin(void);
! #define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
  
  
  #define unlazy_fpu( tsk ) do { \
*** ./include/asm-arm/dma.h	Sun Aug 12 14:14:00 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-arm/dma.h	Tue May 14 15:09:18 2002
***************
*** 5,10 ****
--- 5,11 ----
  
  #include <linux/config.h>
  #include <linux/spinlock.h>
+ #include <linux/sched.h>
  #include <asm/system.h>
  #include <asm/memory.h>
  #include <asm/scatterlist.h>
*** ./include/asm-arm/hardirq.h	Thu Oct 11 12:04:57 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-arm/hardirq.h	Tue May 14 15:09:18 2002
***************
*** 34,39 ****
--- 34,40 ----
  #define irq_exit(cpu,irq)	(local_irq_count(cpu)--)
  
  #define synchronize_irq()	do { } while (0)
+ #define release_irqlock(cpu)	do { } while (0)
  
  #else
  #error SMP not supported
*** ./include/asm-arm/smplock.h	Sun Aug 12 14:14:00 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-arm/smplock.h	Tue May 14 15:09:18 2002
***************
*** 3,14 ****
--- 3,19 ----
   *
   * Default SMP lock implementation
   */
+ #include <linux/config.h>
  #include <linux/interrupt.h>
  #include <linux/spinlock.h>
  
  extern spinlock_t kernel_flag;
  
+ #ifdef CONFIG_PREEMPT
+ #define kernel_locked()		preempt_get_count()
+ #else
  #define kernel_locked()		spin_is_locked(&kernel_flag)
+ #endif
  
  /*
   * Release global kernel lock and global interrupt lock
***************
*** 40,47 ****
--- 45,58 ----
   */
  static inline void lock_kernel(void)
  {
+ #ifdef CONFIG_PREEMPT
+ 	if (current->lock_depth == -1)
+ 		spin_lock(&kernel_flag);
+ 	++current->lock_depth;
+ #else
  	if (!++current->lock_depth)
  		spin_lock(&kernel_flag);
+ #endif
  }
  
  static inline void unlock_kernel(void)
*** ./include/asm-arm/softirq.h	Sat Sep  8 15:02:31 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-arm/softirq.h	Tue May 14 15:09:18 2002
***************
*** 5,24 ****
  #include <asm/hardirq.h>
  
  #define __cpu_bh_enable(cpu) \
! 		do { barrier(); local_bh_count(cpu)--; } while (0)
  #define cpu_bh_disable(cpu) \
! 		do { local_bh_count(cpu)++; barrier(); } while (0)
  
  #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
  #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
  
  #define in_softirq()		(local_bh_count(smp_processor_id()) != 0)
  
! #define local_bh_enable()						\
  do {									\
  	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
  	if (!--*ptr && ptr[-2])						\
  		__asm__("bl%? __do_softirq": : : "lr");/* out of line */\
  } while (0)
  
  #endif	/* __ASM_SOFTIRQ_H */
--- 5,26 ----
  #include <asm/hardirq.h>
  
  #define __cpu_bh_enable(cpu) \
! 		do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0)
  #define cpu_bh_disable(cpu) \
! 		do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0)
  
  #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
  #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
  
  #define in_softirq()		(local_bh_count(smp_processor_id()) != 0)
  
! #define _local_bh_enable()						\
  do {									\
  	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
  	if (!--*ptr && ptr[-2])						\
  		__asm__("bl%? __do_softirq": : : "lr");/* out of line */\
  } while (0)
+ 
+ #define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0)
  
  #endif	/* __ASM_SOFTIRQ_H */
*** ./include/asm-arm/pgalloc.h	Sun Aug 12 14:14:00 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-arm/pgalloc.h	Tue May 14 15:09:18 2002
***************
*** 57,96 ****
--- 57,104 ----
  {
  	unsigned long *ret;
  
+ 	preempt_disable();
  	if ((ret = pgd_quicklist) != NULL) {
  		pgd_quicklist = (unsigned long *)__pgd_next(ret);
  		ret[1] = ret[2];
  		clean_dcache_entry(ret + 1);
  		pgtable_cache_size--;
  	}
+ 	preempt_enable();
  	return (pgd_t *)ret;
  }
  
  static inline void free_pgd_fast(pgd_t *pgd)
  {
+ 	preempt_disable();
  	__pgd_next(pgd) = (unsigned long) pgd_quicklist;
  	pgd_quicklist = (unsigned long *) pgd;
  	pgtable_cache_size++;
+ 	preempt_enable();
  }
  
  static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
  {
  	unsigned long *ret;
  
+ 	preempt_disable();
  	if((ret = pte_quicklist) != NULL) {
  		pte_quicklist = (unsigned long *)__pte_next(ret);
  		ret[0] = 0;
  		clean_dcache_entry(ret);
  		pgtable_cache_size--;
  	}
+ 	preempt_enable();
  	return (pte_t *)ret;
  }
  
  static inline void free_pte_fast(pte_t *pte)
  {
+ 	preempt_disable();
  	__pte_next(pte) = (unsigned long) pte_quicklist;
  	pte_quicklist = (unsigned long *) pte;
  	pgtable_cache_size++;
+ 	preempt_enable();
  }
  
  #else	/* CONFIG_NO_PGT_CACHE */
*** ./include/asm-sh/hardirq.h	Sat Sep  8 15:29:09 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-sh/hardirq.h	Tue May 14 15:09:18 2002
***************
*** 34,39 ****
--- 34,41 ----
  
  #define synchronize_irq()	barrier()
  
+ #define release_irqlock(cpu)	do { } while (0)
+ 
  #else
  
  #error Super-H SMP is not available
*** ./include/asm-sh/smplock.h	Sat Sep  8 15:29:09 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-sh/smplock.h	Tue May 14 15:09:18 2002
***************
*** 9,23 ****
  
  #include <linux/config.h>
  
! #ifndef CONFIG_SMP
! 
  #define lock_kernel()				do { } while(0)
  #define unlock_kernel()				do { } while(0)
! #define release_kernel_lock(task, cpu, depth)	((depth) = 1)
! #define reacquire_kernel_lock(task, cpu, depth)	do { } while(0)
  
  #else
! #error "We do not support SMP on SH"
! #endif /* CONFIG_SMP */
  
  #endif /* __ASM_SH_SMPLOCK_H */
--- 9,96 ----
  
  #include <linux/config.h>
  
! #if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT)
! /*
!  * Should never happen, since linux/smp_lock.h catches this case;
!  * but in case this file is included directly with neither SMP nor
!  * PREEMPT configuration, provide same dummys as linux/smp_lock.h
!  */
  #define lock_kernel()				do { } while(0)
  #define unlock_kernel()				do { } while(0)
! #define release_kernel_lock(task, cpu)		do { } while(0)
! #define reacquire_kernel_lock(task)		do { } while(0)
! #define kernel_locked()		1
! 
! #else /* CONFIG_SMP || CONFIG_PREEMPT */
! 
! #if CONFIG_SMP
! #error "We do not support SMP on SH yet"
! #endif
! /*
!  * Default SMP lock implementation (i.e. the i386 version)
!  */
! 
! #include <linux/interrupt.h>
! #include <linux/spinlock.h>
! 
! extern spinlock_t kernel_flag;
! #define lock_bkl() spin_lock(&kernel_flag)
! #define unlock_bkl() spin_unlock(&kernel_flag)
  
+ #ifdef CONFIG_SMP
+ #define kernel_locked()		spin_is_locked(&kernel_flag)
+ #elif  CONFIG_PREEMPT
+ #define kernel_locked()		preempt_get_count()
+ #else  /* neither */
+ #define kernel_locked()		1
+ #endif
+ 
+ /*
+  * Release global kernel lock and global interrupt lock
+  */
+ #define release_kernel_lock(task, cpu) \
+ do { \
+ 	if (task->lock_depth >= 0) \
+ 		spin_unlock(&kernel_flag); \
+ 	release_irqlock(cpu); \
+ 	__sti(); \
+ } while (0)
+ 
+ /*
+  * Re-acquire the kernel lock
+  */
+ #define reacquire_kernel_lock(task) \
+ do { \
+ 	if (task->lock_depth >= 0) \
+ 		spin_lock(&kernel_flag); \
+ } while (0)
+ 
+ /*
+  * Getting the big kernel lock.
+  *
+  * This cannot happen asynchronously,
+  * so we only need to worry about other
+  * CPU's.
+  */
+ static __inline__ void lock_kernel(void)
+ {
+ #ifdef CONFIG_PREEMPT
+ 	if (current->lock_depth == -1)
+ 		spin_lock(&kernel_flag);
+ 	++current->lock_depth;
  #else
! 	if (!++current->lock_depth)
! 		spin_lock(&kernel_flag);
! #endif
! }
! 
! static __inline__ void unlock_kernel(void)
! {
! 	if (current->lock_depth < 0)
! 		BUG();
! 	if (--current->lock_depth < 0)
! 		spin_unlock(&kernel_flag);
! }
! #endif /* CONFIG_SMP || CONFIG_PREEMPT */
  
  #endif /* __ASM_SH_SMPLOCK_H */
*** ./include/asm-sh/softirq.h	Sat Sep  8 15:29:09 2001
--- ../linux-2.4.19-pre8-ac3p/./include/asm-sh/softirq.h	Tue May 14 15:09:18 2002
***************
*** 6,11 ****
--- 6,12 ----
  
  #define local_bh_disable()			\
  do {						\
+ 	preempt_disable();			\
  	local_bh_count(smp_processor_id())++;	\
  	barrier();				\
  } while (0)
***************
*** 14,19 ****
--- 15,21 ----
  do {						\
  	barrier();				\
  	local_bh_count(smp_processor_id())--;	\
+ 	preempt_enable();			\
  } while (0)
  
  #define local_bh_enable()				\
***************
*** 23,28 ****
--- 25,31 ----
  	    && softirq_pending(smp_processor_id())) {	\
  		do_softirq();				\
  	}						\
+ 	preempt_enable();				\
  } while (0)
  
  #define in_softirq() (local_bh_count(smp_processor_id()) != 0)
*** ./net/socket.c	Tue May 14 14:59:20 2002
--- ../linux-2.4.19-pre8-ac3p/./net/socket.c	Tue May 14 15:09:18 2002
***************
*** 132,138 ****
  
  static struct net_proto_family *net_families[NPROTO];
  
! #ifdef CONFIG_SMP
  static atomic_t net_family_lockct = ATOMIC_INIT(0);
  static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED;
  
--- 132,138 ----
  
  static struct net_proto_family *net_families[NPROTO];
  
! #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
  static atomic_t net_family_lockct = ATOMIC_INIT(0);
  static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED;
  
*** ./net/sunrpc/pmap_clnt.c	Tue May 14 14:56:02 2002
--- ../linux-2.4.19-pre8-ac3p/./net/sunrpc/pmap_clnt.c	Tue May 14 15:09:18 2002
***************
*** 12,17 ****
--- 12,18 ----
  #include <linux/config.h>
  #include <linux/types.h>
  #include <linux/socket.h>
+ #include <linux/sched.h>
  #include <linux/kernel.h>
  #include <linux/errno.h>
  #include <linux/uio.h>
*** ./MAINTAINERS	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./MAINTAINERS	Tue May 14 15:09:17 2002
***************
*** 1317,1322 ****
--- 1317,1330 ----
  M:	mostrows@styx.uwaterloo.ca
  S:	Maintained
  
+ PREEMPTIBLE KERNEL
+ P:	Robert M. Love
+ M:	rml@tech9.net
+ L:	linux-kernel@vger.kernel.org
+ L:	kpreempt-tech@lists.sourceforge.net
+ W:	http://tech9.net/rml/linux
+ S:	Supported
+ 
  PROMISE DC4030 CACHING DISK CONTROLLER DRIVER
  P:	Peter Denison
  M:	promise@pnd-pc.demon.co.uk
*** ./drivers/sound/sound_core.c	Sun Sep 30 15:26:08 2001
--- ../linux-2.4.19-pre8-ac3p/./drivers/sound/sound_core.c	Tue May 14 15:09:17 2002
***************
*** 37,42 ****
--- 37,43 ----
  #include <linux/config.h>
  #include <linux/module.h>
  #include <linux/init.h>
+ #include <linux/sched.h>
  #include <linux/slab.h>
  #include <linux/types.h>
  #include <linux/kernel.h>
*** ./drivers/ieee1394/csr.c	Tue May 14 14:55:42 2002
--- ../linux-2.4.19-pre8-ac3p/./drivers/ieee1394/csr.c	Tue May 14 15:09:17 2002
***************
*** 10,15 ****
--- 10,16 ----
   */
  
  #include <linux/string.h>
+ #include <linux/sched.h>
  
  #include "ieee1394_types.h"
  #include "hosts.h"
*** ./arch/i386/config.in	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/i386/config.in	Tue May 14 15:09:17 2002
***************
*** 199,204 ****
--- 199,205 ----
  bool 'Math emulation' CONFIG_MATH_EMULATION
  bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
  bool 'Symmetric multi-processing support' CONFIG_SMP
+ bool 'Preemptible Kernel' CONFIG_PREEMPT
  if [ "$CONFIG_SMP" != "y" ]; then
     bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC
     dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC
***************
*** 212,220 ****
     bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
  fi
  
! if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
!    define_bool CONFIG_HAVE_DEC_LOCK y
  fi
  endmenu
  
  mainmenu_option next_comment
--- 213,224 ----
     bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
  fi
  
! if [ "$CONFIG_SMP" = "y" -o "$CONFIG_PREEMPT" = "y" ]; then
!    if [ "$CONFIG_X86_CMPXCHG" = "y" ]; then
!       define_bool CONFIG_HAVE_DEC_LOCK y
!    fi
  fi
+ 
  endmenu
  
  mainmenu_option next_comment
*** ./arch/i386/lib/dec_and_lock.c	Fri Jul  7 21:20:16 2000
--- ../linux-2.4.19-pre8-ac3p/./arch/i386/lib/dec_and_lock.c	Tue May 14 15:09:17 2002
***************
*** 8,13 ****
--- 8,14 ----
   */
  
  #include <linux/spinlock.h>
+ #include <linux/sched.h>
  #include <asm/atomic.h>
  
  int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
*** ./arch/i386/kernel/entry.S	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/i386/kernel/entry.S	Tue May 14 15:09:17 2002
***************
*** 71,77 ****
   * these are offsets into the task-struct.
   */
  state		=  0
! flags		=  4
  sigpending	=  8
  addr_limit	= 12
  exec_domain	= 16
--- 71,77 ----
   * these are offsets into the task-struct.
   */
  state		=  0
! preempt_count	=  4
  sigpending	=  8
  addr_limit	= 12
  exec_domain	= 16
***************
*** 79,86 ****
--- 79,106 ----
  tsk_ptrace	= 24
  cpu		= 32
  
+ /* These are offsets into the irq_stat structure
+  * There is one per cpu and it is aligned to 32
+  * byte boundry (we put that here as a shift count)
+  */
+ irq_array_shift                 = CONFIG_X86_L1_CACHE_SHIFT
+ 
+ irq_stat_local_irq_count        = 4
+ irq_stat_local_bh_count         = 8
+ 
  ENOSYS = 38
  
+ #ifdef CONFIG_SMP
+ #define GET_CPU_INDX	movl cpu(%ebx),%eax;  \
+                         shll $irq_array_shift,%eax
+ #define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx); \
+                              GET_CPU_INDX
+ #define CPU_INDX (,%eax)
+ #else
+ #define GET_CPU_INDX
+ #define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx)
+ #define CPU_INDX
+ #endif
  
  #define SAVE_ALL \
  	cld; \
***************
*** 176,182 ****
  
  
  ENTRY(ret_from_fork)
! #if CONFIG_SMP
  	pushl %ebx
  	call SYMBOL_NAME(schedule_tail)
  	addl $4, %esp
--- 196,202 ----
  
  
  ENTRY(ret_from_fork)
! #if CONFIG_SMP || CONFIG_PREEMPT
  	pushl %ebx
  	call SYMBOL_NAME(schedule_tail)
  	addl $4, %esp
***************
*** 249,260 ****
--- 269,298 ----
  	ALIGN
  ENTRY(ret_from_intr)
  	GET_CURRENT(%ebx)
+ #ifdef CONFIG_PREEMPT
+ 	cli
+ 	decl preempt_count(%ebx)
+ #endif
  ret_from_exception:
  	movl EFLAGS(%esp),%eax		# mix EFLAGS and CS
  	movb CS(%esp),%al
  	testl $(VM_MASK | 3),%eax	# return to VM86 mode or non-supervisor?
  	jne ret_from_sys_call
+ #ifdef CONFIG_PREEMPT
+ 	cmpl $0,preempt_count(%ebx)
+ 	jnz restore_all
+ 	cmpl $0,need_resched(%ebx)
+ 	jz restore_all
+ 	movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx
+ 	addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx
+ 	jnz restore_all
+ 	incl preempt_count(%ebx)
+ 	sti
+ 	call SYMBOL_NAME(preempt_schedule)
+ 	jmp ret_from_intr
+ #else
  	jmp restore_all
+ #endif
  
  	ALIGN
  reschedule:
***************
*** 291,296 ****
--- 329,337 ----
  	GET_CURRENT(%ebx)
  	call *%edi
  	addl $8,%esp
+ #ifdef CONFIG_PREEMPT
+ 	cli
+ #endif
  	jmp ret_from_exception
  
  ENTRY(coprocessor_error)
***************
*** 310,321 ****
--- 351,368 ----
  	movl %cr0,%eax
  	testl $0x4,%eax			# EM (math emulation bit)
  	jne device_not_available_emulate
+ #ifdef CONFIG_PREEMPT
+ 	cli
+ #endif
  	call SYMBOL_NAME(math_state_restore)
  	jmp ret_from_exception
  device_not_available_emulate:
  	pushl $0		# temporary storage for ORIG_EIP
  	call  SYMBOL_NAME(math_emulate)
  	addl $4,%esp
+ #ifdef CONFIG_PREEMPT
+ 	cli
+ #endif
  	jmp ret_from_exception
  
  ENTRY(debug)
*** ./arch/i386/kernel/traps.c	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/i386/kernel/traps.c	Tue May 14 15:09:17 2002
***************
*** 756,761 ****
--- 756,763 ----
   *
   * Careful.. There are problems with IBM-designed IRQ13 behaviour.
   * Don't touch unless you *really* know how it works.
+  *
+  * Must be called with kernel preemption disabled.
   */
  asmlinkage void math_state_restore(struct pt_regs regs)
  {
*** ./arch/i386/kernel/i387.c	Fri Feb 23 13:09:08 2001
--- ../linux-2.4.19-pre8-ac3p/./arch/i386/kernel/i387.c	Tue May 14 15:09:17 2002
***************
*** 10,15 ****
--- 10,16 ----
  
  #include <linux/config.h>
  #include <linux/sched.h>
+ #include <linux/spinlock.h>
  #include <asm/processor.h>
  #include <asm/i387.h>
  #include <asm/math_emu.h>
***************
*** 65,70 ****
--- 66,73 ----
  {
  	struct task_struct *tsk = current;
  
+ 	preempt_disable();
+ 	
  	if (tsk->flags & PF_USEDFPU) {
  		__save_init_fpu(tsk);
  		return;
*** ./arch/arm/config.in	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/arm/config.in	Tue May 14 15:09:17 2002
***************
*** 552,557 ****
--- 552,558 ----
  if [ "$CONFIG_ISDN" != "n" ]; then
     source drivers/isdn/Config.in
  fi
+ dep_bool 'Preemptible Kernel' CONFIG_PREEMPT $CONFIG_CPU_32
  endmenu
  
  #
*** ./arch/arm/kernel/entry-armv.S	Tue May 14 14:55:40 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/arm/kernel/entry-armv.S	Tue May 14 15:09:17 2002
***************
*** 697,702 ****
--- 697,708 ----
  		add	r4, sp, #S_SP
  		mov	r6, lr
  		stmia	r4, {r5, r6, r7, r8, r9}	@ save sp_SVC, lr_SVC, pc, cpsr, old_ro
+ #ifdef CONFIG_PREEMPT
+ 		get_current_task r9
+ 		ldr	r8, [r9, #TSK_PREEMPT]
+ 		add	r8, r8, #1
+ 		str	r8, [r9, #TSK_PREEMPT]
+ #endif
  1:		get_irqnr_and_base r0, r6, r5, lr
  		movne	r1, sp
  		@
***************
*** 704,709 ****
--- 710,734 ----
  		@
  		adrsvc	ne, lr, 1b
  		bne	do_IRQ
+ #ifdef CONFIG_PREEMPT
+ 2:		ldr	r8, [r9, #TSK_PREEMPT]
+ 		subs	r8, r8, #1
+ 		bne	3f
+ 		ldr	r7, [r9, #TSK_NEED_RESCHED]
+ 		teq	r7, #0
+ 		beq	3f
+ 		ldr	r6, .LCirqstat
+ 		ldr	r0, [r6, #IRQSTAT_BH_COUNT]
+ 		teq	r0, #0
+ 		bne	3f
+ 		mov	r0, #MODE_SVC
+ 		msr	cpsr_c, r0		@ enable interrupts
+ 		bl	SYMBOL_NAME(preempt_schedule)
+ 		mov	r0, #I_BIT | MODE_SVC
+ 		msr	cpsr_c, r0              @ disable interrupts
+ 		b	2b
+ 3:		str	r8, [r9, #TSK_PREEMPT]
+ #endif
  		ldr	r0, [sp, #S_PSR]		@ irqs are already disabled
  		msr	spsr, r0
  		ldmia	sp, {r0 - pc}^			@ load r0 - pc, cpsr
***************
*** 761,766 ****
--- 786,794 ----
  .LCprocfns:	.word	SYMBOL_NAME(processor)
  #endif
  .LCfp:		.word	SYMBOL_NAME(fp_enter)
+ #ifdef CONFIG_PREEMPT
+ .LCirqstat:	.word	SYMBOL_NAME(irq_stat)
+ #endif
  
  		irq_prio_table
  
***************
*** 801,806 ****
--- 829,840 ----
  		stmdb	r8, {sp, lr}^
  		alignment_trap r4, r7, __temp_irq
  		zero_fp
+ 		get_current_task tsk
+ #ifdef CONFIG_PREEMPT
+ 		ldr	r0, [tsk, #TSK_PREEMPT]
+ 		add	r0, r0, #1
+ 		str	r0, [tsk, #TSK_PREEMPT]
+ #endif
  1:		get_irqnr_and_base r0, r6, r5, lr
  		movne	r1, sp
  		adrsvc	ne, lr, 1b
***************
*** 808,815 ****
  		@ routine called with r0 = irq number, r1 = struct pt_regs *
  		@
  		bne	do_IRQ
  		mov	why, #0
- 		get_current_task tsk
  		b	ret_to_user
  
  		.align	5
--- 842,853 ----
  		@ routine called with r0 = irq number, r1 = struct pt_regs *
  		@
  		bne	do_IRQ
+ #ifdef CONFIG_PREEMPT
+ 		ldr	r0, [tsk, #TSK_PREEMPT]
+ 		sub	r0, r0, #1
+ 		str	r0, [tsk, #TSK_PREEMPT]
+ #endif
  		mov	why, #0
  		b	ret_to_user
  
  		.align	5
*** ./arch/arm/tools/getconstants.c	Thu Oct 11 12:04:57 2001
--- ../linux-2.4.19-pre8-ac3p/./arch/arm/tools/getconstants.c	Tue May 14 15:09:17 2002
***************
*** 13,18 ****
--- 13,19 ----
  
  #include <asm/pgtable.h>
  #include <asm/uaccess.h>
+ #include <asm/hardirq.h>
  
  /*
   * Make sure that the compiler and target are compatible.
***************
*** 38,43 ****
--- 39,49 ----
  
  DEFN("TSS_SAVE",		OFF_TSK(thread.save));
  DEFN("TSS_FPESAVE",		OFF_TSK(thread.fpstate.soft.save));
+ 
+ #ifdef CONFIG_PREEMPT
+ DEFN("TSK_PREEMPT",		OFF_TSK(preempt_count));
+ DEFN("IRQSTAT_BH_COUNT",	(unsigned long)&(((irq_cpustat_t *)0)->__local_bh_count));
+ #endif
  
  #ifdef CONFIG_CPU_32
  DEFN("TSS_DOMAIN",		OFF_TSK(thread.domain));
*** ./arch/sh/config.in	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/sh/config.in	Tue May 14 15:09:17 2002
***************
*** 124,129 ****
--- 124,130 ----
     hex 'Physical memory start address' CONFIG_MEMORY_START 08000000
     hex 'Physical memory size' CONFIG_MEMORY_SIZE 00400000
  fi
+ bool 'Preemptible Kernel' CONFIG_PREEMPT
  endmenu
  
  if [ "$CONFIG_SH_HP690" = "y" ]; then
*** ./arch/sh/kernel/entry.S	Tue May 14 14:55:42 2002
--- ../linux-2.4.19-pre8-ac3p/./arch/sh/kernel/entry.S	Tue May 14 15:09:17 2002
***************
*** 60,69 ****
  /*
   * These are offsets into the task-struct.
   */
! flags		=  4
  sigpending	=  8
  need_resched	= 20
  tsk_ptrace	= 24
  
  PT_TRACESYS  = 0x00000002
  PF_USEDFPU   = 0x00100000
--- 60,77 ----
  /*
   * These are offsets into the task-struct.
   */
! preempt_count	=  4
  sigpending	=  8
  need_resched	= 20
  tsk_ptrace	= 24
+ flags		= 84
+ 
+ /*
+  * These offsets are into irq_stat.
+  * (Find irq_cpustat_t in asm-sh/hardirq.h)
+  */
+ local_irq_count =  8
+ local_bh_count  = 12
  
  PT_TRACESYS  = 0x00000002
  PF_USEDFPU   = 0x00100000
***************
*** 143,149 ****
  	mov.l	__INV_IMASK, r11;	\
  	stc	sr, r10;		\
  	and	r11, r10;		\
! 	stc	k_g_imask, r11;	\
  	or	r11, r10;		\
  	ldc	r10, sr
  
--- 151,157 ----
  	mov.l	__INV_IMASK, r11;	\
  	stc	sr, r10;		\
  	and	r11, r10;		\
! 	stc	k_g_imask, r11;		\
  	or	r11, r10;		\
  	ldc	r10, sr
  
***************
*** 304,311 ****
  	mov.l	@(tsk_ptrace,r0), r0	! Is current PTRACE_SYSCALL'd?
  	mov	#PT_TRACESYS, r1
  	tst	r1, r0
! 	bt	ret_from_syscall
! 	bra	syscall_ret_trace
  	 nop	 
  
  	.align	2
--- 312,319 ----
  	mov.l	@(tsk_ptrace,r0), r0	! Is current PTRACE_SYSCALL'd?
  	mov	#PT_TRACESYS, r1
  	tst	r1, r0
! 	bf	syscall_ret_trace
! 	bra	ret_from_syscall
  	 nop	 
  
  	.align	2
***************
*** 505,512 ****
  	.long	syscall_ret_trace
  __syscall_ret:
  	.long	syscall_ret
- __INV_IMASK:
- 	.long	0xffffff0f	! ~(IMASK)
  
  
  	.align	2
--- 513,518 ----
***************
*** 518,524 ****
--- 524,607 ----
  	.align	2
  1:	.long	SYMBOL_NAME(schedule)
  
+ #ifdef CONFIG_PREEMPT	
+ 	!
+ 	! Returning from interrupt during kernel mode: check if
+ 	! preempt_schedule should be called. If need_resched flag
+ 	! is set, preempt_count is zero, and we're not currently
+ 	! in an interrupt handler (local irq or bottom half) then
+ 	! call preempt_schedule. 
+ 	!
+ 	! Increment preempt_count to prevent a nested interrupt
+ 	! from reentering preempt_schedule, then decrement after
+ 	! and drop through to regular interrupt return which will
+ 	! jump back and check again in case such an interrupt did
+ 	! come in (and didn't preempt due to preempt_count).
+ 	!
+ 	! NOTE:	because we just checked that preempt_count was
+ 	! zero before getting to the call, can't we use immediate
+ 	! values (1 and 0) rather than inc/dec? Also, rather than
+ 	! drop through to ret_from_irq, we already know this thread
+ 	! is kernel mode, can't we go direct to ret_from_kirq? In
+ 	! fact, with proper interrupt nesting and so forth could
+ 	! the loop simply be on the need_resched w/o checking the
+ 	! other stuff again? Optimize later...
+ 	!
+ 	.align	2
+ ret_from_kirq:
+ 	! Nonzero preempt_count prevents scheduling
+ 	stc	k_current, r1
+ 	mov.l	@(preempt_count,r1), r0
+ 	cmp/eq	#0, r0
+ 	bf	restore_all
+ 	! Zero need_resched prevents scheduling
+ 	mov.l	@(need_resched,r1), r0
+ 	cmp/eq	#0, r0
+ 	bt	restore_all
+ 	! If in_interrupt(), don't schedule
+ 	mov.l	__irq_stat, r1
+ 	mov.l	@(local_irq_count,r1), r0
+ 	mov.l	@(local_bh_count,r1), r1
+ 	or	r1, r0
+ 	cmp/eq	#0, r0
+ 	bf	restore_all
+ 	! Allow scheduling using preempt_schedule
+ 	! Adjust preempt_count and SR as needed.
+ 	stc	k_current, r1
+ 	mov.l	@(preempt_count,r1), r0	! Could replace this ...
+ 	add	#1, r0			! ... and this w/mov #1?
+ 	mov.l	r0, @(preempt_count,r1)
+ 	STI()
+ 	mov.l	__preempt_schedule, r0
+ 	jsr	@r0
+ 	 nop	
+ 	/* CLI */
+ 	stc	sr, r0
+ 	or	#0xf0, r0
+ 	ldc	r0, sr
+ 	!
+ 	stc	k_current, r1
+ 	mov.l	@(preempt_count,r1), r0	! Could replace this ...
+ 	add	#-1, r0			! ... and this w/mov #0?
+ 	mov.l	r0, @(preempt_count,r1)
+ 	! Maybe should bra ret_from_kirq, or loop over need_resched?
+ 	! For now, fall through to ret_from_irq again...
+ #endif /* CONFIG_PREEMPT */
+ 	
  ret_from_irq:
+ 	mov	#OFF_SR, r0
+ 	mov.l	@(r0,r15), r0	! get status register
+ 	shll	r0
+ 	shll	r0		! kernel space?
+ #ifndef CONFIG_PREEMPT
+ 	bt	restore_all	! Yes, it's from kernel, go back soon
+ #else /* CONFIG_PREEMPT */
+ 	bt	ret_from_kirq	! From kernel: maybe preempt_schedule
+ #endif /* CONFIG_PREEMPT */
+ 	!
+ 	bra	ret_from_syscall
+ 	 nop
+ 
  ret_from_exception:
  	mov	#OFF_SR, r0
  	mov.l	@(r0,r15), r0	! get status register
***************
*** 564,569 ****
--- 647,659 ----
  	.long	SYMBOL_NAME(do_signal)
  __irq_stat:
  	.long	SYMBOL_NAME(irq_stat)
+ #ifdef CONFIG_PREEMPT
+ __preempt_schedule:
+ 	.long	SYMBOL_NAME(preempt_schedule)
+ #endif /* CONFIG_PREEMPT */	
+ __INV_IMASK:
+ 	.long	0xffffff0f	! ~(IMASK)
+ 
  
  	.align 2
  restore_all:
***************
*** 679,685 ****
  __fpu_prepare_fd:
  	.long	SYMBOL_NAME(fpu_prepare_fd)
  __init_task_flags:
! 	.long	SYMBOL_NAME(init_task_union)+4
  __PF_USEDFPU:
  	.long	PF_USEDFPU
  #endif
--- 769,775 ----
  __fpu_prepare_fd:
  	.long	SYMBOL_NAME(fpu_prepare_fd)
  __init_task_flags:
! 	.long	SYMBOL_NAME(init_task_union)+flags
  __PF_USEDFPU:
  	.long	PF_USEDFPU
  #endif
*** ./arch/sh/kernel/irq.c	Sat Sep  8 15:29:09 2001
--- ../linux-2.4.19-pre8-ac3p/./arch/sh/kernel/irq.c	Tue May 14 15:09:17 2002
***************
*** 229,234 ****
--- 229,242 ----
  	struct irqaction * action;
  	unsigned int status;
  
+ 	/*
+ 	 * At this point we're now about to actually call handlers,
+ 	 * and interrupts might get reenabled during them... bump
+ 	 * preempt_count to prevent any preemption while the handler
+  	 * called here is pending...
+  	 */
+  	preempt_disable();
+ 
  	/* Get IRQ number */
  	asm volatile("stc	r2_bank, %0\n\t"
  		     "shlr2	%0\n\t"
***************
*** 298,305 ****
--- 306,322 ----
  	desc->handler->end(irq);
  	spin_unlock(&desc->lock);
  
+ 
  	if (softirq_pending(cpu))
  		do_softirq();
+ 
+ 	/*
+ 	 * We're done with the handlers, interrupts should be
+ 	 * currently disabled; decrement preempt_count now so
+ 	 * as we return preemption may be allowed...
+ 	 */
+ 	preempt_enable_no_resched();
+ 
  	return 1;
  }
  
*** ./Documentation/Configure.help	Tue May 14 14:59:18 2002
--- ../linux-2.4.19-pre8-ac3p/./Documentation/Configure.help	Tue May 14 15:09:17 2002
***************
*** 291,296 ****
--- 291,307 ----
    If you have a system with several CPUs, you do not need to say Y
    here: the local APIC will be used automatically.
  
+ Preemptible Kernel
+ CONFIG_PREEMPT
+   This option reduces the latency of the kernel when reacting to
+   real-time or interactive events by allowing a low priority process to
+   be preempted even if it is in kernel mode executing a system call.
+   This allows applications to run more reliably even when the system is
+   under load.
+ 
+   Say Y here if you are building a kernel for a desktop, embedded
+   real-time system.  Say N if you are unsure.
+ 
  Kernel math emulation
  CONFIG_MATH_EMULATION
    Linux can emulate a math coprocessor (used for floating point

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15  1:31 ` Bill Davidsen
@ 2002-05-15  1:41   ` William Lee Irwin III
  2002-05-15 14:39     ` Bill Davidsen
  0 siblings, 1 reply; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-15  1:41 UTC (permalink / raw)
  To: Bill Davidsen; +Cc: Rik van Riel, linux-kernel, linux-mm

On Tue, May 14, 2002 at 09:31:30PM -0400, Bill Davidsen wrote:
> *** ./Makefile	Tue May 14 14:59:18 2002
> --- ../linux-2.4.19-pre8-ac3p/./Makefile	Tue May 14 17:01:49 2002
> ***************
> *** 1,7 ****
>   VERSION = 2
>   PATCHLEVEL = 4
>   SUBLEVEL = 19
> ! EXTRAVERSION = -pre8-ac3
>   
>   KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)

These look like patch rejects, not patches themselves.
Resend?


Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 17:17       ` Denis Vlasenko
@ 2002-05-15 14:03         ` Rik van Riel
  2002-05-15 20:17           ` Denis Vlasenko
  2002-05-15 15:15         ` Bill Davidsen
  1 sibling, 1 reply; 28+ messages in thread
From: Rik van Riel @ 2002-05-15 14:03 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: William Lee Irwin III, linux-kernel, linux-mm

On Wed, 15 May 2002, Denis Vlasenko wrote:

> I was investigating why sometimes in top I see idle % like
> 9384729374923.43%. It was caused by idle count in /proc/stat
> going backward sometimes.

Thanks for tracking down this bug.

> It can be fixed for SMP:
> * add spinlock
> or
> * add per_cpu_idle, account it too at timer/APIC int
>   and get rid of idle % calculations for /proc/stat
>
> As a user, I vote for glitchless statistics even if they
> consume extra i++ cycle every timer int on every CPU.

Same for me. The last option is probably easiest to implement
and cheapest at run time. The extra "cost" will approach zero
once somebody takes the time to put the per-cpu stats on per
cpu cache lines, which I'm sure somebody will do once we have
enough per-cpu stats ;)

cheers,

Rik
-- 
	http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid.  Go buy yourself a real t-shirt"

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15  1:41   ` William Lee Irwin III
@ 2002-05-15 14:39     ` Bill Davidsen
  0 siblings, 0 replies; 28+ messages in thread
From: Bill Davidsen @ 2002-05-15 14:39 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Rik van Riel, linux-kernel, linux-mm

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1159 bytes --]

On Tue, 14 May 2002, William Lee Irwin III wrote:

> On Tue, May 14, 2002 at 09:31:30PM -0400, Bill Davidsen wrote:
> > *** ./Makefile	Tue May 14 14:59:18 2002
> > --- ../linux-2.4.19-pre8-ac3p/./Makefile	Tue May 14 17:01:49 2002
> > ***************
> > *** 1,7 ****
> >   VERSION = 2
> >   PATCHLEVEL = 4
> >   SUBLEVEL = 19
> > ! EXTRAVERSION = -pre8-ac3
> >   
> >   KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
> 
> These look like patch rejects, not patches themselves.
> Resend?

Looks like a context diff to me, I only diffed the files which had
changed, because the test machine is really slow (it is NOT named
"glacial" because it's so cool). I did a "make distclean" and then a full
recursive diff, which only took 46 minutes, and it's attached. I did a
--dry-run against the pre8-ac3 source and got no error messages, but the
last one seemed to work as well.

The benefit of a really slow test machine is that improvements are
magnified, so I can quickly see if a patch for performance does
anything;-)

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

[-- Attachment #2: Type: TEXT/PLAIN, Size: 69168 bytes --]

diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/CREDITS linux-2.4.19-pre8-ac3p/CREDITS
--- linux-2.4.19-pre8-ac3/CREDITS	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/CREDITS	Tue May 14 15:09:17 2002
@@ -996,8 +996,8 @@
 
 N: Nigel Gamble
 E: nigel@nrg.org
-E: nigel@sgi.com
 D: Interrupt-driven printer driver
+D: Preemptible kernel
 S: 120 Alley Way
 S: Mountain View, California 94040
 S: USA
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/Documentation/Configure.help linux-2.4.19-pre8-ac3p/Documentation/Configure.help
--- linux-2.4.19-pre8-ac3/Documentation/Configure.help	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/Documentation/Configure.help	Tue May 14 15:09:17 2002
@@ -291,6 +291,17 @@
   If you have a system with several CPUs, you do not need to say Y
   here: the local APIC will be used automatically.
 
+Preemptible Kernel
+CONFIG_PREEMPT
+  This option reduces the latency of the kernel when reacting to
+  real-time or interactive events by allowing a low priority process to
+  be preempted even if it is in kernel mode executing a system call.
+  This allows applications to run more reliably even when the system is
+  under load.
+
+  Say Y here if you are building a kernel for a desktop, embedded
+  real-time system.  Say N if you are unsure.
+
 Kernel math emulation
 CONFIG_MATH_EMULATION
   Linux can emulate a math coprocessor (used for floating point
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/Documentation/preempt-locking.txt linux-2.4.19-pre8-ac3p/Documentation/preempt-locking.txt
--- linux-2.4.19-pre8-ac3/Documentation/preempt-locking.txt	Wed Dec 31 19:00:00 1969
+++ linux-2.4.19-pre8-ac3p/Documentation/preempt-locking.txt	Tue May 14 15:09:17 2002
@@ -0,0 +1,104 @@
+		  Proper Locking Under a Preemptible Kernel:
+		       Keeping Kernel Code Preempt-Safe
+			  Robert Love <rml@tech9.net>
+			   Last Updated: 22 Jan 2002
+
+
+INTRODUCTION
+
+
+A preemptible kernel creates new locking issues.  The issues are the same as
+those under SMP: concurrency and reentrancy.  Thankfully, the Linux preemptible
+kernel model leverages existing SMP locking mechanisms.  Thus, the kernel
+requires explicit additional locking for very few additional situations.
+
+This document is for all kernel hackers.  Developing code in the kernel
+requires protecting these situations.
+ 
+
+RULE #1: Per-CPU data structures need explicit protection
+
+
+Two similar problems arise. An example code snippet:
+
+	struct this_needs_locking tux[NR_CPUS];
+	tux[smp_processor_id()] = some_value;
+	/* task is preempted here... */
+	something = tux[smp_processor_id()];
+
+First, since the data is per-CPU, it may not have explicit SMP locking, but
+require it otherwise.  Second, when a preempted task is finally rescheduled,
+the previous value of smp_processor_id may not equal the current.  You must
+protect these situations by disabling preemption around them.
+
+
+RULE #2: CPU state must be protected.
+
+
+Under preemption, the state of the CPU must be protected.  This is arch-
+dependent, but includes CPU structures and state not preserved over a context
+switch.  For example, on x86, entering and exiting FPU mode is now a critical
+section that must occur while preemption is disabled.  Think what would happen
+if the kernel is executing a floating-point instruction and is then preempted.
+Remember, the kernel does not save FPU state except for user tasks.  Therefore,
+upon preemption, the FPU registers will be sold to the lowest bidder.  Thus,
+preemption must be disabled around such regions.
+
+Note, some FPU functions are already explicitly preempt safe.  For example,
+kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
+However, math_state_restore must be called with preemption disabled.
+
+
+RULE #3: Lock acquire and release must be performed by same task
+
+
+A lock acquired in one task must be released by the same task.  This
+means you can't do oddball things like acquire a lock and go off to
+play while another task releases it.  If you want to do something
+like this, acquire and release the task in the same code path and
+have the caller wait on an event by the other task.
+
+
+SOLUTION
+
+
+Data protection under preemption is achieved by disabling preemption for the
+duration of the critical region.
+
+preempt_enable()		decrement the preempt counter
+preempt_disable()		increment the preempt counter
+preempt_enable_no_resched()	decrement, but do not immediately preempt
+preempt_get_count()		return the preempt counter
+
+The functions are nestable.  In other words, you can call preempt_disable
+n-times in a code path, and preemption will not be reenabled until the n-th
+call to preempt_enable.  The preempt statements define to nothing if
+preemption is not enabled.
+
+Note that you do not need to explicitly prevent preemption if you are holding
+any locks or interrupts are disabled, since preemption is implicitly disabled
+in those cases.
+
+Example:
+
+	cpucache_t *cc; /* this is per-CPU */
+	preempt_disable();
+	cc = cc_data(searchp);
+	if (cc && cc->avail) {
+		__free_block(searchp, cc_entry(cc), cc->avail);
+		cc->avail = 0;
+	}
+	preempt_enable();
+	return 0;
+
+Notice how the preemption statements must encompass every reference of the
+critical variables.  Another example:
+
+	int buf[NR_CPUS];
+	set_cpu_val(buf);
+	if (buf[smp_processor_id()] == -1) printf(KERN_INFO "wee!\n");
+	spin_lock(&buf_lock);
+	/* ... */
+
+This code is not preempt-safe, but see how easily we can fix it by simply
+moving the spin_lock up two lines.
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/MAINTAINERS linux-2.4.19-pre8-ac3p/MAINTAINERS
--- linux-2.4.19-pre8-ac3/MAINTAINERS	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/MAINTAINERS	Tue May 14 15:09:17 2002
@@ -1317,6 +1317,14 @@
 M:	mostrows@styx.uwaterloo.ca
 S:	Maintained
 
+PREEMPTIBLE KERNEL
+P:	Robert M. Love
+M:	rml@tech9.net
+L:	linux-kernel@vger.kernel.org
+L:	kpreempt-tech@lists.sourceforge.net
+W:	http://tech9.net/rml/linux
+S:	Supported
+
 PROMISE DC4030 CACHING DISK CONTROLLER DRIVER
 P:	Peter Denison
 M:	promise@pnd-pc.demon.co.uk
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/Makefile linux-2.4.19-pre8-ac3p/Makefile
--- linux-2.4.19-pre8-ac3/Makefile	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/Makefile	Tue May 14 17:01:49 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 19
-EXTRAVERSION = -pre8-ac3
+EXTRAVERSION = -pre8-ac3.1p+iow
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/arm/config.in linux-2.4.19-pre8-ac3p/arch/arm/config.in
--- linux-2.4.19-pre8-ac3/arch/arm/config.in	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/arch/arm/config.in	Tue May 14 15:09:17 2002
@@ -552,6 +552,7 @@
 if [ "$CONFIG_ISDN" != "n" ]; then
    source drivers/isdn/Config.in
 fi
+dep_bool 'Preemptible Kernel' CONFIG_PREEMPT $CONFIG_CPU_32
 endmenu
 
 #
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/arm/kernel/entry-armv.S linux-2.4.19-pre8-ac3p/arch/arm/kernel/entry-armv.S
--- linux-2.4.19-pre8-ac3/arch/arm/kernel/entry-armv.S	Tue May 14 14:55:40 2002
+++ linux-2.4.19-pre8-ac3p/arch/arm/kernel/entry-armv.S	Tue May 14 15:09:17 2002
@@ -697,6 +697,12 @@
 		add	r4, sp, #S_SP
 		mov	r6, lr
 		stmia	r4, {r5, r6, r7, r8, r9}	@ save sp_SVC, lr_SVC, pc, cpsr, old_ro
+#ifdef CONFIG_PREEMPT
+		get_current_task r9
+		ldr	r8, [r9, #TSK_PREEMPT]
+		add	r8, r8, #1
+		str	r8, [r9, #TSK_PREEMPT]
+#endif
 1:		get_irqnr_and_base r0, r6, r5, lr
 		movne	r1, sp
 		@
@@ -704,6 +710,25 @@
 		@
 		adrsvc	ne, lr, 1b
 		bne	do_IRQ
+#ifdef CONFIG_PREEMPT
+2:		ldr	r8, [r9, #TSK_PREEMPT]
+		subs	r8, r8, #1
+		bne	3f
+		ldr	r7, [r9, #TSK_NEED_RESCHED]
+		teq	r7, #0
+		beq	3f
+		ldr	r6, .LCirqstat
+		ldr	r0, [r6, #IRQSTAT_BH_COUNT]
+		teq	r0, #0
+		bne	3f
+		mov	r0, #MODE_SVC
+		msr	cpsr_c, r0		@ enable interrupts
+		bl	SYMBOL_NAME(preempt_schedule)
+		mov	r0, #I_BIT | MODE_SVC
+		msr	cpsr_c, r0              @ disable interrupts
+		b	2b
+3:		str	r8, [r9, #TSK_PREEMPT]
+#endif
 		ldr	r0, [sp, #S_PSR]		@ irqs are already disabled
 		msr	spsr, r0
 		ldmia	sp, {r0 - pc}^			@ load r0 - pc, cpsr
@@ -761,6 +786,9 @@
 .LCprocfns:	.word	SYMBOL_NAME(processor)
 #endif
 .LCfp:		.word	SYMBOL_NAME(fp_enter)
+#ifdef CONFIG_PREEMPT
+.LCirqstat:	.word	SYMBOL_NAME(irq_stat)
+#endif
 
 		irq_prio_table
 
@@ -801,6 +829,12 @@
 		stmdb	r8, {sp, lr}^
 		alignment_trap r4, r7, __temp_irq
 		zero_fp
+		get_current_task tsk
+#ifdef CONFIG_PREEMPT
+		ldr	r0, [tsk, #TSK_PREEMPT]
+		add	r0, r0, #1
+		str	r0, [tsk, #TSK_PREEMPT]
+#endif
 1:		get_irqnr_and_base r0, r6, r5, lr
 		movne	r1, sp
 		adrsvc	ne, lr, 1b
@@ -808,8 +842,12 @@
 		@ routine called with r0 = irq number, r1 = struct pt_regs *
 		@
 		bne	do_IRQ
+#ifdef CONFIG_PREEMPT
+		ldr	r0, [tsk, #TSK_PREEMPT]
+		sub	r0, r0, #1
+		str	r0, [tsk, #TSK_PREEMPT]
+#endif
 		mov	why, #0
-		get_current_task tsk
 		b	ret_to_user
 
 		.align	5
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/arm/tools/getconstants.c linux-2.4.19-pre8-ac3p/arch/arm/tools/getconstants.c
--- linux-2.4.19-pre8-ac3/arch/arm/tools/getconstants.c	Thu Oct 11 12:04:57 2001
+++ linux-2.4.19-pre8-ac3p/arch/arm/tools/getconstants.c	Tue May 14 15:09:17 2002
@@ -13,6 +13,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
+#include <asm/hardirq.h>
 
 /*
  * Make sure that the compiler and target are compatible.
@@ -38,6 +39,11 @@
 
 DEFN("TSS_SAVE",		OFF_TSK(thread.save));
 DEFN("TSS_FPESAVE",		OFF_TSK(thread.fpstate.soft.save));
+
+#ifdef CONFIG_PREEMPT
+DEFN("TSK_PREEMPT",		OFF_TSK(preempt_count));
+DEFN("IRQSTAT_BH_COUNT",	(unsigned long)&(((irq_cpustat_t *)0)->__local_bh_count));
+#endif
 
 #ifdef CONFIG_CPU_32
 DEFN("TSS_DOMAIN",		OFF_TSK(thread.domain));
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/i386/config.in linux-2.4.19-pre8-ac3p/arch/i386/config.in
--- linux-2.4.19-pre8-ac3/arch/i386/config.in	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/arch/i386/config.in	Tue May 14 15:09:17 2002
@@ -199,6 +199,7 @@
 bool 'Math emulation' CONFIG_MATH_EMULATION
 bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
 bool 'Symmetric multi-processing support' CONFIG_SMP
+bool 'Preemptible Kernel' CONFIG_PREEMPT
 if [ "$CONFIG_SMP" != "y" ]; then
    bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC
    dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC
@@ -212,9 +213,12 @@
    bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
 fi
 
-if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
-   define_bool CONFIG_HAVE_DEC_LOCK y
+if [ "$CONFIG_SMP" = "y" -o "$CONFIG_PREEMPT" = "y" ]; then
+   if [ "$CONFIG_X86_CMPXCHG" = "y" ]; then
+      define_bool CONFIG_HAVE_DEC_LOCK y
+   fi
 fi
+
 endmenu
 
 mainmenu_option next_comment
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/i386/kernel/entry.S linux-2.4.19-pre8-ac3p/arch/i386/kernel/entry.S
--- linux-2.4.19-pre8-ac3/arch/i386/kernel/entry.S	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/arch/i386/kernel/entry.S	Tue May 14 15:09:17 2002
@@ -71,7 +71,7 @@
  * these are offsets into the task-struct.
  */
 state		=  0
-flags		=  4
+preempt_count	=  4
 sigpending	=  8
 addr_limit	= 12
 exec_domain	= 16
@@ -79,8 +79,28 @@
 tsk_ptrace	= 24
 cpu		= 32
 
+/* These are offsets into the irq_stat structure
+ * There is one per cpu and it is aligned to 32
+ * byte boundry (we put that here as a shift count)
+ */
+irq_array_shift                 = CONFIG_X86_L1_CACHE_SHIFT
+
+irq_stat_local_irq_count        = 4
+irq_stat_local_bh_count         = 8
+
 ENOSYS = 38
 
+#ifdef CONFIG_SMP
+#define GET_CPU_INDX	movl cpu(%ebx),%eax;  \
+                        shll $irq_array_shift,%eax
+#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx); \
+                             GET_CPU_INDX
+#define CPU_INDX (,%eax)
+#else
+#define GET_CPU_INDX
+#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx)
+#define CPU_INDX
+#endif
 
 #define SAVE_ALL \
 	cld; \
@@ -176,7 +196,7 @@
 
 
 ENTRY(ret_from_fork)
-#if CONFIG_SMP
+#if CONFIG_SMP || CONFIG_PREEMPT
 	pushl %ebx
 	call SYMBOL_NAME(schedule_tail)
 	addl $4, %esp
@@ -249,12 +269,30 @@
 	ALIGN
 ENTRY(ret_from_intr)
 	GET_CURRENT(%ebx)
+#ifdef CONFIG_PREEMPT
+	cli
+	decl preempt_count(%ebx)
+#endif
 ret_from_exception:
 	movl EFLAGS(%esp),%eax		# mix EFLAGS and CS
 	movb CS(%esp),%al
 	testl $(VM_MASK | 3),%eax	# return to VM86 mode or non-supervisor?
 	jne ret_from_sys_call
+#ifdef CONFIG_PREEMPT
+	cmpl $0,preempt_count(%ebx)
+	jnz restore_all
+	cmpl $0,need_resched(%ebx)
+	jz restore_all
+	movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx
+	addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx
+	jnz restore_all
+	incl preempt_count(%ebx)
+	sti
+	call SYMBOL_NAME(preempt_schedule)
+	jmp ret_from_intr
+#else
 	jmp restore_all
+#endif
 
 	ALIGN
 reschedule:
@@ -291,6 +329,9 @@
 	GET_CURRENT(%ebx)
 	call *%edi
 	addl $8,%esp
+#ifdef CONFIG_PREEMPT
+	cli
+#endif
 	jmp ret_from_exception
 
 ENTRY(coprocessor_error)
@@ -310,12 +351,18 @@
 	movl %cr0,%eax
 	testl $0x4,%eax			# EM (math emulation bit)
 	jne device_not_available_emulate
+#ifdef CONFIG_PREEMPT
+	cli
+#endif
 	call SYMBOL_NAME(math_state_restore)
 	jmp ret_from_exception
 device_not_available_emulate:
 	pushl $0		# temporary storage for ORIG_EIP
 	call  SYMBOL_NAME(math_emulate)
 	addl $4,%esp
+#ifdef CONFIG_PREEMPT
+	cli
+#endif
 	jmp ret_from_exception
 
 ENTRY(debug)
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/i386/kernel/i387.c linux-2.4.19-pre8-ac3p/arch/i386/kernel/i387.c
--- linux-2.4.19-pre8-ac3/arch/i386/kernel/i387.c	Fri Feb 23 13:09:08 2001
+++ linux-2.4.19-pre8-ac3p/arch/i386/kernel/i387.c	Tue May 14 15:09:17 2002
@@ -10,6 +10,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/math_emu.h>
@@ -65,6 +66,8 @@
 {
 	struct task_struct *tsk = current;
 
+	preempt_disable();
+	
 	if (tsk->flags & PF_USEDFPU) {
 		__save_init_fpu(tsk);
 		return;
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/i386/kernel/traps.c linux-2.4.19-pre8-ac3p/arch/i386/kernel/traps.c
--- linux-2.4.19-pre8-ac3/arch/i386/kernel/traps.c	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/arch/i386/kernel/traps.c	Tue May 14 15:09:17 2002
@@ -756,6 +756,8 @@
  *
  * Careful.. There are problems with IBM-designed IRQ13 behaviour.
  * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled.
  */
 asmlinkage void math_state_restore(struct pt_regs regs)
 {
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/i386/lib/dec_and_lock.c linux-2.4.19-pre8-ac3p/arch/i386/lib/dec_and_lock.c
--- linux-2.4.19-pre8-ac3/arch/i386/lib/dec_and_lock.c	Fri Jul  7 21:20:16 2000
+++ linux-2.4.19-pre8-ac3p/arch/i386/lib/dec_and_lock.c	Tue May 14 15:09:17 2002
@@ -8,6 +8,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 #include <asm/atomic.h>
 
 int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/sh/config.in linux-2.4.19-pre8-ac3p/arch/sh/config.in
--- linux-2.4.19-pre8-ac3/arch/sh/config.in	Tue May 14 14:59:18 2002
+++ linux-2.4.19-pre8-ac3p/arch/sh/config.in	Tue May 14 15:09:17 2002
@@ -124,6 +124,7 @@
    hex 'Physical memory start address' CONFIG_MEMORY_START 08000000
    hex 'Physical memory size' CONFIG_MEMORY_SIZE 00400000
 fi
+bool 'Preemptible Kernel' CONFIG_PREEMPT
 endmenu
 
 if [ "$CONFIG_SH_HP690" = "y" ]; then
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/sh/kernel/entry.S linux-2.4.19-pre8-ac3p/arch/sh/kernel/entry.S
--- linux-2.4.19-pre8-ac3/arch/sh/kernel/entry.S	Tue May 14 14:55:42 2002
+++ linux-2.4.19-pre8-ac3p/arch/sh/kernel/entry.S	Tue May 14 15:09:17 2002
@@ -60,10 +60,18 @@
 /*
  * These are offsets into the task-struct.
  */
-flags		=  4
+preempt_count	=  4
 sigpending	=  8
 need_resched	= 20
 tsk_ptrace	= 24
+flags		= 84
+
+/*
+ * These offsets are into irq_stat.
+ * (Find irq_cpustat_t in asm-sh/hardirq.h)
+ */
+local_irq_count =  8
+local_bh_count  = 12
 
 PT_TRACESYS  = 0x00000002
 PF_USEDFPU   = 0x00100000
@@ -143,7 +151,7 @@
 	mov.l	__INV_IMASK, r11;	\
 	stc	sr, r10;		\
 	and	r11, r10;		\
-	stc	k_g_imask, r11;	\
+	stc	k_g_imask, r11;		\
 	or	r11, r10;		\
 	ldc	r10, sr
 
@@ -304,8 +312,8 @@
 	mov.l	@(tsk_ptrace,r0), r0	! Is current PTRACE_SYSCALL'd?
 	mov	#PT_TRACESYS, r1
 	tst	r1, r0
-	bt	ret_from_syscall
-	bra	syscall_ret_trace
+	bf	syscall_ret_trace
+	bra	ret_from_syscall
 	 nop	 
 
 	.align	2
@@ -505,8 +513,6 @@
 	.long	syscall_ret_trace
 __syscall_ret:
 	.long	syscall_ret
-__INV_IMASK:
-	.long	0xffffff0f	! ~(IMASK)
 
 
 	.align	2
@@ -518,7 +524,84 @@
 	.align	2
 1:	.long	SYMBOL_NAME(schedule)
 
+#ifdef CONFIG_PREEMPT	
+	!
+	! Returning from interrupt during kernel mode: check if
+	! preempt_schedule should be called. If need_resched flag
+	! is set, preempt_count is zero, and we're not currently
+	! in an interrupt handler (local irq or bottom half) then
+	! call preempt_schedule. 
+	!
+	! Increment preempt_count to prevent a nested interrupt
+	! from reentering preempt_schedule, then decrement after
+	! and drop through to regular interrupt return which will
+	! jump back and check again in case such an interrupt did
+	! come in (and didn't preempt due to preempt_count).
+	!
+	! NOTE:	because we just checked that preempt_count was
+	! zero before getting to the call, can't we use immediate
+	! values (1 and 0) rather than inc/dec? Also, rather than
+	! drop through to ret_from_irq, we already know this thread
+	! is kernel mode, can't we go direct to ret_from_kirq? In
+	! fact, with proper interrupt nesting and so forth could
+	! the loop simply be on the need_resched w/o checking the
+	! other stuff again? Optimize later...
+	!
+	.align	2
+ret_from_kirq:
+	! Nonzero preempt_count prevents scheduling
+	stc	k_current, r1
+	mov.l	@(preempt_count,r1), r0
+	cmp/eq	#0, r0
+	bf	restore_all
+	! Zero need_resched prevents scheduling
+	mov.l	@(need_resched,r1), r0
+	cmp/eq	#0, r0
+	bt	restore_all
+	! If in_interrupt(), don't schedule
+	mov.l	__irq_stat, r1
+	mov.l	@(local_irq_count,r1), r0
+	mov.l	@(local_bh_count,r1), r1
+	or	r1, r0
+	cmp/eq	#0, r0
+	bf	restore_all
+	! Allow scheduling using preempt_schedule
+	! Adjust preempt_count and SR as needed.
+	stc	k_current, r1
+	mov.l	@(preempt_count,r1), r0	! Could replace this ...
+	add	#1, r0			! ... and this w/mov #1?
+	mov.l	r0, @(preempt_count,r1)
+	STI()
+	mov.l	__preempt_schedule, r0
+	jsr	@r0
+	 nop	
+	/* CLI */
+	stc	sr, r0
+	or	#0xf0, r0
+	ldc	r0, sr
+	!
+	stc	k_current, r1
+	mov.l	@(preempt_count,r1), r0	! Could replace this ...
+	add	#-1, r0			! ... and this w/mov #0?
+	mov.l	r0, @(preempt_count,r1)
+	! Maybe should bra ret_from_kirq, or loop over need_resched?
+	! For now, fall through to ret_from_irq again...
+#endif /* CONFIG_PREEMPT */
+	
 ret_from_irq:
+	mov	#OFF_SR, r0
+	mov.l	@(r0,r15), r0	! get status register
+	shll	r0
+	shll	r0		! kernel space?
+#ifndef CONFIG_PREEMPT
+	bt	restore_all	! Yes, it's from kernel, go back soon
+#else /* CONFIG_PREEMPT */
+	bt	ret_from_kirq	! From kernel: maybe preempt_schedule
+#endif /* CONFIG_PREEMPT */
+	!
+	bra	ret_from_syscall
+	 nop
+
 ret_from_exception:
 	mov	#OFF_SR, r0
 	mov.l	@(r0,r15), r0	! get status register
@@ -564,6 +647,13 @@
 	.long	SYMBOL_NAME(do_signal)
 __irq_stat:
 	.long	SYMBOL_NAME(irq_stat)
+#ifdef CONFIG_PREEMPT
+__preempt_schedule:
+	.long	SYMBOL_NAME(preempt_schedule)
+#endif /* CONFIG_PREEMPT */	
+__INV_IMASK:
+	.long	0xffffff0f	! ~(IMASK)
+
 
 	.align 2
 restore_all:
@@ -679,7 +769,7 @@
 __fpu_prepare_fd:
 	.long	SYMBOL_NAME(fpu_prepare_fd)
 __init_task_flags:
-	.long	SYMBOL_NAME(init_task_union)+4
+	.long	SYMBOL_NAME(init_task_union)+flags
 __PF_USEDFPU:
 	.long	PF_USEDFPU
 #endif
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/arch/sh/kernel/irq.c linux-2.4.19-pre8-ac3p/arch/sh/kernel/irq.c
--- linux-2.4.19-pre8-ac3/arch/sh/kernel/irq.c	Sat Sep  8 15:29:09 2001
+++ linux-2.4.19-pre8-ac3p/arch/sh/kernel/irq.c	Tue May 14 15:09:17 2002
@@ -229,6 +229,14 @@
 	struct irqaction * action;
 	unsigned int status;
 
+	/*
+	 * At this point we're now about to actually call handlers,
+	 * and interrupts might get reenabled during them... bump
+	 * preempt_count to prevent any preemption while the handler
+ 	 * called here is pending...
+ 	 */
+ 	preempt_disable();
+
 	/* Get IRQ number */
 	asm volatile("stc	r2_bank, %0\n\t"
 		     "shlr2	%0\n\t"
@@ -298,8 +306,17 @@
 	desc->handler->end(irq);
 	spin_unlock(&desc->lock);
 
+
 	if (softirq_pending(cpu))
 		do_softirq();
+
+	/*
+	 * We're done with the handlers, interrupts should be
+	 * currently disabled; decrement preempt_count now so
+	 * as we return preemption may be allowed...
+	 */
+	preempt_enable_no_resched();
+
 	return 1;
 }
 
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/drivers/ieee1394/csr.c linux-2.4.19-pre8-ac3p/drivers/ieee1394/csr.c
--- linux-2.4.19-pre8-ac3/drivers/ieee1394/csr.c	Tue May 14 14:55:42 2002
+++ linux-2.4.19-pre8-ac3p/drivers/ieee1394/csr.c	Tue May 14 15:09:17 2002
@@ -10,6 +10,7 @@
  */
 
 #include <linux/string.h>
+#include <linux/sched.h>
 
 #include "ieee1394_types.h"
 #include "hosts.h"
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/drivers/sound/sound_core.c linux-2.4.19-pre8-ac3p/drivers/sound/sound_core.c
--- linux-2.4.19-pre8-ac3/drivers/sound/sound_core.c	Sun Sep 30 15:26:08 2001
+++ linux-2.4.19-pre8-ac3p/drivers/sound/sound_core.c	Tue May 14 15:09:17 2002
@@ -37,6 +37,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/fs/adfs/map.c linux-2.4.19-pre8-ac3p/fs/adfs/map.c
--- linux-2.4.19-pre8-ac3/fs/adfs/map.c	Thu Oct 25 16:53:53 2001
+++ linux-2.4.19-pre8-ac3p/fs/adfs/map.c	Tue May 14 15:09:17 2002
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/adfs_fs.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 
 #include "adfs.h"
 
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/fs/buffer.c linux-2.4.19-pre8-ac3p/fs/buffer.c
--- linux-2.4.19-pre8-ac3/fs/buffer.c	Tue May 14 14:59:19 2002
+++ linux-2.4.19-pre8-ac3p/fs/buffer.c	Tue May 14 16:43:04 2002
@@ -154,11 +154,13 @@
 	get_bh(bh);
 	add_wait_queue(&bh->b_wait, &wait);
 	do {
+		atomic_inc(&nr_iowait_tasks);
 		run_task_queue(&tq_disk);
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!buffer_locked(bh))
 			break;
 		schedule();
+		atomic_dec(&nr_iowait_tasks);
 	} while (buffer_locked(bh));
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(&bh->b_wait, &wait);
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/fs/exec.c linux-2.4.19-pre8-ac3p/fs/exec.c
--- linux-2.4.19-pre8-ac3/fs/exec.c	Tue May 14 14:59:19 2002
+++ linux-2.4.19-pre8-ac3p/fs/exec.c	Tue May 14 15:09:17 2002
@@ -427,8 +427,8 @@
 		active_mm = current->active_mm;
 		current->mm = mm;
 		current->active_mm = mm;
-		task_unlock(current);
 		activate_mm(active_mm, mm);
+		task_unlock(current);
 		mm_release();
 		if (old_mm) {
 			if (active_mm != old_mm) BUG();
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/fs/fat/cache.c linux-2.4.19-pre8-ac3p/fs/fat/cache.c
--- linux-2.4.19-pre8-ac3/fs/fat/cache.c	Fri Oct 12 16:48:42 2001
+++ linux-2.4.19-pre8-ac3p/fs/fat/cache.c	Tue May 14 15:09:17 2002
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/fat_cvf.h>
+#include <linux/sched.h>
 
 #if 0
 #  define PRINTK(x) printk x
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/fs/nls/nls_base.c linux-2.4.19-pre8-ac3p/fs/nls/nls_base.c
--- linux-2.4.19-pre8-ac3/fs/nls/nls_base.c	Tue May 14 14:55:54 2002
+++ linux-2.4.19-pre8-ac3p/fs/nls/nls_base.c	Tue May 14 15:09:18 2002
@@ -18,6 +18,7 @@
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 
 static struct nls_table *tables;
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/fs/proc/proc_misc.c linux-2.4.19-pre8-ac3p/fs/proc/proc_misc.c
--- linux-2.4.19-pre8-ac3/fs/proc/proc_misc.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/fs/proc/proc_misc.c	Tue May 14 16:48:00 2002
@@ -273,7 +273,7 @@
 	int i, len;
 	extern unsigned long total_forks;
 	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	unsigned int sum = 0, user = 0, nice = 0, system = 0, iowait = 0;
 	int major, disk;
 
 	for (i = 0 ; i < smp_num_cpus; i++) {
@@ -282,23 +282,26 @@
 		user += kstat.per_cpu_user[cpu];
 		nice += kstat.per_cpu_nice[cpu];
 		system += kstat.per_cpu_system[cpu];
+		iowait += kstat.per_cpu_iowait[cpu];
 #if !defined(CONFIG_ARCH_S390)
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat.irqs[cpu][j];
 #endif
 	}
 
-	len = sprintf(page, "cpu  %u %u %u %lu\n", user, nice, system,
-		      jif * smp_num_cpus - (user + nice + system));
+	len = sprintf(page, "cpu  %u %u %u %lu %u\n", user, nice, system,
+		      jif * smp_num_cpus - (user + nice + system),
+		      iowait);
 	for (i = 0 ; i < smp_num_cpus; i++)
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
+		len += sprintf(page + len, "cpu%d %u %u %u %lu %u\n",
 			i,
 			kstat.per_cpu_user[cpu_logical_map(i)],
 			kstat.per_cpu_nice[cpu_logical_map(i)],
 			kstat.per_cpu_system[cpu_logical_map(i)],
 			jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
 				   + kstat.per_cpu_nice[cpu_logical_map(i)] \
-				   + kstat.per_cpu_system[cpu_logical_map(i)]));
+				   + kstat.per_cpu_system[cpu_logical_map(i)]),
+			kstat.per_cpu_iowait[cpu_logical_map(i)]);
 	len += sprintf(page + len,
 		"page %u %u\n"
 		"swap %u %u\n"
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-arm/dma.h linux-2.4.19-pre8-ac3p/include/asm-arm/dma.h
--- linux-2.4.19-pre8-ac3/include/asm-arm/dma.h	Sun Aug 12 14:14:00 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-arm/dma.h	Tue May 14 15:09:18 2002
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 #include <asm/system.h>
 #include <asm/memory.h>
 #include <asm/scatterlist.h>
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-arm/hardirq.h linux-2.4.19-pre8-ac3p/include/asm-arm/hardirq.h
--- linux-2.4.19-pre8-ac3/include/asm-arm/hardirq.h	Thu Oct 11 12:04:57 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-arm/hardirq.h	Tue May 14 15:09:18 2002
@@ -34,6 +34,7 @@
 #define irq_exit(cpu,irq)	(local_irq_count(cpu)--)
 
 #define synchronize_irq()	do { } while (0)
+#define release_irqlock(cpu)	do { } while (0)
 
 #else
 #error SMP not supported
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-arm/pgalloc.h linux-2.4.19-pre8-ac3p/include/asm-arm/pgalloc.h
--- linux-2.4.19-pre8-ac3/include/asm-arm/pgalloc.h	Sun Aug 12 14:14:00 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-arm/pgalloc.h	Tue May 14 15:09:18 2002
@@ -57,40 +57,48 @@
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if ((ret = pgd_quicklist) != NULL) {
 		pgd_quicklist = (unsigned long *)__pgd_next(ret);
 		ret[1] = ret[2];
 		clean_dcache_entry(ret + 1);
 		pgtable_cache_size--;
 	}
+	preempt_enable();
 	return (pgd_t *)ret;
 }
 
 static inline void free_pgd_fast(pgd_t *pgd)
 {
+	preempt_disable();
 	__pgd_next(pgd) = (unsigned long) pgd_quicklist;
 	pgd_quicklist = (unsigned long *) pgd;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if((ret = pte_quicklist) != NULL) {
 		pte_quicklist = (unsigned long *)__pte_next(ret);
 		ret[0] = 0;
 		clean_dcache_entry(ret);
 		pgtable_cache_size--;
 	}
+	preempt_enable();
 	return (pte_t *)ret;
 }
 
 static inline void free_pte_fast(pte_t *pte)
 {
+	preempt_disable();
 	__pte_next(pte) = (unsigned long) pte_quicklist;
 	pte_quicklist = (unsigned long *) pte;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 #else	/* CONFIG_NO_PGT_CACHE */
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-arm/smplock.h linux-2.4.19-pre8-ac3p/include/asm-arm/smplock.h
--- linux-2.4.19-pre8-ac3/include/asm-arm/smplock.h	Sun Aug 12 14:14:00 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-arm/smplock.h	Tue May 14 15:09:18 2002
@@ -3,12 +3,17 @@
  *
  * Default SMP lock implementation
  */
+#include <linux/config.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 
 extern spinlock_t kernel_flag;
 
+#ifdef CONFIG_PREEMPT
+#define kernel_locked()		preempt_get_count()
+#else
 #define kernel_locked()		spin_is_locked(&kernel_flag)
+#endif
 
 /*
  * Release global kernel lock and global interrupt lock
@@ -40,8 +45,14 @@
  */
 static inline void lock_kernel(void)
 {
+#ifdef CONFIG_PREEMPT
+	if (current->lock_depth == -1)
+		spin_lock(&kernel_flag);
+	++current->lock_depth;
+#else
 	if (!++current->lock_depth)
 		spin_lock(&kernel_flag);
+#endif
 }
 
 static inline void unlock_kernel(void)
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-arm/softirq.h linux-2.4.19-pre8-ac3p/include/asm-arm/softirq.h
--- linux-2.4.19-pre8-ac3/include/asm-arm/softirq.h	Sat Sep  8 15:02:31 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-arm/softirq.h	Tue May 14 15:09:18 2002
@@ -5,20 +5,22 @@
 #include <asm/hardirq.h>
 
 #define __cpu_bh_enable(cpu) \
-		do { barrier(); local_bh_count(cpu)--; } while (0)
+		do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0)
 #define cpu_bh_disable(cpu) \
-		do { local_bh_count(cpu)++; barrier(); } while (0)
+		do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0)
 
 #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
 #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
 
 #define in_softirq()		(local_bh_count(smp_processor_id()) != 0)
 
-#define local_bh_enable()						\
+#define _local_bh_enable()						\
 do {									\
 	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
 	if (!--*ptr && ptr[-2])						\
 		__asm__("bl%? __do_softirq": : : "lr");/* out of line */\
 } while (0)
+
+#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0)
 
 #endif	/* __ASM_SOFTIRQ_H */
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/hardirq.h linux-2.4.19-pre8-ac3p/include/asm-i386/hardirq.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/hardirq.h	Thu Nov 22 14:46:19 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/hardirq.h	Tue May 14 15:20:25 2002
@@ -36,6 +36,8 @@
 
 #define synchronize_irq()	barrier()
 
+#define release_irqlock(cpu)	do { } while (0)
+
 #else
 
 #include <asm/atomic.h>
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/highmem.h linux-2.4.19-pre8-ac3p/include/asm-i386/highmem.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/highmem.h	Tue May 14 14:55:55 2002
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/highmem.h	Tue May 14 15:20:25 2002
@@ -88,6 +88,7 @@
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	if (page < highmem_start_page)
 		return page_address(page);
 
@@ -109,8 +110,10 @@
 	unsigned long vaddr = (unsigned long) kvaddr;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	if (vaddr < FIXADDR_START) // FIXME
+	if (vaddr < FIXADDR_START) { // FIXME
+		preempt_enable();
 		return;
+	}
 
 	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
 		out_of_line_bug();
@@ -122,6 +125,8 @@
 	pte_clear(kmap_pte-idx);
 	__flush_tlb_one(vaddr);
 #endif
+
+	preempt_enable();
 }
 
 #endif /* __KERNEL__ */
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/hw_irq.h linux-2.4.19-pre8-ac3p/include/asm-i386/hw_irq.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/hw_irq.h	Thu Nov 22 14:46:18 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/hw_irq.h	Tue May 14 15:20:25 2002
@@ -95,6 +95,18 @@
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
+#define GET_CURRENT \
+	"movl %esp, %ebx\n\t" \
+	"andl $-8192, %ebx\n\t"
+
+#ifdef CONFIG_PREEMPT
+#define BUMP_LOCK_COUNT \
+	GET_CURRENT \
+	"incl 4(%ebx)\n\t"
+#else
+#define BUMP_LOCK_COUNT
+#endif
+
 #define SAVE_ALL \
 	"cld\n\t" \
 	"pushl %es\n\t" \
@@ -108,14 +120,11 @@
 	"pushl %ebx\n\t" \
 	"movl $" STR(__KERNEL_DS) ",%edx\n\t" \
 	"movl %edx,%ds\n\t" \
-	"movl %edx,%es\n\t"
+	"movl %edx,%es\n\t" \
+	BUMP_LOCK_COUNT
 
 #define IRQ_NAME2(nr) nr##_interrupt(void)
 #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-#define GET_CURRENT \
-	"movl %esp, %ebx\n\t" \
-	"andl $-8192, %ebx\n\t"
 
 /*
  *	SMP has a few special interrupts for IPI messages
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/i387.h linux-2.4.19-pre8-ac3p/include/asm-i386/i387.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/i387.h	Thu Nov 22 14:48:58 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/i387.h	Tue May 14 18:09:08 2002
@@ -12,6 +12,7 @@
 #define __ASM_I386_I387_H
 
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -24,7 +25,7 @@
 extern void restore_fpu( struct task_struct *tsk );
 
 extern void kernel_fpu_begin(void);
-#define kernel_fpu_end() stts()
+#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
 
 
 #define unlazy_fpu( tsk ) do { \
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/pgalloc.h linux-2.4.19-pre8-ac3p/include/asm-i386/pgalloc.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/pgalloc.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/pgalloc.h	Tue May 14 15:20:25 2002
@@ -75,20 +75,26 @@
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if ((ret = pgd_quicklist) != NULL) {
 		pgd_quicklist = (unsigned long *)(*ret);
 		ret[0] = 0;
 		pgtable_cache_size--;
-	} else
+		preempt_enable();
+	} else {
+		preempt_enable();
 		ret = (unsigned long *)get_pgd_slow();
+	}
 	return (pgd_t *)ret;
 }
 
 static inline void free_pgd_fast(pgd_t *pgd)
 {
+	preempt_disable();
 	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
 	pgd_quicklist = (unsigned long *) pgd;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 static inline void free_pgd_slow(pgd_t *pgd)
@@ -119,11 +125,13 @@
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if ((ret = (unsigned long *)pte_quicklist) != NULL) {
 		pte_quicklist = (unsigned long *)(*ret);
 		ret[0] = ret[1];
 		pgtable_cache_size--;
 	}
+	preempt_enable();
 	return (pte_t *)ret;
 }
 
@@ -135,12 +143,14 @@
 extern int pgt_cache_water[];
 static inline void pte_free_fast(pte_t *pte)
 {
+	preempt_disable();
 	if (pgtable_cache_size < pgt_cache_water[1]) {
 		*(unsigned long *)pte = (unsigned long) pte_quicklist;
 		pte_quicklist = (unsigned long *) pte;
 		pgtable_cache_size++;
 	} else
 		pte_free_slow(pte);
+	preempt_enable();
 }
 
 #define pte_free(pte)		pte_free_fast(pte)
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/smplock.h linux-2.4.19-pre8-ac3p/include/asm-i386/smplock.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/smplock.h	Tue May 14 14:55:55 2002
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/smplock.h	Tue May 14 18:09:08 2002
@@ -11,7 +11,15 @@
 extern spinlock_cacheline_t kernel_flag_cacheline;  
 #define kernel_flag kernel_flag_cacheline.lock      
 
+#ifdef CONFIG_SMP
 #define kernel_locked()		spin_is_locked(&kernel_flag)
+#else
+#ifdef CONFIG_PREEMPT
+#define kernel_locked()		preempt_get_count()
+#else
+#define kernel_locked()		1
+#endif
+#endif
 
 /*
  * Release global kernel lock and global interrupt lock
@@ -43,6 +51,11 @@
  */
 static __inline__ void lock_kernel(void)
 {
+#ifdef CONFIG_PREEMPT
+	if (current->lock_depth == -1)
+		spin_lock(&kernel_flag);
+	++current->lock_depth;
+#else
 #if 1
 	if (!++current->lock_depth)
 		spin_lock(&kernel_flag);
@@ -54,6 +67,7 @@
 		"\n9:"
 		:"=m" (__dummy_lock(&kernel_flag)),
 		 "=m" (current->lock_depth));
+#endif
 #endif
 }
 
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/softirq.h linux-2.4.19-pre8-ac3p/include/asm-i386/softirq.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/softirq.h	Tue May 14 14:55:55 2002
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/softirq.h	Tue May 14 15:20:25 2002
@@ -5,9 +5,9 @@
 #include <asm/hardirq.h>
 
 #define __cpu_bh_enable(cpu) \
-		do { barrier(); local_bh_count(cpu)--; } while (0)
+		do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0)
 #define cpu_bh_disable(cpu) \
-		do { local_bh_count(cpu)++; barrier(); } while (0)
+		do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0)
 
 #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
 #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
@@ -22,7 +22,7 @@
  * If you change the offsets in irq_stat then you have to
  * update this code as well.
  */
-#define local_bh_enable()						\
+#define _local_bh_enable()						\
 do {									\
 	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
 									\
@@ -44,5 +44,7 @@
 		: "r" (ptr), "i" (do_softirq)				\
 		/* no registers clobbered */ );				\
 } while (0)
+
+#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0)
 
 #endif	/* __ASM_SOFTIRQ_H */
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-i386/spinlock.h linux-2.4.19-pre8-ac3p/include/asm-i386/spinlock.h
--- linux-2.4.19-pre8-ac3/include/asm-i386/spinlock.h	Tue May 14 14:55:55 2002
+++ linux-2.4.19-pre8-ac3p/include/asm-i386/spinlock.h	Tue May 14 15:20:25 2002
@@ -77,7 +77,7 @@
 		:"=m" (lock->lock) : : "memory"
 
 
-static inline void spin_unlock(spinlock_t *lock)
+static inline void _raw_spin_unlock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	if (lock->magic != SPINLOCK_MAGIC)
@@ -97,7 +97,7 @@
 		:"=q" (oldval), "=m" (lock->lock) \
 		:"0" (oldval) : "memory"
 
-static inline void spin_unlock(spinlock_t *lock)
+static inline void _raw_spin_unlock(spinlock_t *lock)
 {
 	char oldval = 1;
 #if SPINLOCK_DEBUG
@@ -113,7 +113,7 @@
 
 #endif
 
-static inline int spin_trylock(spinlock_t *lock)
+static inline int _raw_spin_trylock(spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -123,7 +123,7 @@
 	return oldval > 0;
 }
 
-static inline void spin_lock(spinlock_t *lock)
+static inline void _raw_spin_lock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	__label__ here;
@@ -179,7 +179,7 @@
  */
 /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
 
-static inline void read_lock(rwlock_t *rw)
+static inline void _raw_read_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -188,7 +188,7 @@
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void write_lock(rwlock_t *rw)
+static inline void _raw_write_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -197,10 +197,10 @@
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
+#define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
 
-static inline int write_trylock(rwlock_t *lock)
+static inline int _raw_write_trylock(rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-sh/hardirq.h linux-2.4.19-pre8-ac3p/include/asm-sh/hardirq.h
--- linux-2.4.19-pre8-ac3/include/asm-sh/hardirq.h	Sat Sep  8 15:29:09 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-sh/hardirq.h	Tue May 14 15:09:18 2002
@@ -34,6 +34,8 @@
 
 #define synchronize_irq()	barrier()
 
+#define release_irqlock(cpu)	do { } while (0)
+
 #else
 
 #error Super-H SMP is not available
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-sh/smplock.h linux-2.4.19-pre8-ac3p/include/asm-sh/smplock.h
--- linux-2.4.19-pre8-ac3/include/asm-sh/smplock.h	Sat Sep  8 15:29:09 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-sh/smplock.h	Tue May 14 15:09:18 2002
@@ -9,15 +9,88 @@
 
 #include <linux/config.h>
 
-#ifndef CONFIG_SMP
-
+#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT)
+/*
+ * Should never happen, since linux/smp_lock.h catches this case;
+ * but in case this file is included directly with neither SMP nor
+ * PREEMPT configuration, provide same dummys as linux/smp_lock.h
+ */
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
-#define release_kernel_lock(task, cpu, depth)	((depth) = 1)
-#define reacquire_kernel_lock(task, cpu, depth)	do { } while(0)
+#define release_kernel_lock(task, cpu)		do { } while(0)
+#define reacquire_kernel_lock(task)		do { } while(0)
+#define kernel_locked()		1
+
+#else /* CONFIG_SMP || CONFIG_PREEMPT */
+
+#if CONFIG_SMP
+#error "We do not support SMP on SH yet"
+#endif
+/*
+ * Default SMP lock implementation (i.e. the i386 version)
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+
+extern spinlock_t kernel_flag;
+#define lock_bkl() spin_lock(&kernel_flag)
+#define unlock_bkl() spin_unlock(&kernel_flag)
 
+#ifdef CONFIG_SMP
+#define kernel_locked()		spin_is_locked(&kernel_flag)
+#elif  CONFIG_PREEMPT
+#define kernel_locked()		preempt_get_count()
+#else  /* neither */
+#define kernel_locked()		1
+#endif
+
+/*
+ * Release global kernel lock and global interrupt lock
+ */
+#define release_kernel_lock(task, cpu) \
+do { \
+	if (task->lock_depth >= 0) \
+		spin_unlock(&kernel_flag); \
+	release_irqlock(cpu); \
+	__sti(); \
+} while (0)
+
+/*
+ * Re-acquire the kernel lock
+ */
+#define reacquire_kernel_lock(task) \
+do { \
+	if (task->lock_depth >= 0) \
+		spin_lock(&kernel_flag); \
+} while (0)
+
+/*
+ * Getting the big kernel lock.
+ *
+ * This cannot happen asynchronously,
+ * so we only need to worry about other
+ * CPU's.
+ */
+static __inline__ void lock_kernel(void)
+{
+#ifdef CONFIG_PREEMPT
+	if (current->lock_depth == -1)
+		spin_lock(&kernel_flag);
+	++current->lock_depth;
 #else
-#error "We do not support SMP on SH"
-#endif /* CONFIG_SMP */
+	if (!++current->lock_depth)
+		spin_lock(&kernel_flag);
+#endif
+}
+
+static __inline__ void unlock_kernel(void)
+{
+	if (current->lock_depth < 0)
+		BUG();
+	if (--current->lock_depth < 0)
+		spin_unlock(&kernel_flag);
+}
+#endif /* CONFIG_SMP || CONFIG_PREEMPT */
 
 #endif /* __ASM_SH_SMPLOCK_H */
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/asm-sh/softirq.h linux-2.4.19-pre8-ac3p/include/asm-sh/softirq.h
--- linux-2.4.19-pre8-ac3/include/asm-sh/softirq.h	Sat Sep  8 15:29:09 2001
+++ linux-2.4.19-pre8-ac3p/include/asm-sh/softirq.h	Tue May 14 15:09:18 2002
@@ -6,6 +6,7 @@
 
 #define local_bh_disable()			\
 do {						\
+	preempt_disable();			\
 	local_bh_count(smp_processor_id())++;	\
 	barrier();				\
 } while (0)
@@ -14,6 +15,7 @@
 do {						\
 	barrier();				\
 	local_bh_count(smp_processor_id())--;	\
+	preempt_enable();			\
 } while (0)
 
 #define local_bh_enable()				\
@@ -23,6 +25,7 @@
 	    && softirq_pending(smp_processor_id())) {	\
 		do_softirq();				\
 	}						\
+	preempt_enable();				\
 } while (0)
 
 #define in_softirq() (local_bh_count(smp_processor_id()) != 0)
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/brlock.h linux-2.4.19-pre8-ac3p/include/linux/brlock.h
--- linux-2.4.19-pre8-ac3/include/linux/brlock.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/brlock.h	Tue May 14 15:20:27 2002
@@ -173,11 +173,11 @@
 }
 
 #else
-# define br_read_lock(idx)	((void)(idx))
-# define br_read_unlock(idx)	((void)(idx))
-# define br_write_lock(idx)	((void)(idx))
-# define br_write_unlock(idx)	((void)(idx))
-#endif
+# define br_read_lock(idx)	({ (void)(idx); preempt_disable(); })
+# define br_read_unlock(idx)	({ (void)(idx); preempt_enable(); })
+# define br_write_lock(idx)	({ (void)(idx); preempt_disable(); })
+# define br_write_unlock(idx)	({ (void)(idx); preempt_enable(); })
+#endif	/* CONFIG_SMP */
 
 /*
  * Now enumerate all of the possible sw/hw IRQ protected
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/dcache.h linux-2.4.19-pre8-ac3p/include/linux/dcache.h
--- linux-2.4.19-pre8-ac3/include/linux/dcache.h	Tue May 14 14:55:57 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/dcache.h	Tue May 14 15:20:25 2002
@@ -126,31 +126,6 @@
 
 extern spinlock_t dcache_lock;
 
-/**
- * d_drop - drop a dentry
- * @dentry: dentry to drop
- *
- * d_drop() unhashes the entry from the parent
- * dentry hashes, so that it won't be found through
- * a VFS lookup any more. Note that this is different
- * from deleting the dentry - d_delete will try to
- * mark the dentry negative if possible, giving a
- * successful _negative_ lookup, while d_drop will
- * just make the cache lookup fail.
- *
- * d_drop() is used mainly for stuff that wants
- * to invalidate a dentry for some reason (NFS
- * timeouts or autofs deletes).
- */
-
-static __inline__ void d_drop(struct dentry * dentry)
-{
-	spin_lock(&dcache_lock);
-	list_del(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_hash);
-	spin_unlock(&dcache_lock);
-}
-
 static __inline__ int dname_external(struct dentry *d)
 {
 	return d->d_name.name != d->d_iname; 
@@ -275,3 +250,34 @@
 #endif /* __KERNEL__ */
 
 #endif	/* __LINUX_DCACHE_H */
+
+#if !defined(__LINUX_DCACHE_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+#define __LINUX_DCACHE_H_INLINES
+
+#ifdef __KERNEL__
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent
+ * dentry hashes, so that it won't be found through
+ * a VFS lookup any more. Note that this is different
+ * from deleting the dentry - d_delete will try to
+ * mark the dentry negative if possible, giving a
+ * successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants
+ * to invalidate a dentry for some reason (NFS
+ * timeouts or autofs deletes).
+ */
+
+static __inline__ void d_drop(struct dentry * dentry)
+{
+	spin_lock(&dcache_lock);
+	list_del(&dentry->d_hash);
+	INIT_LIST_HEAD(&dentry->d_hash);
+	spin_unlock(&dcache_lock);
+}
+#endif
+#endif
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/fs_struct.h linux-2.4.19-pre8-ac3p/include/linux/fs_struct.h
--- linux-2.4.19-pre8-ac3/include/linux/fs_struct.h	Fri Jul 13 18:10:44 2001
+++ linux-2.4.19-pre8-ac3p/include/linux/fs_struct.h	Tue May 14 15:09:18 2002
@@ -20,6 +20,15 @@
 extern void exit_fs(struct task_struct *);
 extern void set_fs_altroot(void);
 
+struct fs_struct *copy_fs_struct(struct fs_struct *old);
+void put_fs_struct(struct fs_struct *fs);
+
+#endif
+#endif
+
+#if !defined(_LINUX_FS_STRUCT_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+#define _LINUX_FS_STRUCT_H_INLINES
+#ifdef __KERNEL__
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
  * It can block. Requires the big lock held.
@@ -65,9 +74,5 @@
 		mntput(old_pwdmnt);
 	}
 }
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old);
-void put_fs_struct(struct fs_struct *fs);
-
 #endif
 #endif
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/kernel_stat.h linux-2.4.19-pre8-ac3p/include/linux/kernel_stat.h
--- linux-2.4.19-pre8-ac3/include/linux/kernel_stat.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/kernel_stat.h	Tue May 14 16:49:04 2002
@@ -18,7 +18,8 @@
 struct kernel_stat {
 	unsigned int per_cpu_user[NR_CPUS],
 	             per_cpu_nice[NR_CPUS],
-	             per_cpu_system[NR_CPUS];
+	             per_cpu_system[NR_CPUS],
+	             per_cpu_iowait[NR_CPUS];
 	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/sched.h linux-2.4.19-pre8-ac3p/include/linux/sched.h
--- linux-2.4.19-pre8-ac3/include/linux/sched.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/sched.h	Tue May 14 18:09:05 2002
@@ -91,6 +91,7 @@
 #define TASK_UNINTERRUPTIBLE	2
 #define TASK_ZOMBIE		4
 #define TASK_STOPPED		8
+#define PREEMPT_ACTIVE		0x4000000
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -156,6 +157,9 @@
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);
+#ifdef CONFIG_PREEMPT
+asmlinkage void preempt_schedule(void);
+#endif
 
 extern int schedule_task(struct tq_struct *task);
 extern void flush_scheduled_tasks(void);
@@ -291,7 +295,7 @@
 	 * offsets of these are hardcoded elsewhere - touch with care
 	 */
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
-	unsigned long flags;	/* per process flags, defined below */
+	int preempt_count;	/* 0 => preemptable, <0 => BUG */
 	int sigpending;
 	mm_segment_t addr_limit;	/* thread address space:
 					 	0-0xBFFFFFFF for user-thead
@@ -317,6 +321,7 @@
 	unsigned long policy;
 	unsigned long cpus_allowed;
 	unsigned int time_slice;
+	unsigned long flags;
 
 	task_t *next_task, *prev_task;
 
@@ -358,6 +363,7 @@
 	struct tms times;
 	unsigned long start_time;
 	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+	long per_cpu_iowait[NR_CPUS];
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
@@ -942,6 +948,11 @@
 {
 	return unlikely(current->need_resched);
 }
+
+#define _TASK_STRUCT_DEFINED
+#include <linux/dcache.h>
+#include <linux/tqueue.h>
+#include <linux/fs_struct.h>
 
 #endif /* __KERNEL__ */
 
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/smp.h linux-2.4.19-pre8-ac3p/include/linux/smp.h
--- linux-2.4.19-pre8-ac3/include/linux/smp.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/smp.h	Tue May 14 15:20:25 2002
@@ -81,7 +81,9 @@
 #define smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_threads_ready			1
+#ifndef CONFIG_PREEMPT
 #define kernel_lock()
+#endif
 #define cpu_logical_map(cpu)			0
 #define cpu_number_map(cpu)			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/smp_lock.h linux-2.4.19-pre8-ac3p/include/linux/smp_lock.h
--- linux-2.4.19-pre8-ac3/include/linux/smp_lock.h	Thu Nov 22 14:46:27 2001
+++ linux-2.4.19-pre8-ac3p/include/linux/smp_lock.h	Tue May 14 18:09:08 2002
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifndef CONFIG_SMP
+#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT)
 
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/spinlock.h linux-2.4.19-pre8-ac3p/include/linux/spinlock.h
--- linux-2.4.19-pre8-ac3/include/linux/spinlock.h	Tue May 14 14:55:58 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/spinlock.h	Tue May 14 15:20:25 2002
@@ -2,6 +2,7 @@
 #define __LINUX_SPINLOCK_H
 
 #include <linux/config.h>
+#include <linux/compiler.h>
 
 /*
  * These are the generic versions of the spinlocks and read-write
@@ -62,8 +63,10 @@
 
 #if (DEBUG_SPINLOCKS < 1)
 
+#ifndef CONFIG_PREEMPT
 #define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
 #define ATOMIC_DEC_AND_LOCK
+#endif
 
 /*
  * Your basic spinlocks, allowing only a single CPU anywhere
@@ -79,11 +82,11 @@
 #endif
 
 #define spin_lock_init(lock)	do { } while(0)
-#define spin_lock(lock)		(void)(lock) /* Not "unused variable". */
+#define _raw_spin_lock(lock)	(void)(lock) /* Not "unused variable". */
 #define spin_is_locked(lock)	(0)
-#define spin_trylock(lock)	({1; })
+#define _raw_spin_trylock(lock)	({1; })
 #define spin_unlock_wait(lock)	do { } while(0)
-#define spin_unlock(lock)	do { } while(0)
+#define _raw_spin_unlock(lock)	do { } while(0)
 
 #elif (DEBUG_SPINLOCKS < 2)
 
@@ -142,12 +145,75 @@
 #endif
 
 #define rwlock_init(lock)	do { } while(0)
-#define read_lock(lock)		(void)(lock) /* Not "unused variable". */
-#define read_unlock(lock)	do { } while(0)
-#define write_lock(lock)	(void)(lock) /* Not "unused variable". */
-#define write_unlock(lock)	do { } while(0)
+#define _raw_read_lock(lock)	(void)(lock) /* Not "unused variable". */
+#define _raw_read_unlock(lock)	do { } while(0)
+#define _raw_write_lock(lock)	(void)(lock) /* Not "unused variable". */
+#define _raw_write_unlock(lock)	do { } while(0)
 
 #endif /* !SMP */
+
+#ifdef CONFIG_PREEMPT
+
+#define preempt_get_count() (current->preempt_count)
+
+#define preempt_disable() \
+do { \
+	++current->preempt_count; \
+	barrier(); \
+} while (0)
+
+#define preempt_enable_no_resched() \
+do { \
+	--current->preempt_count; \
+	barrier(); \
+} while (0)
+
+#define preempt_enable() \
+do { \
+	--current->preempt_count; \
+	barrier(); \
+	if (unlikely(current->preempt_count < current->need_resched)) \
+		preempt_schedule(); \
+} while (0)
+
+#define spin_lock(lock)	\
+do { \
+	preempt_disable(); \
+	_raw_spin_lock(lock); \
+} while(0)
+
+#define spin_trylock(lock)	({preempt_disable(); _raw_spin_trylock(lock) ? \
+				1 : ({preempt_enable(); 0;});})
+#define spin_unlock(lock) \
+do { \
+	_raw_spin_unlock(lock); \
+	preempt_enable(); \
+} while (0)
+
+#define read_lock(lock)		({preempt_disable(); _raw_read_lock(lock);})
+#define read_unlock(lock)	({_raw_read_unlock(lock); preempt_enable();})
+#define write_lock(lock)	({preempt_disable(); _raw_write_lock(lock);})
+#define write_unlock(lock)	({_raw_write_unlock(lock); preempt_enable();})
+#define write_trylock(lock)	({preempt_disable();_raw_write_trylock(lock) ? \
+				1 : ({preempt_enable(); 0;});})
+
+#else
+
+#define preempt_get_count()	(0)
+#define preempt_disable()	do { } while (0)
+#define preempt_enable_no_resched()	do {} while(0)
+#define preempt_enable()	do { } while (0)
+
+#define spin_lock(lock)		_raw_spin_lock(lock)
+#define spin_trylock(lock)	_raw_spin_trylock(lock)
+#define spin_unlock(lock)	_raw_spin_unlock(lock)
+
+#define read_lock(lock)		_raw_read_lock(lock)
+#define read_unlock(lock)	_raw_read_unlock(lock)
+#define write_lock(lock)	_raw_write_lock(lock)
+#define write_unlock(lock)	_raw_write_unlock(lock)
+#define write_trylock(lock)	_raw_write_trylock(lock)
+#endif
 
 /* "lock on reference count zero" */
 #ifndef ATOMIC_DEC_AND_LOCK
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/swap.h linux-2.4.19-pre8-ac3p/include/linux/swap.h
--- linux-2.4.19-pre8-ac3/include/linux/swap.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/swap.h	Tue May 14 16:49:58 2002
@@ -90,6 +90,7 @@
 extern int nr_inactive_clean_pages;
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
+extern atomic_t nr_iowait_tasks;
 extern spinlock_cacheline_t pagecache_lock_cacheline;
 #define pagecache_lock (pagecache_lock_cacheline.lock)
 extern void __remove_inode_page(struct page *);
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/include/linux/tqueue.h linux-2.4.19-pre8-ac3p/include/linux/tqueue.h
--- linux-2.4.19-pre8-ac3/include/linux/tqueue.h	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/include/linux/tqueue.h	Tue May 14 15:20:25 2002
@@ -94,6 +94,22 @@
 extern spinlock_t tqueue_lock;
 
 /*
+ * Call all "bottom halfs" on a given list.
+ */
+
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+		__run_task_queue(list);
+}
+
+#endif /* _LINUX_TQUEUE_H */
+
+#if !defined(_LINUX_TQUEUE_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+#define _LINUX_TQUEUE_H_INLINES
+/*
  * Queue a task on a tq.  Return non-zero if it was successfully
  * added.
  */
@@ -109,17 +125,4 @@
 	}
 	return ret;
 }
-
-/*
- * Call all "bottom halfs" on a given list.
- */
-
-extern void __run_task_queue(task_queue *list);
-
-static inline void run_task_queue(task_queue *list)
-{
-	if (TQ_ACTIVE(*list))
-		__run_task_queue(list);
-}
-
-#endif /* _LINUX_TQUEUE_H */
+#endif
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/kernel/exit.c linux-2.4.19-pre8-ac3p/kernel/exit.c
--- linux-2.4.19-pre8-ac3/kernel/exit.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/kernel/exit.c	Tue May 14 15:09:18 2002
@@ -373,8 +373,8 @@
 		/* more a memory barrier than a real lock */
 		task_lock(tsk);
 		tsk->mm = NULL;
-		task_unlock(tsk);
 		enter_lazy_tlb(mm, current, smp_processor_id());
+		task_unlock(tsk);
 		mmput(mm);
 	}
 }
@@ -494,6 +494,11 @@
 		panic("Attempted to kill init!");
 	tsk->flags |= PF_EXITING;
 	del_timer_sync(&tsk->real_timer);
+
+	if (unlikely(preempt_get_count()))
+		printk(KERN_ERR "%s[%d] exited with preempt_count %d\n",
+				current->comm, current->pid,
+				preempt_get_count());
 
 fake_volatile:
 #ifdef CONFIG_BSD_PROCESS_ACCT
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/kernel/fork.c linux-2.4.19-pre8-ac3p/kernel/fork.c
--- linux-2.4.19-pre8-ac3/kernel/fork.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/kernel/fork.c	Tue May 14 15:09:18 2002
@@ -640,6 +640,13 @@
 	if (p->binfmt && p->binfmt->module)
 		__MOD_INC_USE_COUNT(p->binfmt->module);
 
+#ifdef CONFIG_PREEMPT
+	/*
+	 * schedule_tail drops this_rq()->lock so compensate with a count
+	 * of 1.  Also, we want to start with kernel preemption disabled.
+	 */
+	p->preempt_count = 1;
+#endif
 	p->did_exec = 0;
 	p->swappable = 0;
 	p->state = TASK_UNINTERRUPTIBLE;
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/kernel/ksyms.c linux-2.4.19-pre8-ac3p/kernel/ksyms.c
--- linux-2.4.19-pre8-ac3/kernel/ksyms.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/kernel/ksyms.c	Tue May 14 15:09:18 2002
@@ -442,6 +442,9 @@
 EXPORT_SYMBOL(interruptible_sleep_on);
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 EXPORT_SYMBOL(schedule);
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(preempt_schedule);
+#endif
 EXPORT_SYMBOL(schedule_timeout);
 EXPORT_SYMBOL(sys_sched_yield);
 EXPORT_SYMBOL(set_user_nice);
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/kernel/sched.c linux-2.4.19-pre8-ac3p/kernel/sched.c
--- linux-2.4.19-pre8-ac3/kernel/sched.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/kernel/sched.c	Tue May 14 15:09:18 2002
@@ -165,10 +165,12 @@
 	struct runqueue *rq;
 
 repeat_lock_task:
+	preempt_disable();
 	rq = task_rq(p);
 	spin_lock_irqsave(&rq->lock, *flags);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock_irqrestore(&rq->lock, *flags);
+		preempt_enable();
 		goto repeat_lock_task;
 	}
 	return rq;
@@ -177,6 +179,7 @@
 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
+	preempt_enable();
 }
 
 /*
@@ -257,11 +260,13 @@
 {
 	int need_resched;
 
+	preempt_disable();
 	need_resched = p->need_resched;
 	wmb();
 	set_tsk_need_resched(p);
 	if (!need_resched && (p->cpu != smp_processor_id()))
 		smp_send_reschedule(p->cpu);
+	preempt_enable();
 }
 
 #ifdef CONFIG_SMP
@@ -276,6 +281,7 @@
 	runqueue_t *rq;
 
 repeat:
+	preempt_disable();
 	rq = task_rq(p);
 	while (unlikely(rq->curr == p)) {
 		cpu_relax();
@@ -284,9 +290,11 @@
 	rq = task_rq_lock(p, &flags);
 	if (unlikely(rq->curr == p)) {
 		task_rq_unlock(rq, &flags);
+		preempt_enable();
 		goto repeat;
 	}
 	task_rq_unlock(rq, &flags);
+	preempt_enable();
 }
 
 /*
@@ -340,6 +348,7 @@
 {
 	runqueue_t *rq;
 
+	preempt_disable();
 	rq = this_rq();
 	spin_lock_irq(&rq->lock);
 
@@ -357,6 +366,7 @@
 	p->cpu = smp_processor_id();
 	activate_task(p, rq);
 	spin_unlock_irq(&rq->lock);
+	preempt_enable();
 }
 
 /*
@@ -384,7 +394,7 @@
 			p->sleep_avg) / (EXIT_WEIGHT + 1);
 }
 
-#if CONFIG_SMP
+#if CONFIG_SMP || CONFIG_PREEMPT
 asmlinkage void schedule_tail(task_t *prev)
 {
 	spin_unlock_irq(&this_rq()->frozen);
@@ -739,6 +749,7 @@
 	BUG_ON(in_interrupt());
 
 need_resched:
+	preempt_disable();
 	prev = current;
 	rq = this_rq();
 
@@ -746,6 +757,13 @@
 	prev->sleep_timestamp = jiffies;
 	spin_lock_irq(&rq->lock);
 
+	/*
+	 * if entering from preempt_schedule, off a kernel preemption,
+	 * go straight to picking the next task.
+	 */
+	if (unlikely(preempt_get_count() & PREEMPT_ACTIVE))
+		goto pick_next_task;
+
 	switch (prev->state) {
 	case TASK_INTERRUPTIBLE:
 		if (unlikely(signal_pending(prev))) {
@@ -757,9 +775,7 @@
 	case TASK_RUNNING:
 		;
 	}
-#if CONFIG_SMP
 pick_next_task:
-#endif
 	if (unlikely(!rq->nr_running)) {
 #if CONFIG_SMP
 		load_balance(rq, 1);
@@ -810,11 +826,30 @@
 	}
 
 	reacquire_kernel_lock(current);
+	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 	return;
 }
 
+#ifdef CONFIG_PREEMPT
+/*
+ * this is is the entry point to schedule() from in-kernel preemption.
+*/
+asmlinkage void preempt_schedule(void)
+{
+need_resched:
+	current->preempt_count += PREEMPT_ACTIVE;
+	schedule();
+ 	current->preempt_count -= PREEMPT_ACTIVE;
+
+	/* we could miss a preemption between schedule() and now */
+ 	barrier();
+	if (unlikely((current->need_resched)))
+		goto need_resched;
+}
+#endif /* CONFIG_PREEMP */
+
 /*
  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
@@ -1192,6 +1227,7 @@
 	runqueue_t *rq;
 	prio_array_t *array;
 
+	preempt_disable();
 	rq = this_rq();
 
 	/*
@@ -1220,6 +1256,7 @@
 		__set_bit(current->prio, array->bitmap);
 	}
 	spin_unlock(&rq->lock);
+	preempt_enable_no_resched();
 
 	schedule();
 
@@ -1424,6 +1461,9 @@
 	double_rq_unlock(idle_rq, rq);
 	set_tsk_need_resched(idle);
 	__restore_flags(flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+	idle->preempt_count = (idle->lock_depth >= 0);
 }
 
 extern void init_timervecs(void);
@@ -1520,6 +1560,7 @@
 	if (!new_mask)
 		BUG();
 
+	preempt_disable();
 	rq = task_rq_lock(p, &flags);
 	p->cpus_allowed = new_mask;
 	/*
@@ -1528,7 +1569,7 @@
 	 */
 	if (new_mask & (1UL << p->cpu)) {
 		task_rq_unlock(rq, &flags);
-		return;
+		goto out;
 	}
 
 	init_MUTEX_LOCKED(&req.sem);
@@ -1538,6 +1579,8 @@
 	wake_up_process(rq->migration_thread);
 
 	down(&req.sem);
+out:
+	preempt_enable();
 }
 
 static int migration_thread(void * bind_cpu)
@@ -1592,18 +1635,18 @@
 		cpu_dest = __ffs(p->cpus_allowed);
 		rq_dest = cpu_rq(cpu_dest);
 repeat:
-		cpu_src = p->thread_info->cpu;
+		cpu_src = p->cpu;
 		rq_src = cpu_rq(cpu_src);
 
 		local_irq_save(flags);
 		double_rq_lock(rq_src, rq_dest);
-		if (p->thread_info->cpu != cpu_src) {
+		if (p->cpu != cpu_src) {
 			double_rq_unlock(rq_src, rq_dest);
 			local_irq_restore(flags);
 			goto repeat;
 		}
 		if (rq_src == rq) {
-			p->thread_info->cpu = cpu_dest;
+			p->cpu = cpu_dest;
 			if (p->array) {
 				deactivate_task(p, rq_src);
 				activate_task(p, rq_dest);
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/kernel/timer.c linux-2.4.19-pre8-ac3p/kernel/timer.c
--- linux-2.4.19-pre8-ac3/kernel/timer.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/kernel/timer.c	Tue May 14 18:42:46 2002
@@ -585,6 +585,8 @@
 {
 	p->per_cpu_utime[cpu] += user;
 	p->per_cpu_stime[cpu] += system;
+	if (atomic_read(&nr_iowait_tasks) > 0)
+		kstat.per_cpu_iowait[cpu] += system;
 	do_process_times(p, user, system);
 	do_it_virt(p, user);
 	do_it_prof(p);
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/lib/dec_and_lock.c linux-2.4.19-pre8-ac3p/lib/dec_and_lock.c
--- linux-2.4.19-pre8-ac3/lib/dec_and_lock.c	Wed Oct  3 12:11:26 2001
+++ linux-2.4.19-pre8-ac3p/lib/dec_and_lock.c	Tue May 14 15:09:18 2002
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 #include <asm/atomic.h>
 
 /*
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/mm/filemap.c linux-2.4.19-pre8-ac3p/mm/filemap.c
--- linux-2.4.19-pre8-ac3/mm/filemap.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/mm/filemap.c	Tue May 14 17:01:27 2002
@@ -45,6 +45,7 @@
  */
 
 atomic_t page_cache_size = ATOMIC_INIT(0);
+atomic_t nr_iowait_tasks = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
 
@@ -828,8 +829,10 @@
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!PageLocked(page))
 			break;
+		atomic_inc(&nr_iowait_tasks);
 		sync_page(page);
 		schedule();
+		atomic_dec(&nr_iowait_tasks);
 	} while (PageLocked(page));
 	__set_task_state(tsk, TASK_RUNNING);
 	remove_wait_queue(waitqueue, &wait);
@@ -875,8 +878,10 @@
 	for (;;) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (PageLocked(page)) {
+			atomic_inc(&nr_iowait_tasks);
 			sync_page(page);
 			schedule();
+			atomic_dec(&nr_iowait_tasks);
 		}
 		if (!TryLockPage(page))
 			break;
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/mm/slab.c linux-2.4.19-pre8-ac3p/mm/slab.c
--- linux-2.4.19-pre8-ac3/mm/slab.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/mm/slab.c	Tue May 14 15:09:18 2002
@@ -49,7 +49,8 @@
  *  constructors and destructors are called without any locking.
  *  Several members in kmem_cache_t and slab_t never change, they
  *	are accessed without any locking.
- *  The per-cpu arrays are never accessed from the wrong cpu, no locking.
+ *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
+ *  	and local interrupts are disabled so slab code is preempt-safe.
  *  The non-constant members are protected with a per-cache irq spinlock.
  *
  * Further notes from the original documentation:
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/net/socket.c linux-2.4.19-pre8-ac3p/net/socket.c
--- linux-2.4.19-pre8-ac3/net/socket.c	Tue May 14 14:59:20 2002
+++ linux-2.4.19-pre8-ac3p/net/socket.c	Tue May 14 15:09:18 2002
@@ -132,7 +132,7 @@
 
 static struct net_proto_family *net_families[NPROTO];
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t net_family_lockct = ATOMIC_INIT(0);
 static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED;
 
diff -u -r --new-file -X /usr/src/exclude linux-2.4.19-pre8-ac3/net/sunrpc/pmap_clnt.c linux-2.4.19-pre8-ac3p/net/sunrpc/pmap_clnt.c
--- linux-2.4.19-pre8-ac3/net/sunrpc/pmap_clnt.c	Tue May 14 14:56:02 2002
+++ linux-2.4.19-pre8-ac3p/net/sunrpc/pmap_clnt.c	Tue May 14 15:09:18 2002
@@ -12,6 +12,7 @@
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/socket.h>
+#include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/uio.h>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 17:17       ` Denis Vlasenko
  2002-05-15 14:03         ` Rik van Riel
@ 2002-05-15 15:15         ` Bill Davidsen
  2002-05-16 10:58           ` Denis Vlasenko
  1 sibling, 1 reply; 28+ messages in thread
From: Bill Davidsen @ 2002-05-15 15:15 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: Rik van Riel, Linux Kernel Mailing List, linux-mm

On Wed, 15 May 2002, Denis Vlasenko wrote:

> Since you are working on this piece of kernel,
> 
> I was investigating why sometimes in top I see idle % like
> 9384729374923.43%. It was caused by idle count in /proc/stat
> going backward sometimes.
> 
> I found the race responsible for that and have a fix for it
> (attached below). It checks for jiffies change and regenerate
> stats if jiffies++ hit us.
> 
> Unfortunately it is for UP case only, in SMP race still exists,
> even on SMP kernel on UP box.
> 
> Why: system/user/idle[/iowait] stats are collected at timer int
> on UP but _on local APIC int_ on SMP.
> 
> It can be fixed for SMP:
> * add spinlock
> or
> * add per_cpu_idle, account it too at timer/APIC int
>   and get rid of idle % calculations for /proc/stat
> 
> As a user, I vote for glitchless statistics even if they
> consume extra i++ cycle every timer int on every CPU.

You have pointed out the problem, but since your fix is UP only and
doesn't have the iowait stuff, I think more of same is needed. I don't
recall seeing this with preempt, but I am not a top user unless I'm
looking for problems.

Thanks for the pointer.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 20:17           ` Denis Vlasenko
@ 2002-05-15 16:13             ` Rik van Riel
  2002-05-15 16:21               ` William Lee Irwin III
                                 ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Rik van Riel @ 2002-05-15 16:13 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: linux-kernel, linux-mm

On Wed, 15 May 2002, Denis Vlasenko wrote:

> I think two patches for same kernel piece at the same time is
> too many. Go ahead and code this if you want.

OK, here it is.   Changes against yesterday's patch:

1) make sure idle time can never go backwards by incrementing
   the idle time in the timer interrupt too (surely we can
   take this overhead if we're idle anyway ;))

2) get_request_wait also raises nr_iowait_tasks (thanks akpm)

This patch is against the latest 2.5 kernel from bk and
pretty much untested. If you have the time, please test
it and let me know if it works.

regards,

Rik
-- 

 drivers/block/ll_rw_blk.c   |    5 ++++-
 fs/buffer.c                 |    2 ++
 fs/proc/proc_misc.c         |   15 ++++++++-------
 include/linux/kernel_stat.h |    4 +++-
 include/linux/swap.h        |    1 +
 kernel/sched.c              |    4 ++++
 mm/filemap.c                |    5 +++++
 7 files changed, 27 insertions(+), 9 deletions(-)


===== drivers/block/ll_rw_blk.c 1.65 vs edited =====
--- 1.65/drivers/block/ll_rw_blk.c	Mon May  6 12:17:09 2002
+++ edited/drivers/block/ll_rw_blk.c	Wed May 15 13:06:49 2002
@@ -1068,8 +1068,11 @@
 	add_wait_queue_exclusive(&rl->wait, &wait);
 	do {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!rl->count)
+		if (!rl->count) {
+			atomic_inc(&nr_iowait_tasks);
 			schedule();
+			atomic_dec(&nr_iowait_tasks);
+		}
 		spin_lock_irq(q->queue_lock);
 		rq = get_request(q, rw);
 		spin_unlock_irq(q->queue_lock);
===== fs/buffer.c 1.96 vs edited =====
--- 1.96/fs/buffer.c	Sat May  4 20:46:31 2002
+++ edited/fs/buffer.c	Tue May 14 14:06:40 2002
@@ -142,7 +142,9 @@
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!buffer_locked(bh))
 			break;
+		atomic_inc(&nr_iowait_tasks);
 		schedule();
+		atomic_dec(&nr_iowait_tasks);
 	} while (buffer_locked(bh));
 	tsk->state = TASK_RUNNING;
 	remove_wait_queue(wq, &wait);
===== fs/proc/proc_misc.c 1.24 vs edited =====
--- 1.24/fs/proc/proc_misc.c	Fri May  3 02:01:31 2002
+++ edited/fs/proc/proc_misc.c	Wed May 15 13:05:21 2002
@@ -282,7 +282,7 @@
 	int i, len;
 	extern unsigned long total_forks;
 	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
 	int major, disk;

 	for (i = 0 ; i < smp_num_cpus; i++) {
@@ -291,23 +291,24 @@
 		user += kstat.per_cpu_user[cpu];
 		nice += kstat.per_cpu_nice[cpu];
 		system += kstat.per_cpu_system[cpu];
+		idle += kstat.per_cpu_idle[cpu];
+		iowait += kstat.per_cpu_iowait[cpu];
 #if !defined(CONFIG_ARCH_S390)
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat.irqs[cpu][j];
 #endif
 	}

-	len = sprintf(page, "cpu  %u %u %u %lu\n", user, nice, system,
-		      jif * smp_num_cpus - (user + nice + system));
+	len = sprintf(page, "cpu  %u %u %u %u %u\n", user, nice, system,
+		      idle, iowait);
 	for (i = 0 ; i < smp_num_cpus; i++)
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
+		len += sprintf(page + len, "cpu%d %u %u %u %u %u\n",
 			i,
 			kstat.per_cpu_user[cpu_logical_map(i)],
 			kstat.per_cpu_nice[cpu_logical_map(i)],
 			kstat.per_cpu_system[cpu_logical_map(i)],
-			jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
-				   + kstat.per_cpu_nice[cpu_logical_map(i)] \
-				   + kstat.per_cpu_system[cpu_logical_map(i)]));
+			kstat.per_cpu_idle[cpu_logical_map(i)],
+			kstat.per_cpu_iowait[cpu_logical_map(i)]);
 	len += sprintf(page + len,
 		"page %u %u\n"
 		"swap %u %u\n"
===== include/linux/kernel_stat.h 1.4 vs edited =====
--- 1.4/include/linux/kernel_stat.h	Thu Apr 11 01:25:39 2002
+++ edited/include/linux/kernel_stat.h	Wed May 15 12:58:38 2002
@@ -18,7 +18,9 @@
 struct kernel_stat {
 	unsigned int per_cpu_user[NR_CPUS],
 	             per_cpu_nice[NR_CPUS],
-	             per_cpu_system[NR_CPUS];
+	             per_cpu_system[NR_CPUS],
+	             per_cpu_idle[NR_CPUS],
+	             per_cpu_iowait[NR_CPUS];
 	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
===== include/linux/swap.h 1.42 vs edited =====
--- 1.42/include/linux/swap.h	Sun May  5 13:55:39 2002
+++ edited/include/linux/swap.h	Tue May 14 14:07:52 2002
@@ -108,6 +108,7 @@
 extern atomic_t buffermem_pages;
 extern spinlock_t pagecache_lock;
 extern void __remove_inode_page(struct page *);
+extern atomic_t nr_iowait_tasks;

 /* Incomplete types for prototype declarations: */
 struct task_struct;
===== kernel/sched.c 1.73 vs edited =====
--- 1.73/kernel/sched.c	Mon Apr 29 09:16:24 2002
+++ edited/kernel/sched.c	Wed May 15 12:58:18 2002
@@ -679,6 +679,10 @@
 	if (p == rq->idle) {
 		if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
 			kstat.per_cpu_system[cpu] += system;
+		else if (atomic_read(&nr_iowait_tasks) > 0)
+			kstat.per_cpu_iowait[cpu] += system;
+		else
+			kstat.per_cpu_idle[cpu] += system;
 #if CONFIG_SMP
 		idle_tick();
 #endif
===== mm/filemap.c 1.87 vs edited =====
--- 1.87/mm/filemap.c	Mon May  6 12:12:36 2002
+++ edited/mm/filemap.c	Tue May 14 14:12:03 2002
@@ -48,6 +48,7 @@
  *        ->sb_lock		(fs/fs-writeback.c)
  */
 spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+atomic_t nr_iowait_tasks = ATOMIC_INIT(0);

 /*
  * Remove a page from the page cache and free it. Caller has to make
@@ -611,8 +612,10 @@
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!test_bit(bit_nr, &page->flags))
 			break;
+		atomic_inc(&nr_iowait_tasks);
 		sync_page(page);
 		schedule();
+		atomic_dec(&nr_iowait_tasks);
 	} while (test_bit(bit_nr, &page->flags));
 	__set_task_state(tsk, TASK_RUNNING);
 	remove_wait_queue(waitqueue, &wait);
@@ -675,8 +678,10 @@
 	for (;;) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (PageLocked(page)) {
+			atomic_inc(&nr_iowait_tasks);
 			sync_page(page);
 			schedule();
+			atomic_dec(&nr_iowait_tasks);
 		}
 		if (!TestSetPageLocked(page))
 			break;


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 16:13             ` Rik van Riel
@ 2002-05-15 16:21               ` William Lee Irwin III
  2002-05-15 17:00               ` William Lee Irwin III
  2002-05-16 11:14               ` Denis Vlasenko
  2 siblings, 0 replies; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-15 16:21 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Denis Vlasenko, linux-kernel, linux-mm

On Wed, 15 May 2002, Denis Vlasenko wrote:
>> I think two patches for same kernel piece at the same time is
>> too many. Go ahead and code this if you want.

On Wed, May 15, 2002 at 01:13:33PM -0300, Rik van Riel wrote:
> OK, here it is.   Changes against yesterday's patch:
> 1) make sure idle time can never go backwards by incrementing
>    the idle time in the timer interrupt too (surely we can
>    take this overhead if we're idle anyway ;))
> 2) get_request_wait also raises nr_iowait_tasks (thanks akpm)
> This patch is against the latest 2.5 kernel from bk and
> pretty much untested. If you have the time, please test
> it and let me know if it works.

I'll take it for a spin on my 8-way HT box; I can remove enough of
the non-compiling device subsystems to get test boots & runs in there.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 16:13             ` Rik van Riel
  2002-05-15 16:21               ` William Lee Irwin III
@ 2002-05-15 17:00               ` William Lee Irwin III
  2002-05-15 18:16                 ` Bill Davidsen
  2002-05-15 18:30                 ` William Lee Irwin III
  2002-05-16 11:14               ` Denis Vlasenko
  2 siblings, 2 replies; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-15 17:00 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Denis Vlasenko, linux-kernel, linux-mm

On Wed, 15 May 2002, Denis Vlasenko wrote:
>> I think two patches for same kernel piece at the same time is
>> too many. Go ahead and code this if you want.

On Wed, May 15, 2002 at 01:13:33PM -0300, Rik van Riel wrote:
> OK, here it is.   Changes against yesterday's patch:
> 1) make sure idle time can never go backwards by incrementing
>    the idle time in the timer interrupt too (surely we can
>    take this overhead if we're idle anyway ;))
> 2) get_request_wait also raises nr_iowait_tasks (thanks akpm)
> This patch is against the latest 2.5 kernel from bk and
> pretty much untested. If you have the time, please test
> it and let me know if it works.

Boots compiles and runs on an 4-way physical HT box. I didn't wake
the evil twins to cut down on the number of variables so it stayed
4-way despite the ability to go 8-way.

Sliding window of 120 seconds, sampled every 15 seconds, under a
repetitive kernel compile load:

Wed May 15 09:56:37 PDT 2002
cpu  60701 0 5137 203545 9327
cpu0 15048 0 1566 50868 2298
cpu1 15257 0 1176 50818 2392
cpu2 15248 0 1346 50802 2247
cpu3 15148 0 1049 51057 2390

Wed May 15 09:56:52 PDT 2002
cpu  66304 0 5543 203545 9327
cpu0 16460 0 1656 50868 2298
cpu1 16606 0 1330 50818 2392
cpu2 16655 0 1441 50802 2247
cpu3 16583 0 1116 51057 2390

Wed May 15 09:57:07 PDT 2002
cpu  71877 0 5980 203545 9327
cpu0 17849 0 1769 50868 2298
cpu1 17972 0 1466 50818 2392
cpu2 18060 0 1539 50802 2247
cpu3 17996 0 1206 51057 2390

Wed May 15 09:57:22 PDT 2002
cpu  77446 0 6420 203545 9327
cpu0 19269 0 1852 50868 2298
cpu1 19328 0 1612 50818 2392
cpu2 19448 0 1653 50802 2247
cpu3 19401 0 1303 51057 2390

Wed May 15 09:57:37 PDT 2002
cpu  83031 0 6843 203545 9327
cpu0 20699 0 1924 50868 2298
cpu1 20704 0 1738 50818 2392
cpu2 20846 0 1757 50802 2247
cpu3 20782 0 1424 51057 2390

Wed May 15 09:57:52 PDT 2002
cpu  87432 0 7216 204779 9328
cpu0 21788 0 2000 51205 2298
cpu1 21737 0 1845 51180 2393
cpu2 21871 0 1806 51230 2247
cpu3 22036 0 1565 51164 2390

Wed May 15 09:58:07 PDT 2002
cpu  93003 0 7653 204779 9328
cpu0 23178 0 2112 51205 2298
cpu1 23134 0 1950 51180 2393
cpu2 23281 0 1898 51230 2247
cpu3 23410 0 1693 51164 2390

Wed May 15 09:58:22 PDT 2002
cpu  98583 0 8082 204779 9328
cpu0 24538 0 2254 51205 2298
cpu1 24521 0 2065 51180 2393
cpu2 24704 0 1978 51230 2247
cpu3 24820 0 1785 51164 2390


It looks very constant, not sure if it should be otherwise.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14  2:18 ` Andrew Morton
  2002-05-14 12:30   ` Rik van Riel
@ 2002-05-15 17:02   ` Denis Vlasenko
  2002-05-16  7:41     ` Andrew Morton
  1 sibling, 1 reply; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-15 17:02 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel; +Cc: linux-kernel, linux-mm

On 14 May 2002 00:18, Andrew Morton wrote:
> Rik van Riel wrote:
> > 4) on SMP systems the iowait time can be overestimated, no big
> >    deal IMHO but cheap suggestions for improvement are welcome
>
> I suspect that a number of these statistical accounting mechanisms
> are going to break.  The new irq-affinity code works awfully well.
>
> The kernel profiler in 2.5 doesn't work very well at present.
> When investigating this, I ran a busy-wait process.  It attached
> itself to CPU #3 and that CPU received precisely zero interrupts
> across a five minute period.  So the profiler cunningly avoids profiling
> busy CPUs, which is rather counter-productive.  Fortunate that oprofile
> uses NMI.

What, even local APIC interrupts did not happen on CPU#3
in these five mins?
--
vda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-14 16:54     ` William Lee Irwin III
@ 2002-05-15 17:17       ` Denis Vlasenko
  2002-05-15 14:03         ` Rik van Riel
  2002-05-15 15:15         ` Bill Davidsen
  0 siblings, 2 replies; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-15 17:17 UTC (permalink / raw)
  To: William Lee Irwin III, Rik van Riel; +Cc: linux-kernel, linux-mm

On 14 May 2002 14:54, William Lee Irwin III wrote:
> On Tue, 14 May 2002, William Lee Irwin III wrote:
> >> This appears to be global across all cpu's. Maybe nr_iowait_tasks
> >> should be accounted on a per-cpu basis, where
>
> On Tue, May 14, 2002 at 01:36:00PM -0300, Rik van Riel wrote:
> > While your proposal should work, somehow I doubt it's worth
> > the complexity. It's just a statistic to help sysadmins ;)
>
> I reserved judgment on that in order to present a possible mechanism.
> I'm not sure it is either; we'll know it matters if sysadmins scream.

Hi Rik,

Since you are working on this piece of kernel,

I was investigating why sometimes in top I see idle % like
9384729374923.43%. It was caused by idle count in /proc/stat
going backward sometimes.

I found the race responsible for that and have a fix for it
(attached below). It checks for jiffies change and regenerate
stats if jiffies++ hit us.

Unfortunately it is for UP case only, in SMP race still exists,
even on SMP kernel on UP box.

Why: system/user/idle[/iowait] stats are collected at timer int
on UP but _on local APIC int_ on SMP.

It can be fixed for SMP:
* add spinlock
or
* add per_cpu_idle, account it too at timer/APIC int
  and get rid of idle % calculations for /proc/stat

As a user, I vote for glitchless statistics even if they
consume extra i++ cycle every timer int on every CPU.

Now you hear very first scream :-)
--
vda

--- fs/proc/proc_misc.c.orig	Wed Nov 21 03:29:09 2001
+++ fs/proc/proc_misc.c	Thu Apr 25 13:57:55 2002
@@ -239,38 +239,47 @@
 				 int count, int *eof, void *data)
 {
 	int i, len;
-	extern unsigned long total_forks;
-	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	extern unsigned long total_forks; /*FIXME: move into a .h */
+	unsigned long jif, sum, user, nice, system;
 	int major, disk;

-	for (i = 0 ; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i), j;
-
-		user += kstat.per_cpu_user[cpu];
-		nice += kstat.per_cpu_nice[cpu];
-		system += kstat.per_cpu_system[cpu];
+	do {
+		jif=jiffies;
+		sum = user = nice = system = 0;
+		for (i = 0 ; i < smp_num_cpus; i++) {
+			int cpu = cpu_logical_map(i), j;
+			user += kstat.per_cpu_user[cpu];
+			nice += kstat.per_cpu_nice[cpu];
+			system += kstat.per_cpu_system[cpu];
 #if !defined(CONFIG_ARCH_S390)
-		for (j = 0 ; j < NR_IRQS ; j++)
-			sum += kstat.irqs[cpu][j];
+			for (j = 0 ; j < NR_IRQS ; j++)
+				sum += kstat.irqs[cpu][j];
 #endif
-	}
-
-	len = sprintf(page, "cpu  %u %u %u %lu\n", user, nice, system,
-		      jif * smp_num_cpus - (user + nice + system));
-	for (i = 0 ; i < smp_num_cpus; i++)
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
-			i,
-			kstat.per_cpu_user[cpu_logical_map(i)],
-			kstat.per_cpu_nice[cpu_logical_map(i)],
-			kstat.per_cpu_system[cpu_logical_map(i)],
-			jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
-				   + kstat.per_cpu_nice[cpu_logical_map(i)] \
-				   + kstat.per_cpu_system[cpu_logical_map(i)]));
+		}
+
+		len = sprintf(page, "cpu  %lu %lu %lu %lu\n",
+			    user, nice, system,
+			    jif*smp_num_cpus - (user+nice+system)
+			    );
+		for (i = 0 ; i < smp_num_cpus; i++) {
+			int cpu = cpu_logical_map(i);
+			len += sprintf(page + len, "cpu%d %lu %lu %lu %lu\n",
+				i,
+				(unsigned long)kstat.per_cpu_user[cpu],
+				(unsigned long)kstat.per_cpu_nice[cpu],
+				(unsigned long)kstat.per_cpu_system[cpu],
+				jif - ( kstat.per_cpu_user[cpu]
+					+ kstat.per_cpu_nice[cpu]
+					+ kstat.per_cpu_system[cpu]));
+		}
+	} while(jif!=jiffies); /* regenerate if there was a timer interrupt */
+				/* TODO: check SMP case: SMP uses local APIC ints
+				for kstat updates, not a timer int... */
+
 	len += sprintf(page + len,
 		"page %u %u\n"
 		"swap %u %u\n"
-		"intr %u",
+		"intr %lu",
 			kstat.pgpgin >> 1,
 			kstat.pgpgout >> 1,
 			kstat.pswpin,

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 17:00               ` William Lee Irwin III
@ 2002-05-15 18:16                 ` Bill Davidsen
  2002-05-15 18:30                 ` William Lee Irwin III
  1 sibling, 0 replies; 28+ messages in thread
From: Bill Davidsen @ 2002-05-15 18:16 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Rik van Riel, Denis Vlasenko, linux-kernel, linux-mm

On Wed, 15 May 2002, William Lee Irwin III wrote:

> Boots compiles and runs on an 4-way physical HT box. I didn't wake
> the evil twins to cut down on the number of variables so it stayed
> 4-way despite the ability to go 8-way.
> 
> Sliding window of 120 seconds, sampled every 15 seconds, under a
> repetitive kernel compile load:
> 
> Wed May 15 09:56:37 PDT 2002
> cpu  60701 0 5137 203545 9327
> cpu0 15048 0 1566 50868 2298
> cpu1 15257 0 1176 50818 2392
> cpu2 15248 0 1346 50802 2247
> cpu3 15148 0 1049 51057 2390
	[... snip ...]
> Wed May 15 09:58:22 PDT 2002
> cpu  98583 0 8082 204779 9328
> cpu0 24538 0 2254 51205 2298
> cpu1 24521 0 2065 51180 2393
> cpu2 24704 0 1978 51230 2247
> cpu3 24820 0 1785 51164 2390
> 
> 
> It looks very constant, not sure if it should be otherwise.

You show-offs with your big memory and everything in it... Okay, boot that
puppy with mem=256m and try that again, particularly with -j4 (or -j8 with
HT on). I bet THAT will give you some IOwait!

I think you do want to try HT after you find out the memory is small
enough. Pure curiousity on my part, I assume it will work, although the
results might not be what I expect.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 17:00               ` William Lee Irwin III
  2002-05-15 18:16                 ` Bill Davidsen
@ 2002-05-15 18:30                 ` William Lee Irwin III
  2002-05-15 18:33                   ` Rik van Riel
  1 sibling, 1 reply; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-15 18:30 UTC (permalink / raw)
  To: Rik van Riel, Denis Vlasenko, linux-kernel, linux-mm

On Wed, May 15, 2002 at 10:00:25AM -0700, William Lee Irwin III wrote:
> Wed May 15 09:58:22 PDT 2002
> cpu  98583 0 8082 204779 9328
> cpu0 24538 0 2254 51205 2298
> cpu1 24521 0 2065 51180 2393
> cpu2 24704 0 1978 51230 2247
> cpu3 24820 0 1785 51164 2390
> 
> It looks very constant, not sure if it should be otherwise.

Not quite constant, just slowly varying:

Wed May 15 11:30:47 PDT 2002
cpu  2095183 0 158967 263950 20705
cpu0 524201 0 40781 64795 5026
cpu1 523034 0 39953 66328 5352
cpu2 525737 0 37989 65826 5115
cpu3 522211 0 40244 67001 5212


Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 18:30                 ` William Lee Irwin III
@ 2002-05-15 18:33                   ` Rik van Riel
  2002-05-15 18:46                     ` William Lee Irwin III
  0 siblings, 1 reply; 28+ messages in thread
From: Rik van Riel @ 2002-05-15 18:33 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Denis Vlasenko, linux-kernel, linux-mm

On Wed, 15 May 2002, William Lee Irwin III wrote:
> On Wed, May 15, 2002 at 10:00:25AM -0700, William Lee Irwin III wrote:
> > Wed May 15 09:58:22 PDT 2002
> > cpu  98583 0 8082 204779 9328
> >
> > It looks very constant, not sure if it should be otherwise.
>
> Not quite constant, just slowly varying:
>
> Wed May 15 11:30:47 PDT 2002
> cpu  2095183 0 158967 263950 20705

Well, with the amount of memory you have in the machine
I expect the time spent in idle and iowait to be fairly
limited during a repetitive kernel compile ;)

If everything "looks" normal in top and vmstat things
should be ok.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 18:33                   ` Rik van Riel
@ 2002-05-15 18:46                     ` William Lee Irwin III
  2002-05-15 19:00                       ` Rik van Riel
  0 siblings, 1 reply; 28+ messages in thread
From: William Lee Irwin III @ 2002-05-15 18:46 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Denis Vlasenko, linux-kernel, linux-mm

On Wed, May 15, 2002 at 03:33:58PM -0300, Rik van Riel wrote:
> Well, with the amount of memory you have in the machine
> I expect the time spent in idle and iowait to be fairly
> limited during a repetitive kernel compile ;)
> If everything "looks" normal in top and vmstat things
> should be ok.

On Wed, May 15, 2002 at 03:33:58PM -0300, Rik van Riel wrote:
$ vmstat 1
   procs                      memory    swap          io     system         cpu
 r  b  w   swpd   free   buff  cache  si  so    bi    bo   in    cs  us  sy  id
17  0  0      0 182880  32364  55180   0   0     2    77   29   229  84   6  10
17  0  0      0 178840  32380  55268   0   0     0  1240  169   766  95   5   0
16  0  0      0 168056  32380  55480   0   0     0     0  104   829  95   5   0
20  0  0      0 197960  32380  55616   0   0     0     0  105  1156  94   6   0
16  0  0      0 179416  32380  55632   0   0     0     0  108   994  95   5   0
18  0  0      0 184428  32380  55692   0   0     0     0  109   954  94   6   0
16  0  0      0 165684  32396  55720   0   0     0  1148  182  1069  96   4   0
17  0  0      0 155572  32396  55732   0   0     0     0  104   854  96   4   0
17  0  0      0 181108  32400  56636   0   0     0     0  104   853  95   5   0
18  0  0      0 194772  32400  56692   0   0     0     0  103   955  94   6   0
16  0  0      0 185068  32400  56856   0   0     0     0  104  1006  95   5   0
18  1  0      0 191744  32408  56972   0   0     0  1808  286  1062  95   5   0
16  0  0      0 202512  32408  57092   0   0     0     0  103   956  94   6   0
17  0  0      0 191116  32408  57140   0   0     0     0  103  1143  91   9   0
17  0  0      0 202032  32408  57288   0   0     0     0  103   899  93   7   0
17  0  0      0 187764  32408  57320   0   0     0     0  103   928  92   8   0
18  1  0      0 198048  32408  57400   0   0     0  1776  294  1026  94   6   0
16  1  0      0 205108  32408  57436   0   0     0     0  104   842  96   4   0
16  0  0      0 181964  32408  57488   0   0     0     0  103  1028  95   5   0
16  0  0      0 175224  32408  57524   0   0     0     0  104   961  95   5   0

All good there. OTOH:

$ top
fscanf failed on /proc/stat for cpu 1
fscanf failed on /proc/stat for cpu 2
fscanf failed on /proc/stat for cpu 3

... and it proceeds thus:

 11:45am  up  2:00,  4 users,  load average: 17.18, 16.86, 16.94
52 processes: 50 sleeping, 2 running, 0 zombie, 0 stopped
CPU0 states: 72.2% user, 27.1% system,  0.0% nice,  0.2% idle
fscanf failed on /proc/stat for cpu 1
Mem:   499088K av,  174936K used,  324152K free,       0K shrd,   32684K buff
Swap: 1052248K av,       0K used, 1052248K free                   61812K cached

  PID USER     PRI  NI  SIZE  RSS SHARE STAT %CPU %MEM   TIME COMMAND
26200 wli       25   0 13780  13M  2204 R    10.0  2.7   0:01 cc1
26246 wli       15   0  1060 1060   804 R     0.1  0.2   0:00 top
    1 root      15   0   492  492   428 S     0.0  0.0   0:00 init
    2 root      0K   0     0    0     0 SW    0.0  0.0   0:00 migration_CPU0
    3 root      0K   0     0    0     0 SW    0.0  0.0   0:00 migration_CPU1
    4 root      0K   0     0    0     0 SW    0.0  0.0   0:00 migration_CPU2
    5 root      0K   0     0    0     0 SW    0.0  0.0   0:00 migration_CPU3
    6 root      15   0     0    0     0 SW    0.0  0.0   0:00 keventd
    7 root      34  19     0    0     0 SWN   0.0  0.0   0:00 ksoftirqd_CPU0
    8 root      34  19     0    0     0 SWN   0.0  0.0   0:00 ksoftirqd_CPU1
    9 root      34  19     0    0     0 SWN   0.0  0.0   0:00 ksoftirqd_CPU2
   10 root      34  19     0    0     0 SWN   0.0  0.0   0:00 ksoftirqd_CPU3
   11 root      25   0     0    0     0 SW    0.0  0.0   0:00 kswapd
   12 root      25   0     0    0     0 SW    0.0  0.0   0:00 pdflush



... i.e. a little bit of recurring fscanf stuff.

Cheers,
Bill

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 18:46                     ` William Lee Irwin III
@ 2002-05-15 19:00                       ` Rik van Riel
  2002-05-16 11:42                         ` Denis Vlasenko
  0 siblings, 1 reply; 28+ messages in thread
From: Rik van Riel @ 2002-05-15 19:00 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Denis Vlasenko, linux-kernel, linux-mm

On Wed, 15 May 2002, William Lee Irwin III wrote:

> $ vmstat 1
>    procs                      memory    swap          io     system         cpu
>  r  b  w   swpd   free   buff  cache  si  so    bi    bo   in    cs  us  sy  id
> 17  0  0      0 182880  32364  55180   0   0     2    77   29   229  84   6  10
> 16  0  0      0 175224  32408  57524   0   0     0     0  104   961  95   5   0
>
> All good there. OTOH:
>
> $ top
> fscanf failed on /proc/stat for cpu 1

Doh, take a look at top.c around line 1460:

              for(i = 0; i < nr_cpu; i++) {
                if(fscanf(file, "cpu%*d %d %d %d %d\n",
                          &u_ticks, &n_ticks, &s_ticks, &i_ticks) != 4) {
                  fprintf(stderr, "fscanf failed on /proc/stat for cpu %d\n", i);

It would have been ok (like vmstat) if it didn't expect the \n
after the fourth number ;/

Oh well, time for another procps patch ;)

cheers,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 14:03         ` Rik van Riel
@ 2002-05-15 20:17           ` Denis Vlasenko
  2002-05-15 16:13             ` Rik van Riel
  0 siblings, 1 reply; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-15 20:17 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel, linux-mm

On 15 May 2002 12:03, Rik van Riel wrote:
> > I was investigating why sometimes in top I see idle % like
> > 9384729374923.43%. It was caused by idle count in /proc/stat
> > going backward sometimes.
>
> Thanks for tracking down this bug.
>
> > It can be fixed for SMP:
> > * add spinlock
> > or
> > * add per_cpu_idle, account it too at timer/APIC int
> >   and get rid of idle % calculations for /proc/stat
> >
> > As a user, I vote for glitchless statistics even if they
> > consume extra i++ cycle every timer int on every CPU.
>
> Same for me. The last option is probably easiest to implement
> and cheapest at run time.

I think two patches for same kernel piece at the same time is
too many. Go ahead and code this if you want.

> The extra "cost" will approach zero
> once somebody takes the time to put the per-cpu stats on per
> cpu cache lines, which I'm sure somebody will do once we have
> enough per-cpu stats ;)

I thought about that too: per_cpu_xxx[cpu] -> per_cpu[cpu].xxx
type thing.
--
vda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 17:02   ` Denis Vlasenko
@ 2002-05-16  7:41     ` Andrew Morton
  2002-05-16 14:04       ` Denis Vlasenko
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Morton @ 2002-05-16  7:41 UTC (permalink / raw)
  To: vda; +Cc: Rik van Riel, linux-kernel, linux-mm

Denis Vlasenko wrote:
> 
> On 14 May 2002 00:18, Andrew Morton wrote:
> > Rik van Riel wrote:
> > > 4) on SMP systems the iowait time can be overestimated, no big
> > >    deal IMHO but cheap suggestions for improvement are welcome
> >
> > I suspect that a number of these statistical accounting mechanisms
> > are going to break.  The new irq-affinity code works awfully well.
> >
> > The kernel profiler in 2.5 doesn't work very well at present.
> > When investigating this, I ran a busy-wait process.  It attached
> > itself to CPU #3 and that CPU received precisely zero interrupts
> > across a five minute period.  So the profiler cunningly avoids profiling
> > busy CPUs, which is rather counter-productive.  Fortunate that oprofile
> > uses NMI.
> 
> What, even local APIC interrupts did not happen on CPU#3
> in these five mins?

CPU1 is busy:

quad:/home/akpm> cat /proc/interrupts ; sleep 10 ; cat /proc/interrupts
           CPU0       CPU1       CPU2       CPU3       
  0:      36059      33847      38948      33846    IO-APIC-edge  timer
  1:          1          1          1          4    IO-APIC-edge  keyboard
  2:          0          0          0          0          XT-PIC  cascade
  4:          1          1          1          0    IO-APIC-edge  GDB-stub
  8:          0          0          0          1    IO-APIC-edge  rtc
 12:          0          1          0          0    IO-APIC-edge  PS/2 Mouse
 14:          1          2          0          3    IO-APIC-edge  ide0
 15:       7558       7557       7633       8025    IO-APIC-edge  ide1
 19:      17088      17707      17210      18610   IO-APIC-level  ide2, ide3, ide4, ide5
 35:         38         71         56        174   IO-APIC-level  aic7xxx
 38:        955       1798        584        517   IO-APIC-level  eth0
 58:      25368      19911      27931      20695   IO-APIC-level  aic7xxx
NMI:     164030     164030     164030     164030 
LOC:     142543     142543     142542     142542 
ERR:          0
MIS:          0
           CPU0       CPU1       CPU2       CPU3       
  0:      36388      33847      39289      34178    IO-APIC-edge  timer
  1:          1          1          1          4    IO-APIC-edge  keyboard
  2:          0          0          0          0          XT-PIC  cascade
  4:          1          1          1          0    IO-APIC-edge  GDB-stub
  8:          0          0          0          1    IO-APIC-edge  rtc
 12:          0          1          0          0    IO-APIC-edge  PS/2 Mouse
 14:          1          2          0          3    IO-APIC-edge  ide0
 15:       7565       7557       7633       8026    IO-APIC-edge  ide1
 19:      17088      17707      17210      18610   IO-APIC-level  ide2, ide3, ide4, ide5
 35:         38         71         56        174   IO-APIC-level  aic7xxx
 38:        969       1798        590        525   IO-APIC-level  eth0
 58:      25368      19911      27931      20695   IO-APIC-level  aic7xxx
NMI:     165032     165032     165032     165032 
LOC:     143545     143545     143544     143544 
ERR:          0
MIS:          0


-

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 15:15         ` Bill Davidsen
@ 2002-05-16 10:58           ` Denis Vlasenko
  0 siblings, 0 replies; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-16 10:58 UTC (permalink / raw)
  To: Bill Davidsen; +Cc: Rik van Riel, Linux Kernel Mailing List, linux-mm

On 15 May 2002 13:15, you wrote:
> On Wed, 15 May 2002, Denis Vlasenko wrote:
> > It can be fixed for SMP:
> > * add spinlock
> > or
> > * add per_cpu_idle, account it too at timer/APIC int
> >   and get rid of idle % calculations for /proc/stat
> >
> > As a user, I vote for glitchless statistics even if they
> > consume extra i++ cycle every timer int on every CPU.
>
> You have pointed out the problem, but since your fix is UP only and
> doesn't have the iowait stuff, I think more of same is needed. I don't
> recall seeing this with preempt, but I am not a top user unless I'm
> looking for problems.

I just wanted to inform Rik of this small problem. Since he's going
to fiddle with stats, he can fix this on the way.
BTW, the bug is easily triggered on SMP kernel, very hard to see
(but definitely happens) with UP, I bet you'll see it on preempt too.

Try these two scripts:

#!/bin/sh
# Prints dots until bad thing happens
# Prints old_idle_cnt -> new_idle_cnt then
prev=0
while true; do cat /proc/stat; done | \
grep -F 'cpu  ' | \
cut -d ' ' -f 6 | \
while read next; do
    echo -n .
    diff=$(($next-$prev))
    if test $diff -lt 0; then
	echo "$prev -> $next"
    fi
    prev=$next
done

#!/bin/sh
# Prints cpu line from /proc/stat repeatedly
# When bad thing happens, flags it by '<<<'
prev=0
while true; do cat /proc/stat; done | \
grep -F 'cpu  ' | \
while read line; do
    next=`echo "$line" | cut -d ' ' -f 6`
    diff=$(($next-$prev))
    if test $diff -lt 0; then
	echo "$line <<<"
    else
	echo "$line"
    fi
    prev=$next
done

--
vda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 16:13             ` Rik van Riel
  2002-05-15 16:21               ` William Lee Irwin III
  2002-05-15 17:00               ` William Lee Irwin III
@ 2002-05-16 11:14               ` Denis Vlasenko
  2 siblings, 0 replies; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-16 11:14 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel, linux-mm

On 15 May 2002 14:13, Rik van Riel wrote:
> On Wed, 15 May 2002, Denis Vlasenko wrote:
> > I think two patches for same kernel piece at the same time is
> > too many. Go ahead and code this if you want.
>
> OK, here it is.   Changes against yesterday's patch:
>
> 1) make sure idle time can never go backwards by incrementing
>    the idle time in the timer interrupt too (surely we can
>    take this overhead if we're idle anyway ;))
>
> 2) get_request_wait also raises nr_iowait_tasks (thanks akpm)
>
> This patch is against the latest 2.5 kernel from bk and
> pretty much untested. If you have the time, please test
> it and let me know if it works.


--- 1.73/kernel/sched.c Mon Apr 29 09:16:24 2002
+++ edited/kernel/sched.c       Wed May 15 12:58:18 2002
@@ -679,6 +679,10 @@
        if (p == rq->idle) {
		[*]
                if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
                        kstat.per_cpu_system[cpu] += system;
+               else if (atomic_read(&nr_iowait_tasks) > 0)
+                       kstat.per_cpu_iowait[cpu] += system;
+               else
+                       kstat.per_cpu_idle[cpu] += system;

[*] Maybe add if(system!=0) there?
--
vda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-15 19:00                       ` Rik van Riel
@ 2002-05-16 11:42                         ` Denis Vlasenko
  0 siblings, 0 replies; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-16 11:42 UTC (permalink / raw)
  To: Rik van Riel, William Lee Irwin III; +Cc: linux-kernel, linux-mm

On 15 May 2002 17:00, Rik van Riel wrote:
> > $ top
> > fscanf failed on /proc/stat for cpu 1
>
> Doh, take a look at top.c around line 1460:
>
>               for(i = 0; i < nr_cpu; i++) {
>                 if(fscanf(file, "cpu%*d %d %d %d %d\n",
>                           &u_ticks, &n_ticks, &s_ticks, &i_ticks) != 4) {
>                   fprintf(stderr, "fscanf failed on /proc/stat for cpu
> %d\n", i);
>
> It would have been ok (like vmstat) if it didn't expect the \n
> after the fourth number ;/
>
> Oh well, time for another procps patch ;)

While you're at it:

          printf("CPU states:"
                 " %2ld.%ld%% user, %2ld.%ld%% system,"
                 " %2ld.%ld%% nice, %2ld.%ld%% idle",
                 user_ticks / 10UL, user_ticks % 10UL,
                 system_ticks / 10UL, system_ticks % 10UL,
                 nice_ticks / 10UL, nice_ticks % 10UL,
                 idle_ticks / 10UL, idle_ticks % 10UL);

" %2ld" -> "%3ld" will make 100.00% look much nicer:
Current code: " 34.56%" " 100.00%" (i.e. 100% is one char wider!)
New code:     " 34.56%" "100.00%"

Same here:

          printf ("CPU%d states: %2d.%-d%% user, %2d.%-d%% system,"
                  " %2d.%-d%% nice, %2d.%-d%% idle",
                  cpumap,

Another thing: in sight of moves towards 64bit jiffies isn't it wise to
use unsigned long long (or explicit u64) for all these numbers?
--
vda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC][PATCH] iowait statistics
  2002-05-16  7:41     ` Andrew Morton
@ 2002-05-16 14:04       ` Denis Vlasenko
  0 siblings, 0 replies; 28+ messages in thread
From: Denis Vlasenko @ 2002-05-16 14:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On 16 May 2002 05:41, Andrew Morton wrote:
> > What, even local APIC interrupts did not happen on CPU#3
> > in these five mins?
>
> CPU1 is busy:
>
> quad:/home/akpm> cat /proc/interrupts ; sleep 10 ; cat /proc/interrupts
>            CPU0       CPU1       CPU2       CPU3
> LOC:     142543     142543     142542     142542
> LOC:     143545     143545     143544     143544

Ok, local ints *are* delivered just fine
--
vda

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2002-05-16  9:04 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-05-14  1:19 [RFC][PATCH] iowait statistics Rik van Riel
2002-05-14  2:18 ` Andrew Morton
2002-05-14 12:30   ` Rik van Riel
2002-05-15 17:02   ` Denis Vlasenko
2002-05-16  7:41     ` Andrew Morton
2002-05-16 14:04       ` Denis Vlasenko
2002-05-14 15:39 ` William Lee Irwin III
2002-05-14 16:36   ` Rik van Riel
2002-05-14 16:54     ` William Lee Irwin III
2002-05-15 17:17       ` Denis Vlasenko
2002-05-15 14:03         ` Rik van Riel
2002-05-15 20:17           ` Denis Vlasenko
2002-05-15 16:13             ` Rik van Riel
2002-05-15 16:21               ` William Lee Irwin III
2002-05-15 17:00               ` William Lee Irwin III
2002-05-15 18:16                 ` Bill Davidsen
2002-05-15 18:30                 ` William Lee Irwin III
2002-05-15 18:33                   ` Rik van Riel
2002-05-15 18:46                     ` William Lee Irwin III
2002-05-15 19:00                       ` Rik van Riel
2002-05-16 11:42                         ` Denis Vlasenko
2002-05-16 11:14               ` Denis Vlasenko
2002-05-15 15:15         ` Bill Davidsen
2002-05-16 10:58           ` Denis Vlasenko
2002-05-14 18:19     ` Martin J. Bligh
2002-05-15  1:31 ` Bill Davidsen
2002-05-15  1:41   ` William Lee Irwin III
2002-05-15 14:39     ` Bill Davidsen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).