linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Nick's scheduler policy
@ 2003-08-24 12:35 Nick Piggin
  2003-08-24 14:29 ` Felipe Alfaro Solana
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Nick Piggin @ 2003-08-24 12:35 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 849 bytes --]

Hi,
Patch against 2.6.0-test4. It fixes a lot of problems here vs
previous versions. There aren't really any open issues for me, so
testers would be welcome.

The big change is more dynamic timeslices, which allows "interactive"
tasks to get very small timeslices while more compute intensive loads
can be given bigger timeslices than usual. This works properly with
nice (niced processes will tend to get bigger timeslices).

I think I have cured test-starve too.

Worst case scheduling latency for a very nice (19) process I have
observed is about 210ms during a niced make -j3 and frantic window
moving in X, on a UP... I did see one of Con's recent kernels have a
worst case of >7s.

On the other hand, I expect the best cases and maybe most usual cases
would be better on Con's... and Con might have since done some work
in the latency area.



[-- Attachment #2: sched-policy-6 --]
[-- Type: text/plain, Size: 20993 bytes --]

--- archs/linux-2.6/include/linux/sched.h	2003-08-24 18:58:59.000000000 +1000
+++ linux-2.6/include/linux/sched.h	2003-08-24 20:29:37.000000000 +1000
@@ -281,7 +281,9 @@ struct signal_struct {
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
- 
+
+#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -339,12 +341,16 @@ struct task_struct {
 	struct list_head run_list;
 	prio_array_t *array;
 
+	unsigned long array_sequence;
+	unsigned long timestamp;
+
+	unsigned long total_time, sleep_time;
 	unsigned long sleep_avg;
-	unsigned long last_run;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
+	unsigned int used_slice;
 
 	struct list_head tasks;
 	struct list_head ptrace_children;
@@ -552,6 +558,7 @@ extern int FASTCALL(wake_up_state(struct
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
 extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
 extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_fork(task_t * p));
 extern void FASTCALL(sched_exit(task_t * p));
 
 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
--- archs/linux-2.6/kernel/fork.c	2003-08-24 18:58:59.000000000 +1000
+++ linux-2.6/kernel/fork.c	2003-08-24 20:32:15.000000000 +1000
@@ -911,33 +911,9 @@ struct task_struct *copy_process(unsigne
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
-	local_irq_disable();
-        p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->last_run = jiffies;
-	if (!current->time_slice) {
-		/*
-	 	 * This case is rare, it happens when the parent has only
-	 	 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		preempt_disable();
-		scheduler_tick(0, 0);
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	/* Perform scheduler related accounting */
+	sched_fork(p);
+
 	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.
--- archs/linux-2.6/kernel/sched.c	2003-08-24 18:58:59.000000000 +1000
+++ linux-2.6/kernel/sched.c	2003-08-24 22:15:30.000000000 +1000
@@ -66,73 +66,17 @@
  * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
-#define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(200 * HZ / 1000)
-#define CHILD_PENALTY		50
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
-#define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(10*HZ)
-#define STARVATION_LIMIT	(10*HZ)
-#define NODE_THRESHOLD		125
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
+#define MIN_TIMESLICE		((1 * HZ / 1000) ? 1 * HZ / 1000 : 1)
+#define MAX_TIMESLICE		(20 * MIN_TIMESLICE) /* This cannot be changed */
 
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
-		INTERACTIVE_DELTA)
+#define MAX_SLEEP		(HZ)
 
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
-
-/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
- */
-
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
-	((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+#define NODE_THRESHOLD		125
 
-static inline unsigned int task_timeslice(task_t *p)
-{
-	return BASE_TIMESLICE(p);
-}
+#define TASK_PREEMPTS_CURR(p, rq)			\
+	( (p)->prio < (rq)->curr->prio			\
+	 	|| ((p)->prio == (rq)->curr->prio	\
+			 && (p)->static_prio < (rq)->curr->static_prio) )
 
 /*
  * These are the runqueue data structures:
@@ -157,7 +101,8 @@ struct prio_array {
  */
 struct runqueue {
 	spinlock_t lock;
-	unsigned long nr_running, nr_switches, expired_timestamp,
+	unsigned long array_sequence;
+	unsigned long nr_running, nr_switches,
 			nr_uninterruptible;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
@@ -179,7 +124,6 @@ static DEFINE_PER_CPU(struct runqueue, r
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
-#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
 
 /*
  * Default context-switch locking:
@@ -298,35 +242,79 @@ static inline void enqueue_task(struct t
 	p->array = array;
 }
 
+static void add_task_time(task_t *p, unsigned long time, int sleep)
+{
+	if (time == 0)
+		return;
+
+	if (time > MAX_SLEEP) {
+		time = MAX_SLEEP;
+		p->total_time = 0;
+		p->sleep_time = 0;
+	} else {
+		unsigned long r;
+
+		r = MAX_SLEEP - time;
+		p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP;
+		p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP;
+	}
+
+	p->total_time += 1000 * time;
+	if (sleep)
+		p->sleep_time += 1000 * time;
+
+	p->sleep_avg = (1000 * p->sleep_time) / p->total_time;
+}
+
 /*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
  */
-static int effective_prio(task_t *p)
+static unsigned int task_timeslice(task_t *p, runqueue_t *rq)
+{
+	int idx, delta;
+	unsigned int base, timeslice;
+
+	if (rt_task(p))
+		return MAX_TIMESLICE;
+	
+#if 0
+	unsigned int timeslice = MIN_TIMESLICE +
+		( (MAX_USER_PRIO - USER_PRIO(p->prio))
+		* (MAX_TIMESLICE - MIN_TIMESLICE) )
+		/ MAX_USER_PRIO;
+#endif
+	idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO),
+			find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO));
+	idx = min(idx, p->prio);
+	delta = p->prio - idx;
+
+	base = MIN_TIMESLICE * MAX_USER_PRIO / (delta + 1);
+	timeslice = base * (USER_PRIO(idx) + 4) / 4;
+
+	if (timeslice <= 0)
+		timeslice = 1;
+
+	return timeslice;
+}
+
+static unsigned long task_priority(task_t *p)
 {
 	int bonus, prio;
 
 	if (rt_task(p))
 		return p->prio;
 
-	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
-			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+	bonus = (MAX_USER_PRIO * p->sleep_avg) / 1000 / 2;
+	prio = USER_PRIO(p->static_prio) / 2 + (MAX_USER_PRIO / 2);
 
-	prio = p->static_prio - bonus;
+	prio = MAX_RT_PRIO + prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
 		prio = MAX_PRIO-1;
+
 	return prio;
 }
 
@@ -347,34 +335,38 @@ static inline void __activate_task(task_
  */
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	long sleep_time = jiffies - p->last_run - 1;
+	unsigned long sleep = jiffies - p->timestamp;
+	p->timestamp = jiffies;
 
-	if (sleep_time > 0) {
-		int sleep_avg;
+	if (sleep > MAX_SLEEP)
+		sleep = MAX_SLEEP;
 
-		/*
-		 * This code gives a bonus to interactive tasks.
-		 *
-		 * The boost works by updating the 'average sleep time'
-		 * value here, based on ->last_run. The more time a task
-		 * spends sleeping, the higher the average gets - and the
-		 * higher the priority boost gets as well.
-		 */
-		sleep_avg = p->sleep_avg + sleep_time;
+	if (!in_interrupt() && current->mm) {
+		unsigned long boost = sleep / 2;
+		add_task_time(current, boost, 1);
+		add_task_time(p, sleep - boost, 1);
+	} else {
+		add_task_time(p, sleep, 1);
+		
+		if (in_interrupt())
+			add_task_time(p, sleep / 2, 1);
+	}
 
-		/*
-		 * 'Overflow' bonus ticks go to the waker as well, so the
-		 * ticks are not lost. This has the effect of further
-		 * boosting tasks that are related to maximum-interactive
-		 * tasks.
-		 */
-		if (sleep_avg > MAX_SLEEP_AVG)
-			sleep_avg = MAX_SLEEP_AVG;
-		if (p->sleep_avg != sleep_avg) {
-			p->sleep_avg = sleep_avg;
-			p->prio = effective_prio(p);
-		}
+	p->prio = task_priority(p);
+
+	if (rq->array_sequence != p->array_sequence) {
+		p->used_slice = 0;
 	}
+
+	if (!in_interrupt() && current->mm) {
+		unsigned long steal;
+
+		steal = min((unsigned int)sleep / 2,
+				(task_timeslice(p, rq) - p->used_slice) / 2);
+		p->used_slice += steal;
+		current->used_slice -= steal;
+	}
+
 	__activate_task(p, rq);
 }
 
@@ -383,6 +375,7 @@ static inline void activate_task(task_t 
  */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
+	p->array_sequence = rq->array_sequence;
 	nr_running_dec(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
@@ -426,7 +419,7 @@ static inline void resched_task(task_t *
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
- */
+ n*/
 void wait_task_inactive(task_t * p)
 {
 	unsigned long flags;
@@ -497,11 +490,9 @@ repeat_lock_task:
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			if (sync)
-				__activate_task(p, rq);
-			else {
-				activate_task(p, rq);
-				if (p->prio < rq->curr->prio)
+			activate_task(p, rq);
+			if (!sync) {
+				if (TASK_PREEMPTS_CURR(p, rq))
 					resched_task(rq->curr);
 			}
 			success = 1;
@@ -534,36 +525,74 @@ int wake_up_state(task_t *p, unsigned in
 }
 
 /*
+ * Perform scheduler related accounting for a newly forked process @p.
+ * @p is forked by current.
+ */
+void sched_fork(task_t *p)
+{
+	unsigned long ts;
+	unsigned long flags;
+	runqueue_t *rq;
+
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
+	 */
+	local_irq_disable();
+	p->timestamp = jiffies;
+	rq = task_rq_lock(current, &flags);
+	ts = task_timeslice(current, rq);
+	task_rq_unlock(rq, &flags);
+	p->used_slice = current->used_slice + (ts - current->used_slice+1) / 2;
+	current->used_slice += (ts - current->used_slice) / 2;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
+	if (current->used_slice >= ts) {
+		/*
+	 	 * This case is rare, it happens when the parent has only
+	 	 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->used_slice = ts + 1;
+		preempt_disable();
+		scheduler_tick(0, 0);
+		local_irq_enable();
+		preempt_enable();
+	} else
+		local_irq_enable();
+}
+	
+/*
  * wake_up_forked_process - wake up a freshly forked process.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created process.
  */
-void wake_up_forked_process(task_t * p)
+void wake_up_forked_process(task_t *p)
 {
 	unsigned long flags;
 	runqueue_t *rq = task_rq_lock(current, &flags);
 
 	p->state = TASK_RUNNING;
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive.
-	 */
-	current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
-	p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
-	p->prio = effective_prio(p);
+
 	set_task_cpu(p, smp_processor_id());
 
-	if (unlikely(!current->array))
-		__activate_task(p, rq);
-	else {
-		p->prio = current->prio;
-		list_add_tail(&p->run_list, &current->run_list);
-		p->array = current->array;
-		p->array->nr_active++;
-		nr_running_inc(rq);
-	}
+	p->sleep_time = current->sleep_time / 40;
+	p->total_time = current->total_time / 40;
+	p->sleep_avg = current->sleep_avg;
+
+	current->sleep_time = 2 * (current->sleep_time) / 3;
+	if (current->total_time != 0)
+		current->sleep_avg = (1000 * current->sleep_time)
+						/ current->total_time;
+
+	p->prio = task_priority(p);
+	__activate_task(p, rq);
+
 	task_rq_unlock(rq, &flags);
 }
 
@@ -582,18 +611,13 @@ void sched_exit(task_t * p)
 
 	local_irq_save(flags);
 	if (p->first_time_slice) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
-			p->parent->time_slice = MAX_TIMESLICE;
+		unsigned long flags;
+		runqueue_t *rq;
+		rq = task_rq_lock(p, &flags);
+		p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice;
+		task_rq_unlock(rq, &flags);
 	}
 	local_irq_restore(flags);
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
-	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
-			p->sleep_avg) / (EXIT_WEIGHT + 1);
 }
 
 /**
@@ -995,13 +1019,29 @@ static inline void pull_task(runqueue_t 
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	if (p->prio < this_rq->curr->prio)
+	if (TASK_PREEMPTS_CURR(p, this_rq))
 		set_need_resched();
-	else {
-		if (p->prio == this_rq->curr->prio &&
-				p->time_slice > this_rq->curr->time_slice)
-			set_need_resched();
-	}
+}
+
+/*
+ * comment me
+ */
+
+static inline int
+can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
+{
+	unsigned long delta;
+
+	if (task_running(rq, tsk))
+		return 0;
+	if (!cpu_isset(this_cpu, tsk->cpus_allowed))
+		return 0;
+
+	delta = jiffies - tsk->timestamp;
+	if (idle && (delta <= cache_decay_ticks))
+		return 0;
+
+	return 1;
 }
 
 /*
@@ -1063,14 +1103,9 @@ skip_queue:
 	 * 3) are cache-hot on their current CPU.
 	 */
 
-#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) &&	\
-		!task_running(rq, p) &&					\
-			cpu_isset(this_cpu, (p)->cpus_allowed))
-
 	curr = curr->prev;
 
-	if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1171,20 +1206,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
-		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
-		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
-
-/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
@@ -1201,17 +1222,11 @@ void scheduler_tick(int user_ticks, int 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
 
-	/* note: this timer irq context must be accounted for as well */
-	if (hardirq_count() - HARDIRQ_OFFSET) {
-		cpustat->irq += sys_ticks;
-		sys_ticks = 0;
-	} else if (softirq_count()) {
-		cpustat->softirq += sys_ticks;
-		sys_ticks = 0;
-	}
-
 	if (p == rq->idle) {
-		if (atomic_read(&rq->nr_iowait) > 0)
+		/* note: this timer irq context must be accounted for as well */
+		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
+			cpustat->system += sys_ticks;
+		else if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
@@ -1232,43 +1247,39 @@ void scheduler_tick(int user_ticks, int 
 	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
-	 * time slice counter and the sleep average. Note: we
-	 * do not update a thread's priority until it either
-	 * goes to sleep or uses up its timeslice. This makes
-	 * it possible for interactive tasks to use up their
-	 * timeslices at their highest priority levels.
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice.
 	 */
-	if (p->sleep_avg)
-		p->sleep_avg--;
 	if (unlikely(rt_task(p))) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
 		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
+		if (p->policy == SCHED_RR) {
+			p->used_slice++;
+			if (p->used_slice >= task_timeslice(p, rq)) {
+				p->used_slice = 0;
+				p->first_time_slice = 0;
+				set_tsk_need_resched(p);
+
+				/* put it at the end of the queue: */
+				dequeue_task(p, rq->active);
+				enqueue_task(p, rq->active);
+			}
 		}
 		goto out_unlock;
 	}
-	if (!--p->time_slice) {
+
+	p->used_slice++;
+	if (p->used_slice >= task_timeslice(p, rq)) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
+		p->prio = task_priority(p);
+		p->used_slice = 0;
 		p->first_time_slice = 0;
 
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
-			if (!rq->expired_timestamp)
-				rq->expired_timestamp = jiffies;
-			enqueue_task(p, rq->expired);
-		} else
-			enqueue_task(p, rq->active);
+		enqueue_task(p, rq->expired);
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
@@ -1287,6 +1298,8 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
+	unsigned long now;
+	unsigned long run_time;
 	int idx;
 
 	/*
@@ -1307,7 +1320,11 @@ need_resched:
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->last_run = jiffies;
+	now = jiffies;
+	run_time = now - prev->timestamp;
+
+	add_task_time(prev, run_time, 0);
+
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1336,7 +1353,6 @@ pick_next_task:
 			goto pick_next_task;
 #endif
 		next = rq->idle;
-		rq->expired_timestamp = 0;
 		goto switch_tasks;
 	}
 
@@ -1345,10 +1361,10 @@ pick_next_task:
 		/*
 		 * Switch the active and expired arrays.
 		 */
+		rq->array_sequence++;
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
-		rq->expired_timestamp = 0;
 	}
 
 	idx = sched_find_first_bit(array->bitmap);
@@ -1360,7 +1376,9 @@ switch_tasks:
 	clear_tsk_need_resched(prev);
 	RCU_qsctr(task_cpu(prev))++;
 
+	prev->timestamp = now;
 	if (likely(prev != next)) {
+		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
 
@@ -2139,6 +2168,8 @@ asmlinkage long sys_sched_rr_get_interva
 	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
+	unsigned long flags;
+	runqueue_t *rq;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -2153,8 +2184,10 @@ asmlinkage long sys_sched_rr_get_interva
 	if (retval)
 		goto out_unlock;
 
+	rq = task_rq_lock(p, &flags);
 	jiffies_to_timespec(p->policy & SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+				0 : task_timeslice(p, rq), &t);
+	task_rq_unlock(rq, &flags);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-24 12:35 [PATCH] Nick's scheduler policy Nick Piggin
@ 2003-08-24 14:29 ` Felipe Alfaro Solana
  2003-08-25  3:05   ` Nick Piggin
  2003-08-25 22:30   ` Bill Davidsen
  2003-08-24 16:55 ` Martin J. Bligh
  2003-08-25  3:27 ` [PATCH] Nick's scheduler policy Randy.Dunlap
  2 siblings, 2 replies; 16+ messages in thread
From: Felipe Alfaro Solana @ 2003-08-24 14:29 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

On Sun, 2003-08-24 at 14:35, Nick Piggin wrote:
> Hi,
> Patch against 2.6.0-test4. It fixes a lot of problems here vs
> previous versions. There aren't really any open issues for me, so
> testers would be welcome.
> 
> The big change is more dynamic timeslices, which allows "interactive"
> tasks to get very small timeslices while more compute intensive loads
> can be given bigger timeslices than usual. This works properly with
> nice (niced processes will tend to get bigger timeslices).
> 
> I think I have cured test-starve too.

I haven't still found any starvation cases, but forking time when the
system is under heavy load has increased considerable with respect to
vanilla or Con's O18.1int:

1. On a Konsole session, run "while true; do a=2; done"
2. Now, try forming a new Konsole session and you'll see it takes
approximately twice the time it takes when the system is under no load.

Also, renicing X to -20 helps X interactivity, while with Con's patches,
renicing X to -20 makes it feel worse.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-24 12:35 [PATCH] Nick's scheduler policy Nick Piggin
  2003-08-24 14:29 ` Felipe Alfaro Solana
@ 2003-08-24 16:55 ` Martin J. Bligh
  2003-08-25  3:00   ` Nick Piggin
  2003-08-25  3:27 ` [PATCH] Nick's scheduler policy Randy.Dunlap
  2 siblings, 1 reply; 16+ messages in thread
From: Martin J. Bligh @ 2003-08-24 16:55 UTC (permalink / raw)
  To: Nick Piggin, linux-kernel

Seems to do badly with CPU intensive tasks:

Kernbench: (make -j vmlinux, maximal tasks)
                              Elapsed      System        User         CPU
              2.6.0-test3       46.00      115.49      571.94     1494.25
         2.6.0-test4-nick       49.37      131.31      611.15     1500.75

Oddly, schedule itself is significantly cheaper, but you seem
to end up with much more expense elsewhere. Thrashing tasks between
CPUs, maybe (esp given the increased user time)? I'll do a proper 
baseline against test4, but I don't expect it to be any different, really.

diffprofile {2.6.0-test3,2.6.0-test4-nick}/kernbench/0/profile
     12314     7.4% total
      3843    16.3% page_remove_rmap
      1657    20.8% __d_lookup
      1322     9.4% do_anonymous_page
      1143    21.5% __copy_to_user_ll
      1034    55.3% atomic_dec_and_lock
       683    48.4% free_hot_cold_page
       669   393.5% filp_close
       553   147.5% .text.lock.file_table
       484   479.2% file_ra_state_init
       409    24.3% buffered_rmqueue
       391    24.9% kmem_cache_free
       362    11.3% zap_pte_range
       304    16.5% path_lookup
       247    31.5% pte_alloc_one
       237    68.3% pgd_ctor
       229    19.0% file_move
       224    16.7% link_path_walk
       220    57.7% file_kill
       188     7.9% do_no_page
       164    24.6% __wake_up
       162     4.4% find_get_page
       146    24.4% generic_file_open
       141    87.6% .text.lock.dcache
       139    11.5% release_pages
       131    51.4% vfs_read
       127    40.2% dnotify_parent
...
      -144    -4.2% __copy_from_user_ll
      -149    -2.3% page_add_rmap
      -352   -25.1% schedule
     -3469    -6.8% default_idle


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-24 16:55 ` Martin J. Bligh
@ 2003-08-25  3:00   ` Nick Piggin
  2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
  0 siblings, 1 reply; 16+ messages in thread
From: Nick Piggin @ 2003-08-25  3:00 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel



Martin J. Bligh wrote:

>Seems to do badly with CPU intensive tasks:
>
>Kernbench: (make -j vmlinux, maximal tasks)
>                              Elapsed      System        User         CPU
>              2.6.0-test3       46.00      115.49      571.94     1494.25
>         2.6.0-test4-nick       49.37      131.31      611.15     1500.75
>

Thanks Martin.

>
>Oddly, schedule itself is significantly cheaper, but you seem
>to end up with much more expense elsewhere. Thrashing tasks between
>CPUs, maybe (esp given the increased user time)? I'll do a proper 
>baseline against test4, but I don't expect it to be any different, really.
>

Yeah I'd say most if not all would be my fault though. What happens
is that a lowest priority process will get a 1ms timeslice if there
is a highest priority process on the same runqueue, though it will
get I think 275ms if there are only other low priority processes
there.

A kernbench probably has enough IO to keep priorities up a bit and
keep timeslices short. The timeslice stuff could probably still use
a bit of tuning. On the other hand, nice -20 processes should get
big timeslices, while other schedulers give them small timeslices.



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-24 14:29 ` Felipe Alfaro Solana
@ 2003-08-25  3:05   ` Nick Piggin
  2003-08-25 22:30   ` Bill Davidsen
  1 sibling, 0 replies; 16+ messages in thread
From: Nick Piggin @ 2003-08-25  3:05 UTC (permalink / raw)
  To: Felipe Alfaro Solana; +Cc: linux-kernel



Felipe Alfaro Solana wrote:

>On Sun, 2003-08-24 at 14:35, Nick Piggin wrote:
>
>>Hi,
>>Patch against 2.6.0-test4. It fixes a lot of problems here vs
>>previous versions. There aren't really any open issues for me, so
>>testers would be welcome.
>>
>>The big change is more dynamic timeslices, which allows "interactive"
>>tasks to get very small timeslices while more compute intensive loads
>>can be given bigger timeslices than usual. This works properly with
>>nice (niced processes will tend to get bigger timeslices).
>>
>>I think I have cured test-starve too.
>>
>
>I haven't still found any starvation cases, but forking time when the
>system is under heavy load has increased considerable with respect to
>vanilla or Con's O18.1int:
>
>1. On a Konsole session, run "while true; do a=2; done"
>2. Now, try forming a new Konsole session and you'll see it takes
>approximately twice the time it takes when the system is under no load.
>

Yeah, it probably penalises parents and children too much on fork, and
doesn't penalise parents of exiting cpu hogs enough. I have noticed
this too.

>
>Also, renicing X to -20 helps X interactivity, while with Con's patches,
>renicing X to -20 makes it feel worse.
>

renicing IMO is a lot more sane in my patches, although others might
disagree. In Con's patches, when you make X -20, it gets huge timeslices.
In my version, it will get lots of smaller timeslices.

Thanks again for testing.

Nick


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-24 12:35 [PATCH] Nick's scheduler policy Nick Piggin
  2003-08-24 14:29 ` Felipe Alfaro Solana
  2003-08-24 16:55 ` Martin J. Bligh
@ 2003-08-25  3:27 ` Randy.Dunlap
  2003-08-25  3:36   ` Nick Piggin
  2 siblings, 1 reply; 16+ messages in thread
From: Randy.Dunlap @ 2003-08-25  3:27 UTC (permalink / raw)
  To: piggin; +Cc: linux-kernel

> Hi,
> Patch against 2.6.0-test4. It fixes a lot of problems here vs
> previous versions. There aren't really any open issues for me, so
> testers would be welcome.
>
...
>
> On the other hand, I expect the best cases and maybe most usual cases would
> be better on Con's... and Con might have since done some work in the latency
> area.

Has anyone developed a (run-time) scheduler [policy] selector, via
sysctl or sysfs, so that different kernel builds aren't required?

I know that I have heard discussions of this previously.

~Randy




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-25  3:27 ` [PATCH] Nick's scheduler policy Randy.Dunlap
@ 2003-08-25  3:36   ` Nick Piggin
  2003-08-26  3:16     ` Mike Fedyk
  0 siblings, 1 reply; 16+ messages in thread
From: Nick Piggin @ 2003-08-25  3:36 UTC (permalink / raw)
  To: Randy.Dunlap; +Cc: linux-kernel



Randy.Dunlap wrote:

>>Hi,
>>Patch against 2.6.0-test4. It fixes a lot of problems here vs
>>previous versions. There aren't really any open issues for me, so
>>testers would be welcome.
>>
>>
>...
>
>>On the other hand, I expect the best cases and maybe most usual cases would
>>be better on Con's... and Con might have since done some work in the latency
>>area.
>>
>
>Has anyone developed a (run-time) scheduler [policy] selector, via
>sysctl or sysfs, so that different kernel builds aren't required?
>
>I know that I have heard discussions of this previously.
>

Not that I know of. This would probably require an extra layer of
indirection in the standard form of Linux's struct of pointers to
functions, with your standard schedule functions as wrappers.
I think it would be highly unlikely that this would get into a
standard kernel, but might make a nice testing tool...

In fact this might end up being incompatible with architectures
like SPARC... but I'm sure someone could make it work if they really
wanted to.

I think the present boot-time selector (selecting different kernels
at boot) will have to suffice for now :P



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] Nick's scheduler policy v7
  2003-08-25  3:00   ` Nick Piggin
@ 2003-08-25 10:41     ` Nick Piggin
  2003-08-25 11:03       ` Felipe Alfaro Solana
                         ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Nick Piggin @ 2003-08-25 10:41 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 496 bytes --]

Hi,
I didn't miss 5 revisions, I'll just stick to using my internal
numbering for releases.

This one has a few changes. Children now get a priority boost
on fork, and parents retain more priority after forking a child,
however exiting CPU hogs will now penalise parents a bit.

Timeslice scaling was tweaked a bit. Oh and remember raising X's
priority should _help_ interactivity with this patch, and IMO is
not an unreasonable thing to be doing.

Please test. I'm not getting enough feedback!


[-- Attachment #2: sched-policy-7a --]
[-- Type: text/plain, Size: 22405 bytes --]

--- linux-2.6/include/linux/sched.h.orig	2003-08-25 20:30:17.000000000 +1000
+++ linux-2.6/include/linux/sched.h	2003-08-25 20:30:24.000000000 +1000
@@ -281,7 +281,9 @@ struct signal_struct {
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
- 
+
+#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -339,12 +341,16 @@ struct task_struct {
 	struct list_head run_list;
 	prio_array_t *array;
 
+	unsigned long array_sequence;
+	unsigned long timestamp;
+
+	unsigned long total_time, sleep_time;
 	unsigned long sleep_avg;
-	unsigned long last_run;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
+	unsigned int used_slice;
 
 	struct list_head tasks;
 	struct list_head ptrace_children;
@@ -552,6 +558,7 @@ extern int FASTCALL(wake_up_state(struct
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
 extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
 extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_fork(task_t * p));
 extern void FASTCALL(sched_exit(task_t * p));
 
 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
--- linux-2.6/kernel/fork.c.orig	2003-08-25 20:30:13.000000000 +1000
+++ linux-2.6/kernel/fork.c	2003-08-25 20:30:24.000000000 +1000
@@ -911,33 +911,9 @@ struct task_struct *copy_process(unsigne
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
-	local_irq_disable();
-        p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->last_run = jiffies;
-	if (!current->time_slice) {
-		/*
-	 	 * This case is rare, it happens when the parent has only
-	 	 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		preempt_disable();
-		scheduler_tick(0, 0);
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	/* Perform scheduler related accounting */
+	sched_fork(p);
+
 	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.
--- linux-2.6/kernel/sched.c.orig	2003-08-25 20:30:10.000000000 +1000
+++ linux-2.6/kernel/sched.c	2003-08-25 20:30:24.000000000 +1000
@@ -66,73 +66,17 @@
  * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
-#define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(200 * HZ / 1000)
-#define CHILD_PENALTY		50
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
-#define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(10*HZ)
-#define STARVATION_LIMIT	(10*HZ)
-#define NODE_THRESHOLD		125
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
-		INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
+#define MIN_TIMESLICE		((1 * HZ / 1000) ? 1 * HZ / 1000 : 1)
+#define MAX_TIMESLICE		(20 * MIN_TIMESLICE) /* This cannot be changed */
 
-/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
- */
+#define MAX_SLEEP		(HZ)
 
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
-	((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+#define NODE_THRESHOLD		125
 
-static inline unsigned int task_timeslice(task_t *p)
-{
-	return BASE_TIMESLICE(p);
-}
+#define TASK_PREEMPTS_CURR(p, rq)			\
+	( (p)->prio < (rq)->curr->prio			\
+	 	|| ((p)->prio == (rq)->curr->prio	\
+			 && (p)->static_prio < (rq)->curr->static_prio) )
 
 /*
  * These are the runqueue data structures:
@@ -157,7 +101,8 @@ struct prio_array {
  */
 struct runqueue {
 	spinlock_t lock;
-	unsigned long nr_running, nr_switches, expired_timestamp,
+	unsigned long array_sequence;
+	unsigned long nr_running, nr_switches,
 			nr_uninterruptible;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
@@ -179,7 +124,6 @@ static DEFINE_PER_CPU(struct runqueue, r
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
-#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
 
 /*
  * Default context-switch locking:
@@ -298,35 +242,73 @@ static inline void enqueue_task(struct t
 	p->array = array;
 }
 
+static void add_task_time(task_t *p, unsigned long time, int sleep)
+{
+	if (time == 0)
+		return;
+
+	if (time > MAX_SLEEP) {
+		time = MAX_SLEEP;
+		p->total_time = 0;
+		p->sleep_time = 0;
+	} else {
+		unsigned long r;
+
+		r = MAX_SLEEP - time;
+		p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP;
+		p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP;
+	}
+
+	p->total_time += 1000 * time;
+	if (sleep)
+		p->sleep_time += 1000 * time;
+
+	p->sleep_avg = (1000 * p->sleep_time) / p->total_time;
+}
+
 /*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
  */
-static int effective_prio(task_t *p)
+static unsigned int task_timeslice(task_t *p, runqueue_t *rq)
+{
+	int idx, delta;
+	unsigned int base, timeslice;
+
+	if (rt_task(p))
+		return MAX_TIMESLICE;
+	
+	idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO),
+			find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO));
+	idx = min(idx, p->prio);
+	delta = p->prio - idx;
+
+	base = MIN_TIMESLICE * MAX_USER_PRIO / (delta + 1);
+	timeslice = base * (USER_PRIO(idx) + 4) / 8;
+
+	if (timeslice <= 0)
+		timeslice = 1;
+
+	return timeslice;
+}
+
+static unsigned long task_priority(task_t *p)
 {
 	int bonus, prio;
 
 	if (rt_task(p))
 		return p->prio;
 
-	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
-			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+	bonus = (MAX_USER_PRIO * p->sleep_avg) / 1000 / 2;
+	prio = USER_PRIO(p->static_prio) / 2 + (MAX_USER_PRIO / 2);
 
-	prio = p->static_prio - bonus;
+	prio = MAX_RT_PRIO + prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
 		prio = MAX_PRIO-1;
+
 	return prio;
 }
 
@@ -347,34 +329,38 @@ static inline void __activate_task(task_
  */
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	long sleep_time = jiffies - p->last_run - 1;
+	unsigned long sleep = jiffies - p->timestamp;
+	p->timestamp = jiffies;
 
-	if (sleep_time > 0) {
-		int sleep_avg;
+	if (sleep > MAX_SLEEP)
+		sleep = MAX_SLEEP;
 
-		/*
-		 * This code gives a bonus to interactive tasks.
-		 *
-		 * The boost works by updating the 'average sleep time'
-		 * value here, based on ->last_run. The more time a task
-		 * spends sleeping, the higher the average gets - and the
-		 * higher the priority boost gets as well.
-		 */
-		sleep_avg = p->sleep_avg + sleep_time;
+	if (!in_interrupt() && current->mm) {
+		unsigned long boost = sleep / 2;
+		add_task_time(current, boost, 1);
+		add_task_time(p, sleep - boost, 1);
+	} else {
+		add_task_time(p, sleep, 1);
+		
+		if (in_interrupt())
+			add_task_time(p, sleep / 2, 1);
+	}
 
-		/*
-		 * 'Overflow' bonus ticks go to the waker as well, so the
-		 * ticks are not lost. This has the effect of further
-		 * boosting tasks that are related to maximum-interactive
-		 * tasks.
-		 */
-		if (sleep_avg > MAX_SLEEP_AVG)
-			sleep_avg = MAX_SLEEP_AVG;
-		if (p->sleep_avg != sleep_avg) {
-			p->sleep_avg = sleep_avg;
-			p->prio = effective_prio(p);
-		}
+	p->prio = task_priority(p);
+
+	if (rq->array_sequence != p->array_sequence) {
+		p->used_slice = 0;
 	}
+
+	if (!in_interrupt() && current->mm) {
+		unsigned long steal;
+
+		steal = min((unsigned int)sleep / 2,
+				(task_timeslice(p, rq) - p->used_slice) / 2);
+		p->used_slice += steal;
+		current->used_slice -= steal;
+	}
+
 	__activate_task(p, rq);
 }
 
@@ -383,6 +369,7 @@ static inline void activate_task(task_t 
  */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
+	p->array_sequence = rq->array_sequence;
 	nr_running_dec(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
@@ -426,7 +413,7 @@ static inline void resched_task(task_t *
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
- */
+ n*/
 void wait_task_inactive(task_t * p)
 {
 	unsigned long flags;
@@ -497,11 +484,9 @@ repeat_lock_task:
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			if (sync)
-				__activate_task(p, rq);
-			else {
-				activate_task(p, rq);
-				if (p->prio < rq->curr->prio)
+			activate_task(p, rq);
+			if (!sync) {
+				if (TASK_PREEMPTS_CURR(p, rq))
 					resched_task(rq->curr);
 			}
 			success = 1;
@@ -534,36 +519,74 @@ int wake_up_state(task_t *p, unsigned in
 }
 
 /*
+ * Perform scheduler related accounting for a newly forked process @p.
+ * @p is forked by current.
+ */
+void sched_fork(task_t *p)
+{
+	unsigned long ts;
+	unsigned long flags;
+	runqueue_t *rq;
+
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
+	 */
+	local_irq_disable();
+	p->timestamp = jiffies;
+	rq = task_rq_lock(current, &flags);
+	ts = task_timeslice(current, rq);
+	task_rq_unlock(rq, &flags);
+	p->used_slice = current->used_slice + (ts - current->used_slice) / 2;
+	current->used_slice += (ts - current->used_slice + 1) / 2;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
+	if (current->used_slice >= ts) {
+		/*
+	 	 * This case is rare, it happens when the parent has only
+	 	 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->used_slice = ts - 1;
+		preempt_disable();
+		scheduler_tick(0, 0);
+		local_irq_enable();
+		preempt_enable();
+	} else
+		local_irq_enable();
+}
+	
+/*
  * wake_up_forked_process - wake up a freshly forked process.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created process.
  */
-void wake_up_forked_process(task_t * p)
+void wake_up_forked_process(task_t *p)
 {
 	unsigned long flags;
 	runqueue_t *rq = task_rq_lock(current, &flags);
 
 	p->state = TASK_RUNNING;
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive.
-	 */
-	current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
-	p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
-	p->prio = effective_prio(p);
+
 	set_task_cpu(p, smp_processor_id());
 
-	if (unlikely(!current->array))
-		__activate_task(p, rq);
-	else {
-		p->prio = current->prio;
-		list_add_tail(&p->run_list, &current->run_list);
-		p->array = current->array;
-		p->array->nr_active++;
-		nr_running_inc(rq);
-	}
+	p->sleep_time = current->sleep_time / 10;
+	p->total_time = current->total_time / 20;
+	p->sleep_avg = current->sleep_avg * 2;
+
+	current->sleep_time = 3 * (current->sleep_time) / 4;
+	if (current->total_time != 0)
+		current->sleep_avg = (1000 * current->sleep_time)
+						/ current->total_time;
+
+	p->prio = task_priority(p);
+	__activate_task(p, rq);
+
 	task_rq_unlock(rq, &flags);
 }
 
@@ -582,18 +605,17 @@ void sched_exit(task_t * p)
 
 	local_irq_save(flags);
 	if (p->first_time_slice) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
-			p->parent->time_slice = MAX_TIMESLICE;
+		unsigned long flags;
+		runqueue_t *rq;
+		rq = task_rq_lock(p, &flags);
+		p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice;
+		task_rq_unlock(rq, &flags);
 	}
-	local_irq_restore(flags);
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
+
 	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
-			p->sleep_avg) / (EXIT_WEIGHT + 1);
+		add_task_time(p->parent, (p->parent->sleep_avg - p->sleep_avg)/2, 0);
+	
+	local_irq_restore(flags);
 }
 
 /**
@@ -995,13 +1017,29 @@ static inline void pull_task(runqueue_t 
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	if (p->prio < this_rq->curr->prio)
+	if (TASK_PREEMPTS_CURR(p, this_rq))
 		set_need_resched();
-	else {
-		if (p->prio == this_rq->curr->prio &&
-				p->time_slice > this_rq->curr->time_slice)
-			set_need_resched();
-	}
+}
+
+/*
+ * comment me
+ */
+
+static inline int
+can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
+{
+	unsigned long delta;
+
+	if (task_running(rq, tsk))
+		return 0;
+	if (!cpu_isset(this_cpu, tsk->cpus_allowed))
+		return 0;
+
+	delta = jiffies - tsk->timestamp;
+	if (idle && (delta <= cache_decay_ticks))
+		return 0;
+
+	return 1;
 }
 
 /*
@@ -1063,14 +1101,9 @@ skip_queue:
 	 * 3) are cache-hot on their current CPU.
 	 */
 
-#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) &&	\
-		!task_running(rq, p) &&					\
-			cpu_isset(this_cpu, (p)->cpus_allowed))
-
 	curr = curr->prev;
 
-	if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1171,20 +1204,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
-		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
-		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
-
-/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
@@ -1201,17 +1220,11 @@ void scheduler_tick(int user_ticks, int 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
 
-	/* note: this timer irq context must be accounted for as well */
-	if (hardirq_count() - HARDIRQ_OFFSET) {
-		cpustat->irq += sys_ticks;
-		sys_ticks = 0;
-	} else if (softirq_count()) {
-		cpustat->softirq += sys_ticks;
-		sys_ticks = 0;
-	}
-
 	if (p == rq->idle) {
-		if (atomic_read(&rq->nr_iowait) > 0)
+		/* note: this timer irq context must be accounted for as well */
+		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
+			cpustat->system += sys_ticks;
+		else if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
@@ -1232,43 +1245,39 @@ void scheduler_tick(int user_ticks, int 
 	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
-	 * time slice counter and the sleep average. Note: we
-	 * do not update a thread's priority until it either
-	 * goes to sleep or uses up its timeslice. This makes
-	 * it possible for interactive tasks to use up their
-	 * timeslices at their highest priority levels.
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice.
 	 */
-	if (p->sleep_avg)
-		p->sleep_avg--;
 	if (unlikely(rt_task(p))) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
 		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
+		if (p->policy == SCHED_RR) {
+			p->used_slice++;
+			if (p->used_slice >= task_timeslice(p, rq)) {
+				p->used_slice = 0;
+				p->first_time_slice = 0;
+				set_tsk_need_resched(p);
+
+				/* put it at the end of the queue: */
+				dequeue_task(p, rq->active);
+				enqueue_task(p, rq->active);
+			}
 		}
 		goto out_unlock;
 	}
-	if (!--p->time_slice) {
+
+	p->used_slice++;
+	if (p->used_slice >= task_timeslice(p, rq)) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
+		p->prio = task_priority(p);
+		p->used_slice = 0;
 		p->first_time_slice = 0;
 
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
-			if (!rq->expired_timestamp)
-				rq->expired_timestamp = jiffies;
-			enqueue_task(p, rq->expired);
-		} else
-			enqueue_task(p, rq->active);
+		enqueue_task(p, rq->expired);
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
@@ -1287,6 +1296,8 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
+	unsigned long now;
+	unsigned long run_time;
 	int idx;
 
 	/*
@@ -1307,7 +1318,11 @@ need_resched:
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->last_run = jiffies;
+	now = jiffies;
+	run_time = now - prev->timestamp;
+
+	add_task_time(prev, run_time, 0);
+
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1336,7 +1351,6 @@ pick_next_task:
 			goto pick_next_task;
 #endif
 		next = rq->idle;
-		rq->expired_timestamp = 0;
 		goto switch_tasks;
 	}
 
@@ -1345,10 +1359,10 @@ pick_next_task:
 		/*
 		 * Switch the active and expired arrays.
 		 */
+		rq->array_sequence++;
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
-		rq->expired_timestamp = 0;
 	}
 
 	idx = sched_find_first_bit(array->bitmap);
@@ -1360,7 +1374,9 @@ switch_tasks:
 	clear_tsk_need_resched(prev);
 	RCU_qsctr(task_cpu(prev))++;
 
+	prev->timestamp = now;
 	if (likely(prev != next)) {
+		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
 
@@ -1600,6 +1616,7 @@ void set_user_nice(task_t *p, long nice)
 	unsigned long flags;
 	prio_array_t *array;
 	runqueue_t *rq;
+	int old_prio, new_prio, delta;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -1608,6 +1625,12 @@ void set_user_nice(task_t *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL:
+	 */
 	if (rt_task(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
@@ -1615,16 +1638,20 @@ void set_user_nice(task_t *p, long nice)
 	array = p->array;
 	if (array)
 		dequeue_task(p, array);
+
+	old_prio = p->prio;
+	new_prio = NICE_TO_PRIO(nice);
+	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	p->prio = NICE_TO_PRIO(nice);
+	p->prio += delta;
+
 	if (array) {
 		enqueue_task(p, array);
 		/*
-		 * If the task is running and lowered its priority,
-		 * or increased its priority then reschedule its CPU:
+		 * If the task increased its priority or is running and
+		 * lowered its priority, then reschedule its CPU:
 		 */
-		if ((NICE_TO_PRIO(nice) < p->static_prio) ||
-							task_running(rq, p))
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
@@ -2139,6 +2166,8 @@ asmlinkage long sys_sched_rr_get_interva
 	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
+	unsigned long flags;
+	runqueue_t *rq;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -2153,8 +2182,10 @@ asmlinkage long sys_sched_rr_get_interva
 	if (retval)
 		goto out_unlock;
 
+	rq = task_rq_lock(p, &flags);
 	jiffies_to_timespec(p->policy & SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+				0 : task_timeslice(p, rq), &t);
+	task_rq_unlock(rq, &flags);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy v7
  2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
@ 2003-08-25 11:03       ` Felipe Alfaro Solana
  2003-08-25 14:36       ` Måns Rullgård
                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Felipe Alfaro Solana @ 2003-08-25 11:03 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

On Mon, 2003-08-25 at 12:41, Nick Piggin wrote:

[...]
> Please test. I'm not getting enough feedback!

I'm testing, I'm testing ;-)
Just let me a few hours to play with it...


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy v7
  2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
  2003-08-25 11:03       ` Felipe Alfaro Solana
@ 2003-08-25 14:36       ` Måns Rullgård
  2003-08-26  3:24       ` Martin J. Bligh
                         ` (2 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Måns Rullgård @ 2003-08-25 14:36 UTC (permalink / raw)
  To: linux-kernel

Nick Piggin <piggin@cyberone.com.au> writes:

> This one has a few changes. Children now get a priority boost
> on fork, and parents retain more priority after forking a child,
> however exiting CPU hogs will now penalise parents a bit.
>
> Timeslice scaling was tweaked a bit. Oh and remember raising X's
> priority should _help_ interactivity with this patch, and IMO is
> not an unreasonable thing to be doing.
>
> Please test. I'm not getting enough feedback!

OK, if you test my software.

Seriously, though, it seems OK at first glance.  I can't reproduce the
XEmacs problems I had with Con's recent versions.  I'll have to run it
for a while and see what it seems like.

-- 
Måns Rullgård
mru@users.sf.net


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-24 14:29 ` Felipe Alfaro Solana
  2003-08-25  3:05   ` Nick Piggin
@ 2003-08-25 22:30   ` Bill Davidsen
  1 sibling, 0 replies; 16+ messages in thread
From: Bill Davidsen @ 2003-08-25 22:30 UTC (permalink / raw)
  To: Felipe Alfaro Solana; +Cc: Nick Piggin, linux-kernel

On Sun, 24 Aug 2003, Felipe Alfaro Solana wrote:

> On Sun, 2003-08-24 at 14:35, Nick Piggin wrote:
> > Hi,
> > Patch against 2.6.0-test4. It fixes a lot of problems here vs
> > previous versions. There aren't really any open issues for me, so
> > testers would be welcome.
> > 
> > The big change is more dynamic timeslices, which allows "interactive"
> > tasks to get very small timeslices while more compute intensive loads
> > can be given bigger timeslices than usual. This works properly with
> > nice (niced processes will tend to get bigger timeslices).
> > 
> > I think I have cured test-starve too.
> 
> I haven't still found any starvation cases, but forking time when the
> system is under heavy load has increased considerable with respect to
> vanilla or Con's O18.1int:

Not having starvation cases may be worth a little overhead. I am more
concerned about avoiding "jackpot cases" than throughput, at least on a
desktop.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy
  2003-08-25  3:36   ` Nick Piggin
@ 2003-08-26  3:16     ` Mike Fedyk
  0 siblings, 0 replies; 16+ messages in thread
From: Mike Fedyk @ 2003-08-26  3:16 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Randy.Dunlap, linux-kernel

On Mon, Aug 25, 2003 at 01:36:25PM +1000, Nick Piggin wrote:
> 
> 
> Randy.Dunlap wrote:
> >Has anyone developed a (run-time) scheduler [policy] selector, via
> >sysctl or sysfs, so that different kernel builds aren't required?
> >
> >I know that I have heard discussions of this previously.
> >
> 
> Not that I know of. This would probably require an extra layer of
...
> In fact this might end up being incompatible with architectures
> like SPARC... but I'm sure someone could make it work if they really
> wanted to.
> 

Why wouldn't it work on sparc?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy v7
  2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
  2003-08-25 11:03       ` Felipe Alfaro Solana
  2003-08-25 14:36       ` Måns Rullgård
@ 2003-08-26  3:24       ` Martin J. Bligh
  2003-08-26  4:04         ` Nick Piggin
  2003-08-26  9:44       ` Yaroslav Rastrigin
  2003-08-27  9:28       ` Mike Galbraith
  4 siblings, 1 reply; 16+ messages in thread
From: Martin J. Bligh @ 2003-08-26  3:24 UTC (permalink / raw)
  To: Nick Piggin, linux-kernel

> I didn't miss 5 revisions, I'll just stick to using my internal
> numbering for releases.
> 
> This one has a few changes. Children now get a priority boost
> on fork, and parents retain more priority after forking a child,
> however exiting CPU hogs will now penalise parents a bit.
> 
> Timeslice scaling was tweaked a bit. Oh and remember raising X's
> priority should _help_ interactivity with this patch, and IMO is
> not an unreasonable thing to be doing.
> 
> Please test. I'm not getting enough feedback!

Well, it's actually a bit faster than either mainline or your previous
rev whilst running SDET:

SDET 128  (see disclaimer)
                           Throughput    Std. Dev
              2.6.0-test4       100.0%         0.3%
         2.6.0-test4-nick       102.9%         0.3%
       2.6.0-test4-nick7a       105.1%         0.5%

But kernbench is significantly slower. The increase in sys time has 
dropped from last time, but user time is up.

Kernbench: (make -j vmlinux, maximal tasks)
                              Elapsed      System        User         CPU
              2.6.0-test4       45.87      116.92      571.10     1499.00
         2.6.0-test4-nick       49.37      131.31      611.15     1500.75
       2.6.0-test4-nick7a       49.48      125.95      617.71     1502.00

diffprofile {2.6.0-test4,2.6.0-test4-nick7a}/kernbench/0/profile

     13989     8.6% total
      4402     9.6% default_idle
      3385    14.4% page_remove_rmap
      1093    13.8% __d_lookup
       702     5.0% do_anonymous_page
       613    11.5% __copy_to_user_ll
       613    32.9% atomic_dec_and_lock
       565    40.9% free_hot_cold_page
       322    18.7% buffered_rmqueue
       296     9.4% zap_pte_range
       282    75.6% .text.lock.file_table
       185    11.4% kmem_cache_free
       183     9.8% path_lookup
       164    12.2% link_path_walk
       154    12.7% release_pages
       152    43.1% pgd_ctor
       127    33.0% file_kill
       126    15.8% pte_alloc_one
       123    10.4% file_move
       107    75.4% .text.lock.dcache
...
       -59    -9.5% copy_process
       -94   -22.5% release_task
      -146    -2.3% page_add_rmap
      -352   -24.7% schedule
     -1026   -29.4% __copy_from_user_ll

Not sure why you're beating up on rmap so much more from a scheduler
change.

diffprofile {2.6.0-test4,2.6.0-test4-nick7a}/sdetbench/128/profile
       513    19.1% .text.lock.filemap
       246     2.6% find_get_page
       150     4.4% copy_mm
        86    46.0% try_to_wake_up
        82    24.1% kunmap_high
        76     0.0% sched_fork
        74     0.5% copy_page_range
        67     1.1% do_no_page
        54    17.3% __pagevec_lru_add_active
        53    10.2% radix_tree_lookup
        51     7.4% __wake_up
...
      -101    -8.6% __block_prepare_write
      -105   -65.2% release_blocks
      -108    -4.7% link_path_walk
      -112   -15.2% mmgrab
      -116    -5.5% buffered_rmqueue
      -118    -5.9% path_release
      -119    -2.9% do_wp_page
      -125    -3.9% pte_alloc_one
      -125   -15.6% proc_pid_status
      -127    -5.5% free_hot_cold_page
      -132   -10.2% exit_notify
      -138   -11.7% __read_lock_failed
      -146    -9.7% number
      -154   -28.5% proc_check_root
      -155   -20.9% proc_root_link
      -176   -10.3% d_alloc
      -179   -13.6% task_mem
      -186    -9.8% .text.lock.dcache
      -186    -7.6% proc_pid_stat
      -193   -11.1% ext2_new_inode
      -230    -4.0% kmem_cache_free
      -239    -8.2% .text.lock.dec_and_lock
      -244   -11.5% schedule
      -250   -38.3% __blk_queue_bounce
      -257   -15.0% current_kernel_time
      -307   -17.6% release_task
      -327    -1.8% zap_pte_range
      -338    -7.7% clear_page_tables
      -384   -20.7% lookup_mnt
      -406   -26.5% __find_get_block
      -412   -18.5% follow_mount
      -565    -9.7% path_lookup
      -729   -11.6% atomic_dec_and_lock
      -865   -46.1% grab_block
     -1185   -10.5% __d_lookup
     -2145    -0.5% default_idle
     -2786    -7.0% page_add_rmap
    -12702   -14.2% page_remove_rmap
    -29467    -3.8% total

Again, rmap and dlookup. Very odd. Some sort of locality thing, I guess.

M.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy v7
  2003-08-26  3:24       ` Martin J. Bligh
@ 2003-08-26  4:04         ` Nick Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nick Piggin @ 2003-08-26  4:04 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel

Martin J. Bligh wrote:

>>I didn't miss 5 revisions, I'll just stick to using my internal
>>numbering for releases.
>>
>>This one has a few changes. Children now get a priority boost
>>on fork, and parents retain more priority after forking a child,
>>however exiting CPU hogs will now penalise parents a bit.
>>
>>Timeslice scaling was tweaked a bit. Oh and remember raising X's
>>priority should _help_ interactivity with this patch, and IMO is
>>not an unreasonable thing to be doing.
>>
>>Please test. I'm not getting enough feedback!
>>
>
>Well, it's actually a bit faster than either mainline or your previous
>rev whilst running SDET:
>
>SDET 128  (see disclaimer)
>                           Throughput    Std. Dev
>              2.6.0-test4       100.0%         0.3%
>         2.6.0-test4-nick       102.9%         0.3%
>       2.6.0-test4-nick7a       105.1%         0.5%
>
>But kernbench is significantly slower. The increase in sys time has 
>dropped from last time, but user time is up.
>
>Kernbench: (make -j vmlinux, maximal tasks)
>                              Elapsed      System        User         CPU
>              2.6.0-test4       45.87      116.92      571.10     1499.00
>         2.6.0-test4-nick       49.37      131.31      611.15     1500.75
>       2.6.0-test4-nick7a       49.48      125.95      617.71     1502.00
>

Thanks Martin. OK, so the drop in kernbench is quite likely to be what
I thought - elevated priorities (caused by eg. make waiting for children)
causing timeslices to shrink. As long as its not a fundamental problem,
this should be able to be tweaked back.

Yeah, I guess the random kernel and user times are probably due to cache.




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy v7
  2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
                         ` (2 preceding siblings ...)
  2003-08-26  3:24       ` Martin J. Bligh
@ 2003-08-26  9:44       ` Yaroslav Rastrigin
  2003-08-27  9:28       ` Mike Galbraith
  4 siblings, 0 replies; 16+ messages in thread
From: Yaroslav Rastrigin @ 2003-08-26  9:44 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

Hi !
>
> This one has a few changes. Children now get a priority boost
> on fork, and parents retain more priority after forking a child,
> however exiting CPU hogs will now penalise parents a bit.
>
> Timeslice scaling was tweaked a bit. Oh and remember raising X's
> priority should _help_ interactivity with this patch, and IMO is
> not an unreasonable thing to be doing.
>
> Please test. I'm not getting enough feedback!
And here's my report (almost no numbers, everything is purely subjective).
I haven't tested Con's O18.1 yet, so my comparision is against -test4 vanilla.
First (and most subjective opinion) - great. Under casual load (Opera 
rendering page and using its motifwrapper to handle flash (it's a CPU hog 
alone) + ocassional compilation + XMMS using ALSA ) everything feels very 
smooth _and_ responsive, no XMMS jerks, windows are moving nicely - X is 
definitely not starved, ps ax in xterm displays its output almost instantly, 
apps startup time is also very pleasantly low - all in all great. 
Unusual load (make -j 4 bzImage + aforementioned activity ) - I was able to 
notice 1 jerk in XMMS (wasn't able to reproduce, so this is acceptable), 
application startup time is slightly worse, but nonetheless useable and once 
started, all apps are quite smooth, but X is definitely starved, and it has 
great impact on WM - window movement is jerky and feels bad. However, renice 
-20 <X pid> cures this behavior completely, without any noticeable penalty on 
another apps - music is playing nicely, page rendering is still clean and 
nice.

Overall - very nice. 
I'll stick to your scheduling policy for a while .

System specs:

IBM ThinkPad T21, PIII-800Mhz , 256Mb. 
Linux 2.6.0-test4, APM, ALSA, anticipatory IO scheduling
XFree 4.3.99.9 

P.S. make clean && make -j 4 bzImage completed while I was writing this 
letter, so I'm assuming throughput is also OK for me.

-- 
With all the best, yarick at relex dot ru.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Nick's scheduler policy v7
  2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
                         ` (3 preceding siblings ...)
  2003-08-26  9:44       ` Yaroslav Rastrigin
@ 2003-08-27  9:28       ` Mike Galbraith
  4 siblings, 0 replies; 16+ messages in thread
From: Mike Galbraith @ 2003-08-27  9:28 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1230 bytes --]

At 08:41 PM 8/25/2003 +1000, Nick Piggin wrote:
>Hi,

Greetings,

>I didn't miss 5 revisions, I'll just stick to using my internal
>numbering for releases.
>
>This one has a few changes. Children now get a priority boost
>on fork, and parents retain more priority after forking a child,
>however exiting CPU hogs will now penalise parents a bit.
>
>Timeslice scaling was tweaked a bit. Oh and remember raising X's
>priority should _help_ interactivity with this patch, and IMO is
>not an unreasonable thing to be doing.
>
>Please test. I'm not getting enough feedback!

Heavy parallel make throughput is still down a little over 10%, but X 
choppiness is markedly improved.  Test-starve is now working.  One thing 
that I noticed is that irman takes quite a bit longer to complete than with 
stock.  I've attached the results of that, plus some contest numbers.  My 
local variant of contest has a couple of differences to stock: dbench 
doesn't run so many instances, and list_load is just a small tree being 
md5summed.  There are two additional loads as well.  ab_load is an ancient 
apache bench jabbering with an also ancient apache (boring static page via 
localhost)... you can guess what irman2_load is :)

         -Mike

[-- Attachment #2: xx --]
[-- Type: application/octet-stream, Size: 5274 bytes --]

[root]:# time ./irman
Response time measurements (milliseconds) for: 2.6.0-test4.v7
      Load       Max       Min       Avg Std. Dev.
      NULL     0.287     0.010     0.011     0.002
    MEMORY   170.052     0.010     0.049     0.534
   FILE_IO   220.056     0.010     0.090     1.013
   PROCESS   553.947     0.010     0.083     4.084

real    7m58.026s
user    0m43.192s
sys     0m24.150s

[root]:# time ./irman
Response time measurements (milliseconds) for: 2.6.0-test4
      Load       Max       Min       Avg Std. Dev.
      NULL     0.426     0.009     0.011     0.002
    MEMORY   111.117     0.009     0.020     0.981
   FILE_IO   306.099     0.009     0.029     2.410
   PROCESS   303.476     0.010     0.032     0.817

real    2m15.407s
user    0m24.167s
sys     0m21.514s

no_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	153	94.8	0.0	0.0	1.00
2.5.70                1	153	94.1	0.0	0.0	1.00
2.6.0-test1           1	153	94.1	0.0	0.0	1.00
2.6.0-test1-mm1       1	152	94.7	0.0	0.0	1.00
2.6.0-test4           1	153	94.8	0.0	0.0	1.00
2.6.0-test4.v7        1	156	94.9	0.0	0.0	1.00
cacherun:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	146	98.6	0.0	0.0	0.95
2.5.70                1	146	98.6	0.0	0.0	0.95
2.6.0-test1           1	146	98.6	0.0	0.0	0.95
2.6.0-test1-mm1       1	146	98.6	0.0	0.0	0.96
2.6.0-test4           1	147	98.0	0.0	0.0	0.96
2.6.0-test4.v7        1	149	99.3	0.0	0.0	0.96
process_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	331	43.8	90.0	55.3	2.16
2.5.70                1	199	72.4	27.0	25.5	1.30
2.6.0-test1           1	264	54.5	61.0	44.3	1.73
2.6.0-test1-mm1       1	323	44.9	88.0	54.2	2.12
2.6.0-test4           1	199	72.9	26.0	26.1	1.30
2.6.0-test4.v7        1	197	76.1	22.0	22.3	1.26
ctar_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	190	77.9	0.0	0.0	1.24
2.5.70                1	186	80.1	0.0	0.0	1.22
2.6.0-test1           1	213	70.4	0.0	0.0	1.39
2.6.0-test1-mm1       1	207	72.5	0.0	0.0	1.36
2.6.0-test4           1	201	74.6	0.0	0.0	1.31
2.6.0-test4.v7        1	208	73.6	0.0	0.0	1.33
xtar_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	196	75.0	0.0	3.1	1.28
2.5.70                1	195	75.9	0.0	3.1	1.27
2.6.0-test1           1	193	76.7	1.0	4.1	1.26
2.6.0-test1-mm1       1	195	75.9	1.0	4.1	1.28
2.6.0-test4           1	193	76.7	1.0	5.2	1.26
2.6.0-test4.v7        1	190	79.5	0.0	4.2	1.22
io_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	437	34.6	69.1	15.1	2.86
2.5.70                1	401	37.7	72.3	17.4	2.62
2.6.0-test1           1	243	61.3	48.1	17.3	1.59
2.6.0-test1-mm1       1	336	44.9	64.5	17.3	2.21
2.6.0-test4           1	231	64.5	38.2	16.0	1.51
2.6.0-test4.v7        1	225	68.9	38.8	16.0	1.44
io_other:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	387	38.8	69.3	17.3	2.53
2.5.70                1	422	36.0	75.3	17.1	2.76
2.6.0-test1           1	258	57.8	55.8	19.0	1.69
2.6.0-test1-mm1       1	330	45.5	63.2	17.2	2.17
2.6.0-test4           1	232	64.7	36.9	15.3	1.52
2.6.0-test4.v7        1	227	67.8	38.6	15.4	1.46
read_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	221	67.0	9.4	3.6	1.44
2.5.70                1	220	67.3	9.4	3.6	1.44
2.6.0-test1           1	200	74.0	9.7	4.0	1.31
2.6.0-test1-mm1       1	190	78.4	9.2	4.2	1.25
2.6.0-test4           1	204	73.0	8.1	3.4	1.33
2.6.0-test4.v7        1	200	76.0	8.6	3.5	1.28
list_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	203	71.4	99.0	20.2	1.33
2.5.70                1	205	70.7	104.0	20.5	1.34
2.6.0-test1           1	199	72.4	102.0	21.6	1.30
2.6.0-test1-mm1       1	193	75.1	91.0	19.7	1.27
2.6.0-test1-mm1X      1	194	74.7	91.0	19.6	1.27
2.6.0-test1X          1	201	71.6	104.0	21.4	1.31
2.6.0-test4           1	201	72.1	103.0	21.4	1.31
2.6.0-test4.v7        1	176	84.7	32.0	8.0	1.13
mem_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	256	57.8	34.0	1.2	1.67
2.5.70                1	252	58.7	33.0	1.2	1.65
2.6.0-test1           1	309	48.2	75.0	2.3	2.02
2.6.0-test1-mm1       1	277	53.8	38.0	1.4	1.82
2.6.0-test4           1	427	35.1	88.0	1.9	2.79
2.6.0-test4.v7        1	296	51.7	75.0	2.4	1.90
dbench_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	517	28.8	5.0	35.6	3.38
2.5.70                1	424	35.1	3.0	26.7	2.77
2.6.0-test1           1	347	42.7	4.0	39.5	2.27
2.6.0-test1-mm1       1	377	39.8	4.0	36.9	2.48
2.6.0-test4           1	444	33.6	5.0	44.4	2.90
2.6.0-test4.v7        1	259	59.5	1.0	13.5	1.66
ab_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.5.69                1	300	48.3	46.0	21.7	1.96
2.5.70                1	300	48.0	46.0	22.0	1.96
2.6.0-test1           1	281	51.6	50.0	25.6	1.84
2.6.0-test1-mm1       1	229	63.3	30.0	18.3	1.51
2.6.0-test4           1	318	45.6	51.0	23.0	2.08
2.6.0-test4.v7        1	230	65.2	22.0	9.6	1.47
irman2_load:
Kernel           [runs]	Time	CPU%	Loads	LCPU%	Ratio
2.6.0-test1           1	999	14.5	31.0	0.0	6.53
2.6.0-test1-mm1       1	210	69.0	6.0	0.0	1.38
2.6.0-test4           1	1001	14.5	31.0	0.0	6.54
2.6.0-test4.v7        1	364	40.9	11.0	0.0	2.33

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2003-08-27  9:24 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-08-24 12:35 [PATCH] Nick's scheduler policy Nick Piggin
2003-08-24 14:29 ` Felipe Alfaro Solana
2003-08-25  3:05   ` Nick Piggin
2003-08-25 22:30   ` Bill Davidsen
2003-08-24 16:55 ` Martin J. Bligh
2003-08-25  3:00   ` Nick Piggin
2003-08-25 10:41     ` [PATCH] Nick's scheduler policy v7 Nick Piggin
2003-08-25 11:03       ` Felipe Alfaro Solana
2003-08-25 14:36       ` Måns Rullgård
2003-08-26  3:24       ` Martin J. Bligh
2003-08-26  4:04         ` Nick Piggin
2003-08-26  9:44       ` Yaroslav Rastrigin
2003-08-27  9:28       ` Mike Galbraith
2003-08-25  3:27 ` [PATCH] Nick's scheduler policy Randy.Dunlap
2003-08-25  3:36   ` Nick Piggin
2003-08-26  3:16     ` Mike Fedyk

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).