linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] "HT scheduler", sched-2.5.63-B3
@ 2003-02-28  9:50 Ingo Molnar
  2003-02-28 21:12 ` Andrew Morton
  2003-03-06 10:16 ` [patch] "HT scheduler", sched-2.5.63-B3 Chris Wedgwood
  0 siblings, 2 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-02-28  9:50 UTC (permalink / raw)
  To: linux-kernel; +Cc: Linus Torvalds, Andrew Morton


this is the latest HT scheduler patch, against 2.5.63-BK. Changes in -B3,
relative to the -A3 patch:

 - fix bootup hang caused by an SMP init race. [the race only triggered 
   when not using serial logging - ugh.]

 - fix bootup hang on non-HT SMP boxes compiled with HT-scheduling
   support, the active balancer's loop did not handle this case correctly.

 - fix over-eager active-balancing due to over-inflated nr_running value 
   when active-balancing via the migration thread.

This patch has been test-compiled and test-booted it on a HT, SMP and UP
box, everything works.

	Ingo

--- linux/include/linux/sched.h.orig	
+++ linux/include/linux/sched.h	
@@ -147,6 +147,24 @@ typedef struct task_struct task_t;
 extern void sched_init(void);
 extern void init_idle(task_t *idle, int cpu);
 
+/*
+ * Is there a way to do this via Kconfig?
+ */
+#if CONFIG_NR_SIBLINGS_2
+# define CONFIG_NR_SIBLINGS 2
+#elif CONFIG_NR_SIBLINGS_4
+# define CONFIG_NR_SIBLINGS 4
+#else
+# define CONFIG_NR_SIBLINGS 0
+#endif
+
+#if CONFIG_NR_SIBLINGS
+# define CONFIG_SHARE_RUNQUEUE 1
+#else
+# define CONFIG_SHARE_RUNQUEUE 0
+#endif
+extern void sched_map_runqueue(int cpu1, int cpu2);
+
 extern void show_state(void);
 extern void show_trace(unsigned long *stack);
 extern void show_stack(unsigned long *stack);
@@ -328,7 +346,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned long sleep_avg;
-	unsigned long sleep_timestamp;
+	unsigned long last_run;
 
 	unsigned long policy;
 	unsigned long cpus_allowed;
@@ -803,11 +821,6 @@ static inline unsigned int task_cpu(stru
 	return p->thread_info->cpu;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-	p->thread_info->cpu = cpu;
-}
-
 #else
 
 static inline unsigned int task_cpu(struct task_struct *p)
@@ -815,10 +828,6 @@ static inline unsigned int task_cpu(stru
 	return 0;
 }
 
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-}
-
 #endif /* CONFIG_SMP */
 
 #endif /* __KERNEL__ */
--- linux/include/asm-i386/apic.h.orig	
+++ linux/include/asm-i386/apic.h	
@@ -98,4 +98,6 @@ extern unsigned int nmi_watchdog;
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+extern int phys_proc_id[NR_CPUS];
+
 #endif /* __ASM_APIC_H */
--- linux/arch/i386/kernel/cpu/proc.c.orig	
+++ linux/arch/i386/kernel/cpu/proc.c	
@@ -1,4 +1,5 @@
 #include <linux/smp.h>
+#include <linux/sched.h>
 #include <linux/timex.h>
 #include <linux/string.h>
 #include <asm/semaphore.h>
@@ -101,6 +102,13 @@ static int show_cpuinfo(struct seq_file 
 		     fpu_exception ? "yes" : "no",
 		     c->cpuid_level,
 		     c->wp_works_ok ? "yes" : "no");
+#if CONFIG_SHARE_RUNQUEUE
+{
+	extern long __rq_idx[NR_CPUS];
+
+	seq_printf(m, "\nrunqueue\t: %ld\n", __rq_idx[n]);
+}
+#endif
 
 	for ( i = 0 ; i < 32*NCAPINTS ; i++ )
 		if ( test_bit(i, c->x86_capability) &&
--- linux/arch/i386/kernel/smpboot.c.orig	
+++ linux/arch/i386/kernel/smpboot.c	
@@ -38,6 +38,7 @@
 #include <linux/kernel.h>
 
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/kernel_stat.h>
 #include <linux/smp_lock.h>
 #include <linux/irq.h>
@@ -939,6 +940,16 @@ void *xquad_portio;
 
 int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
+static int test_ht;
+
+static int __init ht_setup(char *str)
+{
+	test_ht = 1;
+	return 1;
+}
+
+__setup("test_ht", ht_setup);
+
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
 	int apicid, cpu, bit;
@@ -1079,7 +1090,20 @@ static void __init smp_boot_cpus(unsigne
 	Dprintk("Boot done.\n");
 
 	/*
-	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
+	 * Here we can be sure that there is an IO-APIC in the system. Let's
+	 * go and set it up:
+	 */
+	smpboot_setup_io_apic();
+
+	setup_boot_APIC_clock();
+
+	/*
+	 * Synchronize the TSC with the AP
+	 */
+	if (cpu_has_tsc && cpucount)
+		synchronize_tsc_bp();
+	/*
+	 * If Hyper-Threading is available, construct cpu_sibling_map[], so
 	 * that we can tell the sibling CPU efficiently.
 	 */
 	if (cpu_has_ht && smp_num_siblings > 1) {
@@ -1088,7 +1112,8 @@ static void __init smp_boot_cpus(unsigne
 		
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
 			int 	i;
-			if (!test_bit(cpu, &cpu_callout_map)) continue;
+			if (!test_bit(cpu, &cpu_callout_map))
+				continue;
 
 			for (i = 0; i < NR_CPUS; i++) {
 				if (i == cpu || !test_bit(i, &cpu_callout_map))
@@ -1104,17 +1129,41 @@ static void __init smp_boot_cpus(unsigne
 				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
 			}
 		}
-	}
-
-	smpboot_setup_io_apic();
-
-	setup_boot_APIC_clock();
+#if CONFIG_SHARE_RUNQUEUE
+		/*
+		 * At this point APs would have synchronised TSC and
+		 * waiting for smp_commenced, with their APIC timer
+		 * disabled. So BP can go ahead do some initialization
+		 * for Hyper-Threading (if needed).
+		 */
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			int i;
+			if (!test_bit(cpu, &cpu_callout_map))
+				continue;
+			for (i = 0; i < NR_CPUS; i++) {
+				if (i <= cpu)
+					continue;
+				if (!test_bit(i, &cpu_callout_map))
+					continue;
 
-	/*
-	 * Synchronize the TSC with the AP
-	 */
-	if (cpu_has_tsc && cpucount)
-		synchronize_tsc_bp();
+				if (phys_proc_id[cpu] != phys_proc_id[i])
+					continue;
+				/*
+				 * merge runqueues, resulting in one
+				 * runqueue per package:
+				 */
+				sched_map_runqueue(cpu, i);
+				break;
+			}
+		}
+#endif
+	}
+#if CONFIG_SHARE_RUNQUEUE
+	if (smp_num_siblings == 1 && test_ht) {
+		printk("Simulating shared runqueues - mapping CPU#1's runqueue to CPU#0's runqueue.\n");
+		sched_map_runqueue(0, 1);
+	}
+#endif
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
--- linux/arch/i386/Kconfig.orig	
+++ linux/arch/i386/Kconfig	
@@ -422,6 +422,24 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
+choice
+
+	prompt "Hyperthreading Support"
+	depends on SMP
+	default NR_SIBLINGS_0
+
+config NR_SIBLINGS_0
+	bool "off"
+
+config NR_SIBLINGS_2
+	bool "2 siblings"
+
+config NR_SIBLINGS_4
+	bool "4 siblings"
+
+endchoice
+
+
 config PREEMPT
 	bool "Preemptible Kernel"
 	help
--- linux/kernel/fork.c.orig	
+++ linux/kernel/fork.c	
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
-	p->sleep_timestamp = jiffies;
+	p->last_run = jiffies;
 	if (!current->time_slice) {
 		/*
 	 	 * This case is rare, it happens when the parent has only
--- linux/kernel/sched.c.orig	
+++ linux/kernel/sched.c	
@@ -67,6 +67,7 @@
 #define INTERACTIVE_DELTA	2
 #define MAX_SLEEP_AVG		(2*HZ)
 #define STARVATION_LIMIT	(2*HZ)
+#define AGRESSIVE_IDLE_STEAL	1
 #define NODE_THRESHOLD          125
 
 /*
@@ -141,6 +142,48 @@ struct prio_array {
 };
 
 /*
+ * It's possible for two CPUs to share the same runqueue.
+ * This makes sense if they eg. share caches.
+ *
+ * We take the common 1:1 (SMP, UP) case and optimize it,
+ * the rest goes via remapping: rq_idx(cpu) gives the
+ * runqueue on which a particular cpu is on, cpu_idx(cpu)
+ * gives the rq-specific index of the cpu.
+ *
+ * (Note that the generic scheduler code does not impose any
+ *  restrictions on the mappings - there can be 4 CPUs per
+ *  runqueue or even assymetric mappings.)
+ */
+#if CONFIG_SHARE_RUNQUEUE
+# define MAX_NR_SIBLINGS CONFIG_NR_SIBLINGS
+  long __rq_idx[NR_CPUS] __cacheline_aligned;
+  static long __cpu_idx[NR_CPUS] __cacheline_aligned;
+# define rq_idx(cpu) (__rq_idx[(cpu)])
+# define cpu_idx(cpu) (__cpu_idx[(cpu)])
+# define for_each_sibling(idx, rq) \
+		for ((idx) = 0; (idx) < (rq)->nr_cpus; (idx)++)
+# define rq_nr_cpus(rq) ((rq)->nr_cpus)
+# define cpu_active_balance(c) (cpu_rq(c)->cpu[0].active_balance)
+#else
+# define MAX_NR_SIBLINGS 1
+# define rq_idx(cpu) (cpu)
+# define cpu_idx(cpu) 0
+# define for_each_sibling(idx, rq) while (0)
+# define cpu_active_balance(c) 0
+# define do_active_balance(rq, cpu) do { } while (0)
+# define rq_nr_cpus(rq) 1
+  static inline void active_load_balance(runqueue_t *rq, int this_cpu) { }
+#endif
+
+typedef struct cpu_s {
+	task_t *curr, *idle;
+	task_t *migration_thread;
+	struct list_head migration_queue;
+	int active_balance;
+	int cpu;
+} cpu_t;
+
+/*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
@@ -151,7 +194,6 @@ struct runqueue {
 	spinlock_t lock;
 	unsigned long nr_running, nr_switches, expired_timestamp,
 			nr_uninterruptible;
-	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_nr_running[NR_CPUS];
@@ -160,27 +202,39 @@ struct runqueue {
 	unsigned int nr_balanced;
 	int prev_node_load[MAX_NUMNODES];
 #endif
-	task_t *migration_thread;
-	struct list_head migration_queue;
+	int nr_cpus;
+	cpu_t cpu[MAX_NR_SIBLINGS];
 
 	atomic_t nr_iowait;
 } ____cacheline_aligned;
 
 static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
 
-#define cpu_rq(cpu)		(runqueues + (cpu))
+#define cpu_rq(cpu)		(runqueues + (rq_idx(cpu)))
+#define cpu_int(c)		((cpu_rq(c))->cpu + cpu_idx(c))
+#define cpu_curr_ptr(cpu)	(cpu_int(cpu)->curr)
+#define cpu_idle_ptr(cpu)	(cpu_int(cpu)->idle)
+
 #define this_rq()		cpu_rq(smp_processor_id())
 #define task_rq(p)		cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define rt_task(p)		((p)->prio < MAX_RT_PRIO)
 
+#define migration_thread(cpu)	(cpu_int(cpu)->migration_thread)
+#define migration_queue(cpu)	(&cpu_int(cpu)->migration_queue)
+
+#if NR_CPUS > 1
+# define task_allowed(p, cpu)	((p)->cpus_allowed & (1UL << (cpu)))
+#else
+# define task_allowed(p, cpu)	1
+#endif
+
 /*
  * Default context-switch locking:
  */
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(rq, next)	do { } while(0)
 # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define task_running(p)		(cpu_curr_ptr(task_cpu(p)) == (p))
 #endif
 
 #ifdef CONFIG_NUMA
@@ -323,16 +377,21 @@ static inline int effective_prio(task_t 
  * Also update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task(p, rq->active);
+	nr_running_inc(rq);
+}
+
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	unsigned long sleep_time = jiffies - p->sleep_timestamp;
-	prio_array_t *array = rq->active;
+	unsigned long sleep_time = jiffies - p->last_run;
 
 	if (!rt_task(p) && sleep_time) {
 		/*
 		 * This code gives a bonus to interactive tasks. We update
 		 * an 'average sleep time' value here, based on
-		 * sleep_timestamp. The more time a task spends sleeping,
+		 * ->last_run. The more time a task spends sleeping,
 		 * the higher the average gets - and the higher the priority
 		 * boost gets as well.
 		 */
@@ -341,8 +400,7 @@ static inline void activate_task(task_t 
 			p->sleep_avg = MAX_SLEEP_AVG;
 		p->prio = effective_prio(p);
 	}
-	enqueue_task(p, array);
-	nr_running_inc(rq);
+	__activate_task(p, rq);
 }
 
 /*
@@ -383,8 +441,18 @@ static inline void resched_task(task_t *
 #endif
 }
 
+static inline void resched_cpu(int cpu)
+{
+	resched_task(cpu_curr_ptr(cpu));
+}
+
 #ifdef CONFIG_SMP
 
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+		p->thread_info->cpu = cpu;
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -402,7 +470,7 @@ void wait_task_inactive(task_t * p)
 repeat:
 	preempt_disable();
 	rq = task_rq(p);
-	if (unlikely(task_running(rq, p))) {
+	if (unlikely(task_running(p))) {
 		cpu_relax();
 		/*
 		 * enable/disable preemption just to make this
@@ -413,7 +481,7 @@ repeat:
 		goto repeat;
 	}
 	rq = task_rq_lock(p, &flags);
-	if (unlikely(task_running(rq, p))) {
+	if (unlikely(task_running(p))) {
 		task_rq_unlock(rq, &flags);
 		preempt_enable();
 		goto repeat;
@@ -421,6 +489,13 @@ repeat:
 	task_rq_unlock(rq, &flags);
 	preempt_enable();
 }
+
+#else
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
 #endif
 
 /*
@@ -435,10 +510,39 @@ repeat:
  */
 void kick_if_running(task_t * p)
 {
-	if ((task_running(task_rq(p), p)) && (task_cpu(p) != smp_processor_id()))
+	if ((task_running(p)) && (task_cpu(p) != smp_processor_id()))
 		resched_task(p);
 }
 
+static void wake_up_cpu(runqueue_t *rq, int cpu, task_t *p)
+{
+	cpu_t *curr_cpu;
+	task_t *curr;
+	int idx;
+
+	if (idle_cpu(cpu))
+		return resched_cpu(cpu);
+
+	for_each_sibling(idx, rq) {
+		curr_cpu = rq->cpu + idx;
+		if (!task_allowed(p, curr_cpu->cpu))
+			continue;
+		if (curr_cpu->idle == curr_cpu->curr)
+			return resched_cpu(curr_cpu->cpu);
+	}
+
+	if (p->prio < cpu_curr_ptr(cpu)->prio)
+		return resched_task(cpu_curr_ptr(cpu));
+
+	for_each_sibling(idx, rq) {
+		curr_cpu = rq->cpu + idx;
+		if (!task_allowed(p, curr_cpu->cpu))
+			continue;
+		curr = curr_cpu->curr;
+		if (p->prio < curr->prio)
+			return resched_task(curr);
+	}
+}
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -460,6 +564,7 @@ static int try_to_wake_up(task_t * p, un
 	long old_state;
 	runqueue_t *rq;
 
+	BUG_ON(!p);
 repeat_lock_task:
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -469,7 +574,7 @@ repeat_lock_task:
 			 * Fast-migrate the task if it's not running or runnable
 			 * currently. Do not violate hard affinity.
 			 */
-			if (unlikely(sync && !task_running(rq, p) &&
+			if (unlikely(sync && !task_running(p) &&
 				(task_cpu(p) != smp_processor_id()) &&
 				(p->cpus_allowed & (1UL << smp_processor_id())))) {
 
@@ -479,10 +584,12 @@ repeat_lock_task:
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			activate_task(p, rq);
-	
-			if (p->prio < rq->curr->prio)
-				resched_task(rq->curr);
+			if (sync)
+				 __activate_task(p, rq);
+			else {
+				activate_task(p, rq);
+				wake_up_cpu(rq, task_cpu(p), p);
+			}
 			success = 1;
 		}
 		p->state = TASK_RUNNING;
@@ -605,7 +712,7 @@ static inline task_t * context_switch(ru
 {
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
-
+  
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
@@ -637,8 +744,9 @@ unsigned long nr_running(void)
 	unsigned long i, sum = 0;
 
 	for (i = 0; i < NR_CPUS; i++)
-		sum += cpu_rq(i)->nr_running;
-
+		/* Shared runqueues are counted only once. */
+		if (!cpu_idx(i))
+			sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 
@@ -649,7 +757,9 @@ unsigned long nr_uninterruptible(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		sum += cpu_rq(i)->nr_uninterruptible;
+		/* Shared runqueues are counted only once. */
+		if (!cpu_idx(i))
+			sum += cpu_rq(i)->nr_uninterruptible;
 	}
 	return sum;
 }
@@ -831,7 +941,23 @@ static inline unsigned long cpus_to_bala
 
 #endif /* CONFIG_NUMA */
 
-#if CONFIG_SMP
+/*
+ * One of the idle_cpu_tick() and busy_cpu_tick() functions will
+ * get called every timer tick, on every CPU. Our balancing action
+ * frequency and balancing agressivity depends on whether the CPU is
+ * idle or not.
+ *
+ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * systems with HZ=100, every 10 msecs.)
+ */
+#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
+#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+
+#if !CONFIG_SMP
+
+static inline void load_balance(runqueue_t *rq, int this_cpu, int idle) { }
+
+#else
 
 /*
  * double_lock_balance - lock the busiest runqueue
@@ -947,12 +1073,7 @@ static inline void pull_task(runqueue_t 
 	set_task_cpu(p, this_cpu);
 	nr_running_inc(this_rq);
 	enqueue_task(p, this_rq->active);
-	/*
-	 * Note that idle threads have a prio of MAX_PRIO, for this test
-	 * to be always true for them.
-	 */
-	if (p->prio < this_rq->curr->prio)
-		set_need_resched();
+	wake_up_cpu(this_rq, this_cpu, p);
 }
 
 /*
@@ -963,9 +1084,9 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle)
+static void load_balance(runqueue_t *this_rq, int this_cpu, int idle)
 {
-	int imbalance, idx, this_cpu = smp_processor_id();
+	int imbalance, idx;
 	runqueue_t *busiest;
 	prio_array_t *array;
 	struct list_head *head, *curr;
@@ -1013,12 +1134,15 @@ skip_queue:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
+	 *
+	 * (except if we are in idle mode which is a more agressive
+	 *  form of rebalancing.)
 	 */
 
-#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
-		!task_running(rq, p) &&					\
-			((p)->cpus_allowed & (1UL << (this_cpu))))
+#define CAN_MIGRATE_TASK(p,rq,cpu)		\
+	(((idle && AGRESSIVE_IDLE_STEAL) ||	\
+		(jiffies - (p)->last_run > cache_decay_ticks)) && \
+			!task_running(p) && task_allowed(p, cpu))
 
 	curr = curr->prev;
 
@@ -1041,31 +1165,144 @@ out:
 	;
 }
 
+#if CONFIG_SHARE_RUNQUEUE
+static void active_load_balance(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *rq;
+	int i, idx;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		rq = cpu_rq(i);
+		if (rq == this_rq)
+			continue;
+ 		/*
+		 * If there's only one CPU mapped to this runqueue
+		 * then there can be no SMT imbalance:
+		 */
+		if (rq->nr_cpus == 1)
+			continue;
+		/*
+		 * Any SMT-specific imbalance?
+		 */
+		for_each_sibling(idx, rq)
+			if (rq->cpu[idx].idle == rq->cpu[idx].curr)
+				continue;
+
+		/*
+		 * At this point it's sure that we have a SMT
+		 * imbalance: this (physical) CPU is idle but
+		 * another CPU has two (or more) tasks running.
+		 *
+		 * We wake up one of the migration threads (it
+		 * doesnt matter which one) and let it fix things up:
+		 */
+		if (!cpu_active_balance(i)) {
+			cpu_active_balance(i) = 1;
+			spin_unlock(&this_rq->lock);
+			if (rq->cpu[0].migration_thread)
+				wake_up_process(rq->cpu[0].migration_thread);
+			spin_lock(&this_rq->lock);
+		}
+	}
+}
+
+static void do_active_balance(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *rq;
+	int i, idx;
+
+	spin_unlock(&this_rq->lock);
+
+	cpu_active_balance(this_cpu) = 0;
+
+	/*
+	 * Is the imbalance still present?
+	 */
+	for_each_sibling(idx, this_rq)
+		if (this_rq->cpu[idx].idle == this_rq->cpu[idx].curr)
+			goto out;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		rq = cpu_rq(i);
+		if (rq == this_rq)
+			continue;
+
+		/* completely idle CPU? */
+		if (rq->nr_running)
+			continue;
+
+		/*
+		 * At this point it's reasonably sure that we have an
+		 * imbalance. Since we are the migration thread, try to
+	 	 * balance a thread over to the target queue.
+		 */
+		spin_lock(&this_rq->lock);
+		this_rq->nr_running--;
+		spin_unlock(&this_rq->lock);
+
+		spin_lock(&rq->lock);
+		load_balance(rq, i, 1);
+		spin_unlock(&rq->lock);
+
+		spin_lock(&this_rq->lock);
+		this_rq->nr_running++;
+		spin_unlock(&this_rq->lock);
+		goto out;
+	}
+out:
+	spin_lock(&this_rq->lock);
+}
+
 /*
- * One of the idle_cpu_tick() and busy_cpu_tick() functions will
- * get called every timer tick, on every CPU. Our balancing action
- * frequency and balancing agressivity depends on whether the CPU is
- * idle or not.
+ * This routine is called to map a CPU into another CPU's runqueue.
  *
- * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
- * systems with HZ=100, every 10 msecs.)
+ * This must be called during bootup with the merged runqueue having
+ * no tasks.
  */
-#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
-#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+void sched_map_runqueue(int cpu1, int cpu2)
+{
+	runqueue_t *rq1 = cpu_rq(cpu1);
+	runqueue_t *rq2 = cpu_rq(cpu2);
+	int cpu2_idx_orig = cpu_idx(cpu2), cpu2_idx;
+
+	BUG_ON(rq1 == rq2 || rq2->nr_running || rq_idx(cpu1) != cpu1);
+	/*
+	 * At this point, we dont have anything in the runqueue yet. So,
+	 * there is no need to move processes between the runqueues.
+	 * Only, the idle processes should be combined and accessed
+	 * properly.
+	 */
+	cpu2_idx = rq1->nr_cpus++;
+
+	rq_idx(cpu2) = cpu1;
+	cpu_idx(cpu2) = cpu2_idx;
+	rq1->cpu[cpu2_idx].cpu = cpu2;
+	rq1->cpu[cpu2_idx].idle = rq2->cpu[cpu2_idx_orig].idle;
+	rq1->cpu[cpu2_idx].curr = rq2->cpu[cpu2_idx_orig].curr;
+	INIT_LIST_HEAD(&rq1->cpu[cpu2_idx].migration_queue);
 
-static inline void idle_tick(runqueue_t *rq)
+	/* just to be safe: */
+	rq2->cpu[cpu2_idx_orig].idle = NULL;
+	rq2->cpu[cpu2_idx_orig].curr = NULL;
+}
+#endif
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
+
+static inline void idle_tick(runqueue_t *rq, unsigned long j)
 {
-	if (jiffies % IDLE_REBALANCE_TICK)
+	if (j % IDLE_REBALANCE_TICK)
 		return;
 	spin_lock(&rq->lock);
-	load_balance(rq, 1);
+	load_balance(rq, smp_processor_id(), 1);
 	spin_unlock(&rq->lock);
 }
 
-#endif
-
-DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
-
 /*
  * We place interactive tasks back into the active array, if possible.
  *
@@ -1076,9 +1313,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
  * increasing number of running tasks:
  */
 #define EXPIRED_STARVING(rq) \
-		((rq)->expired_timestamp && \
+		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))
+			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
 
 /*
  * This function gets called by the timer code, with HZ frequency.
@@ -1091,12 +1328,13 @@ void scheduler_tick(int user_ticks, int 
 {
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
+	unsigned long j = jiffies;
 	task_t *p = current;
 
  	if (rcu_pending(cpu))
  		rcu_check_callbacks(cpu, user_ticks);
 
-	if (p == rq->idle) {
+	if (p == cpu_idle_ptr(cpu)) {
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
 			kstat_cpu(cpu).cpustat.system += sys_ticks;
@@ -1104,9 +1342,7 @@ void scheduler_tick(int user_ticks, int 
 			kstat_cpu(cpu).cpustat.iowait += sys_ticks;
 		else
 			kstat_cpu(cpu).cpustat.idle += sys_ticks;
-#if CONFIG_SMP
-		idle_tick(rq);
-#endif
+		idle_tick(rq, j);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -1115,12 +1351,13 @@ void scheduler_tick(int user_ticks, int 
 		kstat_cpu(cpu).cpustat.user += user_ticks;
 	kstat_cpu(cpu).cpustat.system += sys_ticks;
 
+	spin_lock(&rq->lock);
 	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
 		set_tsk_need_resched(p);
+		spin_unlock(&rq->lock);
 		return;
 	}
-	spin_lock(&rq->lock);
 	if (unlikely(rt_task(p))) {
 		/*
 		 * RR tasks need a special form of timeslice management.
@@ -1156,16 +1393,14 @@ void scheduler_tick(int user_ticks, int 
 
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			if (!rq->expired_timestamp)
-				rq->expired_timestamp = jiffies;
+				rq->expired_timestamp = j;
 			enqueue_task(p, rq->expired);
 		} else
 			enqueue_task(p, rq->active);
 	}
 out:
-#if CONFIG_SMP
-	if (!(jiffies % BUSY_REBALANCE_TICK))
-		load_balance(rq, 0);
-#endif
+	if (!(j % BUSY_REBALANCE_TICK))
+		load_balance(rq, smp_processor_id(), 0);
 	spin_unlock(&rq->lock);
 }
 
@@ -1176,11 +1411,11 @@ void scheduling_functions_start_here(voi
  */
 asmlinkage void schedule(void)
 {
+	int idx, this_cpu, retry = 0;
+	struct list_head *queue;
 	task_t *prev, *next;
-	runqueue_t *rq;
 	prio_array_t *array;
-	struct list_head *queue;
-	int idx;
+	runqueue_t *rq;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -1193,15 +1428,15 @@ asmlinkage void schedule(void)
 			dump_stack();
 		}
 	}
-
-	check_highmem_ptes();
 need_resched:
+	check_highmem_ptes();
+	this_cpu = smp_processor_id();
 	preempt_disable();
 	prev = current;
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->sleep_timestamp = jiffies;
+	prev->last_run = jiffies;
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1224,12 +1459,14 @@ need_resched:
 	}
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
-#if CONFIG_SMP
-		load_balance(rq, 1);
+		load_balance(rq, this_cpu, 1);
 		if (rq->nr_running)
 			goto pick_next_task;
-#endif
-		next = rq->idle;
+		active_load_balance(rq, this_cpu);
+		if (rq->nr_running)
+			goto pick_next_task;
+pick_idle:
+		next = cpu_idle_ptr(this_cpu);
 		rq->expired_timestamp = 0;
 		goto switch_tasks;
 	}
@@ -1245,18 +1482,49 @@ pick_next_task:
 		rq->expired_timestamp = 0;
 	}
 
+new_array:
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
+	if ((next != prev) && (rq_nr_cpus(rq) > 1)) {
+		struct list_head *tmp = queue->next;
+
+		while ((task_running(next) && (next != prev)) || !task_allowed(next, this_cpu)) {
+			tmp = tmp->next;
+			if (tmp != queue) {
+				next = list_entry(tmp, task_t, run_list);
+				continue;
+			}
+			idx = find_next_bit(array->bitmap, MAX_PRIO, ++idx);
+			if (idx == MAX_PRIO) {
+				if (retry || !rq->expired->nr_active) {
+					goto pick_idle;
+				}
+				/*
+				 * To avoid infinite changing of arrays,
+				 * when we have only tasks runnable by
+				 * sibling.
+				 */
+				retry = 1;
+
+				array = rq->expired;
+				goto new_array;
+			}
+			queue = array->queue + idx;
+			tmp = queue->next;
+			next = list_entry(tmp, task_t, run_list);
+		}
+	}
 
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
-	RCU_qsctr(prev->thread_info->cpu)++;
+	RCU_qsctr(task_cpu(prev))++;
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
-		rq->curr = next;
+		cpu_curr_ptr(this_cpu) = next;
+		set_task_cpu(next, this_cpu);
 	
 		prepare_arch_switch(rq, next);
 		prev = context_switch(rq, prev, next);
@@ -1522,9 +1790,8 @@ void set_user_nice(task_t *p, long nice)
 		 * If the task is running and lowered its priority,
 		 * or increased its priority then reschedule its CPU:
 		 */
-		if ((NICE_TO_PRIO(nice) < p->static_prio) ||
-							task_running(rq, p))
-			resched_task(rq->curr);
+		if ((NICE_TO_PRIO(nice) < p->static_prio) || task_running(p))
+			resched_task(cpu_curr_ptr(task_cpu(p)));
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
@@ -1602,7 +1869,7 @@ int task_nice(task_t *p)
  */
 int task_curr(task_t *p)
 {
-	return cpu_curr(task_cpu(p)) == p;
+	return cpu_curr_ptr(task_cpu(p)) == p;
 }
 
 /**
@@ -1611,7 +1878,7 @@ int task_curr(task_t *p)
  */
 int idle_cpu(int cpu)
 {
-	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+	return cpu_curr_ptr(cpu) == cpu_idle_ptr(cpu);
 }
 
 /**
@@ -1701,7 +1968,7 @@ static int setscheduler(pid_t pid, int p
 	else
 		p->prio = p->static_prio;
 	if (array)
-		activate_task(p, task_rq(p));
+		__activate_task(p, task_rq(p));
 
 out_unlock:
 	task_rq_unlock(rq, &flags);
@@ -2161,7 +2428,7 @@ void __init init_idle(task_t *idle, int 
 	local_irq_save(flags);
 	double_rq_lock(idle_rq, rq);
 
-	idle_rq->curr = idle_rq->idle = idle;
+	cpu_curr_ptr(cpu) = cpu_idle_ptr(cpu) = idle;
 	deactivate_task(idle, rq);
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
@@ -2216,6 +2483,7 @@ void set_cpus_allowed(task_t *p, unsigne
 	unsigned long flags;
 	migration_req_t req;
 	runqueue_t *rq;
+	int cpu;
 
 #if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */
 	new_mask &= cpu_online_map;
@@ -2237,31 +2505,31 @@ void set_cpus_allowed(task_t *p, unsigne
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array && !task_running(rq, p)) {
+	if (!p->array && !task_running(p)) {
 		set_task_cpu(p, __ffs(p->cpus_allowed));
 		task_rq_unlock(rq, &flags);
 		return;
 	}
 	init_completion(&req.done);
 	req.task = p;
-	list_add(&req.list, &rq->migration_queue);
+	cpu = task_cpu(p);
+	list_add(&req.list, migration_queue(cpu));
 	task_rq_unlock(rq, &flags);
-
-	wake_up_process(rq->migration_thread);
+	wake_up_process(migration_thread(cpu));
 
 	wait_for_completion(&req.done);
 }
 
 /*
- * migration_thread - this is a highprio system thread that performs
+ * migration_task - this is a highprio system thread that performs
  * thread migration by 'pulling' threads into the target runqueue.
  */
-static int migration_thread(void * data)
+static int migration_task(void * data)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	int cpu = (long) data;
 	runqueue_t *rq;
-	int ret;
+	int ret, idx;
 
 	daemonize("migration/%d", cpu);
 	set_fs(KERNEL_DS);
@@ -2275,7 +2543,8 @@ static int migration_thread(void * data)
 	ret = setscheduler(0, SCHED_FIFO, &param);
 
 	rq = this_rq();
-	rq->migration_thread = current;
+	migration_thread(cpu) = current;
+	idx = cpu_idx(cpu);
 
 	for (;;) {
 		runqueue_t *rq_src, *rq_dest;
@@ -2286,8 +2555,10 @@ static int migration_thread(void * data)
 		task_t *p;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		head = &rq->migration_queue;
-		current->state = TASK_INTERRUPTIBLE;
+		if (cpu_active_balance(cpu))
+			do_active_balance(rq, cpu);
+		head = migration_queue(cpu);
+		current->state = TASK_UNINTERRUPTIBLE;
 		if (list_empty(head)) {
 			spin_unlock_irqrestore(&rq->lock, flags);
 			schedule();
@@ -2315,9 +2586,8 @@ repeat:
 			set_task_cpu(p, cpu_dest);
 			if (p->array) {
 				deactivate_task(p, rq_src);
-				activate_task(p, rq_dest);
-				if (p->prio < rq_dest->curr->prio)
-					resched_task(rq_dest->curr);
+				__activate_task(p, rq_dest);
+				wake_up_cpu(rq_dest, cpu_dest, p);
 			}
 		}
 		double_rq_unlock(rq_src, rq_dest);
@@ -2335,12 +2605,13 @@ static int migration_call(struct notifie
 			  unsigned long action,
 			  void *hcpu)
 {
+	long cpu = (long) hcpu;
+
 	switch (action) {
 	case CPU_ONLINE:
-		printk("Starting migration thread for cpu %li\n",
-		       (long)hcpu);
-		kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
-		while (!cpu_rq((long)hcpu)->migration_thread)
+		printk("Starting migration thread for cpu %li\n", cpu);
+		kernel_thread(migration_task, hcpu, CLONE_KERNEL);
+		while (!migration_thread(cpu))
 			yield();
 		break;
 	}
@@ -2415,11 +2686,20 @@ void __init sched_init(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		prio_array_t *array;
 
+		/*
+		 * Start with a 1:1 mapping between CPUs and runqueues:
+		 */
+#if CONFIG_SHARE_RUNQUEUE
+		rq_idx(i) = i;
+		cpu_idx(i) = 0;
+#endif
 		rq = cpu_rq(i);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
-		INIT_LIST_HEAD(&rq->migration_queue);
+		INIT_LIST_HEAD(migration_queue(i));
+		rq->nr_cpus = 1;
+		rq->cpu[cpu_idx(i)].cpu = i;
 		atomic_set(&rq->nr_iowait, 0);
 		nr_running_init(rq);
 
@@ -2437,9 +2717,9 @@ void __init sched_init(void)
 	 * We have to do a little magic to get the first
 	 * thread right in SMP mode.
 	 */
-	rq = this_rq();
-	rq->curr = current;
-	rq->idle = current;
+	cpu_curr_ptr(smp_processor_id()) = current;
+	cpu_idle_ptr(smp_processor_id()) = current;
+
 	set_task_cpu(current, smp_processor_id());
 	wake_up_forked_process(current);
 
--- linux/init/main.c.orig	
+++ linux/init/main.c	
@@ -354,7 +354,14 @@ static void __init smp_init(void)
 
 static void rest_init(void)
 {
+	/* 
+	 *	We count on the initial thread going ok 
+	 *	Like idlers init is an unlocked kernel thread, which will
+	 *	make syscalls (and thus be locked).
+	 */
+	init_idle(current, smp_processor_id());
 	kernel_thread(init, NULL, CLONE_KERNEL);
+
 	unlock_kernel();
  	cpu_idle();
 } 
@@ -438,13 +445,6 @@ asmlinkage void __init start_kernel(void
 	check_bugs();
 	printk("POSIX conformance testing by UNIFIX\n");
 
-	/* 
-	 *	We count on the initial thread going ok 
-	 *	Like idlers init is an unlocked kernel thread, which will
-	 *	make syscalls (and thus be locked).
-	 */
-	init_idle(current, smp_processor_id());
-
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-02-28  9:50 [patch] "HT scheduler", sched-2.5.63-B3 Ingo Molnar
@ 2003-02-28 21:12 ` Andrew Morton
  2003-02-28 21:23   ` Robert Love
  2003-03-06 10:16 ` [patch] "HT scheduler", sched-2.5.63-B3 Chris Wedgwood
  1 sibling, 1 reply; 100+ messages in thread
From: Andrew Morton @ 2003-02-28 21:12 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, torvalds

Ingo Molnar <mingo@elte.hu> wrote:
>
> 
> this is the latest HT scheduler patch, against 2.5.63-BK.

It's looking pretty good Ingo.

- The "tbench starves everything else" problem is fixed.

- The mm_struct leak is fixed

- As far as I can tell, the scheduler update no longer causes the
  large "contest io_load" changes, wherein the streaming write made
  lots more progress at the expense of the kernel compile.

- The longstanding problem wherein a kernel build makes my X desktop
  unusable is 90% fixed - it is still possible to trigger stalls, but
  they are less severe, and you actually have to work at it a bit to
  make them happen.

Thanks.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-02-28 21:12 ` Andrew Morton
@ 2003-02-28 21:23   ` Robert Love
  2003-03-01  4:25     ` Andrew Morton
  0 siblings, 1 reply; 100+ messages in thread
From: Robert Love @ 2003-02-28 21:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, linux-kernel, torvalds

On Fri, 2003-02-28 at 16:12, Andrew Morton wrote:

> - The longstanding problem wherein a kernel build makes my X desktop
>   unusable is 90% fixed - it is still possible to trigger stalls, but
>   they are less severe, and you actually have to work at it a bit to
>   make them happen.

That is odd, because I do not see any changes in here that would improve
interactivity.  This looks to be pretty much purely the HT stuff.

Andrew, if you drop this patch, your X desktop usability drops?

Ingo, is there something in here that I am missing?

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-02-28 21:23   ` Robert Love
@ 2003-03-01  4:25     ` Andrew Morton
  2003-03-06  3:20       ` Linus Torvalds
  2003-03-06  6:58       ` [patch] "interactivity changes", sched-2.5.64-A4 Ingo Molnar
  0 siblings, 2 replies; 100+ messages in thread
From: Andrew Morton @ 2003-03-01  4:25 UTC (permalink / raw)
  To: Robert Love; +Cc: mingo, linux-kernel, torvalds

Robert Love <rml@tech9.net> wrote:
>
> On Fri, 2003-02-28 at 16:12, Andrew Morton wrote:
> 
> > - The longstanding problem wherein a kernel build makes my X desktop
> >   unusable is 90% fixed - it is still possible to trigger stalls, but
> >   they are less severe, and you actually have to work at it a bit to
> >   make them happen.
> 
> That is odd, because I do not see any changes in here that would improve
> interactivity.  This looks to be pretty much purely the HT stuff.
> 
> Andrew, if you drop this patch, your X desktop usability drops?

hm, you're right.  It's still really bad.  I forgot that I was using distcc.

And I also forgot that tbench starves everything else only on CONFIG_SMP=n.
That problem remains with us as well.



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-01  4:25     ` Andrew Morton
@ 2003-03-06  3:20       ` Linus Torvalds
  2003-03-06  7:45         ` Andrew Morton
                           ` (3 more replies)
  2003-03-06  6:58       ` [patch] "interactivity changes", sched-2.5.64-A4 Ingo Molnar
  1 sibling, 4 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06  3:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robert Love, mingo, linux-kernel


On Fri, 28 Feb 2003, Andrew Morton wrote:
> > 
> > Andrew, if you drop this patch, your X desktop usability drops?
> 
> hm, you're right.  It's still really bad.  I forgot that I was using distcc.
> 
> And I also forgot that tbench starves everything else only on CONFIG_SMP=n.
> That problem remains with us as well.

Andrew, I always thought that the scheduler interactivity was bogus, since
it didn't give any bonus to processes that _help_ interactive users
(notably the X server, but it could be other things).

To fix that, some people nice up their X servers, which has its own set of 
problems.

How about something more like this (yeah, untested, but you get the idea): 
the person who wakes up an interactive task gets the interactivity bonus 
if the interactive task is already maxed out. I dunno how well this plays 
with the X server, but assuming most clients use UNIX domain sockets, the 
wake-ups _should_ be synchronous, so it should work well to say "waker 
gets bonus".

This should result in:

 - if X ends up using all of its time to handle clients, obviously X will 
   not count as interactive on its own. HOWEVER, if an xterm or something 
   gets an X event, the fact that the xterm has been idle means that _it_ 
   gets a interactivity boost at wakeup.

 - after a few such boosts (or assuming lots of idleness of xterm), the 
   process that caused the wakeup (ie the X server) will get the 
   "extraneous interactivity".

This all depends on whether the unix domain socket code runs in bottom 
half or process context. If it runs in bottom half context we're screwed, 
I haven't checked.

Does this make any difference for you? I don't know what your load test 
is, and considering that my regular desktop has 4 fast CPU's I doubt I can 
see the effect very clearly anyway ("Awww, poor Linus!")

NOTE! This doesn't help a "chain" of interactive helpers. It could be 
extended to that, by just allowing the waker to "steal" interactivity 
points from a sleeping process, but then we'd need to start being careful 
about fairness and in particular we'd have to disallow this for signal 
handling.

		Linus

----
===== kernel/sched.c 1.161 vs edited =====
--- 1.161/kernel/sched.c	Thu Feb 20 20:33:52 2003
+++ edited/kernel/sched.c	Wed Mar  5 19:09:45 2003
@@ -337,8 +337,15 @@
 		 * boost gets as well.
 		 */
 		p->sleep_avg += sleep_time;
-		if (p->sleep_avg > MAX_SLEEP_AVG)
+		if (p->sleep_avg > MAX_SLEEP_AVG) {
+			int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
 			p->sleep_avg = MAX_SLEEP_AVG;
+			if (ticks > MAX_SLEEP_AVG)
+				ticks = MAX_SLEEP_AVG;
+			if (!in_interrupt())
+				current->sleep_avg = ticks;
+		}
+			
 		p->prio = effective_prio(p);
 	}
 	enqueue_task(p, array);


^ permalink raw reply	[flat|nested] 100+ messages in thread

* [patch] "interactivity changes", sched-2.5.64-A4
  2003-03-01  4:25     ` Andrew Morton
  2003-03-06  3:20       ` Linus Torvalds
@ 2003-03-06  6:58       ` Ingo Molnar
  2003-03-06 10:00         ` Andrew Morton
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06  6:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robert Love, linux-kernel, Linus Torvalds


Andrew,

> hm, you're right.  It's still really bad.  I forgot that I was using
> distcc.
> 
> And I also forgot that tbench starves everything else only on
> CONFIG_SMP=n. That problem remains with us as well.

well, i took out the interactivity improvements from the 2.5.59-E6
scheduler patch, to keep the pure HT-scheduler bits only, and havent added
them back since. The two patch-sets (interactivity, and HT scheduler)
interact heavily, so i did not post the patch separately, but here it goes
as-is, against 2.5.64 - does it help your interactivity problems on UP
setups?

The patch includes the following items:

 - a new wakeup feature: SMART_WAKE_CHILD (default:1) [this is a reworked 
   version of the wakeup changes from Andrea's tree.]

 - scheduler parameter tunings (CHILD_PENALTY, MAX_TIMESLICE,
   STARVATION_LIMIT, MAX_SLEEP_AVG)

 - activate_task() splitup to separate the interactivity-active variant 
   from the task-queue variant => this fixes our statistics in some cases.

 - finer-grained preemption: if priority is equal then the task with the 
   longer pending timeslice wins.

 - make it possible to disable sync-wakeups via SYNC_WAKEUPS (default:1)

 - a naming cleanup (sleep_timestamp => last_run)

 - fix up STARVATION_LIMIT usage so that a value of 0 is recognized as 'no
   starvation limit'.

ideally we'd like to have these piece by piece, but could you just give
this patchset a go, and if it makes a difference then we can separate out
the bits that matter, to pinpoint your problems as exactly as possible.

i've test-booted the patch on UP and SMP as well.

	Ingo

--- linux/include/linux/sched.h.orig	
+++ linux/include/linux/sched.h	
@@ -328,7 +328,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned long sleep_avg;
-	unsigned long sleep_timestamp;
+	unsigned long last_run;
 
 	unsigned long policy;
 	unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig	
+++ linux/kernel/sched.c	
@@ -54,20 +54,21 @@
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(300 * HZ / 1000)
-#define CHILD_PENALTY		95
+#define MAX_TIMESLICE		(200 * HZ / 1000)
+#define CHILD_PENALTY		50
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
 #define PRIO_BONUS_RATIO	25
 #define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(2*HZ)
-#define STARVATION_LIMIT	(2*HZ)
-#define NODE_THRESHOLD          125
+#define MAX_SLEEP_AVG		(10*HZ)
+#define STARVATION_LIMIT	(10*HZ)
+#define SYNC_WAKEUPS		1
+#define SMART_WAKE_CHILD	1
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -323,16 +324,21 @@ static inline int effective_prio(task_t 
  * Also update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task(p, rq->active);
+	rq->nr_running++;
+}
+
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	unsigned long sleep_time = jiffies - p->sleep_timestamp;
-	prio_array_t *array = rq->active;
+	unsigned long sleep_time = jiffies - p->last_run;
 
 	if (!rt_task(p) && sleep_time) {
 		/*
 		 * This code gives a bonus to interactive tasks. We update
 		 * an 'average sleep time' value here, based on
-		 * sleep_timestamp. The more time a task spends sleeping,
+		 * ->last_run. The more time a task spends sleeping,
 		 * the higher the average gets - and the higher the priority
 		 * boost gets as well.
 		 */
@@ -341,8 +347,7 @@ static inline void activate_task(task_t 
 			p->sleep_avg = MAX_SLEEP_AVG;
 		p->prio = effective_prio(p);
 	}
-	enqueue_task(p, array);
-	nr_running_inc(rq);
+	__activate_task(p, rq);
 }
 
 /*
@@ -460,6 +465,7 @@ static int try_to_wake_up(task_t * p, un
 	long old_state;
 	runqueue_t *rq;
 
+	sync &= SYNC_WAKEUPS;
 repeat_lock_task:
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -479,12 +485,17 @@ repeat_lock_task:
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			activate_task(p, rq);
-	
-			if (p->prio < rq->curr->prio)
-				resched_task(rq->curr);
+			if (sync)
+				__activate_task(p, rq);
+			else {
+				activate_task(p, rq);
+				if (p->prio < rq->curr->prio)
+					resched_task(rq->curr);
+			}
 			success = 1;
 		}
+		if (p->state >= TASK_ZOMBIE)
+			BUG();
 		p->state = TASK_RUNNING;
 	}
 	task_rq_unlock(rq, &flags);
@@ -525,8 +536,19 @@ void wake_up_forked_process(task_t * p)
 		p->prio = effective_prio(p);
 	}
 	set_task_cpu(p, smp_processor_id());
-	activate_task(p, rq);
 
+	if (SMART_WAKE_CHILD) {
+		if (unlikely(!current->array))
+			__activate_task(p, rq);
+		else {
+			p->prio = current->prio;
+			list_add_tail(&p->run_list, &current->run_list);
+			p->array = current->array;
+			p->array->nr_active++;
+			rq->nr_running++;
+		}
+	} else
+		activate_task(p, rq);
 	task_rq_unlock(rq, &flags);
 }
 
@@ -953,6 +975,11 @@ static inline void pull_task(runqueue_t 
 	 */
 	if (p->prio < this_rq->curr->prio)
 		set_need_resched();
+	else {
+		if (p->prio == this_rq->curr->prio &&
+				p->time_slice > this_rq->curr->time_slice)
+			set_need_resched();
+	}
 }
 
 /*
@@ -1016,7 +1043,7 @@ skip_queue:
 	 */
 
 #define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
+	((jiffies - (p)->last_run > cache_decay_ticks) &&	\
 		!task_running(rq, p) &&					\
 			((p)->cpus_allowed & (1UL << (this_cpu))))
 
@@ -1076,9 +1103,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
  * increasing number of running tasks:
  */
 #define EXPIRED_STARVING(rq) \
-		((rq)->expired_timestamp && \
+		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))
+			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
 
 /*
  * This function gets called by the timer code, with HZ frequency.
@@ -1201,7 +1228,7 @@ need_resched:
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->sleep_timestamp = jiffies;
+	prev->last_run = jiffies;
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1701,7 +1728,7 @@ static int setscheduler(pid_t pid, int p
 	else
 		p->prio = p->static_prio;
 	if (array)
-		activate_task(p, task_rq(p));
+		__activate_task(p, task_rq(p));
 
 out_unlock:
 	task_rq_unlock(rq, &flags);
--- linux/kernel/fork.c.orig	
+++ linux/kernel/fork.c	
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
-	p->sleep_timestamp = jiffies;
+	p->last_run = jiffies;
 	if (!current->time_slice) {
 		/*
 	 	 * This case is rare, it happens when the parent has only


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  3:20       ` Linus Torvalds
@ 2003-03-06  7:45         ` Andrew Morton
  2003-03-06  7:47           ` Ingo Molnar
  2003-03-06 15:37           ` Linus Torvalds
  2003-03-06 17:13         ` Ingo Molnar
                           ` (2 subsequent siblings)
  3 siblings, 2 replies; 100+ messages in thread
From: Andrew Morton @ 2003-03-06  7:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: rml, mingo, linux-kernel

Linus Torvalds <torvalds@transmeta.com> wrote:
>
> 
> On Fri, 28 Feb 2003, Andrew Morton wrote:
> > > 
> > > Andrew, if you drop this patch, your X desktop usability drops?
> > 
> > hm, you're right.  It's still really bad.  I forgot that I was using distcc.
> > 
> > And I also forgot that tbench starves everything else only on CONFIG_SMP=n.
> > That problem remains with us as well.
> 
> Andrew, I always thought that the scheduler interactivity was bogus, since
> it didn't give any bonus to processes that _help_ interactive users
> (notably the X server, but it could be other things).

The current interactivity booster heuristic does appear to work very well - I
did an A/B comparison with 2.4.x a while back, and 2.5 is distinctly better.

But it is a heuristic, and it will inevitably make mistakes.  The problem
which I am observing is that the cost of those mistakes is very high.

I believe that we should recognise that no heuristic will be 100% accurate,
and that we should seek to minimise the impact of those 0.1%-of-the time
mistakes which it will make.  Perhaps by just dropping the max timeslice??

Let me redescribe the problem:

- Dual 850MHz PIII, 900M of RAM.
- Start a `make -j3 vmlinux', everything in pagecache
- Start using X applications.  Moving a window about is the usual trigger.

Everything goes happily for a while, and then blam.  Windows get stuck for
0.5-1.0 seconds, the mouse pointer gets so laggy that it is uncontrollable,
etc.  The fix is to put your hands in your pockets for 5-10 seconds and wait
for the scheduler to notice that the X server is idle.

Interestingly, this does not happen if the background load is a bunch of
busywaits.  It seems to need the fork&exit load of a compilation to trigger.

Robert was able to reproduce this, and noticed that the scheduler had niced
the X server down as far as it would go.

I'm a pretty pathological case, because I was driving two 1600x1200x24
displays with a crufty old AGP voodoo3 and a $2 PCI nvidia.  Am now using a
presumably less crufty radeon and the problem persists.

But last time I tested Ingo's interactivity patch things were a lot better. 
I shall retest his latest now.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  7:45         ` Andrew Morton
@ 2003-03-06  7:47           ` Ingo Molnar
  2003-03-06 15:07             ` Linus Torvalds
  2003-03-06 15:37           ` Linus Torvalds
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06  7:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, rml, linux-kernel


On Wed, 5 Mar 2003, Andrew Morton wrote:

> The current interactivity booster heuristic does appear to work very
> well - I did an A/B comparison with 2.4.x a while back, and 2.5 is
> distinctly better.

> Let me redescribe the problem:
> 
> - Dual 850MHz PIII, 900M of RAM.
> - Start a `make -j3 vmlinux', everything in pagecache
> - Start using X applications.  Moving a window about is the usual trigger.

please do another thing as well: in addition to applying the -A4 patch,
also renice X to -10.

> But last time I tested Ingo's interactivity patch things were a lot
> better.  I shall retest his latest now.

please do.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "interactivity changes", sched-2.5.64-A4
  2003-03-06  6:58       ` [patch] "interactivity changes", sched-2.5.64-A4 Ingo Molnar
@ 2003-03-06 10:00         ` Andrew Morton
  2003-03-06 10:07           ` [patch] "interactivity changes", sched-2.5.64-A5 Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Andrew Morton @ 2003-03-06 10:00 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: rml, linux-kernel, torvalds

Ingo Molnar <mingo@elte.hu> wrote:
>
> 
> well, i took out the interactivity improvements from the 2.5.59-E6
> scheduler patch, to keep the pure HT-scheduler bits only, and havent added
> them back since. The two patch-sets (interactivity, and HT scheduler)
> interact heavily, so i did not post the patch separately, but here it goes
> as-is, against 2.5.64 - does it help your interactivity problems on UP
> setups?

Ah, this is the patch I thought I was testing last time ;) A bit more careful
this time:

- The tbench-starves-everything-on-uniprocessor problem (well, I'd have to
  say it is a bug) is fixed.

  When running a compilation against a `tbench 4' one would expect the
  compilation to be slowed down by maybe 4x.  It seems a little slower than
  that, but it is in that ballpark.  At least the compilation makes _some_
  progress now.

- The large performance shift with `contest io_load' is still there.

  This test times how long it takes to compile a kernel in the presence of
  massive streaming writeout to the same filesystem.

  2.5.64, UP, !preempt, ext3:
	Finished compiling kernel: elapsed: 409 user: 107 system: 11
	Finished io_load: elapsed: 409 user: 2 system: 37 loads: 16810

	Finished compiling kernel: elapsed: 283 user: 107 system: 10
	Finished io_load: elapsed: 286 user: 1 system: 17 loads: 7990

  2.5.66+sched-2.5.64-A4, UP, !preempt, ext3:
	Finished compiling kernel: elapsed: 910 user: 108 system: 12
	Finished io_load: elapsed: 912 user: 4 system: 75 loads: 35210

	Finished compiling kernel: elapsed: 940 user: 108 system: 12
	Finished io_load: elapsed: 940 user: 4 system: 78 loads: 36510
       
  The compilation took twice as long, and the streaming write made much
  more progress.

  Given that a monster `dd if=/dev/zero' takes only a few percent CPU
  anyway, it is quite odd that this is happening.

  Regardless of the fairness issue we want to maximise CPU utilisaton in
  this workload.  The above figures show that the total CPU utilisation has
  fallen from

	409 / (107+11+4+75) = 48% CPU
  and	283 / (107+10+1+17) = 48% CPU

  down to

	910 / (108+12+4+75) = 22% CPU
  and	940 / (108+12+4+78) = 21% CPU

  which is quite bad.

  I cannot explain this - why so much idle time?  It seems to happen with
  ext2 as well, so it may not be the weird interaction between kjournald, the
  CPU scheduler and the IO scheduler which I initially suspected.  Poor me.

- On the X server problem: The patch pretty much fixes it up.  I have to
  work pretty hard to make the UI flip into sucky mode, and it is much less
  severe.  I'd say it it acceptable.

  Renicing X to -10 makes it significantly better.  Text under moved
  windows gets redrawn promptly.  But renicing X is not a very adequate
  solution in my experience - I've found that when the email client goes off
  to parse a 1000-message folder the scheduler can decide to penalise it, and
  the application freezes up for some time.

Overall, yep, good patch and we should run with it.  I need to work out what
on earth has happened to the IO load balancing.  We only got that working
properly a few days back.



^ permalink raw reply	[flat|nested] 100+ messages in thread

* [patch] "interactivity changes", sched-2.5.64-A5
  2003-03-06 10:00         ` Andrew Morton
@ 2003-03-06 10:07           ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 10:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robert Love, linux-kernel, Linus Torvalds


On Thu, 6 Mar 2003, Andrew Morton wrote:

> Overall, yep, good patch and we should run with it. [...]

great - here it is -A5 which is -A4 in a cleaned up form:

 - make SMART_WAKE_CHILD and SYNC_WAKEUPS unconditional.
 - remove a mis-merge of a TASK_ZOMBIE assert.

(without the last_run rename the patch becomes even smaller, do a
':s/last_run/sleep_timestamp/g' on it in vi.)

	Ingo

--- linux/include/linux/sched.h.orig	
+++ linux/include/linux/sched.h	
@@ -328,7 +328,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned long sleep_avg;
-	unsigned long sleep_timestamp;
+	unsigned long last_run;
 
 	unsigned long policy;
 	unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig	
+++ linux/kernel/sched.c	
@@ -54,20 +54,19 @@
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(300 * HZ / 1000)
-#define CHILD_PENALTY		95
+#define MAX_TIMESLICE		(200 * HZ / 1000)
+#define CHILD_PENALTY		50
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
 #define PRIO_BONUS_RATIO	25
 #define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(2*HZ)
-#define STARVATION_LIMIT	(2*HZ)
-#define NODE_THRESHOLD          125
+#define MAX_SLEEP_AVG		(10*HZ)
+#define STARVATION_LIMIT	(10*HZ)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -323,16 +322,21 @@ static inline int effective_prio(task_t 
  * Also update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task(p, rq->active);
+	rq->nr_running++;
+}
+
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	unsigned long sleep_time = jiffies - p->sleep_timestamp;
-	prio_array_t *array = rq->active;
+	unsigned long sleep_time = jiffies - p->last_run;
 
 	if (!rt_task(p) && sleep_time) {
 		/*
 		 * This code gives a bonus to interactive tasks. We update
 		 * an 'average sleep time' value here, based on
-		 * sleep_timestamp. The more time a task spends sleeping,
+		 * ->last_run. The more time a task spends sleeping,
 		 * the higher the average gets - and the higher the priority
 		 * boost gets as well.
 		 */
@@ -341,8 +345,7 @@ static inline void activate_task(task_t 
 			p->sleep_avg = MAX_SLEEP_AVG;
 		p->prio = effective_prio(p);
 	}
-	enqueue_task(p, array);
-	nr_running_inc(rq);
+	__activate_task(p, rq);
 }
 
 /*
@@ -479,10 +482,13 @@ repeat_lock_task:
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			activate_task(p, rq);
-	
-			if (p->prio < rq->curr->prio)
-				resched_task(rq->curr);
+			if (sync)
+				__activate_task(p, rq);
+			else {
+				activate_task(p, rq);
+				if (p->prio < rq->curr->prio)
+					resched_task(rq->curr);
+			}
 			success = 1;
 		}
 		p->state = TASK_RUNNING;
@@ -525,8 +531,16 @@ void wake_up_forked_process(task_t * p)
 		p->prio = effective_prio(p);
 	}
 	set_task_cpu(p, smp_processor_id());
-	activate_task(p, rq);
 
+	if (unlikely(!current->array))
+		__activate_task(p, rq);
+	else {
+		p->prio = current->prio;
+		list_add_tail(&p->run_list, &current->run_list);
+		p->array = current->array;
+		p->array->nr_active++;
+		rq->nr_running++;
+	}
 	task_rq_unlock(rq, &flags);
 }
 
@@ -953,6 +967,11 @@ static inline void pull_task(runqueue_t 
 	 */
 	if (p->prio < this_rq->curr->prio)
 		set_need_resched();
+	else {
+		if (p->prio == this_rq->curr->prio &&
+				p->time_slice > this_rq->curr->time_slice)
+			set_need_resched();
+	}
 }
 
 /*
@@ -1016,7 +1035,7 @@ skip_queue:
 	 */
 
 #define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
+	((jiffies - (p)->last_run > cache_decay_ticks) &&	\
 		!task_running(rq, p) &&					\
 			((p)->cpus_allowed & (1UL << (this_cpu))))
 
@@ -1076,9 +1095,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
  * increasing number of running tasks:
  */
 #define EXPIRED_STARVING(rq) \
-		((rq)->expired_timestamp && \
+		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))
+			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
 
 /*
  * This function gets called by the timer code, with HZ frequency.
@@ -1201,7 +1220,7 @@ need_resched:
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->sleep_timestamp = jiffies;
+	prev->last_run = jiffies;
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1701,7 +1720,7 @@ static int setscheduler(pid_t pid, int p
 	else
 		p->prio = p->static_prio;
 	if (array)
-		activate_task(p, task_rq(p));
+		__activate_task(p, task_rq(p));
 
 out_unlock:
 	task_rq_unlock(rq, &flags);
--- linux/kernel/fork.c.orig	
+++ linux/kernel/fork.c	
@@ -916,7 +916,7 @@ static struct task_struct *copy_process(
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
-	p->sleep_timestamp = jiffies;
+	p->last_run = jiffies;
 	if (!current->time_slice) {
 		/*
 	 	 * This case is rare, it happens when the parent has only


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-02-28  9:50 [patch] "HT scheduler", sched-2.5.63-B3 Ingo Molnar
  2003-02-28 21:12 ` Andrew Morton
@ 2003-03-06 10:16 ` Chris Wedgwood
  2003-03-06 10:18   ` Ingo Molnar
  1 sibling, 1 reply; 100+ messages in thread
From: Chris Wedgwood @ 2003-03-06 10:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Linus Torvalds, Andrew Morton

On Fri, Feb 28, 2003 at 10:50:03AM +0100, Ingo Molnar wrote:

> +#elif CONFIG_NR_SIBLINGS_4
> +# define CONFIG_NR_SIBLINGS 4

There are HT chips which have 4 siblings?


  --cw

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 10:16 ` [patch] "HT scheduler", sched-2.5.63-B3 Chris Wedgwood
@ 2003-03-06 10:18   ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 10:18 UTC (permalink / raw)
  To: Chris Wedgwood; +Cc: linux-kernel, Linus Torvalds, Andrew Morton


On Thu, 6 Mar 2003, Chris Wedgwood wrote:

> > +#elif CONFIG_NR_SIBLINGS_4
> > +# define CONFIG_NR_SIBLINGS 4
> 
> There are HT chips which have 4 siblings?

not that i'm aware of - this was just there because i'm sure this thing
wont stop at 2 logical CPUs, and i wanted to add a Kconfig structure that
deals with that too. We can comment off the "CONFIG_NR_SIBLINGS 4" line
from the Kconfig, for now.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  7:47           ` Ingo Molnar
@ 2003-03-06 15:07             ` Linus Torvalds
  2003-03-06 16:54               ` Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 15:07 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, rml, linux-kernel


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> 
> please do another thing as well: in addition to applying the -A4 patch,
> also renice X to -10.

NO.

This is a BUG IN THE SCHEDULER!

We should notice automatically that X is "interactive", thanks to the 
fact that interactive tasks wait for it. That's the whole point of my very 
simple patch..

So don't argue about things that are obviously broken. Renicing X is not 
the answer, and in fact there have been multiple reports that it makes 
XMMS skip sound because it just _hides_ the problem and moves it 
elsewhere.

Please test my simple patch that should at least attack the real issue 
instead of waffling and hiding it.

			Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  7:45         ` Andrew Morton
  2003-03-06  7:47           ` Ingo Molnar
@ 2003-03-06 15:37           ` Linus Torvalds
  2003-03-06 16:01             ` Jeff Garzik
  1 sibling, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 15:37 UTC (permalink / raw)
  To: Andrew Morton; +Cc: rml, mingo, linux-kernel


On Wed, 5 Mar 2003, Andrew Morton wrote:
> 
> But it is a heuristic, and it will inevitably make mistakes.  The problem
> which I am observing is that the cost of those mistakes is very high.

The problem I used to see quite clearly is that X gets turned into a
non-interactive task, because with a complex X desktop X can easily use
5-15% of CPU even when everything is perfectly normal. That's simply
because it ends up having to _do_ a lot: anti-aliased text, shifting
logos, xdaliclock in the corner, blinking cursors, annoying moving
advertising on web-sites, small animations in the task-bar etc.

And the current heuristic SUCKS for this. And it's quite fundamental: the
heuristic is _wrong_. The heuristic breaks down _completely_ when there
are ten CPU hogs that peg the CPU, and now the one process that wants 10% 
of the CPU suddenly doesn't sleep any more, since it ends up needing all 
the timeslices it could get.

At this point the current heurstic totally breaks down, and doesn't have 
any way out. The _only_ part of Ingo's patch that may help is the fact 
that he made the maximum timeslice be smaller, but realize that since X 
is now considered to be a background task, that doesn't help very much 
_either_, since it's going to be scheduled round-robin with those other 
ten background tasks, and changing the max timeslice from 300 to 200ms 
only means that the pauses will be "only" two seconds instead of being 
three seconds.

See? 

I told Ingo about it half a year ago, and he ignored it then, saying it 
can't be solved. Re-nicing X doesn't really help: it only hides the 
problem because it means that X won't be scheduled round-robin with the 
background load any more, but because it makes X more important than user 
processes it then has other fundamental flaws.

So with "nice -10", when X gets busy, it gets _too_ busy and locks out
other processes - it fixes the interactivity, but it causes latency
problems because X is still considered a _background_ job, so it gets a 
200ms timeslice at high priority and guess what? That kind of sucks for 
regular programs that want low latency.

> Let me redescribe the problem:
> 
> - Dual 850MHz PIII, 900M of RAM.
> - Start a `make -j3 vmlinux', everything in pagecache
> - Start using X applications.  Moving a window about is the usual trigger.

Yeah. Guess what? Moving a window makes X suddenly use a lot more CPU (try 
turning off acceleration if you _really_ want to see it), so you see the 
fundamental problem with less load. Add some load (and some X eye-candy) 
and you will see it much easier. 

And read the above description of what happens, and realize that it is 
_fundamental_.

And guess what my 5-liner patch tries to address? The one that you didn't
bother testing.. The one that actually tries to tackle the problem head-on 
instead of hiding it. 

> Interestingly, this does not happen if the background load is a bunch of
> busywaits.  It seems to need the fork&exit load of a compilation to trigger.

It happened with busy-waits too last time I checked (which is about halfa
year ago), it's just that a normal busy-wait tends to _do_ a lot less than
doign a fork()/wait()/busy-child. When the load on the CPU becomes
"constrained" (ie when the CPU idle time goes down to zero) of a
fork/wait/busy-child, suddenly that isn't a load unit of "1" any more, 
since the fork/wait/exit actually takes time.

Which means that you may need 6 busy-wait tasks to get the same kind of 
real load that you get with just 3 fork/wait/exit things.

> Robert was able to reproduce this, and noticed that the scheduler had niced
> the X server down as far as it would go.

Right. It's totally starved for CPU, so it never sleeps, so it gets turned 
into non-interactive, so it gets round-robined-with-the-background-tasks.

> I'm a pretty pathological case, because I was driving two 1600x1200x24
> displays with a crufty old AGP voodoo3 and a $2 PCI nvidia.  Am now using a
> presumably less crufty radeon and the problem persists.

I saw the problem on this 4-CPU machine with my old graphics card, which 
was something perfectly moderns (ATI Radeon 7000). I can still trigger it 
even with my current Radeon 8500, but I need to basically make the build 
be "make -j15" ot something like that to trigger it, at which point I'm 
not sure it's relevant any more..

And yes, I think my five-liner makes a difference to me, but since I need
to push the machine so far "off" a reasonable load to trigger it, I don't
think my machine is a good test.

A good test is a fairly slow graphics card, with some stuff happening all 
the time on the screen (ie xdaliclock or a noisy web-site). And then 
seeing what moving a window does when doing a compile.

The window moving itself is not helped by my patch, but my patch _should_
mean that the _users_ of X (which are less starved for CPU than X is,
since they don't actually have to do the heavy lifting) will be giving X
"life points" as X wakes them up.

And that's the whole point of the patch. Any synchronous wake-up that 
wakes up an interactive task is a _good_ thing. Not just for the 
interactive task itself, but also for the thing that causes it to wake up. 
It makes X itself more interactive - it still goes into "background mode" 
for short stretches, but it seems to need more pushing, and it seems 
to come out of its funk faster.

Note the "seems". This is why I'd like others to test it - since it's very 
much a perception issue.

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 15:37           ` Linus Torvalds
@ 2003-03-06 16:01             ` Jeff Garzik
  2003-03-06 16:51               ` Linus Torvalds
                                 ` (3 more replies)
  0 siblings, 4 replies; 100+ messages in thread
From: Jeff Garzik @ 2003-03-06 16:01 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, rml, mingo, linux-kernel

Pardon the suggestion of a dumb hueristic, feel free to ignore me: 
would it work to run-first processes that have modified their iopl() 
level?  i.e. "if you access hardware directly, we'll treat you specially 
in the scheduler"?

An alternative is to encourage distros to set some sort of flag for 
processes like the X server, when it is run.  This sounds suspiciously 
like the existing "renice X server" hack, but it could be something like 
changing the X server from SCHED_OTHER  to SCHED_HW_ACCESS instead.

Just throwing out some things, because I care about apps which access 
hardware from userspace :)


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 16:01             ` Jeff Garzik
@ 2003-03-06 16:51               ` Linus Torvalds
  2003-03-06 17:11                 ` Ingo Molnar
  2003-03-06 17:17               ` Ingo Molnar
                                 ` (2 subsequent siblings)
  3 siblings, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 16:51 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Andrew Morton, rml, mingo, linux-kernel


On Thu, 6 Mar 2003, Jeff Garzik wrote:
>
> Pardon the suggestion of a dumb hueristic, feel free to ignore me: 
> would it work to run-first processes that have modified their iopl() 
> level?  i.e. "if you access hardware directly, we'll treat you specially 
> in the scheduler"?

See, the thing is, I don't actually believe that X is _special_ in this 
regard.

If we have an interactive process that has been waiting nicely for
somebody else (ie it is "interactive" in that it tends to run quickly and 
then wait again, without blowing away the cache), then I think that 
"somebody else" should be automatically getting some of the extra 
attention - just for latency reasons.

It's not an X special-case issue. It's a completely _generic_ "low-latency
service to interactive tasks" issue. 

Put another way:

Imagine other servers running on the same machine. Imagine, if you will, a
logging service (or IM server, or whatever) that gets overwhelmed when it
has thousands of interactive clients on an overloaded machine. We know the
clients care about latency (clearly true since they sleep a lot - they
can't be about bandwidth: the logic only triggers if the clients have lots
of interactivity bonus points), so if we notice that this server is waking
up a lot of these latency-critical clients, doesn't it make sense to make
the _server_ as latency-critical too?

See? My small patch hits _exactly_ that case. It misses a lot of 
opportunities (it really only works with synchronous wakeups, so in 
practice it probably ends up working mainly for things like UNIX domain 
sockets and regular pipes), but that's an implementation issue, not a 
"special case" thing.

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 15:07             ` Linus Torvalds
@ 2003-03-06 16:54               ` Ingo Molnar
  2003-03-06 17:03                 ` Linus Torvalds
  0 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 16:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, rml, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> We should notice automatically that X is "interactive", thanks to the
> fact that interactive tasks wait for it. That's the whole point of my
> very simple patch..

the whole compilation (gcc tasks) will be rated 'interactive' as well,
because an 'interactive' make process and/or shell process is waiting on
it. I tried something like this before, and it didnt work.

> So don't argue about things that are obviously broken. Renicing X is not
> the answer, and in fact there have been multiple reports that it makes
> XMMS skip sound because it just _hides_ the problem and moves it
> elsewhere.

the xine has been analyzed quite well (which is analogous to the XMMS
problem), it's not X that makes XMMS skip, it's the other CPU-bound tasks
on the desktops that cause it to skip occasionally. Increasing the
priority of xine to just -1 or -2 solves the skipping problem.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 16:54               ` Ingo Molnar
@ 2003-03-06 17:03                 ` Linus Torvalds
  2003-03-06 17:24                   ` Ingo Molnar
  2003-03-07  3:19                   ` Horst von Brand
  0 siblings, 2 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 17:03 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, rml, linux-kernel


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> 
> the whole compilation (gcc tasks) will be rated 'interactive' as well,
> because an 'interactive' make process and/or shell process is waiting on
> it.

No. The make that is waiting for it will be woken up _once_ - when the 
thing dies. Marking it interactive at that point is absolutely fine.

> I tried something like this before, and it didnt work.

You can't have tried it very hard.

In fact, you haven't apparently tried it hard enough to even bother giving
my patch a look, much less apply it and try it out.

> the xine has been analyzed quite well (which is analogous to the XMMS
> problem), it's not X that makes XMMS skip, it's the other CPU-bound tasks
> on the desktops that cause it to skip occasionally. Increasing the
> priority of xine to just -1 or -2 solves the skipping problem.

Are you _crazy_?

Normal users can't "just increase the priority". You have to be root to do 
so. And I already told you why it's only hiding the problem.

In short, you're taking a very NT'ish approach - make certain programs run 
in the "foreground", and give them a static boost because they are 
magically more important. And you're ignoring the fact that the heuristics 
we have now are clearly fundamentally broken in certain circumstances.

I've pointed out the circumstances, I've told you why it happens and when 
it happens, and you've not actually even answered that part. You've only 
gone "it's not a problem, you can fix it up by renicing every time you 
find a problem".

Get your head out of the sand, and stop this "nice" blathering.

			Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 16:51               ` Linus Torvalds
@ 2003-03-06 17:11                 ` Ingo Molnar
  2003-03-06 17:52                   ` jvlists
  2003-03-06 18:49                   ` Alan Cox
  0 siblings, 2 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:11 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jeff Garzik, Andrew Morton, rml, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> See, the thing is, I don't actually believe that X is _special_ in this
> regard.

X is special. Especially in Andrew's wild-window-dragging experiment X is
a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
that makes it special is that there's a human looking at the output of the
X client. This is information that is simply not available to the kernel.

Windows solves this problem by giving the application that owns 'the
foreground window' a special boost - they have the window manager in the
kernel after all. I do not like this approach, i could very well depend on
an application's latency that is not directly connected to the foreground
task.

the only thing that is more or less a good sign of humans being involved,
is delay. It's not always the case, but _most_ of the time, when a human
is involved, there's lots of sleep time. The 2.4 scheduler kind of
rewarded tasks based on this, the 2.5 scheduler does this more
prominently.

now which are those tasks that are CPU-bound but still have a human eye
looking at them most of the time? The two main ones are games and video
playback. Games tend to be CPU hogs for natural reasons, and video
playback is something that is CPU-intensive but non-smooth output is still
easily noticed by humans.

note that there isnt all that much problem with any but these two
categories.

and for those two categories, we have to give up and just help the kernel
a bit. We have to tell the kernel that there's a human looking at the
output of these applications. One plan that i think might work would be to
enable users to raise priority of applications by 2 or 3 priority levels,
which the kernel would allow. If the interface only allows such priority
setting for the current process, and if it's not inherited across fork(),
then at least initially, it would be harder to abuse this mechanism for a
'boost all of my processes' tool in .bashrc.

another experiment (a really bad hack) was to do the boost in iopl() -
recognizing the fact that if an application uses iopl(), it's privileged
and it's special. This of course unacceptable, but it isolated X and a
couple of other 'important' apps. It doesnt solve the 'xine problem'.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  3:20       ` Linus Torvalds
  2003-03-06  7:45         ` Andrew Morton
@ 2003-03-06 17:13         ` Ingo Molnar
  2003-03-06 17:30           ` Linus Torvalds
  2003-03-06 20:42         ` Andrew Morton
  2003-03-06 22:03         ` Martin Waitz
  3 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


On Wed, 5 Mar 2003, Linus Torvalds wrote:

>  		p->sleep_avg += sleep_time;
> -		if (p->sleep_avg > MAX_SLEEP_AVG)
> +		if (p->sleep_avg > MAX_SLEEP_AVG) {
> +			int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
>  			p->sleep_avg = MAX_SLEEP_AVG;
> +			if (ticks > MAX_SLEEP_AVG)
> +				ticks = MAX_SLEEP_AVG;
> +			if (!in_interrupt())
> +				current->sleep_avg = ticks;
> +		}
> +			
>  		p->prio = effective_prio(p);

interesting approach, but it has one problem which so far i tried to
avoid: it makes it too easy for a process to gain a bonus. Until now
pretty much the only way to get an interactive bonus was to actually
sleep. Another rule was that interactivity is kept constant, ie. the only
'source' of interactivity was passing time, not some artificial activity
performed by any process. Even the timeslice passing across fork()/exit()  
is controlled carefully to maintain the total sum of timeslices.

With the above code it's enough to keep a single helper thread around
which blocks on a pipe, to create an almost endless source of
interactivity bonus. And does not even have to be 'deliberate' - there's
tons of code that just waits for a CPU-bound task to finish (eg. 'make'
waiting for gcc to finish), and which processes/threads have a maximum
boost already, in which case the interactivity boost is not justified.

Anyway, Andrew, could you give Linus' patch a go as well?

	Ingo



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 16:01             ` Jeff Garzik
  2003-03-06 16:51               ` Linus Torvalds
@ 2003-03-06 17:17               ` Ingo Molnar
  2003-03-06 18:06                 ` Dimitrie O. Paun
  2003-03-06 18:25               ` Alan Cox
  2003-03-06 22:00               ` Bill Davidsen
  3 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:17 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Linus Torvalds, Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Jeff Garzik wrote:

> Pardon the suggestion of a dumb hueristic, feel free to ignore me:  
> would it work to run-first processes that have modified their iopl()  
> level?  i.e. "if you access hardware directly, we'll treat you specially
> in the scheduler"?

we did this in the Red Hat kernel, just as a quick hack to boost X without
having to modify X (bleh). While it worked very well in practice, it's a
really bad idea from an interface design POV, because it singles out X
mostly based on a random characteristics of X, but leaves out some other
tasks that are 'suffering' from not being recognized as interactive, but
still having a user watching them (and expecting keypresses to react
fast): games, xine, XMMS, etc.

> An alternative is to encourage distros to set some sort of flag for
> processes like the X server, when it is run.  This sounds suspiciously
> like the existing "renice X server" hack, but it could be something like
> changing the X server from SCHED_OTHER to SCHED_HW_ACCESS instead.

yes, an ELF flag might work, or my suggestion to allow applications to
increase their priority (up until a certain degree).

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:03                 ` Linus Torvalds
@ 2003-03-06 17:24                   ` Ingo Molnar
  2003-03-06 17:42                     ` Linus Torvalds
  2003-03-07  3:19                   ` Horst von Brand
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:24 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


another thing. What really happens in the 'recompile job' thing is not
that X gets non-interactive. Most of the time it _is_ interactive. The
problem is that the gcc processes and make processes themselves, which do
use up a considerable portion of CPU time (generate a load of 4-5 with
make -j2), get rated as interactive as well. Shells, gnome-terminal, X
itself will be often preempted by these gcc and make processes. Making X
more interactive does not help in this case.

so my patchset attempts to tune things in the direction of making the
scheduler recognize the compile job as truly CPU-bound. It was a bad
interaction of child-inherits-priority plus child-wakeup priorities.

My suggestion to increase X's priority was just an unrelated suggestion -
if Andrew is dragging windows around then X does get 100% CPU-bound, and
the only good way to get out of that situation is to tell the system that
it's rightfully monopolizing the CPU, and that the user has no problem
with that.

I also had feedback from people who reniced X in the _other_ direction,
because they did not want their rare administration work done under X
impact the generic performance of the server.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:13         ` Ingo Molnar
@ 2003-03-06 17:30           ` Linus Torvalds
  2003-03-06 17:33             ` Ingo Molnar
  2003-03-06 17:50             ` Ingo Molnar
  0 siblings, 2 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 17:30 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> 
> interesting approach,

Ahh.. After five emails on the subject, you finally took a look at the 
patch? Good.

>		 but it has one problem which so far i tried to
> avoid: it makes it too easy for a process to gain a bonus.

Yes and no. It actually follows your rules:

>							 Until now
> pretty much the only way to get an interactive bonus was to actually
> sleep.

And that is still true. _Somebody_ has to sleep, and the bonus is still 
entirely based on how long that somebody slept.


>  Another rule was that interactivity is kept constant, ie. the only
> 'source' of interactivity was passing time, not some artificial activity
> performed by any process.

And this is also still true. Again, the only _source_ of interactivity is 
the fact that the woken-up process slept for a noticeable time. There is 
no "artificial source" there.

Do the math: the patch uses the exact same source as the current algorithm 
does - it only changes the _cutoff_.

The current algorithm just throws away "interactivity bonus" points when 
the sleeper has gotten to some maximum value. 

The new one doesn't throw them away when it hits the maximum - it gives 
them to the person who woke it up.

>		 Even the timeslice passing across fork()/exit()  
> is controlled carefully to maintain the total sum of timeslices.

But my patch DID NOT CHANGE THE TOTAL SUM! Read the patch again. 

See:

+                       int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
			p->sleep_avg = MAX_SLEEP_AVG;

Note how this breaks down:

	"p->sleep_avg - MAX_SLEEP_AVG"

this is the part we're "throwing away", because the sleeper had already 
accumulated enough interactivity points.

	" + current->sleep_avg"

this is the part that the waker _already_had_.

	if (ticks > MAX_SLEEP_AVG)
		ticks = MAX_SLEEP_AVG;

This just says that the waker, too, will be limited by the "maximum 
interactivity" thing.

	if (!in_interrupt())
		current->sleep_avg = ticks;

and this part says "we only give the potential boost to a _synchronous_ 
waker" (ie a network packet that comes in and wakes somebody up in 
bottom half processing will _not_ cause an interactivity boost to a random 
process).

See? The patch maintains the rule that "interactivity" only gets created 
by sleeping. The only thing it really does is to change how we 
_distribute_ the interactivity we get. It gives some of the interactivity 
to the waker.

> With the above code it's enough to keep a single helper thread around
> which blocks on a pipe, to create an almost endless source of
> interactivity bonus.

No. You do not get to "create" interactivity. All the CPU cycles you win 
from interactivity you must have lost by having one of your processes wait 
for it.

Sure - it changes the balance of who gets to be considered interactive. 
But that's the whole _point_ of it.

>		 And does not even have to be 'deliberate' - there's
> tons of code that just waits for a CPU-bound task to finish (eg. 'make'
> waiting for gcc to finish), and which processes/threads have a maximum
> boost already, in which case the interactivity boost is not justified.

And if they already have the maximum boost, they won't get any more. Look 
at the five lines of actual code added.

Also, note how your "waiting for gcc to finish" is still not true. Sure, 
that "make" will be considered interactive, but it's not going to help the 
waker (gcc) any, since it will be interactive waiting for gcc to _die_.

I can come up with better (ie "real") examples of where this changes
behaviour:

 - "cc1" (slow) writes to a pipe to "as" (fast)

   "as" is fast, so as ends up waiting most of the time. Thus it ends up 
   being marked interactive.

   When cc1 wakes up as, assuming as has been marked "maximally 
   interactive", cc1 will get an interactivity boost too.

Is this "wrong"?  Maybe, if you see it from a pure "interactivity"  
standpoint. But another way of seeing it is to say that it automatically
tries to _balance_ this kind of pipeline - since "cc1" is much slower and
actually _wants_ the interactivity that "as" is clearly not ever going to
actually get any real advantage from, it is actually likely to be
perfectly fine give "cc1" a priority.

Also, note that if cc1 is _really_ slow (which it is) and really doesn't 
wake up as very often, you won't see this effect, since the 
"anti-interactive" code will balance things out.

In short, it's all about balancing. There are things that are
"pro-interactive" (real sleeping), and there are things that are
"anti-interactive" (using up your timeslice). The question is how you 
spread out the bonus points (or the negative points).

The current scheduler doesn't spread them out at all. I think that's a
bug, since pipelines of multiple processes are actually perfectly common,
and X is only one example of this.

And my patch may spread it out _too_ much. Maybe we shouldn't give _all_ 
of the left-over interactivity to the waker. Maybe we should give just 
half of it away..

See?

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:30           ` Linus Torvalds
@ 2003-03-06 17:33             ` Ingo Molnar
  2003-03-06 17:50             ` Ingo Molnar
  1 sibling, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:33 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> > interesting approach,
> 
> Ahh.. After five emails on the subject, you finally took a look at the 
> patch? Good.

no, it was the first email i wrote, it was in my postponed queue.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:25               ` Alan Cox
@ 2003-03-06 17:35                 ` Linus Torvalds
  2003-03-06 22:30                   ` Eric Northup
  0 siblings, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 17:35 UTC (permalink / raw)
  To: Alan Cox
  Cc: Jeff Garzik, Andrew Morton, rml, mingo, Linux Kernel Mailing List


On 6 Mar 2003, Alan Cox wrote:
> 
> Not all X servers do that. X is not special in any way. Its just a
> daemon. It has the same timing properties as many other daemons doing
> time critical operations for many clients

I really think that this is the most important observation about the 
behaviour. X really isn't doing anything that other deamons don't do. It 
so happens that what X is doing is a lot more complex than most deamons 
do, so it's fairly easy to trigger X into using a fair amount of CPU time, 
but that only makes the breakdown case so much easier to see.

In all honesty, though, it's also obviously the case that the breakdown
case in the case of X is _really_ visible in a very very concrete way,
since X is the _only_ thing really "visible" unless you have Direct
Rendering clients. So even if the problem were to happen with another
deamon, it probably wouldn't stand out quite as badly. So in that sense X 
ends up having some special characteristics - "visibility".

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:24                   ` Ingo Molnar
@ 2003-03-06 17:42                     ` Linus Torvalds
  2003-03-06 17:54                       ` Ingo Molnar
                                         ` (3 more replies)
  0 siblings, 4 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 17:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> 
> another thing. What really happens in the 'recompile job' thing is not
> that X gets non-interactive. Most of the time it _is_ interactive.

This is not what I've seen.

When X is interactive, and is competing against other interactive jobs, 
you don't get multi-second slowdowns. X still gets 10% of the CPU, it just 
gets it in smaller chunks.

The multi-second "freezes" are the thing that bothered me, and those were
definitely due to the fact that X was competing as a _non_interactive
member against other non-interactive members, causing it to still get 10%
of the CPU, but only every few seconds. So you'd get a very bursty
behaviour with very visible pauses.

It's ok to slow X down. Nobody in their right mind would expect X to track 
the mouse 100% when scrolling and the machine load is 15+. I certainly 
don't.

But having X just _pause_ for several seconds gets to me. I can't easily 
make it happen any more thanks to having ridiculous hardware, and I think 
X itself has gotten better thanks to more optimizations in both clients 
and X itself (ie if the CPU requirements of X go down from 5% to 3%, it 
gets a _lot_ harder to trigger).

But it was definitely there. 3-5 second _pauses_. Not slowdowns.

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:30           ` Linus Torvalds
  2003-03-06 17:33             ` Ingo Molnar
@ 2003-03-06 17:50             ` Ingo Molnar
  2003-03-06 18:00               ` Linus Torvalds
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> this is the part we're "throwing away", because the sleeper had already
> accumulated enough interactivity points.
> 
> 	" + current->sleep_avg"
> 
> this is the part that the waker _already_had_.
> 
> 	if (ticks > MAX_SLEEP_AVG)
> 		ticks = MAX_SLEEP_AVG;
> 
> This just says that the waker, too, will be limited by the "maximum 
> interactivity" thing.

ok. I misread this part. It's actually a 'super boost' for interactive
tasks and tasks related to them.

> 	if (!in_interrupt())
> 		current->sleep_avg = ticks;

this (making a difference in characteristics based on in_interrupt()) was
something i tried before, but got killed due to generic wackyness :-)  
Also, it's not really justified to isolate a process just because it got
woken up by a hw event.

> and this part says "we only give the potential boost to a _synchronous_
> waker" (ie a network packet that comes in and wakes somebody up in
> bottom half processing will _not_ cause an interactivity boost to a
> random process).

how about a keyboard interrupt?

> See? The patch maintains the rule that "interactivity" only gets created
> by sleeping. The only thing it really does is to change how we
> _distribute_ the interactivity we get. It gives some of the
> interactivity to the waker.

yes - i misunderstood this property of it - and this removed most of my
objections.

> Also, note how your "waiting for gcc to finish" is still not true. Sure,
> that "make" will be considered interactive, but it's not going to help
> the waker (gcc) any, since it will be interactive waiting for gcc to
> _die_.

there's also another phenomenon in the 'make -j5 hell': gas getting
boosted due to it waiting on the gcc pipeline. Now gcc will be 'back
boosted'. But we might be lucky and get away with it - testing will show.

>  - "cc1" (slow) writes to a pipe to "as" (fast)
> 
>    "as" is fast, so as ends up waiting most of the time. Thus it ends up 
>    being marked interactive.
> 
>    When cc1 wakes up as, assuming as has been marked "maximally 
>    interactive", cc1 will get an interactivity boost too.
> 
> Is this "wrong"?  Maybe, if you see it from a pure "interactivity"  
> standpoint. But another way of seeing it is to say that it automatically
> tries to _balance_ this kind of pipeline - since "cc1" is much slower
> and actually _wants_ the interactivity that "as" is clearly not ever
> going to actually get any real advantage from, it is actually likely to
> be perfectly fine give "cc1" a priority.

okay.

> In short, it's all about balancing. There are things that are
> "pro-interactive" (real sleeping), and there are things that are
> "anti-interactive" (using up your timeslice). The question is how you
> spread out the bonus points (or the negative points).

yes.

> The current scheduler doesn't spread them out at all. I think that's a
> bug, since pipelines of multiple processes are actually perfectly
> common, and X is only one example of this.

i have tried multiple schemes before to spread out interactivity, none
worked so far - but i have not tried any 'back-boosting' towards a CPU-hog
before, so it's an interesting experiment. If you look at the
child-timeslice thing that is a common vector for interactivity to spread.

> And my patch may spread it out _too_ much. Maybe we shouldn't give _all_
> of the left-over interactivity to the waker. Maybe we should give just
> half of it away..

yes, not spreading out could also make it possible to give it back via
multiple wakeup links, interactivity will 'diffuse' along wakeups.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:11                 ` Ingo Molnar
@ 2003-03-06 17:52                   ` jvlists
  2003-03-06 18:27                     ` Ingo Molnar
  2003-03-06 23:18                     ` Robert Love
  2003-03-06 18:49                   ` Alan Cox
  1 sibling, 2 replies; 100+ messages in thread
From: jvlists @ 2003-03-06 17:52 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, Jeff Garzik, Andrew Morton, rml, linux-kernel


mingo@elte.hu (Ingo Molnar) writes:

> X is special. Especially in Andrew's wild-window-dragging experiment X is
> a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
> that makes it special is that there's a human looking at the output of the
> X client. This is information that is simply not available to the kernel.

Isn't it special because the window manager and other interactive
tasks are blocked waiting for it? Presumably during a wild-window drag
there will be loads of block-wakeup sequences between X and the window
manager. IF every such event gave a little boost to the X server that
would mean the X server would get loads of interactivity brownie
points.

This may be a stupid idea (feel free to ignore, it may even be what
the kernel already does), but if you had a separate input server and
display server instead of one X server you could follow the trail all
the way from the real user.

/dev/input/mice is special because we know it interfaces to the user.
The X input server gains points for waiting on  /dev/input/mice
interactive X program gains points for waiting on  X input server
X output server is special because it blocks waiting on the
interactive X program (and vice versa!)

The hard part would be detecting interactivity for stuff waiting on ip
sockets.

If this information was available to user programs then the X display
server could even prioritize rendering a new character in bash over
displaying the fishbowl animation running in the background.

Jan

P.S. IMVHO the xine problem is completely different as has nothing to
with interactivity but with the fact that it is soft
real-time. i.e. you need to distingish xine from say a gimp filter or
a 3D renderer with incremental live updates of the scene it is
creating.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:42                     ` Linus Torvalds
@ 2003-03-06 17:54                       ` Ingo Molnar
  2003-03-06 18:02                         ` Linus Torvalds
  2003-03-06 17:55                       ` John Levon
                                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


okay, here's a patch that merges Linus' "priority boost backmerging" and
my CPU-hog-detection-improvement patches. Andrew, just in case you were
about to test various scheduler-patch combinations on your box, this might
be something worth to try.

	Ingo

--- linux/include/linux/sched.h.orig	
+++ linux/include/linux/sched.h	
@@ -328,7 +328,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned long sleep_avg;
-	unsigned long sleep_timestamp;
+	unsigned long last_run;
 
 	unsigned long policy;
 	unsigned long cpus_allowed;
--- linux/kernel/sched.c.orig	
+++ linux/kernel/sched.c	
@@ -54,20 +54,19 @@
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
- * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
- * maximum timeslice is 300 msecs. Timeslices get refilled after
+ * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
+ * maximum timeslice is 200 msecs. Timeslices get refilled after
  * they expire.
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(300 * HZ / 1000)
-#define CHILD_PENALTY		95
+#define MAX_TIMESLICE		(200 * HZ / 1000)
+#define CHILD_PENALTY		50
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
 #define PRIO_BONUS_RATIO	25
 #define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(2*HZ)
-#define STARVATION_LIMIT	(2*HZ)
-#define NODE_THRESHOLD          125
+#define MAX_SLEEP_AVG		(10*HZ)
+#define STARVATION_LIMIT	(10*HZ)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -323,26 +322,36 @@ static inline int effective_prio(task_t 
  * Also update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task(p, rq->active);
+	rq->nr_running++;
+}
+
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	unsigned long sleep_time = jiffies - p->sleep_timestamp;
-	prio_array_t *array = rq->active;
+	unsigned long sleep_time = jiffies - p->last_run;
 
 	if (!rt_task(p) && sleep_time) {
 		/*
 		 * This code gives a bonus to interactive tasks. We update
 		 * an 'average sleep time' value here, based on
-		 * sleep_timestamp. The more time a task spends sleeping,
+		 * ->last_run. The more time a task spends sleeping,
 		 * the higher the average gets - and the higher the priority
 		 * boost gets as well.
 		 */
 		p->sleep_avg += sleep_time;
-		if (p->sleep_avg > MAX_SLEEP_AVG)
+		if (p->sleep_avg > MAX_SLEEP_AVG) {
+			int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
 			p->sleep_avg = MAX_SLEEP_AVG;
-		p->prio = effective_prio(p);
+			if (ticks > MAX_SLEEP_AVG)
+				ticks = MAX_SLEEP_AVG;
+			if (!in_interrupt())
+				current->sleep_avg = ticks;
+		}
+ 		p->prio = effective_prio(p);
 	}
-	enqueue_task(p, array);
-	nr_running_inc(rq);
+	__activate_task(p, rq);
 }
 
 /*
@@ -479,10 +488,13 @@ repeat_lock_task:
 			}
 			if (old_state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible--;
-			activate_task(p, rq);
-	
-			if (p->prio < rq->curr->prio)
-				resched_task(rq->curr);
+			if (sync)
+				__activate_task(p, rq);
+			else {
+				activate_task(p, rq);
+				if (p->prio < rq->curr->prio)
+					resched_task(rq->curr);
+			}
 			success = 1;
 		}
 		p->state = TASK_RUNNING;
@@ -525,8 +537,16 @@ void wake_up_forked_process(task_t * p)
 		p->prio = effective_prio(p);
 	}
 	set_task_cpu(p, smp_processor_id());
-	activate_task(p, rq);
 
+	if (unlikely(!current->array))
+		__activate_task(p, rq);
+	else {
+		p->prio = current->prio;
+		list_add_tail(&p->run_list, &current->run_list);
+		p->array = current->array;
+		p->array->nr_active++;
+		rq->nr_running++;
+	}
 	task_rq_unlock(rq, &flags);
 }
 
@@ -953,6 +973,11 @@ static inline void pull_task(runqueue_t 
 	 */
 	if (p->prio < this_rq->curr->prio)
 		set_need_resched();
+	else {
+		if (p->prio == this_rq->curr->prio &&
+				p->time_slice > this_rq->curr->time_slice)
+			set_need_resched();
+	}
 }
 
 /*
@@ -1016,7 +1041,7 @@ skip_queue:
 	 */
 
 #define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((jiffies - (p)->sleep_timestamp > cache_decay_ticks) &&	\
+	((jiffies - (p)->last_run > cache_decay_ticks) &&	\
 		!task_running(rq, p) &&					\
 			((p)->cpus_allowed & (1UL << (this_cpu))))
 
@@ -1076,9 +1101,9 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
  * increasing number of running tasks:
  */
 #define EXPIRED_STARVING(rq) \
-		((rq)->expired_timestamp && \
+		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))
+			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
 
 /*
  * This function gets called by the timer code, with HZ frequency.
@@ -1201,7 +1226,7 @@ need_resched:
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->sleep_timestamp = jiffies;
+	prev->last_run = jiffies;
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1701,7 +1726,7 @@ static int setscheduler(pid_t pid, int p
 	else
 		p->prio = p->static_prio;
 	if (array)
-		activate_task(p, task_rq(p));
+		__activate_task(p, task_rq(p));
 
 out_unlock:
 	task_rq_unlock(rq, &flags);
--- linux/kernel/fork.c.orig	
+++ linux/kernel/fork.c	
@@ -916,7 +917,7 @@ static struct task_struct *copy_process(
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
-	p->sleep_timestamp = jiffies;
+	p->last_run = jiffies;
 	if (!current->time_slice) {
 		/*
 	 	 * This case is rare, it happens when the parent has only


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:49                   ` Alan Cox
@ 2003-03-06 17:55                     ` Linus Torvalds
  2003-03-06 18:04                       ` Ingo Molnar
  2003-03-08  0:05                       ` Daniel Phillips
  2003-03-06 17:58                     ` Ingo Molnar
  1 sibling, 2 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 17:55 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Jeff Garzik, Andrew Morton, rml, Linux Kernel Mailing List


On 6 Mar 2003, Alan Cox wrote:
> 
> X isnt special at all. Research OS people have done stuff like time transfers
> but I've not seen that in a production OS. In that situation an interactive
> task blocking on a server hands on some of its interactiveness to whoever
> services the request.

This is what I really wanted to do - give the interactivity away at 
blocking time. However, there's usually no sane way to do that, since we 
don't know a-priori who we are blocking _for_. We could try to transfer it 
in the unix domain socket sleep case to whoever is waiting at the other 
end, but that's actually quite complex and ends up having each sleep entry 
somehow inform the subsystem it is sleeping on that it wants to give 
interactivity to the other end.

The thing about wakeups is that it's an "illogical" place to give the 
bonus ("it's too late to give the server a bonus _now_, I wanted it when I 
went to sleep"), but it's the _trivial_ place to give it.

It also (I'm convinced) is actually in the end exactly equivalent to 
giving the bonus at sleep time - in the steady state picture. In the 
single-transfer case it is wrong, but the single transfer case doesn't 
matter - the interactivity bonus isn't a "immediate shot of caffeine into 
the bloodstream" anyway. The interactivity bonus is supposed to help over 
time, so the only thing that matters is really the steady state.

But the proof is in the pudding. Does this actually make things appear 
"nicer" to people? Or is it just another wanking session?

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:42                     ` Linus Torvalds
  2003-03-06 17:54                       ` Ingo Molnar
@ 2003-03-06 17:55                       ` John Levon
  2003-03-06 18:07                         ` Linus Torvalds
  2003-03-07  5:57                       ` Ingo Molnar
  2003-03-07  6:45                       ` Aaron Lehmann
  3 siblings, 1 reply; 100+ messages in thread
From: John Levon @ 2003-03-06 17:55 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, Andrew Morton, Robert Love, linux-kernel

On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:

> But it was definitely there. 3-5 second _pauses_. Not slowdowns.

It's still there. Red Hat 8.0, 2.5.63. The  thing can pause for 15+
seconds (and during this time madplay quite happily trundled on playing
an mp3). Workload was KDE, gcc, nothing exciting...

john

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:49                   ` Alan Cox
  2003-03-06 17:55                     ` Linus Torvalds
@ 2003-03-06 17:58                     ` Ingo Molnar
  1 sibling, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 17:58 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Jeff Garzik, Andrew Morton, rml,
	Linux Kernel Mailing List


On 6 Mar 2003, Alan Cox wrote:

> On Thu, 2003-03-06 at 17:11, Ingo Molnar wrote:
> > X is special. Especially in Andrew's wild-window-dragging experiment X is
> > a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
> > that makes it special is that there's a human looking at the output of the
> > X client. This is information that is simply not available to the kernel.
> 
> Just like a streaming video server
> Just like a 3D game using DRI
> 
> X isnt special at all. [...]

i'm convinced we are talking about the same thing. My argument was that X
is special _only_ because humans are looking at it. Ie. it's not special
at all to the kernel.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:50             ` Ingo Molnar
@ 2003-03-06 18:00               ` Linus Torvalds
  0 siblings, 0 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 18:00 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> 
> > And my patch may spread it out _too_ much. Maybe we shouldn't give _all_
> > of the left-over interactivity to the waker. Maybe we should give just
> > half of it away..
> 
> yes, not spreading out could also make it possible to give it back via
> multiple wakeup links, interactivity will 'diffuse' along wakeups.

Yes, I think this could work well. What we could do is basically just
always spread the interactivity eavenly between the waker and the sleeper,
instead of my current "give the sleeper as much as we can, and if
anything is left over give some to the waker" approach.

So my current patch is extreme in that it tries to use up as much as 
possible of the "interactivity bonus points", but it is _not_ extreme in 
the sense that it doesn't inherit interactivity both ways. 

So my patch will _not_ try to balance a series of three or more processes, 
where only one is interactive. Because it will trigger only for the 
process _directly_ connected to the interactive one.

If we spread it out instead by always trying to balance the interactivity, 
it would spread out from the interactive one along any synchronous wakeup 
chain. Which is more extreme than I was willing to try with a first 
example..

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:54                       ` Ingo Molnar
@ 2003-03-06 18:02                         ` Linus Torvalds
  2003-03-06 18:11                           ` Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 18:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> 
> okay, here's a patch that merges Linus' "priority boost backmerging" and
> my CPU-hog-detection-improvement patches.

I was actually going to suggest making the CPU-hog detector _worse_, to 
see what the extreme behaviour of the "boost balancing" is. One of the 
things I would hope for is that the interactivity balancing would act as a 
damper on the CPU hug detector, and make it less important to get that one 
right in the first place.

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:55                     ` Linus Torvalds
@ 2003-03-06 18:04                       ` Ingo Molnar
  2003-03-06 18:13                         ` Martin J. Bligh
  2003-03-08  0:05                       ` Daniel Phillips
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 18:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Jeff Garzik, Andrew Morton, rml, Linux Kernel Mailing List


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> But the proof is in the pudding. Does this actually make things appear
> "nicer" to people? Or is it just another wanking session?

yes, it would be interesting to see Andrew's experiments redone for the 
following combinations:

   - your patch
   - my patch
   - both patches

in fact my patch was tested and it mostly solved the problem for Andrew,
but i'm now convinced that the combination of patches will be the real
solution for this class of problems - as that will attack _both_ ends,
both CPU hogs are recognized better, and interactivity detection
interactivity-distribution is improved.

but neither patch solves all problems. The typical DRI game just does
nothing but calls the DRI ioctl() and burns CPU time. So i'm convinced we
need one more way to help the kernel identify tasks that are important to
people - increasing the priority of those tasks programmatically is
certainly a workable solution, but there might be other solutions.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:17               ` Ingo Molnar
@ 2003-03-06 18:06                 ` Dimitrie O. Paun
  2003-03-06 22:14                   ` Martin J. Bligh
  0 siblings, 1 reply; 100+ messages in thread
From: Dimitrie O. Paun @ 2003-03-06 18:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jeff Garzik, Linus Torvalds, Andrew Morton, Robert Love, linux-kernel

On Thu, 6 Mar 2003, Ingo Molnar wrote:

> yes, an ELF flag might work, or my suggestion to allow applications to
> increase their priority (up until a certain degree).

An ELF flag might be better, as it's declarative -- it allows the kernel
to implement 'interactivity' in various ways, so we can keep tweeking it.
Priority might prove to be a bit different than interactivity, so we
better not overload the two just yet.

-- 
Dimi.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:55                       ` John Levon
@ 2003-03-06 18:07                         ` Linus Torvalds
  2003-03-06 18:11                           ` John Levon
                                             ` (2 more replies)
  0 siblings, 3 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 18:07 UTC (permalink / raw)
  To: John Levon; +Cc: Ingo Molnar, Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, John Levon wrote:
> On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:
> 
> > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
> 
> It's still there. Red Hat 8.0, 2.5.63. The  thing can pause for 15+
> seconds (and during this time madplay quite happily trundled on playing
> an mp3). Workload was KDE, gcc, nothing exciting...

Oh, well. I didn't actually even verify that UNIX domain sockets will
cause synchronous wakeups, so the patch may literally be doing nothing at
all. You can try that theory out by just removing the test for 
"in_interrupt()".

It will also almost certainly be totally ineffective if you use TCP 
sockets for the same reason. So if your DISPLAY variable is set to 
"localhost:0" instead of ":0.0", you'd definitely have to remove the 
"in_interrupt()" test.

But it may also just be that the theory is just broken.

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:20                             ` Linus Torvalds
@ 2003-03-06 18:10                               ` David S. Miller
  0 siblings, 0 replies; 100+ messages in thread
From: David S. Miller @ 2003-03-06 18:10 UTC (permalink / raw)
  To: torvalds; +Cc: mingo, levon, akpm, rml, linux-kernel

   From: Linus Torvalds <torvalds@transmeta.com>
   Date: Thu, 6 Mar 2003 10:20:42 -0800 (PST)

   Note that "in_interrupt()" will also trigger for callers that call from
   bh-atomic regions as well as actual BH handlers. Which is correct - they 
   are both "interrupt contexts" as far as most users should be concerned.
   
   The unix domain case may well be bh-atomic, I haven't looked at the code. 
   I'm pretty much certain that the TCP case _will_ be BH-atomic, even for 
   loopback.
   
   David?

Unix sockets use non-BH locks, there are no software interrupts
involved in AF_UNIX processing so no need to protect against them.

The actual wakeup comes from the socket callbacks, we use the
default data_ready() which is:

void sock_def_readable(struct sock *sk, int len)
{
        read_lock(&sk->callback_lock);
        if (sk->sleep && waitqueue_active(sk->sleep))
		wake_up_interruptible(sk->sleep);
	sk_wake_async(sk,1,POLL_IN);
	read_unlock(&sk->callback_lock);
}

And for write wakeups Unix uses it's own, which is:

static void unix_write_space(struct sock *sk)
{
        read_lock(&sk->callback_lock);
        if (unix_writable(sk)) {
		if (sk->sleep && waitqueue_active(sk->sleep))
                        wake_up_interruptible(sk->sleep);
                sk_wake_async(sk, 2, POLL_OUT);
        }
        read_unlock(&sk->callback_lock);
}

So, to reiterate, no BH locking is used by AF_UNIX.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:02                         ` Linus Torvalds
@ 2003-03-06 18:11                           ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 18:11 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> > okay, here's a patch that merges Linus' "priority boost backmerging" and
> > my CPU-hog-detection-improvement patches.
> 
> I was actually going to suggest making the CPU-hog detector _worse_, to
> see what the extreme behaviour of the "boost balancing" is. One of the
> things I would hope for is that the interactivity balancing would act as
> a damper on the CPU hug detector, and make it less important to get that
> one right in the first place.

this does not appear to happen, at least with the current incarnation,
make -j5 jobs of the kernel tree (on UP), with your patch alone are
hoovering at a considerably higher dynamic priority than they are with my
patch. But interactivity itself is good as far as i can tell, in both
cases - but i think Andrew would be a better tester in this regard.

plus my patch doesnt really change the core CPU-hog detector, what it does
is that it changes the way children/parents will inherit priorities, plus
changes the order of how children get their first timeslice. The
end-effect is that the 'CPU-hog' make job is better recognized as that -
but X itself will not be recognized in any different way.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:07                         ` Linus Torvalds
@ 2003-03-06 18:11                           ` John Levon
  2003-03-06 18:16                             ` Linus Torvalds
  2003-03-06 18:15                           ` Ingo Molnar
  2003-03-06 18:18                           ` Ingo Molnar
  2 siblings, 1 reply; 100+ messages in thread
From: John Levon @ 2003-03-06 18:11 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, Andrew Morton, Robert Love, linux-kernel

On Thu, Mar 06, 2003 at 10:07:17AM -0800, Linus Torvalds wrote:

> > > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
> > 
> > It's still there. Red Hat 8.0, 2.5.63. The  thing can pause for 15+
> 
> Oh, well. I didn't actually even verify that UNIX domain sockets will
> cause synchronous wakeups, so the patch may literally be doing nothing at
> all. You can try that theory out by just removing the test for 
> "in_interrupt()".

Sorry, I think I misunderstood what you meant in your comment above. I
haven't actually tried these patches yet, worry not.

regards,
john

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:04                       ` Ingo Molnar
@ 2003-03-06 18:13                         ` Martin J. Bligh
  2003-03-06 18:26                           ` Linus Torvalds
  0 siblings, 1 reply; 100+ messages in thread
From: Martin J. Bligh @ 2003-03-06 18:13 UTC (permalink / raw)
  To: Ingo Molnar, Linus Torvalds
  Cc: Alan Cox, Jeff Garzik, Andrew Morton, rml, Linux Kernel Mailing List

>> But the proof is in the pudding. Does this actually make things appear
>> "nicer" to people? Or is it just another wanking session?
> 
> yes, it would be interesting to see Andrew's experiments redone for the 
> following combinations:
> 
>    - your patch
>    - my patch
>    - both patches
> 
> in fact my patch was tested and it mostly solved the problem for Andrew,
> but i'm now convinced that the combination of patches will be the real
> solution for this class of problems - as that will attack _both_ ends,
> both CPU hogs are recognized better, and interactivity detection
> interactivity-distribution is improved.

It would be nice if we had a "batch-job-able" simulation of this situation,
to do more accurate testing with ... can anyone think of an easy way to
do such a thing? "waggle a few windows around on X and see if it feels 
better to you or not" is kind of hard to do accurate testing with.
Of course, the simulation has to be accurate too, or it gets stupid ;-)

M.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:07                         ` Linus Torvalds
  2003-03-06 18:11                           ` John Levon
@ 2003-03-06 18:15                           ` Ingo Molnar
  2003-03-06 18:20                             ` Linus Torvalds
  2003-03-06 18:18                           ` Ingo Molnar
  2 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 18:15 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: John Levon, Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> > It's still there. Red Hat 8.0, 2.5.63. The  thing can pause for 15+
> > seconds (and during this time madplay quite happily trundled on playing
> > an mp3). Workload was KDE, gcc, nothing exciting...
> 
> Oh, well. I didn't actually even verify that UNIX domain sockets will
> cause synchronous wakeups, so the patch may literally be doing nothing
> at all. You can try that theory out by just removing the test for
> "in_interrupt()".

you are not referring to the 'synchronous wakeups' as used by fs/pipe.c,
right? in_interrupt() isolates interrupt-context wakeups (asynchronous
wakeups) and process-context wakeups - which can also be called
synchronous, in a way.

so i think your current patch should cover unix domain sockets just as
well, they certain dont use IRQ-context wakeups.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:11                           ` John Levon
@ 2003-03-06 18:16                             ` Linus Torvalds
  0 siblings, 0 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 18:16 UTC (permalink / raw)
  To: John Levon; +Cc: Ingo Molnar, Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, John Levon wrote:
> 
> Sorry, I think I misunderstood what you meant in your comment above. I
> haven't actually tried these patches yet, worry not.

Oh, ok. There is still hope. And as apparently you can see the problem 
easily, maybe you can be one of the poor guinea-pigs for the patches 
floating around.

(Damn! 15 second lockups? That's just not much fun any more.)

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:07                         ` Linus Torvalds
  2003-03-06 18:11                           ` John Levon
  2003-03-06 18:15                           ` Ingo Molnar
@ 2003-03-06 18:18                           ` Ingo Molnar
  2 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 18:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: John Levon, Andrew Morton, Robert Love, linux-kernel


in any case, i'm quite confortable about your patch now, because it just
fits into the thinking that is behind the current boosting concept. So
it's not a random tweak added here or there (like the iopl() boost), it's
an extension of the core interactivity concept, and as such it cannot have
bad effects, unless the core concept is faulty. (which i dont think it
is.)

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:15                           ` Ingo Molnar
@ 2003-03-06 18:20                             ` Linus Torvalds
  2003-03-06 18:10                               ` David S. Miller
  0 siblings, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 18:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: John Levon, Andrew Morton, Robert Love, linux-kernel, David S. Miller


On Thu, 6 Mar 2003, Ingo Molnar wrote:
> > Oh, well. I didn't actually even verify that UNIX domain sockets will
> > cause synchronous wakeups, so the patch may literally be doing nothing
> > at all. You can try that theory out by just removing the test for
> > "in_interrupt()".
> 
> you are not referring to the 'synchronous wakeups' as used by fs/pipe.c,
> right?

No, sorry. Bad choice of words.

The traditional "synchronous wakeups" as used by fs/pipe.c is a hint to 
the scheduler that the waker will go to sleep.

And no, that's not the hint I'm using at all. I'm only interested in 
"process-synchronous", since if the wakeup isn't process-synchronous then 
"current" doesn't make much sense to me.

> so i think your current patch should cover unix domain sockets just as
> well, they certain dont use IRQ-context wakeups.

Note that "in_interrupt()" will also trigger for callers that call from
bh-atomic regions as well as actual BH handlers. Which is correct - they 
are both "interrupt contexts" as far as most users should be concerned.

The unix domain case may well be bh-atomic, I haven't looked at the code. 
I'm pretty much certain that the TCP case _will_ be BH-atomic, even for 
loopback.

David?

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 16:01             ` Jeff Garzik
  2003-03-06 16:51               ` Linus Torvalds
  2003-03-06 17:17               ` Ingo Molnar
@ 2003-03-06 18:25               ` Alan Cox
  2003-03-06 17:35                 ` Linus Torvalds
  2003-03-06 22:00               ` Bill Davidsen
  3 siblings, 1 reply; 100+ messages in thread
From: Alan Cox @ 2003-03-06 18:25 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Linus Torvalds, Andrew Morton, rml, mingo, Linux Kernel Mailing List

On Thu, 2003-03-06 at 16:01, Jeff Garzik wrote:
> Pardon the suggestion of a dumb hueristic, feel free to ignore me: 
> would it work to run-first processes that have modified their iopl() 
> level?  i.e. "if you access hardware directly, we'll treat you specially 
> in the scheduler"?

Not all X servers do that. X is not special in any way. Its just a
daemon. It has the same timing properties as many other daemons doing
time critical operations for many clients



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:13                         ` Martin J. Bligh
@ 2003-03-06 18:26                           ` Linus Torvalds
  0 siblings, 0 replies; 100+ messages in thread
From: Linus Torvalds @ 2003-03-06 18:26 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Ingo Molnar, Alan Cox, Jeff Garzik, Andrew Morton, rml,
	Linux Kernel Mailing List


On Thu, 6 Mar 2003, Martin J. Bligh wrote:
> 
> It would be nice if we had a "batch-job-able" simulation of this situation,
> to do more accurate testing with ... can anyone think of an easy way to
> do such a thing? "waggle a few windows around on X and see if it feels 
> better to you or not" is kind of hard to do accurate testing with.
> Of course, the simulation has to be accurate too, or it gets stupid ;-)

It should be possible to use the same approach that the other latency
testers use (ie contest&co), by just generating some specific background
load (say, different mixtures of X clients and plain compute-bound things 
like kernel compiles).

Ie measure directly exactly what we're interested in: the latency of some
general X request, under different loads.

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:52                   ` jvlists
@ 2003-03-06 18:27                     ` Ingo Molnar
  2003-03-07  8:15                       ` Xavier Bestel
  2003-03-06 23:18                     ` Robert Love
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-06 18:27 UTC (permalink / raw)
  To: jvlists; +Cc: Linus Torvalds, Jeff Garzik, Andrew Morton, rml, linux-kernel


On Thu, 6 Mar 2003 jvlists@ntlworld.com wrote:

> P.S. IMVHO the xine problem is completely different as has nothing to
> with interactivity but with the fact that it is soft real-time. i.e. you
> need to distingish xine from say a gimp filter or a 3D renderer with
> incremental live updates of the scene it is creating.

it is the same category of problems: xine and X are both applications,
which, if lagged, are noticed by users. xine can be a perfectly fine CPU
hog when playing back DVDs. It can also be a mostly interactive task
playing back music mostly. For xine it's not just that audio skipping that
gets noticed, it's also the video-playback jerkiness that can be noticed
by users.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:11                 ` Ingo Molnar
  2003-03-06 17:52                   ` jvlists
@ 2003-03-06 18:49                   ` Alan Cox
  2003-03-06 17:55                     ` Linus Torvalds
  2003-03-06 17:58                     ` Ingo Molnar
  1 sibling, 2 replies; 100+ messages in thread
From: Alan Cox @ 2003-03-06 18:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Jeff Garzik, Andrew Morton, rml,
	Linux Kernel Mailing List

On Thu, 2003-03-06 at 17:11, Ingo Molnar wrote:
> X is special. Especially in Andrew's wild-window-dragging experiment X is
> a pure CPU-bound task that just eats CPU cycles no end. There _only_ thing
> that makes it special is that there's a human looking at the output of the
> X client. This is information that is simply not available to the kernel.

Just like a streaming video server
Just like a 3D game using DRI

X isnt special at all. Research OS people have done stuff like time transfers
but I've not seen that in a production OS. In that situation an interactive
task blocking on a server hands on some of its interactiveness to whoever
services the request.

Thats not a trivial Linux addition however


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  3:20       ` Linus Torvalds
  2003-03-06  7:45         ` Andrew Morton
  2003-03-06 17:13         ` Ingo Molnar
@ 2003-03-06 20:42         ` Andrew Morton
  2003-03-06 22:18           ` Robert Love
                             ` (3 more replies)
  2003-03-06 22:03         ` Martin Waitz
  3 siblings, 4 replies; 100+ messages in thread
From: Andrew Morton @ 2003-03-06 20:42 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: rml, mingo, linux-kernel

Linus Torvalds <torvalds@transmeta.com> wrote:
>
> 
> ===== kernel/sched.c 1.161 vs edited =====
> --- 1.161/kernel/sched.c	Thu Feb 20 20:33:52 2003
> +++ edited/kernel/sched.c	Wed Mar  5 19:09:45 2003
> @@ -337,8 +337,15 @@
>  		 * boost gets as well.
>  		 */
>  		p->sleep_avg += sleep_time;
> -		if (p->sleep_avg > MAX_SLEEP_AVG)
> +		if (p->sleep_avg > MAX_SLEEP_AVG) {
> +			int ticks = p->sleep_avg - MAX_SLEEP_AVG + current->sleep_avg;
>  			p->sleep_avg = MAX_SLEEP_AVG;
> +			if (ticks > MAX_SLEEP_AVG)
> +				ticks = MAX_SLEEP_AVG;
> +			if (!in_interrupt())
> +				current->sleep_avg = ticks;
> +		}
> +			
>  		p->prio = effective_prio(p);
>  	}
>  	enqueue_task(p, array);

This improves the X interactivity tremendously.  I went back to 2.5.64 base
just to verify, and the difference was very noticeable.

The test involved doing the big kernel compile while moving large xterm,
mozilla and sylpheed windows about.  With this patch the mouse cursor was
sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws were
maybe 0.5 seconds laggy.

So.  A big thumbs up on that one.  It appears to be approximately as
successful as sched-2.5.64-a5.

Ingo's combo patch is better still - that is sched-2.5.64-a5 and your patch
combined (yes?).  The slight mouse jerkiness is gone and even when doing
really silly things I cannot make it misbehave at all.  I'd handwavingly
describe both your patch and sched-2.5.64-a5 as 80% solutions, and the combo
95%.


So I'm a happy camper, and will be using Ingo's combo patch.  But I do not
use XMMS and xine and things like that - they may be running like crap with
these patches.  I do not know, and I do not have a base to compare against
even if I could work out how to get them going.


I did spend a bit of time a while back trying to come up with some
combination of tests which would exhibit this problem, and which would allow
it to be measured and tweaked.  It was unsuccessful - the peculiar
combination of a compilation and X seems to bring it out.



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 16:01             ` Jeff Garzik
                                 ` (2 preceding siblings ...)
  2003-03-06 18:25               ` Alan Cox
@ 2003-03-06 22:00               ` Bill Davidsen
  3 siblings, 0 replies; 100+ messages in thread
From: Bill Davidsen @ 2003-03-06 22:00 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Linux Kernel Mailing List

On Thu, 6 Mar 2003, Jeff Garzik wrote:

> Pardon the suggestion of a dumb hueristic, feel free to ignore me: 
> would it work to run-first processes that have modified their iopl() 
> level?  i.e. "if you access hardware directly, we'll treat you specially 
> in the scheduler"?
> 
> An alternative is to encourage distros to set some sort of flag for 
> processes like the X server, when it is run.  This sounds suspiciously 
> like the existing "renice X server" hack, but it could be something like 
> changing the X server from SCHED_OTHER  to SCHED_HW_ACCESS instead.
> 
> Just throwing out some things, because I care about apps which access 
> hardware from userspace :)

Well, close. But any low-latency access, even if somewhat buffered,
is likely to be user visible if unresponsive. Clearly that means video
memory like X and DRI, and sound, and mouse. If you define this generally
it would include serial as well, probably not a bad thing.

The proposal to backfeed priority from interractive processes to processes
waking them sounds useful, perhaps that might be limited a bit however.
Someone (Ingo?, Linus?) proposed limiting this to a fraction of the
priority of the interractive process, but it might be more useful to
simply limit how much could be added at one time, perhaps as a fraction of
the delta between max and current interractive bonus, which would have the
waker increase asymptomically toward max. Hysteresis is nice.

I do agree that it would be better to identify processes using physical
devices, far better than trying to identify some subset because it's
obvious.

I think this would include parallel port, although other than PL/IP I
don't think of a case where it matters much.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06  3:20       ` Linus Torvalds
                           ` (2 preceding siblings ...)
  2003-03-06 20:42         ` Andrew Morton
@ 2003-03-06 22:03         ` Martin Waitz
  2003-03-06 22:07           ` Robert Love
  3 siblings, 1 reply; 100+ messages in thread
From: Martin Waitz @ 2003-03-06 22:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, mingo, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1635 bytes --]

hi :)

On Wed, Mar 05, 2003 at 07:20:34PM -0800, Linus Torvalds wrote:
> How about something more like this (yeah, untested, but you get the idea): 
> the person who wakes up an interactive task gets the interactivity bonus 
> if the interactive task is already maxed out. I dunno how well this plays 
> with the X server, but assuming most clients use UNIX domain sockets, the 
> wake-ups _should_ be synchronous, so it should work well to say "waker 
> gets bonus".

i used a similar method to correctly account resource usage
(cpu,energy,..) of processes in my diploma thesis:
work done by a sever (e.g. X) is accounted to the current client,
giving more resources to the server
http://admingilde.org/~martin/papers/

implementation is working but far from being mergeable...




RE: the patch, i think using sleep_avg is a wrong metric from the
beginning.

in addition, timeslices should be shortest for high priority processes
(depending on dynamic prio, not static)

but these are of course just simple statements and i don't have
a patch that makes a really good scheduler :(

-- 
CU,		  / Friedrich-Alexander University Erlangen, Germany
Martin Waitz	//  [Tali on IRCnet]  [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet    //
tippfehler und ist auch ohne grossbuchstaben gueltig.   /
			    -
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich 
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit 
noch Sicherheit verdient.            Benjamin Franklin (1706 - 1790)

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:03         ` Martin Waitz
@ 2003-03-06 22:07           ` Robert Love
  2003-03-06 22:35             ` Martin Waitz
  2003-03-07 10:27             ` Ingo Molnar
  0 siblings, 2 replies; 100+ messages in thread
From: Robert Love @ 2003-03-06 22:07 UTC (permalink / raw)
  To: Martin Waitz; +Cc: Linus Torvalds, Andrew Morton, mingo, linux-kernel

On Thu, 2003-03-06 at 17:03, Martin Waitz wrote:

> RE: the patch, i think using sleep_avg is a wrong metric from the
> beginning.

Eh?  It is as close to a heuristic of interactivity as I can think of.

> in addition, timeslices should be shortest for high priority processes
> (depending on dynamic prio, not static)

No, they should be longer.  In some cases they should be nearly
infinitely long (which is sort of what we do with the reinsertion into
the active array for highly interactive tasks).  We want interactive
tasks to always be able to run.

You may think they need shorter timeslice because they are I/O-bound. 
Keep in mind they need not _use_ all their timeslice in one go, and
having a large timeslice ensures they have timeslice available when they
wake up and need to run.

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:06                 ` Dimitrie O. Paun
@ 2003-03-06 22:14                   ` Martin J. Bligh
  2003-03-06 22:31                     ` Dimitrie O. Paun
  0 siblings, 1 reply; 100+ messages in thread
From: Martin J. Bligh @ 2003-03-06 22:14 UTC (permalink / raw)
  To: Dimitrie O. Paun, Ingo Molnar
  Cc: Jeff Garzik, Linus Torvalds, Andrew Morton, Robert Love, linux-kernel

>> yes, an ELF flag might work, or my suggestion to allow applications to
>> increase their priority (up until a certain degree).
> 
> An ELF flag might be better, as it's declarative -- it allows the kernel
> to implement 'interactivity' in various ways, so we can keep tweeking it.
> Priority might prove to be a bit different than interactivity, so we
> better not overload the two just yet.

Difficult to see how this would work. For instance, is bash interactive 
or a batch job?

M.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 20:42         ` Andrew Morton
@ 2003-03-06 22:18           ` Robert Love
  2003-03-07  5:52           ` Ingo Molnar
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 100+ messages in thread
From: Robert Love @ 2003-03-06 22:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, mingo, linux-kernel

On Thu, 2003-03-06 at 15:42, Andrew Morton wrote:

> So I'm a happy camper, and will be using Ingo's combo patch.  But I do not
> use XMMS and xine and things like that - they may be running like crap with
> these patches.  I do not know, and I do not have a base to compare against
> even if I could work out how to get them going.

Linus,

This is great for me, too.  I played around with some mp3 playing and
did the akpm-window-wiggle test.  It is definitely the smoothest.

I think we definitely need Ingo's tweaked scheduler parameters - I have
been running a similar set of values myself for some time.  But your
patch seems to make the difference.

This is the most subject stuff on the planet, but here is a rough
ranking of interactivity performance in the bad cases on a scale of 1
(worse) to 5 (best):

linus-patch + tweaked-parameters:	5
linus-patch:				4
tweaked-parameters + reniced X:		3.5
tweaked-parameters:			2.5
stock:					1

Sorry, did not test Ingo's full patch.  It is basically the tweaked
parameters plus the sync wakeup which looks correct.

In the average case, the O(1) scheduler does fine without any changes. 
The heuristic works.  It is just the worst-case cases where we need
help, and from above I think we have that.

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:35                 ` Linus Torvalds
@ 2003-03-06 22:30                   ` Eric Northup
  2003-03-06 23:40                     ` Martin J. Bligh
  0 siblings, 1 reply; 100+ messages in thread
From: Eric Northup @ 2003-03-06 22:30 UTC (permalink / raw)
  To: Linus Torvalds, Alan Cox
  Cc: Jeff Garzik, Andrew Morton, rml, mingo, Linux Kernel Mailing List

On Thursday 06 March 2003 12:35 pm, Linus Torvalds wrote:
> On 6 Mar 2003, Alan Cox wrote:
> > Not all X servers do that. X is not special in any way. Its just a
> > daemon. It has the same timing properties as many other daemons doing
> > time critical operations for many clients
>
> I really think that this is the most important observation about the
> behaviour. X really isn't doing anything that other deamons don't do. It
> so happens that what X is doing is a lot more complex than most deamons
> do, so it's fairly easy to trigger X into using a fair amount of CPU time,
> but that only makes the breakdown case so much easier to see.
>
> In all honesty, though, it's also obviously the case that the breakdown
> case in the case of X is _really_ visible in a very very concrete way,
> since X is the _only_ thing really "visible" unless you have Direct
> Rendering clients. So even if the problem were to happen with another
> deamon, it probably wouldn't stand out quite as badly. So in that sense X
> ends up having some special characteristics - "visibility".

Yes.  The special thing about X (and XMMS, XINE, et al) is that the 
interactivity heuristic fails in a user-noticeable way.  The heuristic must 
also be guessing incorrectly in other situations, but nobody seems to have 
noticed when it did (or, at least, complained about it).

Another noteworthy attribute of these programs is that they _know_ they will 
be used interactively.  And, if there were a way for them to convey that 
information to the kernel (not by using 'nice'), I bet all of the above 
projects would take advantage of that mechanism.

At first I strongly dissaproved of the kernel's timeslice adjustment by 
interactivity estimation; policy belongs in userland, I thought.  But the 
algorithm is actually quite effective -- a balance between high HZ for 
real-time-ish applications but manages to avoid the cache-thrashing in places 
it would hurt.  This policy works well most of the time, so the kernel 
includes it as a sensible default.  But X demonstrates that sometimes we want 
to set a different policy.  After all, the heuristic will not become perfect, 
and its worst-case behavior is pretty bad.  (If your computers are too fast 
to notice, try turning off cache with /proc/mtrr.  Not be a great idea on an 
important system though...)

The question is: what interface would be used?  A new syscall seems excessive; 
but something in /proc (/proc/xyz/preferred_timeslice ?) is non-optimal, 
because it must not be used in chroot.  I am not qualified to suggest the 
Proper mechanism, but I strongly believe there should be one.

--Eric


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:14                   ` Martin J. Bligh
@ 2003-03-06 22:31                     ` Dimitrie O. Paun
  2003-03-06 23:05                       ` Robert Love
  0 siblings, 1 reply; 100+ messages in thread
From: Dimitrie O. Paun @ 2003-03-06 22:31 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Ingo Molnar, Jeff Garzik, Linus Torvalds, Andrew Morton,
	Robert Love, linux-kernel

On Thu, 6 Mar 2003, Martin J. Bligh wrote:

> Difficult to see how this would work. For instance, is bash interactive 
> or a batch job?

Right, being able to control this interactivity knob programmatically
seems like a useful thing. That way, the window manager can boost the
interactivity of the foreground window for example. It does seem that
figuring out that something is interactive in the scheduler is tough,
there is just not enough information, whereas a higher layer may know
this for a fact. I guess this reduces my argument to just keeping the 
interactivity setting separate from priority.

-- 
Dimi.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:07           ` Robert Love
@ 2003-03-06 22:35             ` Martin Waitz
  2003-03-06 22:56               ` Robert Love
  2003-03-07 10:27             ` Ingo Molnar
  1 sibling, 1 reply; 100+ messages in thread
From: Martin Waitz @ 2003-03-06 22:35 UTC (permalink / raw)
  To: Robert Love; +Cc: Linus Torvalds, Andrew Morton, mingo, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2455 bytes --]

On Thu, Mar 06, 2003 at 05:07:37PM -0500, Robert Love wrote:
> On Thu, 2003-03-06 at 17:03, Martin Waitz wrote:
> 
> > RE: the patch, i think using sleep_avg is a wrong metric from the
> > beginning.
> Eh?  It is as close to a heuristic of interactivity as I can think of.

processes tend to max out at one extreme or the other

processes that get stalled by a huge overall load of the machine
are forced to sleep, too; yet they are not interactive.

priority should be based on things the processes did, not on what
they haven't done.

> > in addition, timeslices should be shortest for high priority processes
> > (depending on dynamic prio, not static)
> 
> No, they should be longer.  In some cases they should be nearly
> infinitely long (which is sort of what we do with the reinsertion into
> the active array for highly interactive tasks).  We want interactive
> tasks to always be able to run.

but for interactive tasks, latency is all-important
if the task can't provide a result in a short timeslice it already
failed to provide low latency and should not be considered interactive
any more.

when the dynamic prio of the currently running process is high,
then only interactive processes should run.
but these processes should be rescheduled fast, to
provide low-latency equally to all interactive processes

when the dynamic prio of the currently running process is low
then no interactive process runs and the scheduler can resort
to a more batch-job mode: increase timeslices to reduce
scheduling overhead.

> You may think they need shorter timeslice because they are I/O-bound. 
> Keep in mind they need not _use_ all their timeslice in one go, and
> having a large timeslice ensures they have timeslice available when they
> wake up and need to run.
but the time slice should not be large enough to stall other
processes, which is extremely important for interactivity

-- 
CU,		  / Friedrich-Alexander University Erlangen, Germany
Martin Waitz	//  [Tali on IRCnet]  [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet    //
tippfehler und ist auch ohne grossbuchstaben gueltig.   /
			    -
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich 
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit 
noch Sicherheit verdient.            Benjamin Franklin (1706 - 1790)

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:35             ` Martin Waitz
@ 2003-03-06 22:56               ` Robert Love
  2003-03-06 23:27                 ` Martin Waitz
  0 siblings, 1 reply; 100+ messages in thread
From: Robert Love @ 2003-03-06 22:56 UTC (permalink / raw)
  To: Martin Waitz; +Cc: Linus Torvalds, Andrew Morton, mingo, linux-kernel

On Thu, 2003-03-06 at 17:35, Martin Waitz wrote:

> processes tend to max out at one extreme or the other
> 
> processes that get stalled by a huge overall load of the machine
> are forced to sleep, too; yet they are not interactive.

Not running != Sleeping

> priority should be based on things the processes did, not on what
> they haven't done.

It _is_ based on something the process has done: sleeping in the kernel

> but for interactive tasks, latency is all-important
> if the task can't provide a result in a short timeslice it already
> failed to provide low latency and should not be considered interactive
> any more.

Ahh, but you are confusing timeslice with the time a process runs for.

This is what I pointed out in my initial email was your flaw :)

A process may have a 100ms timeslice, but only run for 1ms at a time
(100x before recalculating its quanta).  This is the intention of giving
large timeslices to interactive tasks: not that they need all 100ms at
_once_ but they may need some of it soon, and when they need it they
really NEED it, so make sure they have enough timeslice such that there
is plenty available when they wake up (since the latency is important,
as you said).

> but the time slice should not be large enough to stall other
> processes, which is extremely important for interactivity

Once a process stalls other processes (i.e. by running a long time i.e.
by being a CPU hog) then it loses its interactive bonus.

The heuristic works.  No one doubts that, modulo the corner cases we are
working on fixing.  I suggest you read the code.

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:31                     ` Dimitrie O. Paun
@ 2003-03-06 23:05                       ` Robert Love
  2003-03-06 23:33                         ` Martin J. Bligh
  0 siblings, 1 reply; 100+ messages in thread
From: Robert Love @ 2003-03-06 23:05 UTC (permalink / raw)
  To: Dimitrie O. Paun
  Cc: Martin J. Bligh, Ingo Molnar, Jeff Garzik, Linus Torvalds,
	Andrew Morton, linux-kernel

On Thu, 2003-03-06 at 17:31, Dimitrie O. Paun wrote:

> Right, being able to control this interactivity knob programmatically
> seems like a useful thing. That way, the window manager can boost the
> interactivity of the foreground window for example. It does seem that
> figuring out that something is interactive in the scheduler is tough,
> there is just not enough information, whereas a higher layer may know
> this for a fact. I guess this reduces my argument to just keeping the 
> interactivity setting separate from priority.

No no no.  Martin's point shows exactly that nothing but the kernel can
ever know whether a task is I/O or CPU bound.  What is bash?  Is it
interactive (when you are typing into it) or CPU bound (when its running
a script or doing other junk)?

Only the kernel knows exactly the sleep patterns of tasks, which is
essentially whether or not a task is interactive.

Finally, the windows manager idea is bad.  The foreground window may
have dependencies elsewhere, and giving it a boost only partially solves
the problem.

I think with Linus's patch, the problem is solved, because we boost both
I/O-bound tasks and tasks that help I/O bound tasks.

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:52                   ` jvlists
  2003-03-06 18:27                     ` Ingo Molnar
@ 2003-03-06 23:18                     ` Robert Love
  1 sibling, 0 replies; 100+ messages in thread
From: Robert Love @ 2003-03-06 23:18 UTC (permalink / raw)
  To: jvlists
  Cc: Ingo Molnar, Linus Torvalds, Jeff Garzik, Andrew Morton, linux-kernel

On Thu, 2003-03-06 at 12:52, jvlists@ntlworld.com wrote:

> P.S. IMVHO the xine problem is completely different as has nothing to
> with interactivity but with the fact that it is soft
> real-time. i.e. you need to distingish xine from say a gimp filter or
> a 3D renderer with incremental live updates of the scene it is
> creating.

Xine is the same as X or vi or xmms, for this problem.

They are all, in fact, soft real-time issues where the latency limit we
want is whatever is the threshold of human perception.  This limit may
be more or less for DVD playback vs. typing vs. music playback... but
ultimately there is some latency threshold you want maintained.  If Xine
is starved too long, your video skips.  If vi is starved too long, you
can perceive a lag in your keystrokes showing up.  It is all the same.

Maybe we do not care about the latency of the gimp filter.  Those are
indeed different.  But the interactivity of other things - say, gimp's
menus in response to you clicking them - are all the same sort of thing.

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:56               ` Robert Love
@ 2003-03-06 23:27                 ` Martin Waitz
  2003-03-06 23:36                   ` Robert Love
  0 siblings, 1 reply; 100+ messages in thread
From: Martin Waitz @ 2003-03-06 23:27 UTC (permalink / raw)
  To: Robert Love; +Cc: Linus Torvalds, Andrew Morton, mingo, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2528 bytes --]

hi :)

On Thu, Mar 06, 2003 at 05:56:06PM -0500, Robert Love wrote:
> On Thu, 2003-03-06 at 17:35, Martin Waitz wrote:
> 
> > processes tend to max out at one extreme or the other
> > 
> > processes that get stalled by a huge overall load of the machine
> > are forced to sleep, too; yet they are not interactive.
> 
> Not running != Sleeping

schedule() does prev->sleep_timestamp = jiffies; just before
deactivating prev.
so i guess inactivity is counted towards sleep_avg, too

> A process may have a 100ms timeslice, but only run for 1ms at a time
> (100x before recalculating its quanta).
but it _can_ use the 100ms at a time if it is cpu-bound
what's so bad about recalculating the quantum?

> This is the intention of giving large timeslices to interactive tasks:
> not that they need all 100ms at _once_ but they may need some of it
> soon, and when they need it they really NEED it, so make sure they
> have enough timeslice such that there is plenty available when they
> wake up (since the latency is important, as you said).
but most of the time, not _one_ process is waken up, but several at once

if it happens that the first who gets to run is cpu-bound,
then all other interactive processes have to wait a long time, even
if they would only need 1ms to finish their work.

scheduling overhead was successfully brought down to a minimum
thanks to the great work of a lot of people.
i think we should use that work to improve latency by reducing
the available timeslice for interactive processes.

if the process is still considered interactive after the time slice had run
out, nothing is lost; it simply gets another one.

but the kernel should get the chance to frequently reschedule
when interactivity is needed.

> Once a process stalls other processes (i.e. by running a long time i.e.
> by being a CPU hog) then it loses its interactive bonus.
but it takes too long. 100ms is noticeable

> I suggest you read the code.
i've read it ;)

-- 
CU,		  / Friedrich-Alexander University Erlangen, Germany
Martin Waitz	//  [Tali on IRCnet]  [tali.home.pages.de] _________
______________/// - - - - - - - - - - - - - - - - - - - - ///
dies ist eine manuell generierte mail, sie beinhaltet    //
tippfehler und ist auch ohne grossbuchstaben gueltig.   /
			    -
Wer bereit ist, grundlegende Freiheiten aufzugeben, um sich 
kurzfristige Sicherheit zu verschaffen, der hat weder Freiheit 
noch Sicherheit verdient.            Benjamin Franklin (1706 - 1790)

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 23:05                       ` Robert Love
@ 2003-03-06 23:33                         ` Martin J. Bligh
  0 siblings, 0 replies; 100+ messages in thread
From: Martin J. Bligh @ 2003-03-06 23:33 UTC (permalink / raw)
  To: Robert Love, Dimitrie O. Paun
  Cc: Ingo Molnar, Jeff Garzik, Linus Torvalds, Andrew Morton, linux-kernel

>> Right, being able to control this interactivity knob programmatically
>> seems like a useful thing. That way, the window manager can boost the
>> interactivity of the foreground window for example. It does seem that
>> figuring out that something is interactive in the scheduler is tough,
>> there is just not enough information, whereas a higher layer may know
>> this for a fact. I guess this reduces my argument to just keeping the 
>> interactivity setting separate from priority.
> 
> No no no.  Martin's point shows exactly that nothing but the kernel can
> ever know whether a task is I/O or CPU bound.  What is bash?  Is it
> interactive (when you are typing into it) or CPU bound (when its running
> a script or doing other junk)?
> 
> Only the kernel knows exactly the sleep patterns of tasks, which is
> essentially whether or not a task is interactive.

Exactly ... all this tweaking, and setting up every app individually is bad.
It should "just frigging work" ;-) We seem to be pretty close to that
at the moment - 2.5 feels *so* much better than 2.4 already (2.4 degenerates
into a total slug overnight, presumably when things like man page reindexes
thrash the page cache).

The fact that the debian renice of the X server actually breaks things is
probably good news ... we're actually paying real attention to the nice
value ;-)

M.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 23:27                 ` Martin Waitz
@ 2003-03-06 23:36                   ` Robert Love
  2003-03-07 10:39                     ` Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Robert Love @ 2003-03-06 23:36 UTC (permalink / raw)
  To: Martin Waitz; +Cc: Linus Torvalds, Andrew Morton, mingo, linux-kernel

On Thu, 2003-03-06 at 18:27, Martin Waitz wrote:

> schedule() does prev->sleep_timestamp = jiffies; just before
> deactivating prev.
> so i guess inactivity is counted towards sleep_avg, too

That is just the initial value.  See activate_task() which actually sets
the sleep_avg value.  If the task is never woken up, sleep_timestamp is
never used and sleep_avg is not touched.

sleep_avg is the important value.

sleep_timestamp is missed named, its really just the jiffies value at
which the task last ran.  Ingo renamed it "last_run" in his latest
patch.

> but most of the time, not _one_ process is waken up, but several at once
> 
> if it happens that the first who gets to run is cpu-bound,
> then all other interactive processes have to wait a long time, even
> if they would only need 1ms to finish their work.

Interactive tasks also have a higher dynamic priority.  So they will be
the one to run.

> scheduling overhead was successfully brought down to a minimum
> thanks to the great work of a lot of people.
> i think we should use that work to improve latency by reducing
> the available timeslice for interactive processes.
>
> if the process is still considered interactive after the time slice had run
> out, nothing is lost; it simply gets another one.
> 
> but the kernel should get the chance to frequently reschedule
> when interactivity is needed.

I understand your point, but we get that now without using super small
timeslices.

Giving interactive tasks larger timeslice ensures they can always run
when they need to.  It also lets us set an upper bound and not have to
recalculate timeslices constantly.

If an interactive task _does_ use all its timeslice, then it is no
longer interactive and that will be noted and it will lose its bonus.

	Robert Love


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:30                   ` Eric Northup
@ 2003-03-06 23:40                     ` Martin J. Bligh
  2003-03-07  0:02                       ` Linus Torvalds
  0 siblings, 1 reply; 100+ messages in thread
From: Martin J. Bligh @ 2003-03-06 23:40 UTC (permalink / raw)
  To: digitale, Linus Torvalds, Alan Cox
  Cc: Jeff Garzik, Andrew Morton, rml, mingo, Linux Kernel Mailing List

> At first I strongly dissaproved of the kernel's timeslice adjustment by 
> interactivity estimation; policy belongs in userland, I thought.  But the 

That seems to be one of the most dramatically overused misguided statements
bandied about in Linux at the moment, with respect to just about every 
subsystem, and it's really starting to get annoying.

Yes, you should be able to frig with policy decisions from userspace, if
you really want to BUT the kernel should do something pretty sane by 
default without user intervention. It's exactly the same mistake apps
keep making, and why half of them are unusable.

M.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 23:40                     ` Martin J. Bligh
@ 2003-03-07  0:02                       ` Linus Torvalds
  2003-03-07  0:09                         ` Dimitrie O. Paun
  0 siblings, 1 reply; 100+ messages in thread
From: Linus Torvalds @ 2003-03-07  0:02 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: digitale, Alan Cox, Jeff Garzik, Andrew Morton, rml, mingo,
	Linux Kernel Mailing List


On Thu, 6 Mar 2003, Martin J. Bligh wrote:
> 
> Yes, you should be able to frig with policy decisions from userspace, if
> you really want to BUT the kernel should do something pretty sane by 
> default without user intervention.

Amen, brother!

One thing that I've often disliked about "tuning parameters" is that many 
people take them to be valid excuses for bad behaviour under some load. 
For example, when some benchmark does badly, or something stutters, you'll 
always find somebody pointing out that it can be fixed by doing a

	echo 55 > /proc/sys/mm/dirty_ratio

or similar.  And the fact that you can tune the thing for certain loads 
somehow thus makes the bug ok..

IT AIN'T SO! Tuning constants are wonderfully useful for developers tuning 
their guesses a bit, and to give more information about what knobs make a 
difference.

In other words, it's ok for a developer to say: "Ok, what happens if you
do the above echo?" to figure out whether maybe the problem is due to
excessive spurgel-production in the VM frobnicator, but it is NOT OK to 
say "you can tweak it for your use, so don't complain about the bad
behaviour".

This is the same thing as "'nice -10' the X-server". Something is wrong, 
and we couldn't fix it, so here's the band-aid to avoid that problem for 
hat particular case. It's acceptable as a band-aid, but if you don't 
realize that it's indicative of a problem, then you're just kidding 
yourself.

Can we do perfectly all the time? No. Sometimes best performance _will_
require tuning for the load. But it should be seen as a failure of the
generic algorithm, not as a success.

Btw, "success" is often "being good enough". We shouldn't _suck_, but you 
should always remember the old "perfect is the enemy of good" thing. 
Trying to get perfect behaviour under some circumstances often means that 
you suck under others, and then the right answer is usually not to try to 
be so damn perfect, but "just being good".

		Linus


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  0:02                       ` Linus Torvalds
@ 2003-03-07  0:09                         ` Dimitrie O. Paun
  0 siblings, 0 replies; 100+ messages in thread
From: Dimitrie O. Paun @ 2003-03-07  0:09 UTC (permalink / raw)
  To: Linus Torvalds, Martin J. Bligh
  Cc: digitale, Alan Cox, Jeff Garzik, Andrew Morton, rml, mingo,
	Linux Kernel Mailing List

On March 6, 2003 07:02 pm, Linus Torvalds wrote:
> Btw, "success" is often "being good enough". We shouldn't _suck_, but you
> should always remember the old "perfect is the enemy of good" thing.
> Trying to get perfect behaviour under some circumstances often means that
> you suck under others, and then the right answer is usually not to try to
> be so damn perfect, but "just being good".

Very much so! I can not agree more with you that the lack of a default
good policy is stupid -- this is why X was in such a bad state for years!
I could write pages on this subject, but I'm glad other people share my
ideas in this matter :)

That being said, I have nothing against an automatic algorithm. If we can
get that one to do what we want, there's obviously no need for knobs. All
I'm saying is that *if* there are cases where we need help, we should not
have to tweak the priority of a process, but create a different knob.
Priority is a different beast from interactivity. For example, it's clear
that good interactive behaviour _requires_ information only available in
the kernel. Which makes the interactivity know (if at all required) more
of a hint rather than a set in stone thing like the priority.

But I guess all this is academic. If the automatic stuff just works, 
we're better off anyway.

-- 
Dimi.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:03                 ` Linus Torvalds
  2003-03-06 17:24                   ` Ingo Molnar
@ 2003-03-07  3:19                   ` Horst von Brand
  1 sibling, 0 replies; 100+ messages in thread
From: Horst von Brand @ 2003-03-07  3:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, Andrew Morton, rml, linux-kernel

Linus Torvalds <torvalds@transmeta.com> said:
> On Thu, 6 Mar 2003, Ingo Molnar wrote:
> > the whole compilation (gcc tasks) will be rated 'interactive' as well,
> > because an 'interactive' make process and/or shell process is waiting on
> > it.
> 
> No. The make that is waiting for it will be woken up _once_ - when the 
> thing dies. Marking it interactive at that point is absolutely fine.

It is woken up each time one Makefile line has been processed, to call the
next one.
-- 
Dr. Horst H. von Brand                   User #22616 counter.li.org
Departamento de Informatica                     Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria              +56 32 654239
Casilla 110-V, Valparaiso, Chile                Fax:  +56 32 797513

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  5:52           ` Ingo Molnar
@ 2003-03-07  5:51             ` Shawn
  2003-03-07  6:01               ` Ingo Molnar
  2003-03-07  6:07               ` Andrew Morton
  0 siblings, 2 replies; 100+ messages in thread
From: Shawn @ 2003-03-07  5:51 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel

On Thu, 2003-03-06 at 23:52, Ingo Molnar wrote:
> On Thu, 6 Mar 2003, Andrew Morton wrote:
> 
> > This improves the X interactivity tremendously.  I went back to 2.5.64
> > base just to verify, and the difference was very noticeable.
> > 
> > The test involved doing the big kernel compile while moving large xterm,
> > mozilla and sylpheed windows about.  With this patch the mouse cursor
> > was sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws
> > were maybe 0.5 seconds laggy.
> > 
> > So.  A big thumbs up on that one.  It appears to be approximately as
> > successful as sched-2.5.64-a5.

I'm sorry, I'm an idiot. Where's sched-2.5.64-a5 available for download?
Latest I found was  sched-2.5.63-B3 inside Andrew's patchset.

I'm kinda drooling over this thread.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 20:42         ` Andrew Morton
  2003-03-06 22:18           ` Robert Love
@ 2003-03-07  5:52           ` Ingo Molnar
  2003-03-07  5:51             ` Shawn
  2003-03-07  6:15           ` Ingo Molnar
  2003-03-07  7:45           ` Mike Galbraith
  3 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  5:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Andrew Morton wrote:

> This improves the X interactivity tremendously.  I went back to 2.5.64
> base just to verify, and the difference was very noticeable.
> 
> The test involved doing the big kernel compile while moving large xterm,
> mozilla and sylpheed windows about.  With this patch the mouse cursor
> was sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws
> were maybe 0.5 seconds laggy.
> 
> So.  A big thumbs up on that one.  It appears to be approximately as
> successful as sched-2.5.64-a5.
> 
> Ingo's combo patch is better still - that is sched-2.5.64-a5 and your
> patch combined (yes?).  The slight mouse jerkiness is gone and even when
> doing really silly things I cannot make it misbehave at all.  I'd
> handwavingly describe both your patch and sched-2.5.64-a5 as 80%
> solutions, and the combo 95%.

great! Ie. both approaches go in the same (good) direction, and the fact
that the combined effect is even better than either patch (although it's
hard to improve something that is 'good' already), shows that there's no
conflict between the two concepts.

> So I'm a happy camper, and will be using Ingo's combo patch.  But I do
> not use XMMS and xine and things like that - they may be running like
> crap with these patches.  I do not know, and I do not have a base to
> compare against even if I could work out how to get them going.

these always used to be problematic to a certain degree, so they are the
next on the list.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  5:57                       ` Ingo Molnar
@ 2003-03-07  5:54                         ` Shawn
  2003-03-07  6:04                           ` Ingo Molnar
  2003-03-07 14:28                         ` jlnance
  1 sibling, 1 reply; 100+ messages in thread
From: Shawn @ 2003-03-07  5:54 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel

On Thu, 2003-03-06 at 23:57, Ingo Molnar wrote:
> so the correct approach is both to make X more interactive (your patch),
> _and_ to make the compilation jobs less interactive (my patch). This is
> that explains why Andrew saw roughly similar interactivity with you and my
> patch applied separately, but the best result was when the combo patch was
> applied. Agreed?

Even better, can you point me to where this "combo" patch can be found?


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:42                     ` Linus Torvalds
  2003-03-06 17:54                       ` Ingo Molnar
  2003-03-06 17:55                       ` John Levon
@ 2003-03-07  5:57                       ` Ingo Molnar
  2003-03-07  5:54                         ` Shawn
  2003-03-07 14:28                         ` jlnance
  2003-03-07  6:45                       ` Aaron Lehmann
  3 siblings, 2 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  5:57 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Linus Torvalds wrote:

> The multi-second "freezes" are the thing that bothered me, and those
> were definitely due to the fact that X was competing as a
> _non_interactive member against other non-interactive members, causing
> it to still get 10% of the CPU, but only every few seconds. So you'd get
> a very bursty behaviour with very visible pauses.

this is only part of what happens in the 'X freeze' case. A CPU hog
competing against another CPU hog can never result in a multi-second
freeze, unless the system is hopelessly overloaded.

What happens is that the CPU-hog estimation is off for compile jobs, and
that _they_ end up being partly interactive, and starve X which does
happen to fall into the CPU-hog category briefly. The 10-15 seconds
timeout is the starvation limit kicking in.

so the correct approach is both to make X more interactive (your patch),
_and_ to make the compilation jobs less interactive (my patch). This is
that explains why Andrew saw roughly similar interactivity with you and my
patch applied separately, but the best result was when the combo patch was
applied. Agreed?

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  5:51             ` Shawn
@ 2003-03-07  6:01               ` Ingo Molnar
  2003-03-07  6:07               ` Andrew Morton
  1 sibling, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  6:01 UTC (permalink / raw)
  To: Shawn; +Cc: linux-kernel


On 6 Mar 2003, Shawn wrote:

> > > So.  A big thumbs up on that one.  It appears to be approximately as
> > > successful as sched-2.5.64-a5.
> 
> I'm sorry, I'm an idiot. Where's sched-2.5.64-a5 available for download?
> Latest I found was sched-2.5.63-B3 inside Andrew's patchset.

-A5 was the last update of my patch which i sent before the 'combo' patch,
attached to the end of the mail, with the subject line of:

   Subject: [patch] "interactivity changes", sched-2.5.64-A5

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  5:54                         ` Shawn
@ 2003-03-07  6:04                           ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  6:04 UTC (permalink / raw)
  To: Shawn; +Cc: linux-kernel


On 6 Mar 2003, Shawn wrote:

> > so the correct approach is both to make X more interactive (your patch),
> > _and_ to make the compilation jobs less interactive (my patch). This is
> > that explains why Andrew saw roughly similar interactivity with you and my
> > patch applied separately, but the best result was when the combo patch was
> > applied. Agreed?
> 
> Even better, can you point me to where this "combo" patch can be found?

in the same thread, a few mails later, send by me. But better yet, try the
latest BK kernel, it has the 'combo' patch merged.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  5:51             ` Shawn
  2003-03-07  6:01               ` Ingo Molnar
@ 2003-03-07  6:07               ` Andrew Morton
  1 sibling, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2003-03-07  6:07 UTC (permalink / raw)
  To: Shawn; +Cc: mingo, linux-kernel

Shawn <core@enodev.com> wrote:
>
> On Thu, 2003-03-06 at 23:52, Ingo Molnar wrote:
> > On Thu, 6 Mar 2003, Andrew Morton wrote:
> > 
> > > This improves the X interactivity tremendously.  I went back to 2.5.64
> > > base just to verify, and the difference was very noticeable.
> > > 
> > > The test involved doing the big kernel compile while moving large xterm,
> > > mozilla and sylpheed windows about.  With this patch the mouse cursor
> > > was sometimes a little jerky (0.1 seconds, perhaps) and mozilla redraws
> > > were maybe 0.5 seconds laggy.
> > > 
> > > So.  A big thumbs up on that one.  It appears to be approximately as
> > > successful as sched-2.5.64-a5.
> 
> I'm sorry, I'm an idiot. Where's sched-2.5.64-a5 available for download?
> Latest I found was  sched-2.5.63-B3 inside Andrew's patchset.

It all got merged.

http://www.kernel.org/pub/linux/kernel/v2.5/testing/cset/cset-1.1068.1.17-to-1.1107.txt.gz


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 20:42         ` Andrew Morton
  2003-03-06 22:18           ` Robert Love
  2003-03-07  5:52           ` Ingo Molnar
@ 2003-03-07  6:15           ` Ingo Molnar
  2003-03-07 10:19             ` Helge Hafting
  2003-03-07  7:45           ` Mike Galbraith
  3 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  6:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Andrew Morton wrote:

> So I'm a happy camper, and will be using Ingo's combo patch.  But I do
> not use XMMS and xine and things like that - they may be running like
> crap with these patches. [...]

one thing the -A5 patch does is that it decreases the default timeslice
value from 150 msecs to 100 msecs. While increasing the timeslice seemed
like a good idea originally (and even with -A5 it's still higher than in
2.4), 150 msecs was too much, it did end up causing just that little bit
of extra latency to audio apps that made them skip more likely. So audio
apps might as well perform a bit better. Certain gaming related problems i
suspect are related to the too long timeslice length as well.

one of the problems with XMMS is that it's often not just an audio app -
it has plugins with fancy graphics, which take up much more CPU time than
the audio decoding/feeding itself, and it's not as threaded as eg. xine.  
It was _this_ interaction that was the root of many XMMS complaints - not
the fact that XMMS does audio.

the xine situation i dont think will change with these patches, xine
simply uses up 90% of CPU time soft-playing a DVD from the DVD player, on
a 500 MHz x86 CPU, and that makes it qualify as a CPU-hog no matter what.  
Furthermore, it uses the Xv extension which makes it communicate much less
with X itself. But xine's problems are not due to interactive tasks, xine
is hurting due to other CPU-hogs (stuff that triggers in desktops
regularly) taking away timeslices. I believe we should still enable
application programmers to give certain apps _some_ minor priority boost,
so that other CPU hogs cannot starve xine. The fact that xine was playing
back perfectly with a +2 boost shows that this could be a quite powerful
tool.

But indeed this all needs to be re-checked with the latest scheduler.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:42                     ` Linus Torvalds
                                         ` (2 preceding siblings ...)
  2003-03-07  5:57                       ` Ingo Molnar
@ 2003-03-07  6:45                       ` Aaron Lehmann
  2003-03-07  6:50                         ` Ingo Molnar
  2003-03-07  7:00                         ` Andrew Morton
  3 siblings, 2 replies; 100+ messages in thread
From: Aaron Lehmann @ 2003-03-07  6:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, Andrew Morton, Robert Love, linux-kernel

On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:
> But it was definitely there. 3-5 second _pauses_. Not slowdowns.

I can second this. Using Linux 2.5.5x, untarring a file while
compiling could cause X to freeze for several seconds at a time.
I haven't seen this problem recently, though I do experience my share
of XMMS skips.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  6:45                       ` Aaron Lehmann
@ 2003-03-07  6:50                         ` Ingo Molnar
  2003-03-07  7:00                           ` Aaron Lehmann
  2003-03-07  7:00                         ` Andrew Morton
  1 sibling, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  6:50 UTC (permalink / raw)
  To: Aaron Lehmann; +Cc: Linus Torvalds, Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Aaron Lehmann wrote:

> > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
> 
> I can second this. Using Linux 2.5.5x, untarring a file while
> compiling could cause X to freeze for several seconds at a time.
> I haven't seen this problem recently, [...]

i believe this is rather due to IO scheduling / VM throttling. Andrew 
added some nice improvements lately, so this should really not happen with 
2.5.64 kernels.

> [...] though I do experience my share of XMMS skips.

okay, could you please test BK-curr, or 2.5.64+combo-patch? Do the skips
still persist? Did they get worse perhaps? I guess it might take a few
days of music listening while doing normal desktop activity, to get a good
feel of it though.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  6:50                         ` Ingo Molnar
@ 2003-03-07  7:00                           ` Aaron Lehmann
  2003-03-07  7:36                             ` Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Aaron Lehmann @ 2003-03-07  7:00 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, Robert Love, linux-kernel

On Fri, Mar 07, 2003 at 07:50:59AM +0100, Ingo Molnar wrote:
> > [...] though I do experience my share of XMMS skips.
> 
> okay, could you please test BK-curr, or 2.5.64+combo-patch? Do the skips
> still persist? Did they get worse perhaps? I guess it might take a few
> days of music listening while doing normal desktop activity, to get a good
> feel of it though.

I was able to reproduce them by selecting text in Mathematica (ugh,
not a very helpful example). The skips were shorter and about three
times as hard to trigger as on 2.5.63.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  6:45                       ` Aaron Lehmann
  2003-03-07  6:50                         ` Ingo Molnar
@ 2003-03-07  7:00                         ` Andrew Morton
  1 sibling, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2003-03-07  7:00 UTC (permalink / raw)
  To: Aaron Lehmann; +Cc: torvalds, mingo, rml, linux-kernel

Aaron Lehmann <aaronl@vitelus.com> wrote:
>
> On Thu, Mar 06, 2003 at 09:42:03AM -0800, Linus Torvalds wrote:
> > But it was definitely there. 3-5 second _pauses_. Not slowdowns.
> 
> I can second this. Using Linux 2.5.5x, untarring a file while
> compiling could cause X to freeze for several seconds at a time.

This may be a VM/block/IO scheduler thing.  If the X server is trying to page
some text in and the disk is busy writing some stuff out then yes, the X
server can freeze for some time.

Of course, a ton of work has gone into this problem, and more work continues
to be done.

If your machine has a small amount of memory, or is heavily overcommitted
then it is more likely to happen.

Using the CFQ or AS IO schedulers will help a bit.  And decreasing
/proc/sys/vm/swappiness may help too.

I'd be interested in feedback on the latter.  The default policy of 60% seems
to be about right for the 128M to 256M desktops, but I am running 85% on
larger memory machines so that a bit more stuff gets paged out.



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  7:00                           ` Aaron Lehmann
@ 2003-03-07  7:36                             ` Ingo Molnar
  2003-03-08 18:28                               ` Aaron Lehmann
  0 siblings, 1 reply; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  7:36 UTC (permalink / raw)
  To: Aaron Lehmann; +Cc: Linus Torvalds, Andrew Morton, Robert Love, linux-kernel


On Thu, 6 Mar 2003, Aaron Lehmann wrote:

> > okay, could you please test BK-curr, or 2.5.64+combo-patch? Do the skips
> > still persist? Did they get worse perhaps? I guess it might take a few
> > days of music listening while doing normal desktop activity, to get a good
> > feel of it though.
> 
> I was able to reproduce them by selecting text in Mathematica (ugh, not
> a very helpful example). The skips were shorter and about three times as
> hard to trigger as on 2.5.63.

okay, just as a data point, could you try to renice the player
process/thread to -2? Does it make the skipping harder to trigger?
How about -5, or -10?

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 20:42         ` Andrew Morton
                             ` (2 preceding siblings ...)
  2003-03-07  6:15           ` Ingo Molnar
@ 2003-03-07  7:45           ` Mike Galbraith
  2003-03-07  7:45             ` Ingo Molnar
       [not found]             ` <Pine.LNX.4.44.0303070842420.4572-100000@localhost.localdom ain>
  3 siblings, 2 replies; 100+ messages in thread
From: Mike Galbraith @ 2003-03-07  7:45 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds; +Cc: rml, mingo, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1937 bytes --]

At 12:42 PM 3/6/2003 -0800, Andrew Morton wrote:
>Ingo's combo patch is better still - that is sched-2.5.64-a5 and your patch
>combined (yes?).  The slight mouse jerkiness is gone and even when doing
>really silly things I cannot make it misbehave at all.  I'd handwavingly
>describe both your patch and sched-2.5.64-a5 as 80% solutions, and the combo
>95%.

Weeeeell, FWIW my box (p3-500/128mb/rage128) disagrees.

I can still barely control the box when a make -j5 bzImage is running under 
X/KDE in one terminal and a vmstat (SCHED_RR) in another. I'm not swapping, 
though a bit of idle junk does page out.  IOW, I think I'm seeing serious 
cpu starvation.

For reference, I used to be able to easily control the same box under X 
while swapping moderately at make -j15 (during page aging vm days), and 
with a heavily twiddled VM at even -j30 (_way_ overloaded with X oinkers 
running).  At -j5, gui/icky was quite usable.

With the make -j30 bzImage, heavy load which does _hit_ VM saturation but 
not constantly, I see what I can only describe as starvation.  vmstat 
output for this load is definitely abby-normal, and there's no way 
/proc/vmstat should look like the attached.  It takes forever to build 
enough vm pressure that swap is touched (bad bad bad), and the volume 
reached is way low.  My hungry little cc1's are not sharing resources the 
way they should be (methinks).

With earlier versions of Ingo's patch, I was only able to get memory 
pressure up to where it should be by drastically reducing timeslice (at the 
expense of turning throughput to crud).  Take this with a giant economy 
sized grain of salt, but what I _think_ (no, I haven't instrumented it) is 
happening is that chunks of the total job are being rescheduled in front of 
the rest of the job such that they alloc/use/exit before the other guys can 
alloc.  (geez that's a nebulous WAG;)

         -Mike  (please torch me if I'm being stupid;) 

[-- Attachment #2: data.txt --]
[-- Type: text/plain, Size: 1275 bytes --]

box = p3/500/128MB
fresh boot
time make -j30 bzImage
cat /proc/vmstat

                  2.5.64-B3  2.5.64-virgin
------------------------------------------
nr_dirty                 64           2448
nr_writeback              0              0
nr_pagecache           7432           7562
nr_page_table_pages      85             85
nr_reverse_maps        3348           3177
nr_mapped               634            579
nr_slab                1239           1235
pgpgin               288560         692725
pgpgout              228803         587572
pswpin                30734         116097
pswpout               47347         138888
pgalloc              918016        1046369
pgfree               940744        1068983
pgactivate            28577          45566
pgdeactivate          55265         172833
pgfault             1579517        1682466
pgmajfault             8463          28137
pgscan               556041        1678315
pgrefill             211542         643564
pgsteal               77311         194216
pginodesteal              0              0
kswapd_steal          73454         173378
kswapd_inodesteal     11069          11317
pageoutrun              222            509
allocstall               85            480
pgrotated             49798         140956

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  7:45           ` Mike Galbraith
@ 2003-03-07  7:45             ` Ingo Molnar
       [not found]             ` <Pine.LNX.4.44.0303070842420.4572-100000@localhost.localdom ain>
  1 sibling, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  7:45 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, Linus Torvalds, rml, linux-kernel


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> Weeeeell, FWIW my box (p3-500/128mb/rage128) disagrees.

UP box, right?

> I can still barely control the box when a make -j5 bzImage is running
> under X/KDE in one terminal and a vmstat (SCHED_RR) in another. I'm not
> swapping, though a bit of idle junk does page out.  IOW, I think I'm
> seeing serious cpu starvation.

which precise scheduler are you using, BK-curr? And to see whether the
problem is related to scheduling at all, could you try to renice X to -10?
[this is not a solution, this is just a test to see the problem is
scheduling related.]

is there any way we could exclude/isolate the VM as the source of
interactivity problems? Eg. any chance to get more RAM into that box?

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
       [not found]             ` <Pine.LNX.4.44.0303070842420.4572-100000@localhost.localdom ain>
@ 2003-03-07  8:09               ` Mike Galbraith
  2003-03-07  8:10                 ` Andrew Morton
                                   ` (2 more replies)
  0 siblings, 3 replies; 100+ messages in thread
From: Mike Galbraith @ 2003-03-07  8:09 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Linus Torvalds, rml, linux-kernel

At 08:45 AM 3/7/2003 +0100, Ingo Molnar wrote:

>On Fri, 7 Mar 2003, Mike Galbraith wrote:
>
> > Weeeeell, FWIW my box (p3-500/128mb/rage128) disagrees.
>
>UP box, right?

Yes, p3/500 w. 128mb ram and rage128.

> > I can still barely control the box when a make -j5 bzImage is running
> > under X/KDE in one terminal and a vmstat (SCHED_RR) in another. I'm not
> > swapping, though a bit of idle junk does page out.  IOW, I think I'm
> > seeing serious cpu starvation.
>
>which precise scheduler are you using, BK-curr? And to see whether the
>problem is related to scheduling at all, could you try to renice X to -10?
>[this is not a solution, this is just a test to see the problem is
>scheduling related.]

I plugged your combo patch into virgin .64.

>is there any way we could exclude/isolate the VM as the source of
>interactivity problems? Eg. any chance to get more RAM into that box?

I can (could with your earlier patches anyway) eliminate the X stalls by 
setting X junk to SCHED_FIFO.  I don't have ram to plug in, but I'm as 
certain as I can be without actually doing so that it's not ram shortage.

Best would be for other testers to run some tests.  With the make -j30 
weirdness, I _suspect_ that other oddities (hmm... multi-client db load... 
query service time) will show.

         -Mike 


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  8:09               ` Mike Galbraith
@ 2003-03-07  8:10                 ` Andrew Morton
  2003-03-07  8:26                   ` Mike Galbraith
  2003-03-07  8:15                 ` Ingo Molnar
       [not found]                 ` <Pine.LNX.4.44.0303070913370.5173-100000@localhost.localdom ain>
  2 siblings, 1 reply; 100+ messages in thread
From: Andrew Morton @ 2003-03-07  8:10 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: mingo, torvalds, rml, linux-kernel

Mike Galbraith <efault@gmx.de> wrote:
>
> ...
> Best would be for other testers to run some tests.  With the make -j30 
> weirdness, I _suspect_ that other oddities (hmm... multi-client db load... 
> query service time) will show.
> 

Yes, this is the second surprise interaction between the CPU scheduler
and the IO system.

Perhaps.  In your case it seems that you're simply unable to generate
the amount of concurrency which you used to.  Which would make it purely
a CPU scheduler thing.

It is not necessarily a bad thing (unless your total runtime has increased?)
but we need to understand what has happened.

What filesystem are you using?

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  8:09               ` Mike Galbraith
  2003-03-07  8:10                 ` Andrew Morton
@ 2003-03-07  8:15                 ` Ingo Molnar
       [not found]                 ` <Pine.LNX.4.44.0303070913370.5173-100000@localhost.localdom ain>
  2 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  8:15 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, Linus Torvalds, rml, linux-kernel


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> I can (could with your earlier patches anyway) eliminate the X stalls by
> setting X junk to SCHED_FIFO.  I don't have ram to plug in, but I'm as
> certain as I can be without actually doing so that it's not ram
> shortage.

okay. Can you eliminate the X stalls with setting X priority to -10 or so
(or SCHED_FIFO - although SCHED_FIFO is much more dangerous). And how does
interactivity under the same load look like with vanilla .64, as compared
to .64+combo? First step is to ensure that the new changes did not
actually hurt interactivity.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 18:27                     ` Ingo Molnar
@ 2003-03-07  8:15                       ` Xavier Bestel
  0 siblings, 0 replies; 100+ messages in thread
From: Xavier Bestel @ 2003-03-07  8:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: jvlists, Linus Torvalds, Jeff Garzik, Andrew Morton, rml,
	Linux Kernel Mailing List

Le jeu 06/03/2003 à 19:27, Ingo Molnar a écrit:
> On Thu, 6 Mar 2003 jvlists@ntlworld.com wrote:
> 
> > P.S. IMVHO the xine problem is completely different as has nothing to
> > with interactivity but with the fact that it is soft real-time. i.e. you
> > need to distingish xine from say a gimp filter or a 3D renderer with
> > incremental live updates of the scene it is creating.
> 
> it is the same category of problems: xine and X are both applications,
> which, if lagged, are noticed by users.

Actually I don't think so:
- X and games need hard interactivity: they have to compute fast a
response to an input.
- xine can precompute a few frames if it wants (I dunno if it does) and
just needs a precise timing to display them and sync the audio buffers.
It could do with CPU slices if X does the right 'display this frame at
this time' thing.

	Xav


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  8:10                 ` Andrew Morton
@ 2003-03-07  8:26                   ` Mike Galbraith
  0 siblings, 0 replies; 100+ messages in thread
From: Mike Galbraith @ 2003-03-07  8:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, torvalds, rml, linux-kernel

At 12:10 AM 3/7/2003 -0800, Andrew Morton wrote:
>Mike Galbraith <efault@gmx.de> wrote:
> >
> > ...
> > Best would be for other testers to run some tests.  With the make -j30
> > weirdness, I _suspect_ that other oddities (hmm... multi-client db load...
> > query service time) will show.
> >
>
>Yes, this is the second surprise interaction between the CPU scheduler
>and the IO system.
>
>Perhaps.  In your case it seems that you're simply unable to generate
>the amount of concurrency which you used to.  Which would make it purely
>a CPU scheduler thing.

Yes, of this I'm sure.

>It is not necessarily a bad thing (unless your total runtime has increased?)
>but we need to understand what has happened.

Total system throughput is fine, and as expected, the only difference is 
the modest overhead of paging heftily or lightly and/or not at all.  The 
realtime throughput difference between kernels is ~10 seconds... very 
definitely concurrency issue imvho.  And I agree 100% that this is not 
_necessarily_ a bad thing.  The time it takes for _any_ pressure to build 
looks decidedly bad though.  With the combo patch it does look better than 
earlier patches... sudden bursts of paging do not occurr, it's..... 
smoother once it starts acting something resembling normal.

>What filesystem are you using?

The build is running in a small EXt2 partition.

         -Mike 


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  8:34                   ` Mike Galbraith
@ 2003-03-07  8:31                     ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  8:31 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, Linus Torvalds, rml, linux-kernel


On Fri, 7 Mar 2003, Mike Galbraith wrote:

> Oh, it's definitely _much_ better than .virgin.  I didn't mean to imply
> that damage was done ;-)  With .virgin, I have a _heck_ of a time
> grabbing the build window to kill it.  With combo, rodent jerkiness
> makes it hard to find/grab, and multisecond stalls are still there, but
> much improved.

ok! [breathing lighter].

could you still try the renice -10 thing with the combo patch, to see how
much of this thing is X actually not getting enough timeslices, vs. other
effects (such as X applications not getting enough timeslices)?

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
       [not found]                 ` <Pine.LNX.4.44.0303070913370.5173-100000@localhost.localdom ain>
@ 2003-03-07  8:34                   ` Mike Galbraith
  2003-03-07  8:31                     ` Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Mike Galbraith @ 2003-03-07  8:34 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Linus Torvalds, rml, linux-kernel

At 09:15 AM 3/7/2003 +0100, Ingo Molnar wrote:

>On Fri, 7 Mar 2003, Mike Galbraith wrote:
>
> > I can (could with your earlier patches anyway) eliminate the X stalls by
> > setting X junk to SCHED_FIFO.  I don't have ram to plug in, but I'm as
> > certain as I can be without actually doing so that it's not ram
> > shortage.
>
>okay. Can you eliminate the X stalls with setting X priority to -10 or so
>(or SCHED_FIFO - although SCHED_FIFO is much more dangerous). And how does
>interactivity under the same load look like with vanilla .64, as compared
>to .64+combo? First step is to ensure that the new changes did not
>actually hurt interactivity.

Oh, it's definitely _much_ better than .virgin.  I didn't mean to imply 
that damage was done ;-)  With .virgin, I have a _heck_ of a time grabbing 
the build window to kill it.  With combo, rodent jerkiness makes it hard to 
find/grab, and multisecond stalls are still there, but much improved.

         -Mike 


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  6:15           ` Ingo Molnar
@ 2003-03-07 10:19             ` Helge Hafting
  2003-03-07 11:06               ` Ingo Molnar
  0 siblings, 1 reply; 100+ messages in thread
From: Helge Hafting @ 2003-03-07 10:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel

Ingo Molnar wrote:
 > I believe we should still enable
> application programmers to give certain apps _some_ minor priority boost,
> so that other CPU hogs cannot starve xine.

But we don't really need further kernel support for that, do we?
I know a user currently cannot raise priority, but the user can
run all his normal apps at slightly lower priority, except for xine.
And the admin/distrubutor can set everything up for using the slightly
lower priority by default.  Well, perhaps all this involves so
much use of "nice" that kernel support is a good idea anyway...

Helge Hafting


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 22:07           ` Robert Love
  2003-03-06 22:35             ` Martin Waitz
@ 2003-03-07 10:27             ` Ingo Molnar
  1 sibling, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07 10:27 UTC (permalink / raw)
  To: Robert Love; +Cc: Martin Waitz, Linus Torvalds, Andrew Morton, linux-kernel


On 6 Mar 2003, Robert Love wrote:

> > in addition, timeslices should be shortest for high priority processes
> > (depending on dynamic prio, not static)
> 
> No, they should be longer.  In some cases they should be nearly
> infinitely long (which is sort of what we do with the reinsertion into
> the active array for highly interactive tasks). [...]

yes, and in fact tasks with 'infinite timeslice' do exist: RT tasks with 
SCHED_FIFO/SCHED_RR.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 23:36                   ` Robert Love
@ 2003-03-07 10:39                     ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07 10:39 UTC (permalink / raw)
  To: Robert Love; +Cc: Martin Waitz, Linus Torvalds, Andrew Morton, linux-kernel


On 6 Mar 2003, Robert Love wrote:

> > but the kernel should get the chance to frequently reschedule
> > when interactivity is needed.
> 
> I understand your point, but we get that now without using super small
> timeslices.

well, the -A6 patch does reduce the timeslice length, it's certainly one
tool to improve "interactivity" of CPU-bound tasks. At the moment we
cannot reduce it arbitrarily though, because right now the way to
implement different CPU-sharing priorities is via the timeslice
calculator. With 100 msec default timeslices, the biggest difference one
can get is a 10 msec timeslice for a nice +19 task, and a 100 msec
timeslice for a nice 0 task. Ie. a nice +19 CPU-bound task uses up ~10% of
CPU time while a nice 0 task uses up 90% of CPU time, if both are running
at once - roughly.

i have planned to add a new interface to solve this generic problem,
basically a syscall that enables the setting of the default timeslice
length - which gives one more degree of freedom to applications. Games
could set this to a super-low value to get really good 'keypress latency'
even in the presence of other CPU hogs. Compilation jobs would use the
default long timeslices, to maximize caching benefits. And we could
lengthen the timeslice of nice +19 tasks as well. The complexity here is
to decouple the timeslice length from CPU-sharing priorities. I've got
this on my list ...

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07 10:19             ` Helge Hafting
@ 2003-03-07 11:06               ` Ingo Molnar
  0 siblings, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07 11:06 UTC (permalink / raw)
  To: Helge Hafting; +Cc: linux-kernel


On Fri, 7 Mar 2003, Helge Hafting wrote:

> But we don't really need further kernel support for that, do we? I know
> a user currently cannot raise priority, but the user can run all his
> normal apps at slightly lower priority, except for xine. And the
> admin/distrubutor can set everything up for using the slightly lower
> priority by default.  Well, perhaps all this involves so much use of
> "nice" that kernel support is a good idea anyway...

true, but i think we could help by doing this automatically. This will
also make it easier to tune things as part of the scheduler proper.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  5:57                       ` Ingo Molnar
  2003-03-07  5:54                         ` Shawn
@ 2003-03-07 14:28                         ` jlnance
  1 sibling, 0 replies; 100+ messages in thread
From: jlnance @ 2003-03-07 14:28 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linus Torvalds, Andrew Morton, Robert Love, linux-kernel

On Fri, Mar 07, 2003 at 06:57:58AM +0100, Ingo Molnar wrote:

> so the correct approach is both to make X more interactive (your patch),
> _and_ to make the compilation jobs less interactive (my patch). This is
> that explains why Andrew saw roughly similar interactivity with you and my
> patch applied separately, but the best result was when the combo patch was
> applied. Agreed?

Ingo,
    Forgive me if this is what your patch does, I have only looked at
Linus'es.  But I dont think this is what you are doing.
    Linus'es patch gives a boost to non-interactive proceses that wake
up interactive ones.  It seems that we could solve the make problem by
applying the same idea in the reverse direction.  Lets unboost interactive
processes that are woken up by non-interactive ones.  So if make gets
woken up by gcc (which is hopefully non-interactive), make does not
get "credit" for having been asleep.
    I am trying to think if this would break anything.  The only thing
I can think of is that if the X server gets marked as non-interactive,
then it is going to cause its clients to get marked as non-interactive
as well when it sends them data.  But this may be a good thing.  If
X is marked as non-interactive it means that it is busy, so this may
have the effect of preventing the X clients from sending it a lot of
extra work while its already busy.

Thanks,

Jim

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-06 17:55                     ` Linus Torvalds
  2003-03-06 18:04                       ` Ingo Molnar
@ 2003-03-08  0:05                       ` Daniel Phillips
  1 sibling, 0 replies; 100+ messages in thread
From: Daniel Phillips @ 2003-03-08  0:05 UTC (permalink / raw)
  To: Linus Torvalds, Alan Cox
  Cc: Ingo Molnar, Jeff Garzik, Andrew Morton, rml, Linux Kernel Mailing List

On Thu 06 Mar 03 18:55, Linus Torvalds wrote:
> The thing about wakeups is that it's an "illogical" place to give the
> bonus ("it's too late to give the server a bonus _now_, I wanted it when I
> went to sleep"), but it's the _trivial_ place to give it.
>
> It also (I'm convinced) is actually in the end exactly equivalent to
> giving the bonus at sleep time - in the steady state picture.

And anyway, on the first sleep there was no chance to measure the sleep_time, 
so there was no bonus to give.

Regards,

Daniel

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
  2003-03-07  7:36                             ` Ingo Molnar
@ 2003-03-08 18:28                               ` Aaron Lehmann
  0 siblings, 0 replies; 100+ messages in thread
From: Aaron Lehmann @ 2003-03-08 18:28 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Robert Love, linux-kernel

On Fri, Mar 07, 2003 at 08:36:12AM +0100, Ingo Molnar wrote:
> > I was able to reproduce them by selecting text in Mathematica (ugh, not
> > a very helpful example). The skips were shorter and about three times as
> > hard to trigger as on 2.5.63.
> 
> okay, just as a data point, could you try to renice the player
> process/thread to -2? Does it make the skipping harder to trigger?
> How about -5, or -10?

Renicing all xmms threads to -2 makes the problem impossible to
trigger in this way. I'm sorry for having such a stupid testcase;
Mathematica is proprietary software, and judging by the way it makes
XMMS skip, it's probably doing something stupid. XMMS also has a "Use
realtime priority" option, which makes it do:

	sparam.sched_priority = sched_get_priority_max(SCHED_RR);
	sched_setscheduler(0, SCHED_RR, &sparam);

Obviously, this requires root privileges, however, it also prevents
the skipping. I wonder if setting CAP_SYS_NICE on xmms and hacking it
to ignore geteuid() would be safe.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [patch] "HT scheduler", sched-2.5.63-B3
@ 2003-03-10 19:53 Perez-Gonzalez, Inaky
  0 siblings, 0 replies; 100+ messages in thread
From: Perez-Gonzalez, Inaky @ 2003-03-10 19:53 UTC (permalink / raw)
  To: 'Ingo Molnar'; +Cc: 'linux-kernel@vger.kernel.org'

> On Fri, 7 Mar 2003, Perez-Gonzalez, Inaky wrote:
> 
> > Question: does this apply also to tasks that are current on another CPU
> > other than the local?
> >
> > Or, can I change the prio of a task that is current on another CPU
> > directly and then set_tsk_need_resched()? Or I also have to unqueue it
> > first?
> 
> any task must be unqueued before changing its priority - the runqueues are
> indexed by the priority field. The special case in wakeup is that 'p' is
> not on any runqueues (hence the wakeup ...), so we have a cheap
> opportunity there to fix up priority. But it's not a problem for an
> already running process either - it needs to be requeued.

Thanks, I was suspecting that [when something makes full sense, it is
easy to suspect it :)].

How friendly would you be to a hook into setscheduler()? I have this 
same problem with real-time futexes, specially wrt to effective priority 
in priority-inheritance & prio-protection. Basically, I need to know 
right away when a waiting tasks' priority has been changed so I can
properly reposition it into the waiting list.

As well, whenever I change the priority of an active/running task, it needs
to activate the hooks (for my proof-of-concept I did that manually; however,
it is limited and non-extensible), and I am looking into unfolding
setscheduler() into setscheduler() and do_setscheduler(), the later taking
the task struct, policy and priority to set, so that it can be called
from inside the kernel. setscheduler() would only do the user-space 
stuff handling.

Would that be ok with you?

Iñaky Pérez-González -- Not speaking for Intel -- all opinions are my own
(and my fault)



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
       [not found] ` <Pine.LNX.4.44.0303071003060.6318-100000@localhost.localdom ain>
@ 2003-03-07  9:38   ` Mike Galbraith
  0 siblings, 0 replies; 100+ messages in thread
From: Mike Galbraith @ 2003-03-07  9:38 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andrew Morton, Linus Torvalds, Robert Love, linux-kernel

At 10:03 AM 3/7/2003 +0100, Ingo Molnar wrote:

>here's sched-2.5.64-B0:

KABOOM!  Major explosions on boot.  First time I booted, I got an instant 
reboot.  Second time, I got a couple of double faults and then explosion 
during fsck (uhoh;).  PID is 1684368482 :-/

Time to reboot into .virgin for (hopefully) a couple of fretful 
moments.  (rebooting into .virgin after first explosion would have been a 
bit smarter;)

         -Mike  


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [patch] "HT scheduler", sched-2.5.63-B3
       [not found] <5.2.0.9.2.20030307093435.01a8fe88@pop.gmx.net>
@ 2003-03-07  9:30 ` Ingo Molnar
       [not found] ` <Pine.LNX.4.44.0303071003060.6318-100000@localhost.localdom ain>
  1 sibling, 0 replies; 100+ messages in thread
From: Ingo Molnar @ 2003-03-07  9:30 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Andrew Morton, Linus Torvalds, rml, linux-kernel


ok, forget the -B0 patch, it's broken - we cannot change the current
task's priority without unqueueing it first. -B1 will have the right code.

	Ingo


^ permalink raw reply	[flat|nested] 100+ messages in thread

end of thread, other threads:[~2003-03-11  1:16 UTC | newest]

Thread overview: 100+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-02-28  9:50 [patch] "HT scheduler", sched-2.5.63-B3 Ingo Molnar
2003-02-28 21:12 ` Andrew Morton
2003-02-28 21:23   ` Robert Love
2003-03-01  4:25     ` Andrew Morton
2003-03-06  3:20       ` Linus Torvalds
2003-03-06  7:45         ` Andrew Morton
2003-03-06  7:47           ` Ingo Molnar
2003-03-06 15:07             ` Linus Torvalds
2003-03-06 16:54               ` Ingo Molnar
2003-03-06 17:03                 ` Linus Torvalds
2003-03-06 17:24                   ` Ingo Molnar
2003-03-06 17:42                     ` Linus Torvalds
2003-03-06 17:54                       ` Ingo Molnar
2003-03-06 18:02                         ` Linus Torvalds
2003-03-06 18:11                           ` Ingo Molnar
2003-03-06 17:55                       ` John Levon
2003-03-06 18:07                         ` Linus Torvalds
2003-03-06 18:11                           ` John Levon
2003-03-06 18:16                             ` Linus Torvalds
2003-03-06 18:15                           ` Ingo Molnar
2003-03-06 18:20                             ` Linus Torvalds
2003-03-06 18:10                               ` David S. Miller
2003-03-06 18:18                           ` Ingo Molnar
2003-03-07  5:57                       ` Ingo Molnar
2003-03-07  5:54                         ` Shawn
2003-03-07  6:04                           ` Ingo Molnar
2003-03-07 14:28                         ` jlnance
2003-03-07  6:45                       ` Aaron Lehmann
2003-03-07  6:50                         ` Ingo Molnar
2003-03-07  7:00                           ` Aaron Lehmann
2003-03-07  7:36                             ` Ingo Molnar
2003-03-08 18:28                               ` Aaron Lehmann
2003-03-07  7:00                         ` Andrew Morton
2003-03-07  3:19                   ` Horst von Brand
2003-03-06 15:37           ` Linus Torvalds
2003-03-06 16:01             ` Jeff Garzik
2003-03-06 16:51               ` Linus Torvalds
2003-03-06 17:11                 ` Ingo Molnar
2003-03-06 17:52                   ` jvlists
2003-03-06 18:27                     ` Ingo Molnar
2003-03-07  8:15                       ` Xavier Bestel
2003-03-06 23:18                     ` Robert Love
2003-03-06 18:49                   ` Alan Cox
2003-03-06 17:55                     ` Linus Torvalds
2003-03-06 18:04                       ` Ingo Molnar
2003-03-06 18:13                         ` Martin J. Bligh
2003-03-06 18:26                           ` Linus Torvalds
2003-03-08  0:05                       ` Daniel Phillips
2003-03-06 17:58                     ` Ingo Molnar
2003-03-06 17:17               ` Ingo Molnar
2003-03-06 18:06                 ` Dimitrie O. Paun
2003-03-06 22:14                   ` Martin J. Bligh
2003-03-06 22:31                     ` Dimitrie O. Paun
2003-03-06 23:05                       ` Robert Love
2003-03-06 23:33                         ` Martin J. Bligh
2003-03-06 18:25               ` Alan Cox
2003-03-06 17:35                 ` Linus Torvalds
2003-03-06 22:30                   ` Eric Northup
2003-03-06 23:40                     ` Martin J. Bligh
2003-03-07  0:02                       ` Linus Torvalds
2003-03-07  0:09                         ` Dimitrie O. Paun
2003-03-06 22:00               ` Bill Davidsen
2003-03-06 17:13         ` Ingo Molnar
2003-03-06 17:30           ` Linus Torvalds
2003-03-06 17:33             ` Ingo Molnar
2003-03-06 17:50             ` Ingo Molnar
2003-03-06 18:00               ` Linus Torvalds
2003-03-06 20:42         ` Andrew Morton
2003-03-06 22:18           ` Robert Love
2003-03-07  5:52           ` Ingo Molnar
2003-03-07  5:51             ` Shawn
2003-03-07  6:01               ` Ingo Molnar
2003-03-07  6:07               ` Andrew Morton
2003-03-07  6:15           ` Ingo Molnar
2003-03-07 10:19             ` Helge Hafting
2003-03-07 11:06               ` Ingo Molnar
2003-03-07  7:45           ` Mike Galbraith
2003-03-07  7:45             ` Ingo Molnar
     [not found]             ` <Pine.LNX.4.44.0303070842420.4572-100000@localhost.localdom ain>
2003-03-07  8:09               ` Mike Galbraith
2003-03-07  8:10                 ` Andrew Morton
2003-03-07  8:26                   ` Mike Galbraith
2003-03-07  8:15                 ` Ingo Molnar
     [not found]                 ` <Pine.LNX.4.44.0303070913370.5173-100000@localhost.localdom ain>
2003-03-07  8:34                   ` Mike Galbraith
2003-03-07  8:31                     ` Ingo Molnar
2003-03-06 22:03         ` Martin Waitz
2003-03-06 22:07           ` Robert Love
2003-03-06 22:35             ` Martin Waitz
2003-03-06 22:56               ` Robert Love
2003-03-06 23:27                 ` Martin Waitz
2003-03-06 23:36                   ` Robert Love
2003-03-07 10:39                     ` Ingo Molnar
2003-03-07 10:27             ` Ingo Molnar
2003-03-06  6:58       ` [patch] "interactivity changes", sched-2.5.64-A4 Ingo Molnar
2003-03-06 10:00         ` Andrew Morton
2003-03-06 10:07           ` [patch] "interactivity changes", sched-2.5.64-A5 Ingo Molnar
2003-03-06 10:16 ` [patch] "HT scheduler", sched-2.5.63-B3 Chris Wedgwood
2003-03-06 10:18   ` Ingo Molnar
     [not found] <5.2.0.9.2.20030307093435.01a8fe88@pop.gmx.net>
2003-03-07  9:30 ` Ingo Molnar
     [not found] ` <Pine.LNX.4.44.0303071003060.6318-100000@localhost.localdom ain>
2003-03-07  9:38   ` Mike Galbraith
2003-03-10 19:53 Perez-Gonzalez, Inaky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).