All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched: high-res preemption tick
@ 2007-10-12 20:51 Peter Zijlstra
  2007-10-13  7:18 ` Mike Galbraith
  2007-10-13 22:51 ` [RFC][PATCH] sched: SCHED_FIFO watchdog timer Peter Zijlstra
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-12 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ingo Molnar, Thomas Gleixner, Mike Galbraith

Subject: sched: high-res preemption tick

Use HR-timers (when available) to deliver an accurate preemption tick.

The regular scheduler tick that runs at 1/HZ can be too coarse when nice
level are used. The fairness system will still keep the cpu utilisation 'fair'
by then delaying the task that got an excessive amount of CPU time but try to
minimize this by delivering preemption points spot-on.

The average frequency of this extra interrupt is sched_latency / nr_latency.
Which need not be higher than 1/HZ, its just that the distribution within the
sched_latency period is important.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---

For now HRTICK and PREEMPT_RESTRICT are not compatible.

 arch/x86/kernel/entry_64.S       |    6 -
 arch/x86/kernel/signal_32.c      |    3 
 arch/x86/kernel/signal_64.c      |    3 
 include/asm-x86/thread_info_32.h |    2 
 include/asm-x86/thread_info_64.h |    5 
 include/linux/hrtimer.h          |    9 +
 include/linux/sched.h            |    3 
 kernel/Kconfig.hz                |    4 
 kernel/sched.c                   |  205 ++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c              |   67 +++++++++++-
 kernel/sched_idletask.c          |    2 
 kernel/sched_rt.c                |    2 
 12 files changed, 290 insertions(+), 21 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -62,6 +62,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
+#include <linux/hrtimer.h>
 
 #include <asm/tlb.h>
 
@@ -323,6 +324,12 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+#ifdef CONFIG_SCHED_HRTICK
+	unsigned long hrtick_flags;
+	ktime_t hrtick_expire;
+	struct hrtimer hrtick_timer;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -446,6 +453,8 @@ enum {
 	SCHED_FEAT_APPROX_AVG           = 8,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 16,
 	SCHED_FEAT_PREEMPT_RESTRICT	= 32,
+	SCHED_FEAT_HRTICK		= 64,
+	SCHED_FEAT_DOUBLE_TICK		= 128,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -454,7 +463,9 @@ const_debug unsigned int sysctl_sched_fe
 		SCHED_FEAT_TREE_AVG		*0 |
 		SCHED_FEAT_APPROX_AVG		*0 |
 		SCHED_FEAT_WAKEUP_PREEMPT	*1 |
-		SCHED_FEAT_PREEMPT_RESTRICT	*1;
+		SCHED_FEAT_PREEMPT_RESTRICT	*0 |
+		SCHED_FEAT_HRTICK		*1 |
+		SCHED_FEAT_DOUBLE_TICK		*0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -654,6 +665,168 @@ void sched_clock_idle_wakeup_event(u64 d
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+static void __resched_task(struct task_struct *p, int tif_bit);
+
+static inline void resched_task(struct task_struct *p)
+{
+	__resched_task(p, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+static inline void resched_hrt(struct task_struct *p)
+{
+	__resched_task(p, TIF_HRTICK_RESCHED);
+}
+
+static inline void resched_rq(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(rq->curr);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+enum {
+	HRTICK_SET,		/* re-programm hrtick_timer */
+	HRTICK_RESET,		/* not a new slice */
+};
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+	if (!sched_feat(HRTICK))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay, int reset)
+{
+	assert_spin_locked(&rq->lock);
+
+	/*
+	 * preempt at: now + delay
+	 */
+	rq->hrtick_expire =
+		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
+	/*
+	 * indicate we need to program the timer
+	 */
+	__set_bit(HRTICK_SET, &rq->hrtick_flags);
+	if (reset)
+		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
+
+	/*
+	 * New slices are called from the schedule path and don't need a
+	 * forced reschedule.
+	 */
+	if (reset)
+		resched_hrt(rq->curr);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * Update the timer from the possible pending state.
+ */
+static void hrtick_set(struct rq *rq)
+{
+	ktime_t time;
+	int set, reset;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
+	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
+	time = rq->hrtick_expire;
+	clear_thread_flag(TIF_HRTICK_RESCHED);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (set) {
+		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
+		if (reset && !hrtimer_active(&rq->hrtick_timer))
+			resched_rq(rq);
+	} else hrtick_clear(rq);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	spin_unlock(&rq->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+	rq->hrtick_flags = 0;
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+}
+#else
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_set(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+#endif
+
+void hrtick_resched(void)
+{
+	struct rq *rq;
+	unsigned long flags;
+
+	if (!test_thread_flag(TIF_HRTICK_RESCHED))
+		return;
+
+	local_irq_save(flags);
+	rq = cpu_rq(smp_processor_id());
+	hrtick_set(rq);
+	local_irq_restore(flags);
+}
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -667,16 +840,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeu
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_thread_flag(p, tif_bit);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -699,10 +872,10 @@ static void resched_cpu(int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
-static inline void resched_task(struct task_struct *p)
+static inline void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	set_tsk_thread_flag(tsk, tif_bit);
 }
 #endif
 
@@ -3366,7 +3539,7 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
+		curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -3505,6 +3678,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
+	hrtick_clear(rq);
+
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
@@ -3537,14 +3712,20 @@ need_resched_nonpreemptible:
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
+		/*
+		 * the context switch might have flipped the stack from under
+		 * us, hence refresh the local variables.
+		 */
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	hrtick_set(rq);
+
+	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-	}
+
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -6518,6 +6699,8 @@ void __init sched_init(void)
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
+		init_rq_hrtick(rq);
+
 		atomic_set(&rq->nr_iowait, 0);
 
 		array = &rq->rt.active;
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -611,13 +611,29 @@ static void put_prev_entity(struct cfs_r
 	cfs_rq->curr = NULL;
 }
 
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_SCHED_HRTICK
+	/*
+	 * queued ticks are scheduled to match the slice, so don't bother
+	 * validating it and just reschedule.
+	 */
+	if (queued)
+		return resched_task(rq_of(cfs_rq)->curr);
+	/*
+	 * don't let the period tick interfere with the hrtick preemption
+	 */
+	if (!sched_feat(DOUBLE_TICK) &&
+			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+		return;
+#endif
+
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -721,6 +737,43 @@ static inline struct sched_entity *paren
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+	int requeue = rq->curr == p;
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	WARN_ON(task_rq(p) != rq);
+
+	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+		u64 slice = sched_slice(cfs_rq, se);
+		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+		s64 delta = slice - ran;
+
+		if (delta < 0) {
+			if (rq->curr == p)
+				resched_task(p);
+			return;
+		}
+
+		/*
+		 * Don't schedule slices shorter than 10000ns, that just
+		 * doesn't make sense. Rely on vruntime for fairness.
+		 */
+		if (!requeue)
+			delta = max(10000LL, delta);
+
+		hrtick_start(rq, delta, requeue);
+	}
+}
+#else
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -738,6 +791,7 @@ static void enqueue_task_fair(struct rq 
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -758,6 +812,7 @@ static void dequeue_task_fair(struct rq 
 			break;
 		sleep = 1;
 	}
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -848,6 +903,7 @@ static void check_preempt_wakeup(struct 
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
+	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
@@ -859,7 +915,10 @@ static struct task_struct *pick_next_tas
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	hrtick_start_fair(rq, p);
+
+	return p;
 }
 
 /*
@@ -995,14 +1054,14 @@ load_balance_fair(struct rq *this_rq, in
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
+		entity_tick(cfs_rq, se, queued);
 	}
 }
 
Index: linux-2.6/kernel/Kconfig.hz
===================================================================
--- linux-2.6.orig/kernel/Kconfig.hz
+++ linux-2.6/kernel/Kconfig.hz
@@ -54,3 +54,7 @@ config HZ
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 
+config SCHED_HRTICK
+	bool "High Resolution Timer preemption tick"
+	depends on HIGH_RES_TIMERS && X86
+	default y
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -256,6 +256,7 @@ extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
+extern void hrtick_resched(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
@@ -890,7 +891,7 @@ struct sched_class {
 			int *all_pinned, int *this_best_prio);
 
 	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
 
Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -217,6 +217,11 @@ static inline ktime_t hrtimer_cb_get_tim
 	return timer->base->get_time();
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -248,6 +253,10 @@ static inline ktime_t hrtimer_cb_get_tim
 	return timer->base->softirq_time;
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return 0;
+}
 #endif
 
 extern ktime_t ktime_get(void);
Index: linux-2.6/kernel/sched_idletask.c
===================================================================
--- linux-2.6.orig/kernel/sched_idletask.c
+++ linux-2.6/kernel/sched_idletask.c
@@ -46,7 +46,7 @@ load_balance_idle(struct rq *this_rq, in
 	return 0;
 }
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -194,7 +194,7 @@ load_balance_rt(struct rq *this_rq, int 
 	return load_moved;
 }
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 	/*
 	 * RR tasks need a special form of timeslice management.
Index: linux-2.6/arch/x86/kernel/signal_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/signal_32.c
+++ linux-2.6/arch/x86/kernel/signal_32.c
@@ -662,6 +662,9 @@ void do_notify_resume(struct pt_regs *re
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 	
 	clear_thread_flag(TIF_IRET);
 }
Index: linux-2.6/include/asm-x86/thread_info_32.h
===================================================================
--- linux-2.6.orig/include/asm-x86/thread_info_32.h
+++ linux-2.6/include/asm-x86/thread_info_32.h
@@ -132,6 +132,7 @@ static inline struct thread_info *curren
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
+#define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -147,6 +148,7 @@ static inline struct thread_info *curren
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
Index: linux-2.6/arch/x86/kernel/signal_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/signal_64.c
+++ linux-2.6/arch/x86/kernel/signal_64.c
@@ -482,6 +482,9 @@ do_notify_resume(struct pt_regs *regs, v
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
Index: linux-2.6/include/asm-x86/thread_info_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/thread_info_64.h
+++ linux-2.6/include/asm-x86/thread_info_64.h
@@ -115,6 +115,7 @@ static inline struct thread_info *stack_
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -133,6 +134,7 @@ static inline struct thread_info *stack_
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -146,6 +148,9 @@ static inline struct thread_info *stack_
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
+#define _TIF_DO_NOTIFY_MASK \
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
 
Index: linux-2.6/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/entry_64.S
+++ linux-2.6/arch/x86/kernel/entry_64.S
@@ -283,7 +283,7 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f
 
 	/* Really a signal */
@@ -377,7 +377,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -603,7 +603,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	sti



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-12 20:51 [PATCH] sched: high-res preemption tick Peter Zijlstra
@ 2007-10-13  7:18 ` Mike Galbraith
  2007-10-13  8:55   ` Peter Zijlstra
  2007-10-13 22:51 ` [RFC][PATCH] sched: SCHED_FIFO watchdog timer Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Mike Galbraith @ 2007-10-13  7:18 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner

On Fri, 2007-10-12 at 22:51 +0200, Peter Zijlstra wrote:
> Subject: sched: high-res preemption tick
> 
> Use HR-timers (when available) to deliver an accurate preemption tick.

This patch further reduced iperf context switching, and boosted
throughput.

iperf -c localhost -P 10 -t 300

Previously reported numbers

2.6.23-smp
[SUM]  0.0-300.0 sec    153 GBytes  4.39 Gbits/sec
[SUM]  0.0-300.1 sec    148 GBytes  4.23 Gbits/sec
[SUM]  0.0-300.0 sec    152 GBytes  4.36 Gbits/sec

2.6.23-smp-d (sched-devel)
[SUM]  0.0-300.0 sec    173 GBytes  4.96 Gbits/sec
[SUM]  0.0-300.1 sec    173 GBytes  4.96 Gbits/sec
[SUM]  0.0-300.0 sec    172 GBytes  4.93 Gbits/sec

Numbers from fresh pull today

2.6.23-smp-d-hrt
(re-enable PREEMPT_RESTRICT)
[SUM]  0.0-300.1 sec    181 GBytes  5.19 Gbits/sec
[SUM]  0.0-300.0 sec    182 GBytes  5.22 Gbits/sec
[SUM]  0.0-300.1 sec    182 GBytes  5.22 Gbits/sec

2.6.23-smp-d
[SUM]  0.0-300.1 sec    174 GBytes  4.97 Gbits/sec
[SUM]  0.0-300.1 sec    173 GBytes  4.95 Gbits/sec
[SUM]  0.0-300.1 sec    173 GBytes  4.96 Gbits/sec



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-13  7:18 ` Mike Galbraith
@ 2007-10-13  8:55   ` Peter Zijlstra
  2007-10-13  9:17     ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-13  8:55 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 1328 bytes --]

On Sat, 2007-10-13 at 09:18 +0200, Mike Galbraith wrote:
> On Fri, 2007-10-12 at 22:51 +0200, Peter Zijlstra wrote:
> > Subject: sched: high-res preemption tick
> > 
> > Use HR-timers (when available) to deliver an accurate preemption tick.
> 
> This patch further reduced iperf context switching, and boosted
> throughput.
> 
> iperf -c localhost -P 10 -t 300
> 
> Previously reported numbers
> 
> 2.6.23-smp
> [SUM]  0.0-300.0 sec    153 GBytes  4.39 Gbits/sec
> [SUM]  0.0-300.1 sec    148 GBytes  4.23 Gbits/sec
> [SUM]  0.0-300.0 sec    152 GBytes  4.36 Gbits/sec
> 
> 2.6.23-smp-d (sched-devel)
> [SUM]  0.0-300.0 sec    173 GBytes  4.96 Gbits/sec
> [SUM]  0.0-300.1 sec    173 GBytes  4.96 Gbits/sec
> [SUM]  0.0-300.0 sec    172 GBytes  4.93 Gbits/sec
> 
> Numbers from fresh pull today
> 
> 2.6.23-smp-d-hrt
> (re-enable PREEMPT_RESTRICT)

Ah, but HRTICK is not compatible with PREEMPT_RESTRICT, it will be
similar to !WAKEUP_PREEMPT.

> [SUM]  0.0-300.1 sec    181 GBytes  5.19 Gbits/sec
> [SUM]  0.0-300.0 sec    182 GBytes  5.22 Gbits/sec
> [SUM]  0.0-300.1 sec    182 GBytes  5.22 Gbits/sec
> 
> 2.6.23-smp-d
> [SUM]  0.0-300.1 sec    174 GBytes  4.97 Gbits/sec
> [SUM]  0.0-300.1 sec    173 GBytes  4.95 Gbits/sec
> [SUM]  0.0-300.1 sec    173 GBytes  4.96 Gbits/sec
> 
> 

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-13  8:55   ` Peter Zijlstra
@ 2007-10-13  9:17     ` Peter Zijlstra
  2007-10-13 10:11       ` Mike Galbraith
  2007-10-13 23:13       ` Peter Zijlstra
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-13  9:17 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner


On Sat, 2007-10-13 at 10:55 +0200, Peter Zijlstra wrote:
> On Sat, 2007-10-13 at 09:18 +0200, Mike Galbraith wrote:
> > On Fri, 2007-10-12 at 22:51 +0200, Peter Zijlstra wrote:
> > > Subject: sched: high-res preemption tick
> > > 
> > > Use HR-timers (when available) to deliver an accurate preemption tick.
> > 
> > This patch further reduced iperf context switching, and boosted
> > throughput.
> > 
> > iperf -c localhost -P 10 -t 300
> > 
> > Previously reported numbers
> > 
> > 2.6.23-smp
> > [SUM]  0.0-300.0 sec    153 GBytes  4.39 Gbits/sec
> > [SUM]  0.0-300.1 sec    148 GBytes  4.23 Gbits/sec
> > [SUM]  0.0-300.0 sec    152 GBytes  4.36 Gbits/sec
> > 
> > 2.6.23-smp-d (sched-devel)
> > [SUM]  0.0-300.0 sec    173 GBytes  4.96 Gbits/sec
> > [SUM]  0.0-300.1 sec    173 GBytes  4.96 Gbits/sec
> > [SUM]  0.0-300.0 sec    172 GBytes  4.93 Gbits/sec
> > 
> > Numbers from fresh pull today
> > 
> > 2.6.23-smp-d-hrt
> > (re-enable PREEMPT_RESTRICT)
> 
> Ah, but HRTICK is not compatible with PREEMPT_RESTRICT, it will be
> similar to !WAKEUP_PREEMPT.

(I do plan to fix that eventually, just need to do it)

Also, this seems to suggest iperf would like SCHED_BATCH.

> > [SUM]  0.0-300.1 sec    181 GBytes  5.19 Gbits/sec
> > [SUM]  0.0-300.0 sec    182 GBytes  5.22 Gbits/sec
> > [SUM]  0.0-300.1 sec    182 GBytes  5.22 Gbits/sec
> > 
> > 2.6.23-smp-d
> > [SUM]  0.0-300.1 sec    174 GBytes  4.97 Gbits/sec
> > [SUM]  0.0-300.1 sec    173 GBytes  4.95 Gbits/sec
> > [SUM]  0.0-300.1 sec    173 GBytes  4.96 Gbits/sec
> > 
> > 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-13  9:17     ` Peter Zijlstra
@ 2007-10-13 10:11       ` Mike Galbraith
  2007-10-13 23:13       ` Peter Zijlstra
  1 sibling, 0 replies; 13+ messages in thread
From: Mike Galbraith @ 2007-10-13 10:11 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner

On Sat, 2007-10-13 at 11:17 +0200, Peter Zijlstra wrote:

> Also, this seems to suggest iperf would like SCHED_BATCH.

Yes.  Throughput falls as preemption climbs.

	-Mike


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC][PATCH] sched: SCHED_FIFO watchdog timer
  2007-10-12 20:51 [PATCH] sched: high-res preemption tick Peter Zijlstra
  2007-10-13  7:18 ` Mike Galbraith
@ 2007-10-13 22:51 ` Peter Zijlstra
  2007-10-15 13:26   ` Dmitry Adamushko
                     ` (2 more replies)
  1 sibling, 3 replies; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-13 22:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Thomas Gleixner, Mike Galbraith, Lennart Poettering

The below patch is an idea proposed by tglx and depends on sched-devel +
the hrtick patch previously posted.

The current watchdog action is to demote the task to SCHED_NORMAL,
however it might be wanted to deliver a signal instead (or have more per
task configuration state). Which is why I added Lennart to the CC list
as I gathered he would like something like this for PulseAudio.

---
Subject: sched: SCHED_FIFO watchdog timer

Set a per task (rlimit based) limit on SCHED_FIFO runtime. When the
limit is exceeded the task is demoted back to SCHED_NORMAL.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/asm-generic/resource.h |    5 +++--
 kernel/sched.c                 |    5 +++++
 kernel/sched_rt.c              |   36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 2 deletions(-)

Index: linux-2.6/include/asm-generic/resource.h
===================================================================
--- linux-2.6.orig/include/asm-generic/resource.h
+++ linux-2.6/include/asm-generic/resource.h
@@ -44,8 +44,8 @@
 #define RLIMIT_NICE		13	/* max nice prio allowed to raise to
 					   0-39 for nice level 19 .. -20 */
 #define RLIMIT_RTPRIO		14	/* maximum realtime priority */
-
-#define RLIM_NLIMITS		15
+#define RLIMIT_FIFOTIME		15	/* timeout for fifo slices in us */
+#define RLIM_NLIMITS		16
 
 /*
  * SuS says limits have to be unsigned.
@@ -86,6 +86,7 @@
 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\
 	[RLIMIT_NICE]		= { 0, 0 },				\
 	[RLIMIT_RTPRIO]		= { 0, 0 },				\
+	[RLIMIT_FIFOTIME]	= {  RLIM_INFINITY,  RLIM_INFINITY },	\
 }
 
 #endif	/* __KERNEL__ */
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -89,6 +89,14 @@ static struct task_struct *pick_next_tas
 
 	next->se.exec_start = rq->clock;
 
+	if (next->policy == SCHED_FIFO) {
+		unsigned long fifotime;
+
+		fifotime = rq->curr->signal->rlim[RLIMIT_FIFOTIME].rlim_cur;
+		if (fifotime != RLIM_INFINITY)
+			hrtick_start(rq, (u64)fifotime * 1000, 0);
+	}
+
 	return next;
 }
 
@@ -194,8 +202,36 @@ load_balance_rt(struct rq *this_rq, int 
 	return load_moved;
 }
 
+#ifdef CONFIG_SCHED_HRT_TICK
+static int fifo_watchdog(struct rq *rq, struct task_struct *p, int queued)
+{
+	if (likely(!queued || p->policy != SCHED_FIFO))
+		return 0;
+
+	/*
+	 * task has been naughty, turn into SCHED_NORMAL
+	 */
+	printk(KERN_INFO "SCHED_FIFO task %s/%d exceeded his runtime quota,"
+			" demoting to regular task\n", p->comm, task_pid_nr(p));
+	deactivate_task(rq, p, 0);
+	__setscheduler(rq, p, SCHED_NORMAL, 0);
+	activate_task(rq, p, 0);
+	resched_task(p);
+
+	return 1;
+}
+#else
+static inline int fifo_watchdog(struct rq *rq, struct task_struct *p, int queued)
+{
+	return 0;
+}
+#endif
+
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+	if (fifo_watchdog(rq, p, queued))
+		return;
+
 	/*
 	 * RR tasks need a special form of timeslice management.
 	 * FIFO tasks have no timeslices.
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -132,6 +132,11 @@ static inline void sg_inc_cpu_power(stru
 }
 #endif
 
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio);
+
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-13  9:17     ` Peter Zijlstra
  2007-10-13 10:11       ` Mike Galbraith
@ 2007-10-13 23:13       ` Peter Zijlstra
  2007-10-13 23:16         ` Peter Zijlstra
  2007-10-14  6:34         ` Mike Galbraith
  1 sibling, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-13 23:13 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner


On Sat, 2007-10-13 at 11:17 +0200, Peter Zijlstra wrote:

> > Ah, but HRTICK is not compatible with PREEMPT_RESTRICT, it will be
> > similar to !WAKEUP_PREEMPT.
> 
> (I do plan to fix that eventually, just need to do it)

I guess something like this ought to do, but its a tad late so I'm quite
sure :-)

---
 kernel/sched_fair.c |   44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -737,6 +737,24 @@ static inline struct sched_entity *paren
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+/*
+ * does pse (newly woken task) preempt se (current task)
+ */
+static int wakeup_preempt(struct sched_entity *se, struct sched_entity *pse)
+{
+	s64 delta, gran;
+
+	delta = se->vruntime - pse->vruntime;
+	gran = sysctl_sched_wakeup_granularity;
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
+
+	if (delta > gran)
+		return 1;
+
+	return 0;
+}
+
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
@@ -764,6 +782,24 @@ static void hrtick_start_fair(struct rq 
 		if (!requeue)
 			delta = max(10000LL, delta);
 
+		/*
+		 * if we delayed wakeup preemption, shorten the slice to at most
+		 * 1 jiffy (does this call for yet another sysctl_sched_?)
+		 */
+		if (sched_feat(PREEMPT_RESTRICT) && first_fair(cfs_rq)) {
+			struct sched_entity *next = __pick_next_entity(cfs_rq);
+
+			if (wakeup_preempt(se, next)) {
+				u64 wakeup = NSEC_PER_SEC / HZ;
+				s64 delta2 = wakeup - ran;
+
+				if (delta2 < 0)
+					resched_task(rq->curr);
+				else
+					delta = min(delta, delta2);
+			}
+		}
+
 		hrtick_start(rq, delta, requeue);
 	}
 }
@@ -866,7 +902,6 @@ static void check_preempt_wakeup(struct 
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
-	s64 delta, gran;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -887,12 +922,7 @@ static void check_preempt_wakeup(struct 
 			pse = parent_entity(pse);
 		}
 
-		delta = se->vruntime - pse->vruntime;
-		gran = sysctl_sched_wakeup_granularity;
-		if (unlikely(se->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, &se->load);
-
-		if (delta > gran) {
+		if (wakeup_preempt(se, pse)) {
 			int now = !sched_feat(PREEMPT_RESTRICT);
 
 			if (now || p->prio < curr->prio || !se->peer_preempt++)



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-13 23:13       ` Peter Zijlstra
@ 2007-10-13 23:16         ` Peter Zijlstra
  2007-10-14  6:34         ` Mike Galbraith
  1 sibling, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-13 23:16 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner


On Sun, 2007-10-14 at 01:13 +0200, Peter Zijlstra wrote:
> On Sat, 2007-10-13 at 11:17 +0200, Peter Zijlstra wrote:
> 
> > > Ah, but HRTICK is not compatible with PREEMPT_RESTRICT, it will be
> > > similar to !WAKEUP_PREEMPT.
> > 
> > (I do plan to fix that eventually, just need to do it)
> 
> I guess something like this ought to do, but its a tad late so I'm quite
> sure :-)

I guess that proves it.... that should have read: 
  ... so I'm _not_ quite sure.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] sched: high-res preemption tick
  2007-10-13 23:13       ` Peter Zijlstra
  2007-10-13 23:16         ` Peter Zijlstra
@ 2007-10-14  6:34         ` Mike Galbraith
  1 sibling, 0 replies; 13+ messages in thread
From: Mike Galbraith @ 2007-10-14  6:34 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner

On Sun, 2007-10-14 at 01:13 +0200, Peter Zijlstra wrote:
> On Sat, 2007-10-13 at 11:17 +0200, Peter Zijlstra wrote:
> 
> > > Ah, but HRTICK is not compatible with PREEMPT_RESTRICT, it will be
> > > similar to !WAKEUP_PREEMPT.
> > 
> > (I do plan to fix that eventually, just need to do it)
> 
> I guess something like this ought to do, but its a tad late so I'm quite
> sure :-)

2.6.23-smp-d-hrt + restrict fix patch
[SUM]  0.0-300.1 sec    176 GBytes  5.03 Gbits/sec
[SUM]  0.0-300.1 sec    175 GBytes  5.02 Gbits/sec
[SUM]  0.0-300.1 sec    176 GBytes  5.05 Gbits/sec

Context switches are further reduced (across the board) over
PREEMPT_RESTRICT, dropping from ~7-8k to ~2.5k with this test, vs ~950
for SCHED_BATCH and ~50k with this tree and no restriction.  Throughput
is ~96% of SCHED_BATCH, vs ~55% with no restriction.  I see no
interactivity regressions.

	-Mike


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] sched: SCHED_FIFO watchdog timer
  2007-10-13 22:51 ` [RFC][PATCH] sched: SCHED_FIFO watchdog timer Peter Zijlstra
@ 2007-10-15 13:26   ` Dmitry Adamushko
  2007-10-15 13:57     ` Peter Zijlstra
  2007-10-15 14:25   ` Lennart Poettering
  2007-10-15 21:32   ` Kay Sievers
  2 siblings, 1 reply; 13+ messages in thread
From: Dmitry Adamushko @ 2007-10-15 13:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Lennart Poettering

On 14/10/2007, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> The below patch is an idea proposed by tglx and depends on sched-devel +
> the hrtick patch previously posted.
>
> The current watchdog action is to demote the task to SCHED_NORMAL,
> however it might be wanted to deliver a signal instead (or have more per
> task configuration state). Which is why I added Lennart to the CC list
> as I gathered he would like something like this for PulseAudio.
>
> ---
> Subject: sched: SCHED_FIFO watchdog timer

Why only SHCED_FIFO and not SCHED_RR?
Their (mis)behavior is similar wrt SCHED_NORMAL tasks.


> +#ifdef CONFIG_SCHED_HRT_TICK
> +static int fifo_watchdog(struct rq *rq, struct task_struct *p, int queued)
> +{
> +       if (likely(!queued || p->policy != SCHED_FIFO))
> +               return 0;
> +
> +       /*
> +        * task has been naughty, turn into SCHED_NORMAL
> +        */
> +       printk(KERN_INFO "SCHED_FIFO task %s/%d exceeded his runtime quota,"
> +                       " demoting to regular task\n", p->comm, task_pid_nr(p));
> +       deactivate_task(rq, p, 0);
> +       __setscheduler(rq, p, SCHED_NORMAL, 0);
> +       activate_task(rq, p, 0);
> +       resched_task(p);

I guess, put_prev_task() / set_curr_task() should be called (for the
case of task_running(p)) to make it group-scheduler-friendly (as it's
done e.g. in sched_setscheduler()).

(normilize_task() should probably do the same)


-- 
Best regards,
Dmitry Adamushko

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] sched: SCHED_FIFO watchdog timer
  2007-10-15 13:26   ` Dmitry Adamushko
@ 2007-10-15 13:57     ` Peter Zijlstra
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2007-10-15 13:57 UTC (permalink / raw)
  To: Dmitry Adamushko
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Lennart Poettering

[-- Attachment #1: Type: text/plain, Size: 1866 bytes --]

On Mon, 2007-10-15 at 15:26 +0200, Dmitry Adamushko wrote:
> On 14/10/2007, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > The below patch is an idea proposed by tglx and depends on sched-devel +
> > the hrtick patch previously posted.
> >
> > The current watchdog action is to demote the task to SCHED_NORMAL,
> > however it might be wanted to deliver a signal instead (or have more per
> > task configuration state). Which is why I added Lennart to the CC list
> > as I gathered he would like something like this for PulseAudio.
> >
> > ---
> > Subject: sched: SCHED_FIFO watchdog timer
> 
> Why only SHCED_FIFO and not SCHED_RR?
> Their (mis)behavior is similar wrt SCHED_NORMAL tasks.

Because SCHED_FIFO is easier, _RR is for later. It was mostly an RFC to
request behavioural wishes from the users.

> 
> > +#ifdef CONFIG_SCHED_HRT_TICK
> > +static int fifo_watchdog(struct rq *rq, struct task_struct *p, int queued)
> > +{
> > +       if (likely(!queued || p->policy != SCHED_FIFO))
> > +               return 0;
> > +
> > +       /*
> > +        * task has been naughty, turn into SCHED_NORMAL
> > +        */
> > +       printk(KERN_INFO "SCHED_FIFO task %s/%d exceeded his runtime quota,"
> > +                       " demoting to regular task\n", p->comm, task_pid_nr(p));
> > +       deactivate_task(rq, p, 0);
> > +       __setscheduler(rq, p, SCHED_NORMAL, 0);
> > +       activate_task(rq, p, 0);
> > +       resched_task(p);
> 
> I guess, put_prev_task() / set_curr_task() should be called (for the
> case of task_running(p)) to make it group-scheduler-friendly (as it's
> done e.g. in sched_setscheduler()).
> 
> (normilize_task() should probably do the same)

Right, that is where I copied from, I'll pull the functionality into a
single function and make this and the sysrq stuff use it.

Thanks!

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] sched: SCHED_FIFO watchdog timer
  2007-10-13 22:51 ` [RFC][PATCH] sched: SCHED_FIFO watchdog timer Peter Zijlstra
  2007-10-15 13:26   ` Dmitry Adamushko
@ 2007-10-15 14:25   ` Lennart Poettering
  2007-10-15 21:32   ` Kay Sievers
  2 siblings, 0 replies; 13+ messages in thread
From: Lennart Poettering @ 2007-10-15 14:25 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Mike Galbraith

On Sun, 14.10.07 00:51, Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:

> The below patch is an idea proposed by tglx and depends on sched-devel +
> the hrtick patch previously posted.
> 
> The current watchdog action is to demote the task to SCHED_NORMAL,
> however it might be wanted to deliver a signal instead (or have more per
> task configuration state). Which is why I added Lennart to the CC list
> as I gathered he would like something like this for PulseAudio.

Indeed! Having this in the kernel would allow us to enable RT
scheduling for PulseAudio by default without bad effects. I was thinking about
adding some kind of babysitting process to userspace -- but doing this as
an RLIMIT in the kernel strikes me a much better idea!

I think it would make a lot of sense to make the API very similar to
RLIMIT_CPU, i.e. also send out SIGXCPU and SIGKILL, with the single
difference that RLIMIT_CPU sends out a signal depending on the total
CPU time used for the process and the new RLIMIT based on the time the
process spent without sleeping. That would be a very reasonable
extension to the current RLIMIT_CPU model.

Thank you very much for doing this patch!

Lennart

-- 
Lennart Poettering                        Red Hat, Inc.
lennart [at] poettering [dot] net         ICQ# 11060553
http://0pointer.net/lennart/           GnuPG 0x1A015CC4

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] sched: SCHED_FIFO watchdog timer
  2007-10-13 22:51 ` [RFC][PATCH] sched: SCHED_FIFO watchdog timer Peter Zijlstra
  2007-10-15 13:26   ` Dmitry Adamushko
  2007-10-15 14:25   ` Lennart Poettering
@ 2007-10-15 21:32   ` Kay Sievers
  2 siblings, 0 replies; 13+ messages in thread
From: Kay Sievers @ 2007-10-15 21:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Thomas Gleixner, Mike Galbraith,
	Lennart Poettering

On 10/14/07, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> The below patch is an idea proposed by tglx and depends on sched-devel +
> the hrtick patch previously posted.
>
> The current watchdog action is to demote the task to SCHED_NORMAL,
> however it might be wanted to deliver a signal instead (or have more per
> task configuration state). Which is why I added Lennart to the CC list
> as I gathered he would like something like this for PulseAudio.

Great, this looks very promising. Thanks for doing this.

Kay

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2007-10-15 21:33 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-10-12 20:51 [PATCH] sched: high-res preemption tick Peter Zijlstra
2007-10-13  7:18 ` Mike Galbraith
2007-10-13  8:55   ` Peter Zijlstra
2007-10-13  9:17     ` Peter Zijlstra
2007-10-13 10:11       ` Mike Galbraith
2007-10-13 23:13       ` Peter Zijlstra
2007-10-13 23:16         ` Peter Zijlstra
2007-10-14  6:34         ` Mike Galbraith
2007-10-13 22:51 ` [RFC][PATCH] sched: SCHED_FIFO watchdog timer Peter Zijlstra
2007-10-15 13:26   ` Dmitry Adamushko
2007-10-15 13:57     ` Peter Zijlstra
2007-10-15 14:25   ` Lennart Poettering
2007-10-15 21:32   ` Kay Sievers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.