linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/11] another rt group sched update
@ 2008-01-06 16:11 Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 01/11] sched: rt throttling vs no_hz Peter Zijlstra
                   ` (12 more replies)
  0 siblings, 13 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

this time compile tested on all 16 combinations of:

  CONFIG_SMP
  CONFIG_FAIR_GROUP_SCHED
  CONFIG_HIGH_RES_TIMERS
  CONFIG_NO_HZ

ran some but not all combinations
--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 01/11] sched: rt throttling vs no_hz
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 02/11] sched: load_balance_monitor rename Peter Zijlstra
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-update.patch --]
[-- Type: text/plain, Size: 4621 bytes --]

We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    2 ++
 kernel/sched.c           |   23 ++++++++++++++++++++++-
 kernel/sched_rt.c        |   30 ++++++++++++++++--------------
 kernel/time/tick-sched.c |    5 +++++
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balan
 }
 #endif
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	u64 rt_period_expire;
+	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 delta;
+
+	if (!rq->rt_throttled)
+		return 0;
+
+	if (rq->clock > rq->rt_period_expire)
+		return 1;
+
+	delta = rq->rt_period_expire - rq->clock;
+	do_div(delta, NSEC_PER_SEC / HZ);
+
+	return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
-	rt_rq->highest_prio = MAX_RT_PRIO;
 	rt_rq->overloaded = 0;
 #endif
 
@@ -7186,6 +7206,7 @@ void __init sched_init(void)
 		list_add(&init_task_group.list, &task_groups);
 #endif
 		rq->rt_period_expire = 0;
+		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
+
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc
 	return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-	if (rt_rq->rt_throttled) {
-		rt_rq->rt_throttled = 0;
-		sched_rt_ratio_enqueue(rt_rq);
-	}
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
 	struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struc
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
 		rq->rt_period_expire += period;
 
-		for_each_leaf_rt_rq(rt_rq, rq)
-			__update_sched_rt_period(rt_rq, period);
+		for_each_leaf_rt_rq(rt_rq, rq) {
+			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+			if (rt_rq->rt_throttled) {
+				rt_rq->rt_throttled = 0;
+				sched_rt_ratio_enqueue(rt_rq);
+			}
+		}
+
+		rq->rt_throttled = 0;
 	}
 }
 
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
+	rt_jiffies = rt_needs_cpu(cpu);
+	if (rt_jiffies && rt_jiffies < delta_jiffies)
+		delta_jiffies = rt_jiffies;
+
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 02/11] sched: load_balance_monitor rename
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 01/11] sched: rt throttling vs no_hz Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 03/11] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-group-fixes.patch --]
[-- Type: text/plain, Size: 830 bytes --]

don't start the load_balance_monitor when there is only a single cpu.
rename the kthread because its currently longer than TASK_COMM_LEN

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7070,8 +7070,11 @@ void __init sched_init_smp(void)
 	sched_init_granularity();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	if (nr_cpu_ids == 1)
+		return;
+
 	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-					 "load_balance_monitor");
+					 "group_balance");
 	if (!IS_ERR(lb_monitor_task)) {
 		lb_monitor_task->flags |= PF_NOFREEZE;
 		wake_up_process(lb_monitor_task);

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 03/11] hrtimer: clean up cpu->base locking tricks
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 01/11] sched: rt throttling vs no_hz Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 02/11] sched: load_balance_monitor rename Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-unlocked-callback.patch --]
[-- Type: text/plain, Size: 2786 bytes --]

In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c         |  109 +++++++++++++++++++++++++++++++++++++++++++----
 kernel/time/tick-sched.c |    8 ---
 2 files changed, 102 insertions(+), 15 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_even
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
+			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
+			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_even
 					 HRTIMER_STATE_CALLBACK, 0);
 			timer_stats_account_hrtimer(timer);
 
+			fn = timer->function;
+			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+				/*
+				 * Used for scheduler timers, avoid lock
+				 * inversion with rq->lock and tasklist_lock.
+				 *
+				 * These timers are required to deal with
+				 * enqueue expiry themselves and are not
+				 * allowed to migrate.
+				 */
+				spin_unlock(&cpu_base->lock);
+				restart = fn(timer);
+				spin_lock(&cpu_base->lock);
+			} else
+				restart = fn(timer);
+
 			/*
 			 * Note: We clear the CALLBACK bit after
 			 * enqueue_hrtimer to avoid reprogramming of
 			 * the event hardware. This happens at the end
 			 * of this function anyway.
 			 */
-			if (timer->function(timer) != HRTIMER_NORESTART) {
+			if (restart != HRTIMER_NORESTART) {
 				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 				enqueue_hrtimer(timer, base, 0);
 			}
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_t
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
-	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 	int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_t
 			touch_softlockup_watchdog();
 			ts->idle_jiffies++;
 		}
-		/*
-		 * update_process_times() might take tasklist_lock, hence
-		 * drop the base lock. sched-tick hrtimers are per-CPU and
-		 * never accessible by userspace APIs, so this is safe to do.
-		 */
-		spin_unlock(&base->lock);
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
-		spin_lock(&base->lock);
 	}
 
 	/* Do not restart, when we are in the idle loop */

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (2 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 03/11] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-07 11:56   ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 05/11] hrtimer: unlock hrtimer_wakeup Peter Zijlstra
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-fallback.patch --]
[-- Type: text/plain, Size: 11137 bytes --]

Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.

Fix this up by splitting it similar to the highres=on case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/hrtimer.h |    5 -
 kernel/hrtimer.c        |  232 +++++++++++++++++++++++++-----------------------
 kernel/timer.c          |    3 
 3 files changed, 125 insertions(+), 115 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -622,6 +622,11 @@ static inline int hrtimer_cb_pending(str
 static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -1030,6 +1035,85 @@ int hrtimer_get_res(const clockid_t whic
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+	spin_lock_irq(&cpu_base->lock);
+
+	while (!list_empty(&cpu_base->cb_pending)) {
+		enum hrtimer_restart (*fn)(struct hrtimer *);
+		struct hrtimer *timer;
+		int restart;
+
+		timer = list_entry(cpu_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		timer_stats_account_hrtimer(timer);
+
+		fn = timer->function;
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		timer->state &= ~HRTIMER_STATE_CALLBACK;
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			/*
+			 * Enqueue the timer, allow reprogramming of the event
+			 * device
+			 */
+			enqueue_hrtimer(timer, timer->base, 1);
+		} else if (hrtimer_active(timer)) {
+			/*
+			 * If the timer was rearmed on another CPU, reprogram
+			 * the event device.
+			 */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+	}
+	spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	int restart;
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer_stats_account_hrtimer(timer);
+
+	fn = timer->function;
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+		/*
+		 * Used for scheduler timers, avoid lock inversion with
+		 * rq->lock and tasklist_lock.
+		 *
+		 * These timers are required to deal with enqueue expiry
+		 * themselves and are not allowed to migrate.
+		 */
+		spin_unlock(&cpu_base->lock);
+		restart = fn(timer);
+		spin_lock(&cpu_base->lock);
+	} else
+		restart = fn(timer);
+
+	/*
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+	 * reprogramming of the event hardware. This happens at the end of this
+	 * function anyway.
+	 */
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+		enqueue_hrtimer(timer, base, 0);
+	}
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
@@ -1063,9 +1147,7 @@ void hrtimer_interrupt(struct clock_even
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
-			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
-			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1089,37 +1171,7 @@ void hrtimer_interrupt(struct clock_even
 				continue;
 			}
 
-			__remove_hrtimer(timer, base,
-					 HRTIMER_STATE_CALLBACK, 0);
-			timer_stats_account_hrtimer(timer);
-
-			fn = timer->function;
-			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
-				/*
-				 * Used for scheduler timers, avoid lock
-				 * inversion with rq->lock and tasklist_lock.
-				 *
-				 * These timers are required to deal with
-				 * enqueue expiry themselves and are not
-				 * allowed to migrate.
-				 */
-				spin_unlock(&cpu_base->lock);
-				restart = fn(timer);
-				spin_lock(&cpu_base->lock);
-			} else
-				restart = fn(timer);
-
-			/*
-			 * Note: We clear the CALLBACK bit after
-			 * enqueue_hrtimer to avoid reprogramming of
-			 * the event hardware. This happens at the end
-			 * of this function anyway.
-			 */
-			if (restart != HRTIMER_NORESTART) {
-				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-				enqueue_hrtimer(timer, base, 0);
-			}
-			timer->state &= ~HRTIMER_STATE_CALLBACK;
+			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
 		base++;
@@ -1140,52 +1192,41 @@ void hrtimer_interrupt(struct clock_even
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
+	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
 
-		timer_stats_account_hrtimer(timer);
+#endif	/* CONFIG_HIGH_RES_TIMERS */
 
-		fn = timer->function;
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
-		restart = fn(timer);
+	if (hrtimer_hres_active())
+		return;
 
-		spin_lock_irq(&cpu_base->lock);
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
 
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
+	run_hrtimer_pending(cpu_base);
 }
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
-
 /*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
  */
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 				     int index)
@@ -1199,46 +1240,27 @@ static inline void run_hrtimer_queue(str
 	if (base->get_softirq_time)
 		base->softirq_time = base->get_softirq_time();
 
-	spin_lock_irq(&cpu_base->lock);
+	spin_lock(&cpu_base->lock);
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		int restart;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart != HRTIMER_NORESTART) {
-			BUG_ON(hrtimer_active(timer));
-			enqueue_hrtimer(timer, base, 0);
+		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+			list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+			continue;
 		}
+
+		__run_hrtimer(timer);
 	}
-	spin_unlock_irq(&cpu_base->lock);
+	spin_unlock(&cpu_base->lock);
 }
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1247,18 +1269,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		if (hrtimer_switch_to_hres())
-			return;
-
 	hrtimer_get_softirq_time(cpu_base);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct sof
 {
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
-	hrtimer_run_queues();
+	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct sof
  */
 void run_local_timers(void)
 {
+	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 }
Index: linux-2.6/include/linux/hrtimer.h
===================================================================
--- linux-2.6.orig/include/linux/hrtimer.h
+++ linux-2.6/include/linux/hrtimer.h
@@ -115,10 +115,8 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
-#ifdef CONFIG_HIGH_RES_TIMERS
 	enum hrtimer_cb_mode		cb_mode;
 	struct list_head		cb_entry;
-#endif
 #ifdef CONFIG_TIMER_STATS
 	void				*start_site;
 	char				start_comm[16];
@@ -194,10 +192,10 @@ struct hrtimer_cpu_base {
 	spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	struct list_head		cb_pending;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				hres_active;
-	struct list_head		cb_pending;
 	unsigned long			nr_events;
 #endif
 };
@@ -319,6 +317,7 @@ extern void hrtimer_init_sleeper(struct 
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 05/11] hrtimer: unlock hrtimer_wakeup
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (3 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 06/11] sched: rt-group: reduce rescheduling Peter Zijlstra
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: hrtimer-do_nanosleep.patch --]
[-- Type: text/plain, Size: 1116 bytes --]

hrtimer_wakeup creates a

  base->lock
    rq->lock

lock dependancy. Avoid this by switching to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
which doesn't hold base->lock.

This fully untangles hrtimer locks from the scheduler locks, and allows
hrtimer usage in the scheduler proper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1296,7 +1296,7 @@ void hrtimer_init_sleeper(struct hrtimer
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 }
 
@@ -1307,6 +1307,8 @@ static int __sched do_nanosleep(struct h
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
+		if (!hrtimer_active(&t->timer))
+			t->task = NULL;
 
 		if (likely(t->task))
 			schedule();

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 06/11] sched: rt-group: reduce rescheduling
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (4 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 05/11] hrtimer: unlock hrtimer_wakeup Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 07/11] sched: rt-group: per group period Peter Zijlstra
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-fix-enqueue.patch --]
[-- Type: text/plain, Size: 748 bytes --]

Only reschedule if the new group has a higher prio task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_rt.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struc
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
 	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
 		enqueue_rt_entity(rt_se);
-		resched_task(rq_of_rt_rq(rt_rq)->curr);
+		if (rt_rq->highest_prio < curr->prio)
+			resched_task(curr);
 	}
 }
 

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 07/11] sched: rt-group: per group period
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (5 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 06/11] sched: rt-group: reduce rescheduling Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 08/11] sched: rt-group: deal with PI Peter Zijlstra
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-rq-hrtimer.patch --]
[-- Type: text/plain, Size: 14233 bytes --]

Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.

Use the fancy new hrtimers to provide a per group period

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    2 
 kernel/sched.c           |  229 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched_rt.c        |   61 ++++++------
 kernel/sysctl.c          |    2 
 kernel/time/tick-sched.c |    5 -
 5 files changed, 237 insertions(+), 62 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
 	struct rt_rq **rt_rq;
 
 	unsigned int rt_ratio;
+	ktime_t rt_period;
 
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
 #endif
 	int rt_throttled;
 	u64 rt_time;
+	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
-	u64 rt_period_expire;
-	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 delta;
-
-	if (!rq->rt_throttled)
-		return 0;
-
-	if (rq->clock > rq->rt_period_expire)
-		return 1;
-
-	delta = rq->rt_period_expire - rq->clock;
-	do_div(delta, NSEC_PER_SEC / HZ);
-
-	return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 1000000;
 
 #define SCHED_RT_FRAC_SHIFT	16
 #define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+	static const ktime_t ktime_zero = { .tv64 = 0 };
+	return ktime_add_ns(ktime_zero, ns);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -5287,6 +5275,158 @@ static inline void sched_init_granularit
 	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+	struct rt_rq *rt_rq =
+		container_of(timer, struct rt_rq, rt_period_timer);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+	ktime_t now = ktime_get();
+
+	WARN_ON(smp_processor_id() != cpu_of(rq));
+	WARN_ON(!in_irq());
+
+	spin_lock(&rq->lock);
+	update_sched_rt_period(rt_rq);
+	spin_unlock(&rq->lock);
+
+	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+	return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+	ktime_t period = sched_rt_period(rt_rq);
+
+	WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+	for (;;) {
+		ktime_t now = ktime_get();
+		hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+		hrtimer_start(&rt_rq->rt_period_timer,
+				rt_rq->rt_period_timer.expires,
+				HRTIMER_MODE_ABS);
+		if (hrtimer_active(&rt_rq->rt_period_timer))
+			break;
+	}
+}
+
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+	hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+#endif
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rt_rq *rt_rq;
+
+	for_each_leaf_rt_rq(rt_rq, rq)
+		sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct rt_rq *rt_rq;
+
+	for_each_leaf_rt_rq(rt_rq, rq)
+		sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+		unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		sched_rt_period_start_cpu(cpu);
+		return NOTIFY_OK;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		sched_rt_period_stop_cpu(cpu);
+		return NOTIFY_OK;
+
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void __init __sched_rt_period_init(void *arg)
+{
+	int cpu = smp_processor_id();
+	sched_rt_period_start_cpu(cpu);
+}
+
+static void __init sched_rt_period_init(void)
+{
+	on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
+	hotcpu_notifier(sched_rt_period_hotplug, 0);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void __sched_rt_period_init_tg(void *arg)
+{
+	struct task_group *tg = arg;
+	int cpu = smp_processor_id();
+
+	sched_rt_period_start(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+	on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
+}
+
+static void __sched_rt_period_destroy_tg(void *arg)
+{
+	struct task_group *tg = arg;
+	int cpu = smp_processor_id();
+
+	sched_rt_period_stop(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+	on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
+static void __init sched_rt_period_init(void)
+{
+	sched_rt_period_start_cpu(0);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+	sched_rt_period_start(tg->rt_rq[0]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+	sched_rt_period_stop(tg->rt_rq[0]);
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -7068,6 +7208,7 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+	sched_rt_period_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (nr_cpu_ids == 1)
@@ -7088,6 +7229,7 @@ void __init sched_init_smp(void)
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
+	sched_rt_period_init();
 }
 #endif /* CONFIG_SMP */
 
@@ -7131,6 +7273,11 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 
+	hrtimer_init(&rt_rq->rt_period_timer,
+			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_rq->rt_period_timer.function = sched_rt_period_timer;
+	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	rt_rq->rq = rq;
 #endif
@@ -7201,6 +7348,8 @@ void __init sched_init(void)
 				&per_cpu(init_sched_entity, i), i, 1);
 
 		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		init_task_group.rt_period =
+			ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
@@ -7208,8 +7357,6 @@ void __init sched_init(void)
 
 		list_add(&init_task_group.list, &task_groups);
 #endif
-		rq->rt_period_expire = 0;
-		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -7598,6 +7745,7 @@ struct task_group *sched_create_group(vo
 
 	tg->shares = NICE_0_LOAD;
 	tg->rt_ratio = 0; /* XXX */
+	tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
@@ -7637,6 +7785,8 @@ struct task_group *sched_create_group(vo
 	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 
+	sched_rt_period_init_tg(tg);
+
 	return tg;
 
 err:
@@ -7658,6 +7808,8 @@ void sched_destroy_group(struct task_gro
 	struct rt_rq *rt_rq = NULL;
 	int i;
 
+	sched_rt_period_destroy_tg(tg);
+
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
@@ -7815,6 +7967,19 @@ unsigned long sched_group_rt_ratio(struc
 	return tg->rt_ratio;
 }
 
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+{
+	tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
+	return 0;
+}
+
+unsigned long sched_group_rt_period(struct task_group *tg)
+{
+	u64 ns = ktime_to_ns(tg->rt_period);
+	do_div(ns, NSEC_PER_USEC);
+	return ns;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7903,6 +8068,17 @@ static u64 cpu_rt_ratio_read_uint(struct
 	return (u64) tg->rt_ratio;
 }
 
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_period_val)
+{
+	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+}
+
 static struct cftype cpu_files[] = {
 	{
 		.name = "shares",
@@ -7914,6 +8090,11 @@ static struct cftype cpu_files[] = {
 		.read_uint = cpu_rt_ratio_read_uint,
 		.write_uint = cpu_rt_ratio_write_uint,
 	},
+	{
+		.name = "rt_period_us",
+		.read_uint = cpu_rt_period_read_uint,
+		.write_uint = cpu_rt_period_write_uint,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati
 	return rt_rq->tg->rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+	BUG_ON(!rt_rq->tg);
+	return rt_rq->tg->rt_period;
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(sched_rt_period(rt_rq));
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
@@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati
 	return sysctl_sched_rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC);
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
@@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc
 	if (rt_rq->rt_throttled)
 		return 1;
 
-	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+	period = sched_rt_period_ns(rt_rq);
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
-
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc
 	return 0;
 }
 
-static void update_sched_rt_period(struct rq *rq)
+static void update_sched_rt_period(struct rt_rq *rt_rq)
 {
-	struct rt_rq *rt_rq;
-	u64 period;
-
-	while (rq->clock > rq->rt_period_expire) {
-		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-		rq->rt_period_expire += period;
-
-		for_each_leaf_rt_rq(rt_rq, rq) {
-			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-			if (rt_rq->rt_throttled) {
-				rt_rq->rt_throttled = 0;
-				sched_rt_ratio_enqueue(rt_rq);
-			}
-		}
-
-		rq->rt_throttled = 0;
+	u64 period = sched_rt_period_ns(rt_rq);
+	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+	if (rt_rq->rt_throttled) {
+		rt_rq->rt_throttled = 0;
+		sched_rt_ratio_enqueue(rt_rq);
 	}
 }
 
@@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq
 	cpuacct_charge(curr, delta_exec);
 
 	rt_rq->rt_time += delta_exec;
-	/*
-	 * might make it a tad more accurate:
-	 *
-	 * update_sched_rt_period(rq);
-	 */
 	if (sched_rt_ratio_exceeded(rt_rq))
 		resched_task(curr);
 }
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_period_ms",
+		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
-	rt_jiffies = rt_needs_cpu(cpu);
-	if (rt_jiffies && rt_jiffies < delta_jiffies)
-		delta_jiffies = rt_jiffies;
-
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 08/11] sched: rt-group: deal with PI
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (6 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 07/11] sched: rt-group: per group period Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 09/11] sched: rt-group: dynamic period ticks Peter Zijlstra
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-pi.patch --]
[-- Type: text/plain, Size: 3892 bytes --]

Steven mentioned the fun case where a lock holding task will be throttled.

Simple fix: allow groups that have boosted tasks to run anyway.
This is ofcourse not quite correct. Needs more tricks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |    3 +++
 kernel/sched_rt.c |   48 ++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 43 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -376,6 +376,8 @@ struct rt_rq {
 	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long rt_nr_boosted;
+
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
@@ -7279,6 +7281,7 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -121,6 +121,23 @@ static void sched_rt_ratio_dequeue(struc
 		dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+	struct task_struct *p;
+
+	if (rt_rq)
+		return !!rt_rq->rt_nr_boosted;
+
+	p = rt_task_of(rt_se);
+	return p->prio != p->normal_prio;
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -170,6 +187,10 @@ static inline void sched_rt_ratio_dequeu
 {
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -190,21 +211,22 @@ static int sched_rt_ratio_exceeded(struc
 	u64 period, ratio;
 
 	if (rt_ratio == SCHED_RT_FRAC)
-		return 0;
+		goto out;
 
 	if (rt_rq->rt_throttled)
-		return 1;
+		goto out;
 
 	period = sched_rt_period_ns(rt_rq);
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
 		rt_rq->rt_throttled = 1;
-		sched_rt_ratio_dequeue(rt_rq);
-		return 1;
+		if (rt_rq_throttled(rt_rq))
+			sched_rt_ratio_dequeue(rt_rq);
 	}
 
-	return 0;
+out:
+	return rt_rq_throttled(rt_rq);
 }
 
 static void update_sched_rt_period(struct rt_rq *rt_rq)
@@ -265,6 +287,10 @@ void inc_rt_tasks(struct sched_rt_entity
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+#endif
 }
 
 static inline
@@ -295,6 +321,12 @@ void dec_rt_tasks(struct sched_rt_entity
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted--;
+
+	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +335,7 @@ static void enqueue_rt_entity(struct sch
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && group_rq->rt_throttled)
+	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -476,7 +508,7 @@ static struct sched_rt_entity *pick_next
 	struct list_head *queue;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		goto out;
 
 	idx = sched_find_first_bit(array->bitmap);
@@ -500,7 +532,7 @@ static struct task_struct *pick_next_tas
 	if (unlikely(!rt_rq->rt_nr_running))
 		return NULL;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
 	do {

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 09/11] sched: rt-group: dynamic period ticks
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (7 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 08/11] sched: rt-group: deal with PI Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 10/11] sched: rt-group: EDF Peter Zijlstra
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-rq-dyn.patch --]
[-- Type: text/plain, Size: 7384 bytes --]

Disable the period updates for inactive groups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |  158 ------------------------------------------------------
 kernel/sched_rt.c |   54 ++++++++++++++++++
 2 files changed, 53 insertions(+), 159 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -5277,158 +5277,6 @@ static inline void sched_init_granularit
 	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
-	struct rt_rq *rt_rq =
-		container_of(timer, struct rt_rq, rt_period_timer);
-	struct rq *rq = rq_of_rt_rq(rt_rq);
-	ktime_t now = ktime_get();
-
-	WARN_ON(smp_processor_id() != cpu_of(rq));
-	WARN_ON(!in_irq());
-
-	spin_lock(&rq->lock);
-	update_sched_rt_period(rt_rq);
-	spin_unlock(&rq->lock);
-
-	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
-	return HRTIMER_RESTART;
-}
-
-static void sched_rt_period_start(struct rt_rq *rt_rq)
-{
-	ktime_t period = sched_rt_period(rt_rq);
-
-	WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
-
-	for (;;) {
-		ktime_t now = ktime_get();
-		hrtimer_forward(&rt_rq->rt_period_timer, now, period);
-		hrtimer_start(&rt_rq->rt_period_timer,
-				rt_rq->rt_period_timer.expires,
-				HRTIMER_MODE_ABS);
-		if (hrtimer_active(&rt_rq->rt_period_timer))
-			break;
-	}
-}
-
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
-static void sched_rt_period_stop(struct rt_rq *rt_rq)
-{
-	hrtimer_cancel(&rt_rq->rt_period_timer);
-}
-#endif
-
-static void sched_rt_period_start_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct rt_rq *rt_rq;
-
-	for_each_leaf_rt_rq(rt_rq, rq)
-		sched_rt_period_start(rt_rq);
-}
-
-#ifdef CONFIG_SMP
-static void sched_rt_period_stop_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct rt_rq *rt_rq;
-
-	for_each_leaf_rt_rq(rt_rq, rq)
-		sched_rt_period_stop(rt_rq);
-}
-
-static int sched_rt_period_hotplug(struct notifier_block *nfb,
-		unsigned long action, void *hcpu)
-{
-	int cpu = (unsigned long)hcpu;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		sched_rt_period_start_cpu(cpu);
-		return NOTIFY_OK;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		sched_rt_period_stop_cpu(cpu);
-		return NOTIFY_OK;
-
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
-
-static void __init __sched_rt_period_init(void *arg)
-{
-	int cpu = smp_processor_id();
-	sched_rt_period_start_cpu(cpu);
-}
-
-static void __init sched_rt_period_init(void)
-{
-	on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
-	hotcpu_notifier(sched_rt_period_hotplug, 0);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __sched_rt_period_init_tg(void *arg)
-{
-	struct task_group *tg = arg;
-	int cpu = smp_processor_id();
-
-	sched_rt_period_start(tg->rt_rq[cpu]);
-}
-
-static void sched_rt_period_init_tg(struct task_group *tg)
-{
-	on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
-}
-
-static void __sched_rt_period_destroy_tg(void *arg)
-{
-	struct task_group *tg = arg;
-	int cpu = smp_processor_id();
-
-	sched_rt_period_stop(tg->rt_rq[cpu]);
-}
-
-static void sched_rt_period_destroy_tg(struct task_group *tg)
-{
-	on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#else /* CONFIG_SMP */
-static void __init sched_rt_period_init(void)
-{
-	sched_rt_period_start_cpu(0);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void sched_rt_period_init_tg(struct task_group *tg)
-{
-	sched_rt_period_start(tg->rt_rq[0]);
-}
-
-static void sched_rt_period_destroy_tg(struct task_group *tg)
-{
-	sched_rt_period_stop(tg->rt_rq[0]);
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -7210,7 +7058,6 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
-	sched_rt_period_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (nr_cpu_ids == 1)
@@ -7231,7 +7078,6 @@ void __init sched_init_smp(void)
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
-	sched_rt_period_init();
 }
 #endif /* CONFIG_SMP */
 
@@ -7788,8 +7634,6 @@ struct task_group *sched_create_group(vo
 	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 
-	sched_rt_period_init_tg(tg);
-
 	return tg;
 
 err:
@@ -7811,8 +7655,6 @@ void sched_destroy_group(struct task_gro
 	struct rt_rq *rt_rq = NULL;
 	int i;
 
-	sched_rt_period_destroy_tg(tg);
-
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -221,8 +221,10 @@ static int sched_rt_ratio_exceeded(struc
 
 	if (rt_rq->rt_time > ratio) {
 		rt_rq->rt_throttled = 1;
-		if (rt_rq_throttled(rt_rq))
+		if (rt_rq_throttled(rt_rq)) {
+			WARN_ON(!hrtimer_active(&rt_rq->rt_period_timer));
 			sched_rt_ratio_dequeue(rt_rq);
+		}
 	}
 
 out:
@@ -242,6 +244,52 @@ static void update_sched_rt_period(struc
 	}
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+	struct rt_rq *rt_rq =
+		container_of(timer, struct rt_rq, rt_period_timer);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+	ktime_t now = ktime_get();
+
+	WARN_ON(smp_processor_id() != cpu_of(rq));
+	WARN_ON(!in_irq());
+
+	spin_lock(&rq->lock);
+	update_sched_rt_period(rt_rq);
+	spin_unlock(&rq->lock);
+
+	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+
+	return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+	ktime_t period;
+
+	WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+	if (hrtimer_active(&rt_rq->rt_period_timer))
+		return;
+
+	period = sched_rt_period(rt_rq);
+
+	for (;;) {
+		ktime_t now = ktime_get();
+		hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+		hrtimer_start(&rt_rq->rt_period_timer,
+				rt_rq->rt_period_timer.expires,
+				HRTIMER_MODE_ABS);
+		if (hrtimer_active(&rt_rq->rt_period_timer))
+			break;
+	}
+}
+
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+	hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -274,6 +322,8 @@ static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	if (!rt_rq->rt_nr_running && !group_rt_rq(rt_se))
+		sched_rt_period_start(rt_rq);
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
@@ -299,6 +349,8 @@ void dec_rt_tasks(struct sched_rt_entity
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
+	if (!rt_rq->rt_nr_running && !group_rt_rq(rt_se))
+		sched_rt_period_stop(rt_rq);
 #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
 		struct rt_prio_array *array;

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 10/11] sched: rt-group: EDF
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (8 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 09/11] sched: rt-group: dynamic period ticks Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-06 16:11 ` [PATCH 11/11] sched: rt-group: interface Peter Zijlstra
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-edf.patch --]
[-- Type: text/plain, Size: 6548 bytes --]

Use a simple Ealiest Deadline First implementation to schedule the realtime
groups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    1 
 kernel/sched.c        |   13 +++++
 kernel/sched_rt.c     |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 124 insertions(+), 5 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -942,6 +942,7 @@ struct sched_rt_entity {
 	int nr_cpus_allowed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rb_node		run_node;
 	struct sched_rt_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct rt_rq		*rt_rq;
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -360,6 +360,11 @@ struct cfs_rq {
 #endif
 };
 
+enum rt_rq_type {
+	RT_RQ_PRIO,
+	RT_RQ_EDF,
+};
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
@@ -376,6 +381,10 @@ struct rt_rq {
 	struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	enum rt_rq_type rt_rq_type;
+	struct rb_root deadlines;
+	struct rb_node *rb_leftmost;
+
 	unsigned long rt_nr_boosted;
 
 	struct rq *rq;
@@ -7127,6 +7136,9 @@ static void init_rt_rq(struct rt_rq *rt_
 	rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rt_rq_type = RT_RQ_PRIO;
+	rt_rq->deadlines = RB_ROOT;
+	rt_rq->rb_leftmost = NULL;
 	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
@@ -7196,6 +7208,7 @@ void __init sched_init(void)
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 
+		rq->rt.rt_rq_type = RT_RQ_EDF;
 		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
 		init_task_group.rt_period =
 			ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -138,6 +138,84 @@ static int rt_se_boosted(struct sched_rt
 	return p->prio != p->normal_prio;
 }
 
+static inline u64 rt_deadline(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+	BUG_ON(!group_rq);
+	return ktime_to_ns(group_rq->rt_period_timer.expires);
+}
+
+static void enqueue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rb_node **link;
+	struct rb_node *parent;
+	struct sched_rt_entity *entry;
+	u64 deadline;
+	int leftmost = 1;
+
+	if (rt_rq->rt_rq_type != RT_RQ_EDF)
+		return;
+
+	link = &rt_rq->deadlines.rb_node;
+	parent = NULL;
+	deadline = rt_deadline(rt_se);
+
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct sched_rt_entity, run_node);
+
+		if (deadline < rt_deadline(entry)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = 0;
+		}
+	}
+
+	if (leftmost)
+		rt_rq->rb_leftmost = &rt_se->run_node;
+
+	rb_link_node(&rt_se->run_node, parent, link);
+	rb_insert_color(&rt_se->run_node, &rt_rq->deadlines);
+}
+
+static void dequeue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+	if (rt_rq->rt_rq_type != RT_RQ_EDF)
+		return;
+
+	if (rt_rq->rb_leftmost == &rt_se->run_node)
+		rt_rq->rb_leftmost = rb_next(&rt_se->run_node);
+
+	rb_erase(&rt_se->run_node, &rt_rq->deadlines);
+}
+
+static void requeue_rt_deadline(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	BUG_ON(!rt_se);
+	if (on_rt_rq(rt_se)) {
+		dequeue_rt_deadline(rt_se);
+		enqueue_rt_deadline(rt_se);
+	}
+}
+
+static struct sched_rt_entity *next_rt_deadline(struct rt_rq *rt_rq)
+{
+	if (rt_rq->rt_rq_type != RT_RQ_EDF)
+		return NULL;
+
+	if (!rt_rq->rb_leftmost)
+		return NULL;
+
+	return rb_entry(rt_rq->rb_leftmost, struct sched_rt_entity, run_node);
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -191,6 +269,23 @@ static inline int rt_rq_throttled(struct
 {
 	return rt_rq->rt_throttled;
 }
+
+static inline void enqueue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+}
+
+static inline void dequeue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+}
+
+static inline void requeue_rt_deadline(struct rt_rq *rt_rq)
+{
+}
+
+static inline struct sched_rt_entity *next_rt_deadline(struct rt_rq *rt_rq)
+{
+	return NULL;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -254,12 +349,13 @@ static enum hrtimer_restart sched_rt_per
 	WARN_ON(smp_processor_id() != cpu_of(rq));
 	WARN_ON(!in_irq());
 
+	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+
 	spin_lock(&rq->lock);
+	requeue_rt_deadline(rt_rq);
 	update_sched_rt_period(rt_rq);
 	spin_unlock(&rq->lock);
 
-	hrtimer_forward(timer, now, sched_rt_period(rt_rq));
-
 	return HRTIMER_RESTART;
 }
 
@@ -283,6 +379,8 @@ static void sched_rt_period_start(struct
 		if (hrtimer_active(&rt_rq->rt_period_timer))
 			break;
 	}
+
+	requeue_rt_deadline(rt_rq);
 }
 
 static void sched_rt_period_stop(struct rt_rq *rt_rq)
@@ -393,6 +491,8 @@ static void enqueue_rt_entity(struct sch
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
+	enqueue_rt_deadline(rt_se);
+
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
@@ -405,6 +505,8 @@ static void dequeue_rt_entity(struct sch
 	if (list_empty(array->queue + rt_se_prio(rt_se)))
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
+	dequeue_rt_deadline(rt_se);
+
 	dec_rt_tasks(rt_se, rt_rq);
 }
 
@@ -552,8 +654,7 @@ static void check_preempt_curr_rt(struct
 		resched_task(rq->curr);
 }
 
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
-						   struct rt_rq *rt_rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
 {
 	struct rt_prio_array *array = &rt_rq->active;
 	struct sched_rt_entity *next = NULL;
@@ -563,6 +664,10 @@ static struct sched_rt_entity *pick_next
 	if (rt_rq_throttled(rt_rq))
 		goto out;
 
+	next = next_rt_deadline(rt_rq);
+	if (next)
+		goto out;
+
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
@@ -588,7 +693,7 @@ static struct task_struct *pick_next_tas
 		return NULL;
 
 	do {
-		rt_se = pick_next_rt_entity(rq, rt_rq);
+		rt_se = pick_next_rt_entity(rt_rq);
 		if (unlikely(!rt_se))
 			goto retry;
 		rt_rq = group_rt_rq(rt_se);

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 11/11] sched: rt-group: interface
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (9 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 10/11] sched: rt-group: EDF Peter Zijlstra
@ 2008-01-06 16:11 ` Peter Zijlstra
  2008-01-07 10:51 ` [PATCH 00/11] another rt group sched update Peter Zijlstra
  2008-01-07 11:17 ` [PATCH 00/11] another rt group sched update Ingo Molnar
  12 siblings, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-06 16:11 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: sched-rt-group-interface.patch --]
[-- Type: text/plain, Size: 11501 bytes --]

Change the rt_ratio interface to rt_runtime_us, to match rt_period_us.
This avoids picking a granularity for the ratio.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    8 +++
 kernel/sched.c        |  116 ++++++++++++++++++++++++++++++++++----------------
 kernel/sched_rt.c     |   42 +++++++-----------
 kernel/sysctl.c       |    4 -
 4 files changed, 106 insertions(+), 64 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1518,7 +1518,7 @@ extern unsigned int sysctl_sched_feature
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_rt_period;
-extern unsigned int sysctl_sched_rt_ratio;
+extern unsigned int sysctl_sched_rt_runtime;
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 extern unsigned int sysctl_sched_min_bal_int_shares;
 extern unsigned int sysctl_sched_max_bal_int_shares;
@@ -2014,6 +2014,12 @@ extern void sched_destroy_group(struct t
 extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
+extern int sched_group_set_rt_runtime(struct task_group *tg,
+				      unsigned long rt_runtime_us);
+extern unsigned long sched_group_rt_runtime(struct task_group *tg);
+extern int sched_group_set_rt_period(struct task_group *tg,
+				     unsigned long rt_runtime_us);
+extern unsigned long sched_group_rt_period(struct task_group *tg);
 
 #endif
 
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -176,7 +176,7 @@ struct task_group {
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 
-	unsigned int rt_ratio;
+	u64 rt_runtime;
 	ktime_t rt_period;
 
 	/*
@@ -646,19 +646,16 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we measure rt task cpu usage in us.
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_rt_period = 1000000;
 
-#define SCHED_RT_FRAC_SHIFT	16
-#define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
-
 /*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+const_debug unsigned int sysctl_sched_rt_runtime = 950000;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -7209,7 +7206,8 @@ void __init sched_init(void)
 				&per_cpu(init_sched_entity, i), i, 1);
 
 		rq->rt.rt_rq_type = RT_RQ_EDF;
-		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		init_task_group.rt_runtime =
+			sysctl_sched_rt_runtime * NSEC_PER_USEC;
 		init_task_group.rt_period =
 			ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
@@ -7606,7 +7604,7 @@ struct task_group *sched_create_group(vo
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
-	tg->rt_ratio = 0; /* XXX */
+	tg->rt_runtime = 0; /* XXX */
 	tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
 	for_each_possible_cpu(i) {
@@ -7801,41 +7799,87 @@ unsigned long sched_group_shares(struct 
 }
 
 /*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
  */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+	u64 r = runtime * (1ULL << 16);
+	do_div(r, period);
+	return r;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct task_group *tgi;
 	unsigned long total = 0;
+	unsigned long global_ratio =
+		to_ratio(sysctl_sched_rt_period, sysctl_sched_rt_runtime);
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &task_groups, list)
-		total += tgi->rt_ratio;
+	list_for_each_entry_rcu(tgi, &task_groups, list) {
+		if (tgi == tg)
+			continue;
+
+		total += to_ratio(ktime_to_ns(tgi->rt_period), tgi->rt_runtime);
+	}
 	rcu_read_unlock();
 
-	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
-		return -EINVAL;
+	return total + to_ratio(period, runtime) < global_ratio;
+}
 
-	tg->rt_ratio = rt_ratio;
-	return 0;
+int sched_group_set_rt_runtime(struct task_group *tg,
+			       unsigned long rt_runtime_us)
+{
+	u64 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	int err = 0;
+
+	mutex_lock(&rt_constraints_mutex);
+	if (!__rt_schedulable(tg, ktime_to_ns(tg->rt_period), rt_runtime)) {
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	tg->rt_runtime = rt_runtime;
+ unlock:
+	mutex_unlock(&rt_constraints_mutex);
+
+	return err;
 }
 
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+unsigned long sched_group_rt_runtime(struct task_group *tg)
 {
-	return tg->rt_ratio;
+	u64 rt_runtime_us = tg->rt_runtime;
+
+	do_div(rt_runtime_us, NSEC_PER_USEC);
+	return rt_runtime_us;
 }
 
-int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period_us)
 {
-	tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
-	return 0;
+	u64 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+	int err = 0;
+
+	mutex_lock(&rt_constraints_mutex);
+	if (!__rt_schedulable(tg, rt_period, tg->rt_runtime)) {
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	tg->rt_period = ns_to_ktime(rt_period);
+ unlock:
+	mutex_unlock(&rt_constraints_mutex);
+
+	return err;
 }
 
 unsigned long sched_group_rt_period(struct task_group *tg)
 {
-	u64 ns = ktime_to_ns(tg->rt_period);
-	do_div(ns, NSEC_PER_USEC);
-	return ns;
+	u64 rt_period_us = ktime_to_ns(tg->rt_period);
+
+	do_div(rt_period_us, NSEC_PER_USEC);
+	return rt_period_us;
 }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
@@ -7913,17 +7957,15 @@ static u64 cpu_shares_read_uint(struct c
 	return (u64) tg->shares;
 }
 
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-		u64 rt_ratio_val)
+static int cpu_rt_runtime_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_runtime_val)
 {
-	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+	return sched_group_set_rt_runtime(cgroup_tg(cgrp), rt_runtime_val);
 }
 
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_runtime_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
-	struct task_group *tg = cgroup_tg(cgrp);
-
-	return (u64) tg->rt_ratio;
+	return sched_group_rt_runtime(cgroup_tg(cgrp));
 }
 
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -7934,7 +7976,7 @@ static int cpu_rt_period_write_uint(stru
 
 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
-	return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+	return sched_group_rt_period(cgroup_tg(cgrp));
 }
 
 static struct cftype cpu_files[] = {
@@ -7944,9 +7986,9 @@ static struct cftype cpu_files[] = {
 		.write_uint = cpu_shares_write_uint,
 	},
 	{
-		.name = "rt_ratio",
-		.read_uint = cpu_rt_ratio_read_uint,
-		.write_uint = cpu_rt_ratio_write_uint,
+		.name = "rt_runtime_us",
+		.read_uint = cpu_rt_runtime_read_uint,
+		.write_uint = cpu_rt_runtime_write_uint,
 	},
 	{
 		.name = "rt_period_us",
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -57,12 +57,12 @@ static inline int on_rt_rq(struct sched_
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
 	if (!rt_rq->tg)
-		return SCHED_RT_FRAC;
+		return 0;
 
-	return rt_rq->tg->rt_ratio;
+	return rt_rq->tg->rt_runtime;
 }
 
 static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
@@ -100,7 +100,7 @@ static inline struct rt_rq *group_rt_rq(
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
@@ -113,7 +113,7 @@ static void sched_rt_ratio_enqueue(struc
 	}
 }
 
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
@@ -218,9 +218,9 @@ static struct sched_rt_entity *next_rt_d
 
 #else
 
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-	return sysctl_sched_rt_ratio;
+	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
 static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
@@ -257,11 +257,11 @@ static inline struct rt_rq *group_rt_rq(
 	return NULL;
 }
 
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 }
 
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
 
@@ -300,25 +300,21 @@ static inline int rt_se_prio(struct sche
 	return rt_task_of(rt_se)->prio;
 }
 
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
-	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
-	u64 period, ratio;
+	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (rt_ratio == SCHED_RT_FRAC)
+	if (!runtime)
 		goto out;
 
 	if (rt_rq->rt_throttled)
 		goto out;
 
-	period = sched_rt_period_ns(rt_rq);
-	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	if (rt_rq->rt_time > ratio) {
+	if (rt_rq->rt_time > runtime) {
 		rt_rq->rt_throttled = 1;
 		if (rt_rq_throttled(rt_rq)) {
 			WARN_ON(!hrtimer_active(&rt_rq->rt_period_timer));
-			sched_rt_ratio_dequeue(rt_rq);
+			sched_rt_rq_dequeue(rt_rq);
 		}
 	}
 
@@ -328,14 +324,12 @@ out:
 
 static void update_sched_rt_period(struct rt_rq *rt_rq)
 {
-	u64 period = sched_rt_period_ns(rt_rq);
-	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+	u64 runtime = sched_rt_runtime(rt_rq);
 
-	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+	rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
 	if (rt_rq->rt_throttled) {
 		rt_rq->rt_throttled = 0;
-		sched_rt_ratio_enqueue(rt_rq);
+		sched_rt_rq_enqueue(rt_rq);
 	}
 }
 
@@ -412,7 +406,7 @@ static void update_curr_rt(struct rq *rq
 	cpuacct_charge(curr, delta_exec);
 
 	rt_rq->rt_time += delta_exec;
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (sched_rt_runtime_exceeded(rt_rq))
 		resched_task(curr);
 }
 
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -319,8 +319,8 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_ratio",
-		.data		= &sysctl_sched_rt_ratio,
+		.procname	= "sched_rt_runtime_us",
+		.data		= &sysctl_sched_rt_runtime,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,

--


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/11] another rt group sched update
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (10 preceding siblings ...)
  2008-01-06 16:11 ` [PATCH 11/11] sched: rt-group: interface Peter Zijlstra
@ 2008-01-07 10:51 ` Peter Zijlstra
  2008-01-07 11:24   ` Peter Zijlstra
  2008-01-07 12:23   ` Srivatsa Vaddagiri
  2008-01-07 11:17 ` [PATCH 00/11] another rt group sched update Ingo Molnar
  12 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-07 10:51 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 1037 bytes --]


The list of open points and issues for this work:

 - review/testing

 - handle the PI case better

   The only thing I can come up with currently is to basically have two
   priority arrays one for boosted and one for non boosted tasks, and
   normally run the highest of either array, but in the case of a
   throttled group, only pick from the boosted array.

   Not sure I like that for its space overhead, Steven?

 - I occasionally see a weird lockup on iterating the task_groups list
   on smp machines. - I failed to see anything wrong, but hey, this
   stack of used brown paper bags is steadily growing.

 - figure out what to do for UID based group scheduling, the current
   implementation leaves it impossible for !root users to execute
   real time tasks by setting rt_runtime_us to 0, and it has no way
   to change it.

   Srivatsa, what happened to the per uid weight patches?, Perhaps we
   can extend that interface to allow changing this.

 - I guess documentation needs to be written ;-)

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/11] another rt group sched update
  2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
                   ` (11 preceding siblings ...)
  2008-01-07 10:51 ` [PATCH 00/11] another rt group sched update Peter Zijlstra
@ 2008-01-07 11:17 ` Ingo Molnar
  12 siblings, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2008-01-07 11:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> this time compile tested on all 16 combinations of:
> 
>   CONFIG_SMP
>   CONFIG_FAIR_GROUP_SCHED
>   CONFIG_HIGH_RES_TIMERS
>   CONFIG_NO_HZ
> 
> ran some but not all combinations

thanks, applied. This is a really big step forwards in terms of making 
RT task CPU usage more flexible and more manageable.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/11] another rt group sched update
  2008-01-07 10:51 ` [PATCH 00/11] another rt group sched update Peter Zijlstra
@ 2008-01-07 11:24   ` Peter Zijlstra
  2008-01-07 12:23   ` Srivatsa Vaddagiri
  1 sibling, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-07 11:24 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


On Mon, 2008-01-07 at 11:51 +0100, Peter Zijlstra wrote:

>  - I occasionally see a weird lockup on iterating the task_groups list
>    on smp machines. - I failed to see anything wrong, but hey, this
>    stack of used brown paper bags is steadily growing.

D'oh

---

Don't add a task_group to the task_groups list for each cpu, there is
only a single task_group.


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7175,6 +7175,10 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	list_add(&init_task_group.list, &task_groups);
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -7201,8 +7205,6 @@ void __init sched_init(void)
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
-
-		list_add(&init_task_group.list, &task_groups);
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
  2008-01-06 16:11 ` [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
@ 2008-01-07 11:56   ` Peter Zijlstra
  2008-01-08 11:16     ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-07 11:56 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 2970 bytes --]


On Sun, 2008-01-06 at 17:11 +0100, Peter Zijlstra wrote:
> plain text document attachment (hrtimer-fallback.patch)


I know I should have boot tested more combinations :-(
Please fold this into the patch.


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c |   38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===================================================================
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t 
 }
 #endif /* BITS_PER_LONG >= 64 */
 
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return timer->state & HRTIMER_STATE_PENDING;
+}
+
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	list_del_init(&timer->cb_entry);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -494,29 +510,12 @@ void hres_timers_resume(void)
 }
 
 /*
- * Check, whether the timer is on the callback pending list
- */
-static inline int hrtimer_cb_pending(const struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_PENDING;
-}
-
-/*
- * Remove a timer from the callback pending list
- */
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
-{
-	list_del_init(&timer->cb_entry);
-}
-
-/*
  * Initialize the high resolution related parts of cpu_base
  */
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
 	base->expires_next.tv64 = KTIME_MAX;
 	base->hres_active = 0;
-	INIT_LIST_HEAD(&base->cb_pending);
 }
 
 /*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(str
  */
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
-	INIT_LIST_HEAD(&timer->cb_entry);
 }
 
 /*
@@ -618,8 +616,6 @@ static inline int hrtimer_enqueue_reprog
 {
 	return 0;
 }
-static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 static inline int hrtimer_reprogram(struct hrtimer *timer,
@@ -1006,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &cpu_base->clock_base[clock_id];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	hrtimer_init_timer_hres(timer);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1419,6 +1416,7 @@ static void __devinit init_hrtimers_cpu(
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
+	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
 }
 



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/11] another rt group sched update
  2008-01-07 12:23   ` Srivatsa Vaddagiri
@ 2008-01-07 12:12     ` Peter Zijlstra
  2008-01-07 16:57     ` [PATCH 12/11] sched: rt-group: uid-group interface Peter Zijlstra
  1 sibling, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-07 12:12 UTC (permalink / raw)
  To: vatsa
  Cc: LKML, Ingo Molnar, Balbir Singh, dmitry.adamushko,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 841 bytes --]


On Mon, 2008-01-07 at 17:53 +0530, Srivatsa Vaddagiri wrote:
> On Mon, Jan 07, 2008 at 11:51:20AM +0100, Peter Zijlstra wrote:
> >  - figure out what to do for UID based group scheduling, the current
> >    implementation leaves it impossible for !root users to execute
> >    real time tasks by setting rt_runtime_us to 0, and it has no way
> >    to change it.
> > 
> >    Srivatsa, what happened to the per uid weight patches?, Perhaps we
> >    can extend that interface to allow changing this.
> 
> Hi Peter,
> 	The sysfs interface for tweaking each user's share should be in
> mainline already (sysfs_create_file() in user_kobject_create()). This
> could be extended for your purpose, hopefully in a straightforward
> manner (you never know that with sysfs :(

Ah, I missed that going in. Thanks, I'll give it a go.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 00/11] another rt group sched update
  2008-01-07 10:51 ` [PATCH 00/11] another rt group sched update Peter Zijlstra
  2008-01-07 11:24   ` Peter Zijlstra
@ 2008-01-07 12:23   ` Srivatsa Vaddagiri
  2008-01-07 12:12     ` Peter Zijlstra
  2008-01-07 16:57     ` [PATCH 12/11] sched: rt-group: uid-group interface Peter Zijlstra
  1 sibling, 2 replies; 31+ messages in thread
From: Srivatsa Vaddagiri @ 2008-01-07 12:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Balbir Singh, dmitry.adamushko,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

On Mon, Jan 07, 2008 at 11:51:20AM +0100, Peter Zijlstra wrote:
>  - figure out what to do for UID based group scheduling, the current
>    implementation leaves it impossible for !root users to execute
>    real time tasks by setting rt_runtime_us to 0, and it has no way
>    to change it.
> 
>    Srivatsa, what happened to the per uid weight patches?, Perhaps we
>    can extend that interface to allow changing this.

Hi Peter,
	The sysfs interface for tweaking each user's share should be in
mainline already (sysfs_create_file() in user_kobject_create()). This
could be extended for your purpose, hopefully in a straightforward
manner (you never know that with sysfs :(

-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-07 12:23   ` Srivatsa Vaddagiri
  2008-01-07 12:12     ` Peter Zijlstra
@ 2008-01-07 16:57     ` Peter Zijlstra
  2008-01-08 10:33       ` Ingo Molnar
  2008-01-08 10:57       ` Dhaval Giani
  1 sibling, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-07 16:57 UTC (permalink / raw)
  To: vatsa
  Cc: LKML, Ingo Molnar, Balbir Singh, dmitry.adamushko,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


Subject: sched: rt-group: add uid-group interface

Extend the /sys/kernel/uids/<uid>/ interface to allow setting
the group's rt_period and rt_runtime.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    4 +-
 kernel/user.c         |   93 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 84 insertions(+), 13 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -559,7 +559,9 @@ struct user_struct {
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
 	struct kset kset;
-	struct subsys_attribute user_attr;
+	struct subsys_attribute share_attr;
+	struct subsys_attribute rt_period_attr;
+	struct subsys_attribute rt_runtime_attr;
 	struct work_struct work;
 #endif
 #endif
Index: linux-2.6/kernel/user.c
===================================================================
--- linux-2.6.orig/kernel/user.c
+++ linux-2.6/kernel/user.c
@@ -129,7 +129,7 @@ static inline void uids_mutex_unlock(voi
 }
 
 /* return cpu shares held by the user */
-static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+static ssize_t cpu_share_show(struct kset *kset, char *buffer)
 {
 	struct user_struct *up = container_of(kset, struct user_struct, kset);
 
@@ -137,8 +137,8 @@ static ssize_t cpu_shares_show(struct ks
 }
 
 /* modify cpu shares held by the user */
-static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
-				size_t size)
+static ssize_t cpu_share_store(struct kset *kset, const char *buffer,
+			       size_t size)
 {
 	struct user_struct *up = container_of(kset, struct user_struct, kset);
 	unsigned long shares;
@@ -151,12 +151,67 @@ static ssize_t cpu_shares_store(struct k
 	return (rc ? rc : size);
 }
 
-static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+static ssize_t cpu_rt_period_show(struct kset *kset, char *buffer)
 {
-	sa->attr.name = name;
-	sa->attr.mode = mode;
-	sa->show = cpu_shares_show;
-	sa->store = cpu_shares_store;
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+
+	return sprintf(buffer, "%lu\n", sched_group_rt_period(up->tg));
+}
+
+static ssize_t cpu_rt_period_store(struct kset *kset, const char *buffer,
+				   size_t size)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	unsigned long rt_period_us;
+	int rc;
+
+	sscanf(buffer, "%lu", &rt_period_us);
+	rc = sched_group_set_rt_period(up->tg, rt_period_us);
+
+	return (rc ?: size);
+}
+
+static ssize_t cpu_rt_runtime_show(struct kset *kset, char *buffer)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+
+	return sprintf(buffer, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kset *kset, const char *buffer,
+				    size_t size)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	unsigned long rt_runtime_us;
+	int rc;
+
+	sscanf(buffer, "%lu", &rt_runtime_us);
+	rc = sched_group_set_rt_runtime(up->tg, rt_runtime_us);
+
+	return (rc ?: size);
+}
+
+static void user_attr_init(struct user_struct *up)
+{
+	struct subsys_attribute *sa;
+
+	sa = &up->share_attr;
+	sa->attr.name = "cpu_share";
+	sa->attr.mode = 0644;
+	sa->show = cpu_share_show;
+	sa->store = cpu_share_store;
+
+	sa = &up->rt_period_attr;
+	sa->attr.name = "cpu_rt_period_us";
+	sa->attr.mode = 0644;
+	sa->show = cpu_rt_period_show;
+	sa->store = cpu_rt_period_store;
+
+	sa = &up->rt_runtime_attr;
+	sa->attr.name = "cpu_rt_runtime_us";
+	sa->attr.mode = 0644;
+	sa->show = cpu_rt_runtime_show;
+	sa->store = cpu_rt_runtime_store;
 }
 
 /* Create "/sys/kernel/uids/<uid>" directory and
@@ -172,15 +227,27 @@ static int user_kobject_create(struct us
 	kobj->parent = &uids_kobject;	/* create under /sys/kernel/uids dir */
 	kobject_set_name(kobj, "%d", up->uid);
 	kset_init(kset);
-	user_attr_init(&up->user_attr, "cpu_share", 0644);
+	user_attr_init(up);
 
 	error = kobject_add(kobj);
 	if (error)
 		goto done;
 
-	error = sysfs_create_file(kobj, &up->user_attr.attr);
+	error = sysfs_create_file(kobj, &up->share_attr.attr);
+	if (error)
+		goto error1;
+	error = sysfs_create_file(kobj, &up->rt_period_attr.attr);
 	if (error)
-		kobject_del(kobj);
+		goto error2;
+	error = sysfs_create_file(kobj, &up->rt_runtime_attr.attr);
+	if (error)
+		goto error3;
+
+	if (0) {
+error3:		sysfs_remove_file(kobj, &up->rt_period_attr.attr);
+error2: 	sysfs_remove_file(kobj, &up->share_attr.attr);
+error1: 	kobject_del(kobj);
+	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 
@@ -238,7 +305,9 @@ static void remove_user_sysfs_dir(struct
 	if (!remove_user)
 		goto done;
 
-	sysfs_remove_file(kobj, &up->user_attr.attr);
+	sysfs_remove_file(kobj, &up->share_attr.attr);
+	sysfs_remove_file(kobj, &up->rt_period_attr.attr);
+	sysfs_remove_file(kobj, &up->rt_runtime_attr.attr);
 	kobject_uevent(kobj, KOBJ_REMOVE);
 	kobject_del(kobj);
 



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-07 16:57     ` [PATCH 12/11] sched: rt-group: uid-group interface Peter Zijlstra
@ 2008-01-08 10:33       ` Ingo Molnar
  2008-01-08 10:57       ` Dhaval Giani
  1 sibling, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2008-01-08 10:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: vatsa, LKML, Balbir Singh, dmitry.adamushko, Steven Rostedt,
	Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Subject: sched: rt-group: add uid-group interface
> 
> Extend the /sys/kernel/uids/<uid>/ interface to allow setting the 
> group's rt_period and rt_runtime.

thanks, applied.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-07 16:57     ` [PATCH 12/11] sched: rt-group: uid-group interface Peter Zijlstra
  2008-01-08 10:33       ` Ingo Molnar
@ 2008-01-08 10:57       ` Dhaval Giani
  2008-01-08 11:02         ` Peter Zijlstra
  2008-01-08 23:26         ` Peter Zijlstra
  1 sibling, 2 replies; 31+ messages in thread
From: Dhaval Giani @ 2008-01-08 10:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: vatsa, LKML, Ingo Molnar, Balbir Singh, dmitry.adamushko,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> 
> Subject: sched: rt-group: add uid-group interface
> 
> Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> the group's rt_period and rt_runtime.
> 

Hi Peter,

Cool stuff! I will try out these patches and try to give you some
feedback.

One request though, could you please add some documentation to
Documentation/ABI/testing/sysfs-kernel-uids?

Thanks,
-- 
regards,
Dhaval

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 10:57       ` Dhaval Giani
@ 2008-01-08 11:02         ` Peter Zijlstra
  2008-01-08 14:31           ` Kay Sievers
  2008-01-08 23:26         ` Peter Zijlstra
  1 sibling, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-08 11:02 UTC (permalink / raw)
  To: Dhaval Giani
  Cc: vatsa, LKML, Ingo Molnar, Balbir Singh, dmitry.adamushko,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 643 bytes --]


On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > 
> > Subject: sched: rt-group: add uid-group interface
> > 
> > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > the group's rt_period and rt_runtime.
> > 
> 
> Hi Peter,
> 
> Cool stuff! I will try out these patches and try to give you some
> feedback.

Thanks, much appreciated!

> One request though, could you please add some documentation to
> Documentation/ABI/testing/sysfs-kernel-uids?

I already have documentation on the todo list, I'll add this file to
that list :-)

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
  2008-01-07 11:56   ` Peter Zijlstra
@ 2008-01-08 11:16     ` Ingo Molnar
  0 siblings, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2008-01-08 11:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Balbir Singh, dmitry.adamushko, Srivatsa Vaddagiri,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Sun, 2008-01-06 at 17:11 +0100, Peter Zijlstra wrote:
> > plain text document attachment (hrtimer-fallback.patch)
> 
> I know I should have boot tested more combinations :-(
> Please fold this into the patch.

done.

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 11:02         ` Peter Zijlstra
@ 2008-01-08 14:31           ` Kay Sievers
  2008-01-08 23:35             ` Peter Zijlstra
  0 siblings, 1 reply; 31+ messages in thread
From: Kay Sievers @ 2008-01-08 14:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dhaval Giani, vatsa, LKML, Ingo Molnar, Balbir Singh,
	dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner

On Jan 8, 2008 12:02 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> > On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > >
> > > Subject: sched: rt-group: add uid-group interface
> > >
> > > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > > the group's rt_period and rt_runtime.
> > >
> >
> > Hi Peter,
> >
> > Cool stuff! I will try out these patches and try to give you some
> > feedback.
>
> Thanks, much appreciated!
>
> > One request though, could you please add some documentation to
> > Documentation/ABI/testing/sysfs-kernel-uids?
>
> I already have documentation on the todo list, I'll add this file to
> that list :-)

Care to rebase the patch against -mm, we fixed the mixed-up usage
of ksets and kobjects, and this can not apply anymore:
  http://git.kernel.org/?p=linux/kernel/git/gregkh/patches.git;a=blob;f=driver/struct-user_info-sysfs.patch;hb=HEAD

There is also an attribute group now which makes it much easier to add
new files.

Thanks,
Kay

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 10:57       ` Dhaval Giani
  2008-01-08 11:02         ` Peter Zijlstra
@ 2008-01-08 23:26         ` Peter Zijlstra
  1 sibling, 0 replies; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-08 23:26 UTC (permalink / raw)
  To: Dhaval Giani
  Cc: vatsa, LKML, Ingo Molnar, Balbir Singh, dmitry.adamushko,
	Steven Rostedt, Gregory Haskins, Thomas Gleixner


On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > 
> > Subject: sched: rt-group: add uid-group interface
> > 
> > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > the group's rt_period and rt_runtime.
> > 
> 
> Hi Peter,
> 
> Cool stuff! I will try out these patches and try to give you some
> feedback.
> 
> One request though, could you please add some documentation to
> Documentation/ABI/testing/sysfs-kernel-uids?

compile tested only attempt at finalizing the interface

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1519,8 +1519,6 @@ extern unsigned int sysctl_sched_child_r
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_rt_period;
-extern unsigned int sysctl_sched_rt_runtime;
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 extern unsigned int sysctl_sched_min_bal_int_shares;
 extern unsigned int sysctl_sched_max_bal_int_shares;
@@ -1530,6 +1528,8 @@ int sched_nr_latency_handler(struct ctl_
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
 
 extern unsigned int sysctl_sched_compat_yield;
 
@@ -2017,8 +2017,8 @@ extern void sched_move_task(struct task_
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
 extern int sched_group_set_rt_runtime(struct task_group *tg,
-				      unsigned long rt_runtime_us);
-extern unsigned long sched_group_rt_runtime(struct task_group *tg);
+				      long rt_runtime_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				     unsigned long rt_runtime_us);
 extern unsigned long sched_group_rt_period(struct task_group *tg);
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -649,13 +649,18 @@ const_debug unsigned int sysctl_sched_nr
  * period over which we measure rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000000;
+unsigned int sysctl_sched_rt_period = 1000000;
 
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
-const_debug unsigned int sysctl_sched_rt_runtime = 950000;
+int sysctl_sched_rt_runtime = 950000;
+
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF	((u64)~0ULL)
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -7751,7 +7756,7 @@ struct task_group *sched_create_group(vo
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
-	tg->rt_runtime = 0; /* XXX */
+	tg->rt_runtime = 0;
 	tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
 	for_each_possible_cpu(i) {
@@ -7956,9 +7961,12 @@ static DEFINE_MUTEX(rt_constraints_mutex
 
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
-	u64 r = runtime * (1ULL << 16);
-	do_div(r, period);
-	return r;
+	if (runtime == RUNTIME_INF)
+		return 1ULL << 16;
+
+	runtime *= (1ULL << 16);
+	do_div(runtime, period);
+	return runtime;
 }
 
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7980,12 +7988,15 @@ static int __rt_schedulable(struct task_
 	return total + to_ratio(period, runtime) < global_ratio;
 }
 
-int sched_group_set_rt_runtime(struct task_group *tg,
-			       unsigned long rt_runtime_us)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
-	u64 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	u64 rt_runtime;
 	int err = 0;
 
+	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	if (rt_runtime_us == -1)
+		rt_runtime = RUNTIME_INF;
+
 	mutex_lock(&rt_constraints_mutex);
 	if (!__rt_schedulable(tg, ktime_to_ns(tg->rt_period), rt_runtime)) {
 		err = -EINVAL;
@@ -7999,10 +8010,14 @@ int sched_group_set_rt_runtime(struct ta
 	return err;
 }
 
-unsigned long sched_group_rt_runtime(struct task_group *tg)
+long sched_group_rt_runtime(struct task_group *tg)
 {
-	u64 rt_runtime_us = tg->rt_runtime;
+	u64 rt_runtime_us;
 
+	if (tg->rt_runtime == RUNTIME_INF)
+		return -1;
+
+	rt_runtime_us = tg->rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
@@ -8108,15 +8123,49 @@ static u64 cpu_shares_read_uint(struct c
 	return (u64) tg->shares;
 }
 
-static int cpu_rt_runtime_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-		u64 rt_runtime_val)
-{
-	return sched_group_set_rt_runtime(cgroup_tg(cgrp), rt_runtime_val);
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+				struct file *file,
+				const char __user *userbuf,
+				size_t nbytes, loff_t *unused_ppos)
+{
+	char buffer[64];
+	int retval = 0;
+	s64 val;
+	char *end;
+
+	if (!nbytes)
+		return -EINVAL;
+	if (nbytes >= sizeof(buffer))
+		return -E2BIG;
+	if (copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;     /* nul-terminate */
+
+	/* strip newline if necessary */
+	if (nbytes && (buffer[nbytes-1] == '\n'))
+		buffer[nbytes-1] = 0;
+	val = simple_strtoll(buffer, &end, 0);
+	if (*end)
+		return -EINVAL;
+
+	/* Pass to subsystem */
+	retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+	if (!retval)
+		retval = nbytes;
+	return retval;
 }
 
-static u64 cpu_rt_runtime_read_uint(struct cgroup *cgrp, struct cftype *cft)
-{
-	return sched_group_rt_runtime(cgroup_tg(cgrp));
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+				   struct file *file,
+				   char __user *buf, size_t nbytes,
+				   loff_t *ppos)
+{
+	char tmp[64];
+	long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+	int len = sprintf(tmp, "%ld\n", val);
+
+	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -8138,8 +8187,8 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "rt_runtime_us",
-		.read_uint = cpu_rt_runtime_read_uint,
-		.write_uint = cpu_rt_runtime_write_uint,
+		.read = cpu_rt_runtime_read,
+		.write = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -60,7 +60,7 @@ static inline int on_rt_rq(struct sched_
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
 	if (!rt_rq->tg)
-		return 0;
+		return RUNTIME_INF;
 
 	return rt_rq->tg->rt_runtime;
 }
@@ -220,6 +220,9 @@ static struct sched_rt_entity *next_rt_d
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
+	if (sysctl_sched_rt_runtime == -1)
+		return RUNTIME_INF;
+
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
@@ -304,7 +307,7 @@ static int sched_rt_runtime_exceeded(str
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (!runtime)
+	if (runtime == RUNTIME_INF)
 		goto out;
 
 	if (rt_rq->rt_throttled)
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -309,22 +309,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_period_us",
-		.data		= &sysctl_sched_rt_period,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_runtime_us",
-		.data		= &sysctl_sched_rt_runtime,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 	{
 		.ctl_name       = CTL_UNNUMBERED,
@@ -346,6 +330,22 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_period_us",
+		.data		= &sysctl_sched_rt_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_runtime_us",
+		.data		= &sysctl_sched_rt_runtime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_compat_yield",
 		.data		= &sysctl_sched_compat_yield,
 		.maxlen		= sizeof(unsigned int),
Index: linux-2.6/kernel/user.c
===================================================================
--- linux-2.6.orig/kernel/user.c
+++ linux-2.6/kernel/user.c
@@ -175,17 +175,17 @@ static ssize_t cpu_rt_runtime_show(struc
 {
 	struct user_struct *up = container_of(kset, struct user_struct, kset);
 
-	return sprintf(buffer, "%lu\n", sched_group_rt_runtime(up->tg));
+	return sprintf(buffer, "%ld\n", sched_group_rt_runtime(up->tg));
 }
 
 static ssize_t cpu_rt_runtime_store(struct kset *kset, const char *buffer,
 				    size_t size)
 {
 	struct user_struct *up = container_of(kset, struct user_struct, kset);
-	unsigned long rt_runtime_us;
+	long rt_runtime_us;
 	int rc;
 
-	sscanf(buffer, "%lu", &rt_runtime_us);
+	sscanf(buffer, "%ld", &rt_runtime_us);
 	rc = sched_group_set_rt_runtime(up->tg, rt_runtime_us);
 
 	return (rc ?: size);
Index: linux-2.6/Documentation/ABI/testing/sysfs-kernel-uids
===================================================================
--- linux-2.6.orig/Documentation/ABI/testing/sysfs-kernel-uids
+++ linux-2.6/Documentation/ABI/testing/sysfs-kernel-uids
@@ -12,3 +12,14 @@ Description:
 		B has shares = 2048, User B will get twice the CPU
 		bandwidth user A will. For more details refer
 		Documentation/sched-design-CFS.txt
+
+What:		/sys/kernel/uids/<uid>/cpu_rt_period_us
+Date:		January 2008
+Contact:	Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:	See Documentation/sched-rt-group.txt
+
+What:		/sys/kernel/uids/<uid>/cpu_rt_runtime_us
+Date:		January 2008
+Contact:	Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:	See Documentation/sched-rt-group.txt
+
Index: linux-2.6/Documentation/sched-rt-group.txt
===================================================================
--- /dev/null
+++ linux-2.6/Documentation/sched-rt-group.txt
@@ -0,0 +1,69 @@
+
+
+Real-Time group scheduling.
+
+The problem space:
+
+In order to schedule multiple groups of realtime tasks each group must
+be assigned a fixed portion of the cpu time available. Without a minimum
+guarantee a realtime group can obviously fall short. A fuzzy upper limit
+is of no use since it cannot be relied upon. Which leaves us with just
+the single fixed portion.
+
+CPU time is divided by means of specifying how much time can be spend
+running in a given period. Say a frame fixed realtime renderer must
+deliver a 25 frames a second, which yields a period of 0.04s. Now say
+it will also have to play some music and respond to input, leaving it
+with around 80% for the graphics. We can then give this group a runtime
+of 0.8 * 0.04s = 0.032s.
+
+This way the graphics group will have a 0.04s period with a 0.032s runtime
+limit.
+
+Now if the audio thread needs to refill the dma buffer every 0.005s, but
+needs only about 3% cpu time to do so, it will can do with a 0.03 * 0.005s
+= 0.00015s.
+
+If it so happens that the graphics group runs at a higher priority than
+the audio group is might be that the audio group will not get CPU time
+in time to meet its deadline. Whereas the graphics group will still easily
+make its deadline if it were delayed for the amount of time the audio
+group needs.
+
+This problem is solved using Earliest Deadline First (EDF) scheduling of the
+realtime groups.
+
+The Interface:
+
+system wide:
+
+/proc/sys/kernel/sched_rt_period_us
+/proc/sys/kernel/sched_rt_runtime_us
+
+CONFIG_FAIR_USER_SCHED
+
+/sys/kernel/uids/<uid>/cpu_rt_period_us
+/sys/kernel/uids/<uid>/cpu_rt_runtime_us
+
+or
+
+CONFIG_FAIR_CGROUP_SCHED
+
+/cgroup/<cgroup>/cpu.rt_period_us
+/cgroup/<cgroup>/cpu.rt_runtime_us
+
+[ time is specified in us because the interface is s32, this gives an
+  operating range of ~35m to 1us ]
+
+The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
+
+A runtime of -1 specifies runtime == period, ie. no limit.
+
+New groups get the period from /proc/sys/kernel/sched_rt_period_us and
+a runtime of 0.
+
+Settings are constrainted to:
+
+   \Sum_{i} runtime_{i} / period_{i} <= global_runtime / global_period
+
+in order to keep the configuration schedulable.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 14:31           ` Kay Sievers
@ 2008-01-08 23:35             ` Peter Zijlstra
  2008-01-08 23:58               ` Greg KH
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2008-01-08 23:35 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Dhaval Giani, vatsa, LKML, Ingo Molnar, Balbir Singh,
	dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner, Greg Kroah-Hartman, Andrew Morton


On Tue, 2008-01-08 at 15:31 +0100, Kay Sievers wrote:
> On Jan 8, 2008 12:02 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> >
> > On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> > > On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > > >
> > > > Subject: sched: rt-group: add uid-group interface
> > > >
> > > > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > > > the group's rt_period and rt_runtime.
> > > >
> > >
> > > Hi Peter,
> > >
> > > Cool stuff! I will try out these patches and try to give you some
> > > feedback.
> >
> > Thanks, much appreciated!
> >
> > > One request though, could you please add some documentation to
> > > Documentation/ABI/testing/sysfs-kernel-uids?
> >
> > I already have documentation on the todo list, I'll add this file to
> > that list :-)
> 
> Care to rebase the patch against -mm, we fixed the mixed-up usage
> of ksets and kobjects, and this can not apply anymore:
>   http://git.kernel.org/?p=linux/kernel/git/gregkh/patches.git;a=blob;f=driver/struct-user_info-sysfs.patch;hb=HEAD
> 
> There is also an attribute group now which makes it much easier to add
> new files.

Ingo, Greg,

What would be the easiest way to carry this forward? sched-devel and
greg's tree would intersect at this point and leave poor akpm with the
resulting mess. Should I just make an incremental patch akpm can carry
and push? Or can we base one tree off the other?




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 23:58               ` Greg KH
@ 2008-01-08 23:57                 ` Ingo Molnar
  2008-01-10  0:05                   ` Greg KH
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2008-01-08 23:57 UTC (permalink / raw)
  To: Greg KH
  Cc: Peter Zijlstra, Kay Sievers, Dhaval Giani, vatsa, LKML,
	Balbir Singh, dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner, Andrew Morton


* Greg KH <gregkh@suse.de> wrote:

> On Wed, Jan 09, 2008 at 12:35:32AM +0100, Peter Zijlstra wrote:
> > 
> > On Tue, 2008-01-08 at 15:31 +0100, Kay Sievers wrote:
> > > On Jan 8, 2008 12:02 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > >
> > > > On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> > > > > On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > > > > >
> > > > > > Subject: sched: rt-group: add uid-group interface
> > > > > >
> > > > > > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > > > > > the group's rt_period and rt_runtime.
> > > > > >
> > > > >
> > > > > Hi Peter,
> > > > >
> > > > > Cool stuff! I will try out these patches and try to give you some
> > > > > feedback.
> > > >
> > > > Thanks, much appreciated!
> > > >
> > > > > One request though, could you please add some documentation to
> > > > > Documentation/ABI/testing/sysfs-kernel-uids?
> > > >
> > > > I already have documentation on the todo list, I'll add this file to
> > > > that list :-)
> > > 
> > > Care to rebase the patch against -mm, we fixed the mixed-up usage
> > > of ksets and kobjects, and this can not apply anymore:
> > >   http://git.kernel.org/?p=linux/kernel/git/gregkh/patches.git;a=blob;f=driver/struct-user_info-sysfs.patch;hb=HEAD
> > > 
> > > There is also an attribute group now which makes it much easier to add
> > > new files.
> > 
> > Ingo, Greg,
> > 
> > What would be the easiest way to carry this forward? sched-devel and
> > greg's tree would intersect at this point and leave poor akpm with the
> > resulting mess. Should I just make an incremental patch akpm can carry
> > and push? Or can we base one tree off the other?
> 
> If it's just a single patch for this, I'd be glad to take it.  But by 
> looking at the [11/12] above, I doubt this is so...
> 
> If it's not that rough (12 patches is not a big deal), I'd be glad to 
> take these through my tree, after you fix up Kay's requests above :)

hm, i'd really like to see this tested and go through sched.git. It's 
only the few sysfs bits which interfere, right?

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 23:35             ` Peter Zijlstra
@ 2008-01-08 23:58               ` Greg KH
  2008-01-08 23:57                 ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Greg KH @ 2008-01-08 23:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kay Sievers, Dhaval Giani, vatsa, LKML, Ingo Molnar,
	Balbir Singh, dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner, Andrew Morton

On Wed, Jan 09, 2008 at 12:35:32AM +0100, Peter Zijlstra wrote:
> 
> On Tue, 2008-01-08 at 15:31 +0100, Kay Sievers wrote:
> > On Jan 8, 2008 12:02 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > >
> > > On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> > > > On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > > > >
> > > > > Subject: sched: rt-group: add uid-group interface
> > > > >
> > > > > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > > > > the group's rt_period and rt_runtime.
> > > > >
> > > >
> > > > Hi Peter,
> > > >
> > > > Cool stuff! I will try out these patches and try to give you some
> > > > feedback.
> > >
> > > Thanks, much appreciated!
> > >
> > > > One request though, could you please add some documentation to
> > > > Documentation/ABI/testing/sysfs-kernel-uids?
> > >
> > > I already have documentation on the todo list, I'll add this file to
> > > that list :-)
> > 
> > Care to rebase the patch against -mm, we fixed the mixed-up usage
> > of ksets and kobjects, and this can not apply anymore:
> >   http://git.kernel.org/?p=linux/kernel/git/gregkh/patches.git;a=blob;f=driver/struct-user_info-sysfs.patch;hb=HEAD
> > 
> > There is also an attribute group now which makes it much easier to add
> > new files.
> 
> Ingo, Greg,
> 
> What would be the easiest way to carry this forward? sched-devel and
> greg's tree would intersect at this point and leave poor akpm with the
> resulting mess. Should I just make an incremental patch akpm can carry
> and push? Or can we base one tree off the other?

If it's just a single patch for this, I'd be glad to take it.  But by
looking at the [11/12] above, I doubt this is so...

If it's not that rough (12 patches is not a big deal), I'd be glad to
take these through my tree, after you fix up Kay's requests above :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-08 23:57                 ` Ingo Molnar
@ 2008-01-10  0:05                   ` Greg KH
  2008-02-07  4:17                     ` Dhaval Giani
  0 siblings, 1 reply; 31+ messages in thread
From: Greg KH @ 2008-01-10  0:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Kay Sievers, Dhaval Giani, vatsa, LKML,
	Balbir Singh, dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner, Andrew Morton

On Wed, Jan 09, 2008 at 12:57:50AM +0100, Ingo Molnar wrote:
> 
> * Greg KH <gregkh@suse.de> wrote:
> 
> > On Wed, Jan 09, 2008 at 12:35:32AM +0100, Peter Zijlstra wrote:
> > > 
> > > On Tue, 2008-01-08 at 15:31 +0100, Kay Sievers wrote:
> > > > On Jan 8, 2008 12:02 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > > >
> > > > > On Tue, 2008-01-08 at 16:27 +0530, Dhaval Giani wrote:
> > > > > > On Mon, Jan 07, 2008 at 05:57:42PM +0100, Peter Zijlstra wrote:
> > > > > > >
> > > > > > > Subject: sched: rt-group: add uid-group interface
> > > > > > >
> > > > > > > Extend the /sys/kernel/uids/<uid>/ interface to allow setting
> > > > > > > the group's rt_period and rt_runtime.
> > > > > > >
> > > > > >
> > > > > > Hi Peter,
> > > > > >
> > > > > > Cool stuff! I will try out these patches and try to give you some
> > > > > > feedback.
> > > > >
> > > > > Thanks, much appreciated!
> > > > >
> > > > > > One request though, could you please add some documentation to
> > > > > > Documentation/ABI/testing/sysfs-kernel-uids?
> > > > >
> > > > > I already have documentation on the todo list, I'll add this file to
> > > > > that list :-)
> > > > 
> > > > Care to rebase the patch against -mm, we fixed the mixed-up usage
> > > > of ksets and kobjects, and this can not apply anymore:
> > > >   http://git.kernel.org/?p=linux/kernel/git/gregkh/patches.git;a=blob;f=driver/struct-user_info-sysfs.patch;hb=HEAD
> > > > 
> > > > There is also an attribute group now which makes it much easier to add
> > > > new files.
> > > 
> > > Ingo, Greg,
> > > 
> > > What would be the easiest way to carry this forward? sched-devel and
> > > greg's tree would intersect at this point and leave poor akpm with the
> > > resulting mess. Should I just make an incremental patch akpm can carry
> > > and push? Or can we base one tree off the other?
> > 
> > If it's just a single patch for this, I'd be glad to take it.  But by 
> > looking at the [11/12] above, I doubt this is so...
> > 
> > If it's not that rough (12 patches is not a big deal), I'd be glad to 
> > take these through my tree, after you fix up Kay's requests above :)
> 
> hm, i'd really like to see this tested and go through sched.git. It's 
> only the few sysfs bits which interfere, right?

Yes, that should be it.

So why not put the majority of this through sched.git, then when my
sysfs changes go in at the beginning of the .25 merge cycle, you can
then add the sysfs changes through your tree or anywhere else.

Unless you are relying on the sysfs changes for this whole feature, and
without them it just doesn't make any sense at all?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-01-10  0:05                   ` Greg KH
@ 2008-02-07  4:17                     ` Dhaval Giani
  2008-02-07  5:42                       ` Greg KH
  0 siblings, 1 reply; 31+ messages in thread
From: Dhaval Giani @ 2008-02-07  4:17 UTC (permalink / raw)
  To: Greg KH
  Cc: Ingo Molnar, Peter Zijlstra, Kay Sievers, vatsa, LKML,
	Balbir Singh, dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner, Andrew Morton

On Wed, Jan 09, 2008 at 04:05:31PM -0800, Greg KH wrote:
> > > > Ingo, Greg,
> > > > 
> > > > What would be the easiest way to carry this forward? sched-devel and
> > > > greg's tree would intersect at this point and leave poor akpm with the
> > > > resulting mess. Should I just make an incremental patch akpm can carry
> > > > and push? Or can we base one tree off the other?
> > > 
> > > If it's just a single patch for this, I'd be glad to take it.  But by 
> > > looking at the [11/12] above, I doubt this is so...
> > > 
> > > If it's not that rough (12 patches is not a big deal), I'd be glad to 
> > > take these through my tree, after you fix up Kay's requests above :)
> > 
> > hm, i'd really like to see this tested and go through sched.git. It's 
> > only the few sysfs bits which interfere, right?
> 
> Yes, that should be it.
> 
> So why not put the majority of this through sched.git, then when my
> sysfs changes go in at the beginning of the .25 merge cycle, you can
> then add the sysfs changes through your tree or anywhere else.
> 

Hi,

I was wondering where these changes are right now. I don't see the sysfs
interface for rt-group-sched in mainline right now.

Thanks,
-- 
regards,
Dhaval

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 12/11] sched: rt-group: uid-group interface
  2008-02-07  4:17                     ` Dhaval Giani
@ 2008-02-07  5:42                       ` Greg KH
  0 siblings, 0 replies; 31+ messages in thread
From: Greg KH @ 2008-02-07  5:42 UTC (permalink / raw)
  To: Dhaval Giani
  Cc: Ingo Molnar, Peter Zijlstra, Kay Sievers, vatsa, LKML,
	Balbir Singh, dmitry.adamushko, Steven Rostedt, Gregory Haskins,
	Thomas Gleixner, Andrew Morton

On Thu, Feb 07, 2008 at 09:47:22AM +0530, Dhaval Giani wrote:
> On Wed, Jan 09, 2008 at 04:05:31PM -0800, Greg KH wrote:
> > > > > Ingo, Greg,
> > > > > 
> > > > > What would be the easiest way to carry this forward? sched-devel and
> > > > > greg's tree would intersect at this point and leave poor akpm with the
> > > > > resulting mess. Should I just make an incremental patch akpm can carry
> > > > > and push? Or can we base one tree off the other?
> > > > 
> > > > If it's just a single patch for this, I'd be glad to take it.  But by 
> > > > looking at the [11/12] above, I doubt this is so...
> > > > 
> > > > If it's not that rough (12 patches is not a big deal), I'd be glad to 
> > > > take these through my tree, after you fix up Kay's requests above :)
> > > 
> > > hm, i'd really like to see this tested and go through sched.git. It's 
> > > only the few sysfs bits which interfere, right?
> > 
> > Yes, that should be it.
> > 
> > So why not put the majority of this through sched.git, then when my
> > sysfs changes go in at the beginning of the .25 merge cycle, you can
> > then add the sysfs changes through your tree or anywhere else.
> > 
> 
> Hi,
> 
> I was wondering where these changes are right now. I don't see the sysfs
> interface for rt-group-sched in mainline right now.

All of the sysfs changes I had are in Linus's tree, so you don't need me
anymore :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2008-02-07  5:43 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-06 16:11 [PATCH 00/11] another rt group sched update Peter Zijlstra
2008-01-06 16:11 ` [PATCH 01/11] sched: rt throttling vs no_hz Peter Zijlstra
2008-01-06 16:11 ` [PATCH 02/11] sched: load_balance_monitor rename Peter Zijlstra
2008-01-06 16:11 ` [PATCH 03/11] hrtimer: clean up cpu->base locking tricks Peter Zijlstra
2008-01-06 16:11 ` [PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback Peter Zijlstra
2008-01-07 11:56   ` Peter Zijlstra
2008-01-08 11:16     ` Ingo Molnar
2008-01-06 16:11 ` [PATCH 05/11] hrtimer: unlock hrtimer_wakeup Peter Zijlstra
2008-01-06 16:11 ` [PATCH 06/11] sched: rt-group: reduce rescheduling Peter Zijlstra
2008-01-06 16:11 ` [PATCH 07/11] sched: rt-group: per group period Peter Zijlstra
2008-01-06 16:11 ` [PATCH 08/11] sched: rt-group: deal with PI Peter Zijlstra
2008-01-06 16:11 ` [PATCH 09/11] sched: rt-group: dynamic period ticks Peter Zijlstra
2008-01-06 16:11 ` [PATCH 10/11] sched: rt-group: EDF Peter Zijlstra
2008-01-06 16:11 ` [PATCH 11/11] sched: rt-group: interface Peter Zijlstra
2008-01-07 10:51 ` [PATCH 00/11] another rt group sched update Peter Zijlstra
2008-01-07 11:24   ` Peter Zijlstra
2008-01-07 12:23   ` Srivatsa Vaddagiri
2008-01-07 12:12     ` Peter Zijlstra
2008-01-07 16:57     ` [PATCH 12/11] sched: rt-group: uid-group interface Peter Zijlstra
2008-01-08 10:33       ` Ingo Molnar
2008-01-08 10:57       ` Dhaval Giani
2008-01-08 11:02         ` Peter Zijlstra
2008-01-08 14:31           ` Kay Sievers
2008-01-08 23:35             ` Peter Zijlstra
2008-01-08 23:58               ` Greg KH
2008-01-08 23:57                 ` Ingo Molnar
2008-01-10  0:05                   ` Greg KH
2008-02-07  4:17                     ` Dhaval Giani
2008-02-07  5:42                       ` Greg KH
2008-01-08 23:26         ` Peter Zijlstra
2008-01-07 11:17 ` [PATCH 00/11] another rt group sched update Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).