[PATCH RFC] sched: boost throttled entities on wakeups

* [PATCH RFC] sched: boost throttled entities on wakeups
@ 2012-10-18  7:32 Vladimir Davydov
  2012-10-18 10:39 ` [Devel] " Vladimir Davydov
  2012-10-19 14:24 ` Peter Zijlstra
  0 siblings, 2 replies; 4+ messages in thread
From: Vladimir Davydov @ 2012-10-18  7:32 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Paul Turner
  Cc: Kirill Korotaev, linux-kernel, devel

[-- Attachment #1: Type: text/plain, Size: 2497 bytes --]

If several tasks in different cpu cgroups are contending for the same resource
(e.g. a semaphore) and one of those task groups is cpu limited (using cfs
bandwidth control), the priority inversion problem is likely to arise: if a cpu
limited task goes to sleep holding the resource (e.g. trying to take another
semaphore), it can be throttled (i.e.  removed from the runqueue), which will
result in other, perhaps high-priority, tasks waiting until the low-priority
task continues its execution.

The patch tries to solve this problem by boosting tasks in throttled groups on
wakeups, i.e.  temporarily unthrottling the groups a woken task belongs to in
order to let the task finish its execution in kernel space.  This obviously
should eliminate the priority inversion problem on voluntary preemptable
kernels.  However, it does not solve the problem for fully preemptable kernels,
although I guess the patch can be extended to handle those kernels too (e.g. by
boosting forcibly preempted tasks thus not allowing to throttle).

I wrote a simple test that demonstrates the problem (the test is attached). It
creates two cgroups each of which is bound to exactly one cpu using cpusets,
sets the limit of the first group to 10% and leaves the second group unlimited.
Then in both groups it starts processes reading the same (big enough) file
along with a couple of busyloops in the limited groups, and measures the read
time.

I've run the test 10 times for a 1 Gb file on a server with > 10 Gb of RAM and
4 cores x 2 hyperthreads (the kernel was with CONFIG_PREEMPT_VOLUNTARY=y). Here
are the results:

without the patch	40.03 +- 7.04 s
with the patch		 8.42 +- 0.48 s

(Since the server's RAM can accommodate the whole file, the read time was the
same for both groups)

I would appreciate if you could answer the following questions regarding the
priority inversion problem and the proposed approach:

1) Do you agree that the problem exists and should be sorted out?

2) If so, does the general approach proposed (unthrottling on wakeups) suits
you? Why or why not?

3) If you think that the approach proposed is sane, what you dislike about the
patch?

Thank you!

---
 include/linux/sched.h   |    8 ++
 kernel/sched/core.c     |    8 ++
 kernel/sched/fair.c     |  182 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/features.h |    2 +
 kernel/sched/sched.h    |    6 ++
 5 files changed, 204 insertions(+), 2 deletions(-)


[-- Attachment #2: sched-boost-throttled-entities-on-wakeups.patch --]
[-- Type: application/octet-stream, Size: 10765 bytes --]

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..5b2ea5a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,6 +1043,7 @@ struct sched_domain;
 #else
 #define ENQUEUE_WAKING		0
 #endif
+#define ENQUEUE_BOOST		8
 
 #define DEQUEUE_SLEEP		1
 
@@ -1089,6 +1090,8 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
+
+	void (*task_scheduled) (struct rq *rq, struct task_struct *p);
 };
 
 struct load_weight {
@@ -1137,6 +1140,11 @@ struct sched_entity {
 	struct list_head	group_node;
 	unsigned int		on_rq;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	unsigned int		boosted;
+	struct list_head	boost_node;
+#endif
+
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..5fe4fd5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1783,6 +1783,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	}
 }
 
+static inline void task_scheduled(struct rq *rq, struct task_struct *p)
+{
+	if (p->sched_class->task_scheduled)
+		p->sched_class->task_scheduled(rq, p);
+}
+
 #ifdef CONFIG_SMP
 
 /* assumes rq->lock is held */
@@ -2888,6 +2894,8 @@ need_resched:
 	sched_preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
+
+	task_scheduled(rq, current);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a1..a723a7e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -772,6 +772,110 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return !list_empty(&cfs_rq->boosted_entities);
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return se->boosted;
+}
+
+static inline void update_entity_boost(struct sched_entity *se)
+{
+	if (!entity_is_task(se))
+		se->boosted = cfs_rq_has_boosted_entities(group_cfs_rq(se));
+}
+
+static int check_enqueue_boost(struct rq *rq, struct task_struct *p, int flags)
+{
+	if (sched_feat(BOOST_WAKEUPS) && (flags & ENQUEUE_WAKEUP))
+		p->se.boosted = 1;
+	return p->se.boosted;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	list_add(&se->boost_node, &cfs_rq->boosted_entities);
+}
+
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	list_del(&se->boost_node);
+}
+
+static int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (se == cfs_rq->curr)
+		return 0;
+
+	if (entity_is_task(se) || !entity_boosted(se)) {
+		__enqueue_boosted_entity(cfs_rq, se);
+		se->boosted = 1;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (se == cfs_rq->curr)
+		return 0;
+
+	if (entity_is_task(se) ||
+	    !cfs_rq_has_boosted_entities(group_cfs_rq(se))) {
+		__dequeue_boosted_entity(cfs_rq, se);
+		if (!entity_is_task(se))
+			se->boosted = 0;
+		return 1;
+	}
+
+	return 0;
+}
+#else
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline void update_entity_boost(struct sched_entity *se) {}
+
+static inline int check_enqueue_boost(struct rq *rq,
+				      struct task_struct *p, int flags)
+{
+	return 0;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+
+static inline int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+#endif
+
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -1113,7 +1217,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
-		check_enqueue_throttle(cfs_rq);
+		if (!(flags & ENQUEUE_BOOST))
+			check_enqueue_throttle(cfs_rq);
 	}
 }
 
@@ -1261,6 +1366,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		if (entity_boosted(se))
+			__dequeue_boosted_entity(cfs_rq, se);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -1289,7 +1396,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  * 3) pick the "last" process, for cache locality
  * 4) do not run the "skip" process, if something else is available
  */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *do_pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *left = se;
@@ -1321,6 +1428,42 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	return se;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se = NULL;
+
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) {
+		rq->cfs_quota_exceeded = 1;
+		if (cfs_rq_has_boosted_entities(cfs_rq)) {
+			se = list_first_entry(&cfs_rq->boosted_entities,
+					      struct sched_entity, boost_node);
+			clear_buddies(cfs_rq, se);
+		}
+	}
+
+	if (!se)
+		se = do_pick_next_entity(cfs_rq);
+
+	if (entity_is_task(se) &&
+	    !entity_boosted(se) && rq->cfs_quota_exceeded)
+		resched_task(task_of(se));
+
+	return se;
+}
+
+static void task_scheduled_fair(struct rq *rq, struct task_struct *p)
+{
+	if (rq->cfs_quota_exceeded)
+		set_tsk_need_resched(p);
+	rq->cfs_quota_exceeded = 0;
+	p->se.boosted = 0;
+}
+#else
+# define pick_next_entity do_pick_next_entity
+#endif
+
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
@@ -1332,6 +1475,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
+	update_entity_boost(prev);
+	if (entity_boosted(prev) && prev->on_rq)
+		__enqueue_boosted_entity(cfs_rq, prev);
+
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
@@ -1965,6 +2112,9 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_rq_throttled(cfs_rq))
 		return;
 
+	if (cfs_rq_has_boosted_entities(cfs_rq))
+		return;
+
 	throttle_cfs_rq(cfs_rq);
 }
 
@@ -2020,6 +2170,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+	INIT_LIST_HEAD(&cfs_rq->boosted_entities);
 }
 
 /* requires cfs_b->lock, may release to reprogram timer */
@@ -2180,11 +2331,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boost = check_enqueue_boost(rq, p, flags);
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
+		if (boost)
+			flags |= ENQUEUE_BOOST;
 		enqueue_entity(cfs_rq, se, flags);
 
 		/*
@@ -2197,6 +2351,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running++;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -2207,12 +2364,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
 	if (!se)
 		inc_nr_running(rq);
+	else if (boost)
+		for_each_sched_entity(se) {
+			cfs_rq = cfs_rq_of(se);
+			if (!enqueue_boosted_entity(cfs_rq, se))
+				break;
+			if (cfs_rq_throttled(cfs_rq))
+				unthrottle_cfs_rq(cfs_rq);
+		}
 	hrtick_update(rq);
 }
 
@@ -2227,6 +2395,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boosted = entity_boosted(se);
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
 	for_each_sched_entity(se) {
@@ -2243,6 +2412,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running--;
 
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/*
@@ -2266,6 +2438,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
+
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
@@ -5339,6 +5514,9 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_move_group	= task_move_group_fair,
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	.task_scheduled		= task_scheduled_fair,
+#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefca..fbc59cd 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,5 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+
+SCHED_FEAT(BOOST_WAKEUPS, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09..681be1d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -267,6 +267,8 @@ struct cfs_rq {
 	u64 throttled_timestamp;
 	int throttled, throttle_count;
 	struct list_head throttled_list;
+
+	struct list_head boosted_entities;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -446,6 +448,10 @@ struct rq {
 	struct hrtimer hrtick_timer;
 #endif
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	int cfs_quota_exceeded;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;

[-- Attachment #3: ioprio_inv_test.sh --]
[-- Type: application/octet-stream, Size: 1795 bytes --]

#! /bin/bash
#

PAGE_SIZE=4096

RESULTS_DIR=ioprio_inv_test_results

TEST_FILE=/root/dummy
TEST_FILE_SIZE=$[256*1024] # in pages

DROP_CACHES=1
PREP_TEST_FILE=1

CGROUP_MNT=/mnt/cgroup
NR_CGROUPS=2

CGROUP[0]=test1
CGROUP_NR_BUSYLOOPS[0]=2
CGROUP_CFS_PERIOD[0]=100000
CGROUP_CFS_QUOTA[0]=10000
CGROUP_CPUMASK[0]=1

CGROUP[1]=test2
CGROUP_NR_BUSYLOOPS[1]=0
CGROUP_CFS_PERIOD[1]=100000
CGROUP_CFS_QUOTA[1]=-1
CGROUP_CPUMASK[1]=2

function do_test()
{
	local cgrp=$1
	sleep 3
	local pidlist=
	local i=
	for i in `seq 1 ${CGROUP_NR_BUSYLOOPS[$cgrp]}`; do
		while :; do :; done &
		pidlist="$pidlist $!"
	done
	dd if=$TEST_FILE of=/dev/null bs=$PAGE_SIZE count=$TEST_FILE_SIZE &> $RESULTS_DIR/$cgrp.log
	local pid
	for pid in $pidlist; do
		kill $pid
	done
}

rm -rf $RESULTS_DIR
mkdir -p $RESULTS_DIR

if [ $PREP_TEST_FILE -ne 0 ]; then
	echo "Creating test file ($TEST_FILE_SIZE pages)..."
	rm -f $TEST_FILE_SIZE
	dd if=/dev/zero of=$TEST_FILE bs=$PAGE_SIZE count=$TEST_FILE_SIZE &>/dev/null
	echo "Syncing..."
	sync; sync
fi

if [ $DROP_CACHES -ne 0 ]; then
	echo "Dropping caches..."
	echo 3 > /proc/sys/vm/drop_caches
fi

echo "Mounting cgroup..."
mkdir -p $CGROUP_MNT
mount -t cgroup -o cpu,cpuset none $CGROUP_MNT

pidlist=
echo "Preparing test..."
for i in `seq 0 $[NR_CGROUPS-1]`; do
	cgrp=$CGROUP_MNT/${CGROUP[$i]}
	mkdir -p $cgrp
	echo ${CGROUP_CFS_PERIOD[$i]} > $cgrp/cpu.cfs_period_us
	echo ${CGROUP_CFS_QUOTA[$i]} > $cgrp/cpu.cfs_quota_us
	echo ${CGROUP_CPUMASK[$i]} > $cgrp/cpuset.cpus
	echo $$ > $cgrp/tasks
	do_test $i &
	pidlist="$pidlist $!"
done

echo $$ > $CGROUP_MNT/tasks

echo "Testing..."
for pid in $pidlist; do
	wait $pid
done

echo
echo "Results:"
echo
tail -n1 $RESULTS_DIR/*
echo

^ permalink raw reply related	[flat|nested] 4+ messages in thread