All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nikunj A. Dadhania" <nikunj@linux.vnet.ibm.com>
To: peterz@infradead.org, mingo@elte.hu, linux-kernel@vger.kernel.org
Cc: nikunj@linux.vnet.ibm.com, vatsa@linux.vnet.ibm.com,
	bharata@linux.vnet.ibm.com
Subject: [RFC PATCH 2/4] sched: Adding gang scheduling infrastrucure
Date: Mon, 19 Dec 2011 14:04:38 +0530	[thread overview]
Message-ID: <20111219083424.32311.23559.stgit@abhimanyu.in.ibm.com> (raw)
In-Reply-To: <20111219083141.32311.9429.stgit@abhimanyu.in.ibm.com>

The patch introduces the concept of gang_leader and gang_cpumasks.  For the
first time when the gang_leader is not set, the gang leader is elected. The
election is dependent on the number of cpus that we have to gang, aka gang
granularity. ATM, gang granularity is set to 8cpus, which can be made to set
using a sysctl if required.

TODO: This still does not take care of cpu-offlining and re-electing the
gang-leader

Signed-off-by: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
---

 kernel/sched/core.c  |    9 +++++
 kernel/sched/fair.c  |   91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    4 ++
 3 files changed, 104 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e96f861..f3ae29c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1968,6 +1968,12 @@ static inline void post_schedule(struct rq *rq)
 
 		rq->post_schedule = 0;
 	}
+	if (rq->gang_schedule == 1) {
+		struct task_group *tg = task_group(rq->curr);
+
+		gang_sched(tg, rq);
+	}
+
 }
 
 #else
@@ -6903,6 +6909,9 @@ void __init sched_init(void)
 		rq->rd = NULL;
 		rq->cpu_power = SCHED_POWER_SCALE;
 		rq->post_schedule = 0;
+		rq->gang_schedule = 0;
+		rq->gang_leader = -1;
+		rq->gang_cpumask = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b95575f..c03efd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3020,6 +3020,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
+	struct task_group *tg;
 
 	if (!cfs_rq->nr_running)
 		return NULL;
@@ -3030,6 +3031,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
+	tg = se->cfs_rq->tg;
+
+	if (tg->gang) {
+		if (!rq->gang_schedule && rq->gang_leader)
+			rq->gang_schedule = tg->gang;
+	}
+
 	p = task_of(se);
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
@@ -3533,6 +3541,15 @@ struct sg_lb_stats {
 };
 
 /**
+ * domain_first_cpu - Returns the first cpu in the cpumask of a sched_domain.
+ * @domain: The domain whose first cpu is to be returned.
+ */
+static inline unsigned int domain_first_cpu(struct sched_domain *sd)
+{
+	return cpumask_first(sched_domain_span(sd));
+}
+
+/**
  * get_sd_load_idx - Obtain the load index for a given sched domain.
  * @sd: The sched_domain whose load_idx is to be obtained.
  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -5485,6 +5502,80 @@ done:
 	return 0;
 }
 
+static void gang_sched_member(void *info)
+{
+	struct task_group *tg = (struct task_group *) info;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+	int cpu;
+	unsigned long flags;
+
+	cpu  = smp_processor_id();
+	cfs_rq = tg->cfs_rq[cpu];
+	rq = cfs_rq->rq;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	/* Check if the runqueue has runnable tasks */
+	if (cfs_rq->nr_running) {
+		/* Favour this task group and set need_resched flag,
+		 * added by following patches */
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+#define GANG_SCHED_GRANULARITY 8
+
+void gang_sched(struct task_group *tg, struct rq *rq)
+{
+	/* We do not gang sched here */
+	if (rq->gang_leader == 0 || !tg || tg->gang == 0)
+		return;
+
+	/* Yes thats the leader */
+	if (rq->gang_leader == 1) {
+
+		if (!in_interrupt() && !irqs_disabled()) {
+			smp_call_function_many(rq->gang_cpumask,
+					gang_sched_member, tg, 0);
+
+			rq->gang_schedule = 0;
+		}
+
+	} else {
+		/*
+		 * find the gang leader according to the span,
+		 * currently we have it as 8cpu, this can be made
+		 * dynamic
+		 */
+		struct sched_domain *sd;
+		unsigned int count;
+		int i;
+
+		for_each_domain(cpu_of(rq), sd) {
+			count = 0;
+			for_each_cpu(i, sched_domain_span(sd))
+				count++;
+
+			if (count >= GANG_SCHED_GRANULARITY)
+				break;
+		}
+
+		if (sd && cpu_of(rq) == domain_first_cpu(sd)) {
+			printk(KERN_INFO "Selected CPU %d as gang leader\n",
+				cpu_of(rq));
+			rq->gang_leader = 1;
+			rq->gang_cpumask = sched_domain_span(sd);
+		} else if (sd) {
+			/*
+			 * A fellow cpu, it will receive gang
+			 * initiations from the gang leader now
+			 */
+			rq->gang_leader = 0;
+		}
+	}
+}
+
 static DEFINE_MUTEX(gang_mutex);
 
 int sched_group_set_gang(struct task_group *tg, unsigned long gang)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f1a85e3..db8369f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -187,6 +187,7 @@ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern int sched_group_set_gang(struct task_group *tg, unsigned long gang);
+extern void gang_sched(struct task_group *tg, struct rq *rq);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -419,6 +420,9 @@ struct rq {
 	unsigned char idle_balance;
 	/* For active balancing */
 	int post_schedule;
+	int gang_schedule;
+	int gang_leader;
+	struct cpumask *gang_cpumask;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;


  parent reply	other threads:[~2011-12-19  8:33 UTC|newest]

Thread overview: 95+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-12-19  8:33 [RFC PATCH 0/4] Gang scheduling in CFS Nikunj A. Dadhania
2011-12-19  8:34 ` [RFC PATCH 1/4] sched: Adding cpu.gang file to cpu cgroup Nikunj A. Dadhania
2011-12-19  8:34 ` Nikunj A. Dadhania [this message]
2011-12-19 15:51   ` [RFC PATCH 2/4] sched: Adding gang scheduling infrastrucure Peter Zijlstra
2011-12-19 16:51     ` Peter Zijlstra
2011-12-20  1:43       ` Nikunj A Dadhania
2011-12-20  1:39     ` Nikunj A Dadhania
2011-12-19  8:34 ` [RFC PATCH 3/4] sched: Gang using set_next_buddy Nikunj A. Dadhania
2011-12-19  8:35 ` [RFC PATCH 4/4] sched:Implement set_gang_buddy Nikunj A. Dadhania
2011-12-19 15:51   ` Peter Zijlstra
2011-12-20  1:43     ` Nikunj A Dadhania
2011-12-26  2:30     ` Nikunj A Dadhania
2011-12-19 11:23 ` [RFC PATCH 0/4] Gang scheduling in CFS Ingo Molnar
2011-12-19 11:44   ` Avi Kivity
2011-12-19 11:50     ` Nikunj A Dadhania
2011-12-19 11:59       ` Avi Kivity
2011-12-19 12:06         ` Nikunj A Dadhania
2011-12-19 12:50           ` Avi Kivity
2011-12-19 13:09             ` Nikunj A Dadhania
2011-12-19 11:45   ` Nikunj A Dadhania
2011-12-19 13:22     ` Nikunj A Dadhania
2011-12-19 16:28       ` Ingo Molnar
2011-12-21 10:39   ` Nikunj A Dadhania
2011-12-21 10:43     ` Avi Kivity
2011-12-23  3:20       ` Nikunj A Dadhania
2011-12-23 10:36         ` Ingo Molnar
2011-12-25 10:58           ` Avi Kivity
2011-12-25 15:45             ` Avi Kivity
2011-12-26  3:14             ` Nikunj A Dadhania
2011-12-26  9:05               ` Avi Kivity
2011-12-26 11:33                 ` Nikunj A Dadhania
2011-12-26 11:41                   ` Avi Kivity
2011-12-27  1:47                     ` Nikunj A Dadhania
2011-12-27  9:15                       ` Avi Kivity
2011-12-27 10:24                         ` Nikunj A Dadhania
2011-12-29 16:07                 ` Better qemu/kvm defaults (was Re: [RFC PATCH 0/4] Gang scheduling in CFS) Dor Laor
2011-12-29 16:07                   ` [Qemu-devel] " Dor Laor
2011-12-29 16:13                   ` Avi Kivity
2011-12-29 16:13                     ` [Qemu-devel] " Avi Kivity
2011-12-29 16:16                   ` Anthony Liguori
2011-12-29 16:16                     ` Anthony Liguori
2012-01-01 10:16                     ` Dor Laor
2012-01-01 10:16                       ` [Qemu-devel] " Dor Laor
2012-01-01 14:01                       ` Ronen Hod
2012-01-01 14:01                         ` Ronen Hod
2012-01-02  9:37                         ` Dor Laor
2012-01-02  9:37                           ` [Qemu-devel] " Dor Laor
2012-01-03 15:48                       ` Anthony Liguori
2012-01-03 15:48                         ` Anthony Liguori
2012-01-03 22:31                         ` Dor Laor
2012-01-03 22:31                           ` Dor Laor
2012-01-03 22:45                           ` Anthony Liguori
2012-01-03 22:45                             ` [Qemu-devel] " Anthony Liguori
2012-01-03 22:59                             ` Dor Laor
2012-01-03 22:59                               ` Dor Laor
2011-12-27  3:15               ` [RFC PATCH 0/4] Gang scheduling in CFS Nikunj A Dadhania
2011-12-27  9:17                 ` Avi Kivity
2011-12-27  9:44                   ` Nikunj A Dadhania
2011-12-27  9:51                     ` Avi Kivity
2011-12-27 10:10                       ` Nikunj A Dadhania
2011-12-27 10:34                         ` Avi Kivity
2011-12-27 10:43                           ` Nikunj A Dadhania
2011-12-27 10:53                             ` Avi Kivity
2011-12-30  9:51             ` Ingo Molnar
2011-12-30 10:10               ` Nikunj A Dadhania
2011-12-31  2:21                 ` Nikunj A Dadhania
2012-01-02  4:20                   ` Nikunj A Dadhania
2012-01-02  9:39                     ` Avi Kivity
2012-01-02 10:22                       ` Nikunj A Dadhania
2012-01-02  9:37                   ` Avi Kivity
2012-01-02 10:30                     ` Nikunj A Dadhania
2012-01-02 13:33                       ` Avi Kivity
2012-01-04 10:52                     ` Nikunj A Dadhania
2012-01-04 14:41                       ` Avi Kivity
2012-01-04 14:56                         ` Srivatsa Vaddagiri
2012-01-04 17:13                           ` Avi Kivity
2012-01-05  6:57                             ` Nikunj A Dadhania
2012-01-04 16:47                         ` Rik van Riel
2012-01-04 17:16                           ` Avi Kivity
2012-01-04 20:56                             ` Rik van Riel
2012-01-04 21:31                             ` Peter Zijlstra
2012-01-04 21:41                               ` Avi Kivity
2012-01-05  9:10                                 ` Ingo Molnar
2012-02-20  8:08                                   ` Nikunj A Dadhania
2012-02-20  8:14                                     ` Ingo Molnar
2012-02-20 10:51                                     ` Peter Zijlstra
2012-02-20 11:53                                       ` Nikunj A Dadhania
2012-02-20 12:02                                         ` Srivatsa Vaddagiri
2012-02-20 12:14                                           ` Peter Zijlstra
2012-01-05  2:10                         ` Nikunj A Dadhania
2011-12-19 15:51 ` Peter Zijlstra
2011-12-19 16:09   ` Alan Cox
2011-12-19 22:10   ` Benjamin Herrenschmidt
2011-12-20  1:56   ` Nikunj A Dadhania
2011-12-20  8:52   ` Jeremy Fitzhardinge

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20111219083424.32311.23559.stgit@abhimanyu.in.ibm.com \
    --to=nikunj@linux.vnet.ibm.com \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=peterz@infradead.org \
    --cc=vatsa@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.