All of lore.kernel.org
 help / color / mirror / Atom feed
* [ANNOUNCE] BLD-3.17 release.
@ 2014-10-11  6:20 Rakib Mullick
  2014-10-13  8:01 ` Mike Galbraith
  0 siblings, 1 reply; 5+ messages in thread
From: Rakib Mullick @ 2014-10-11  6:20 UTC (permalink / raw)
  To: linux-kernel

BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17
, this version contains a build fix when CONFIG_BLD=n. I have recently
done some netperf benchmark (actually I did prevously too but never
publish) below shows the stat of default netperf run on localhost
system (client/server) running on local system (core i3, 2g mem,
higher is better).

	    tcp_stream	tcp_rr    udp_stream     udp_rr

mainline    9343.54    20812.03	   18231.74	24396.074
				   18210.71

bld	   14738.35    29224.54    26475.75	34910.08
				   26462.53

These are average of 5 runs of each tests. BLD performs better
and shows ~(35-40)% improvement. Config used for this benchmark
could be found at the following link:

https://raw.githubusercontent.com/rmullick/bld-patches/master/config.benchmark-3.17

And, recently Luis Cruz backports BLD's previous release BLD-3.16
for Android and experimentally ran it on his galaxy SIII, these
could be found at following link:

https://github.com/SyNtheticNightmar3/bld-patches

If you are interested in running it on Android, take a look at the
above link.

Thanks,
Rakib

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
---

diff --git a/init/Kconfig b/init/Kconfig
index 80a6907..65319c6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -36,6 +36,15 @@ config BROKEN_ON_SMP
 	depends on BROKEN || !SMP
 	default y
 
+config BLD
+	bool "An alternate CPU load distribution technique for task scheduler"
+	depends on SMP
+	default y
+	help
+	  This is an alternate CPU load distribution technique based for task
+	  scheduler based on The Barbershop Load Distribution algorithm. Not
+	  suitable for NUMA, should work well on SMP.
+
 config INIT_ENV_ARG_LIMIT
 	int
 	default 32 if !UML
diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
new file mode 100644
index 0000000..097dd23
--- /dev/null
+++ b/kernel/sched/bld.h
@@ -0,0 +1,207 @@
+#ifdef CONFIG_BLD
+
+static DEFINE_RWLOCK(rt_list_lock);
+static LIST_HEAD(rt_rq_head);
+static LIST_HEAD(cfs_rq_head);
+static DEFINE_RWLOCK(cfs_list_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->rq;
+}
+#else
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+	return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+	return rt_rq->rq;
+}
+#else
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+	return container_of(rt_rq, struct rq, rt);
+}
+#endif
+
+static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
+{
+	int cpu = smp_processor_id(), i;
+	unsigned long load, min_load = ULONG_MAX;
+	struct rq *rq;
+
+	if (task_type) {
+		for_each_cpu(i, mask) {
+			rq = cpu_rq(i);
+			load = rq->cfs.load.weight;
+			if (load < min_load) {
+				min_load = load;
+				cpu = i;
+			}
+		}
+	} else {
+		min_load = -1;
+
+		for_each_cpu(i, mask) {
+			rq = cpu_rq(i);
+			load = rq->rt.lowbit;
+			if (load > min_load) {
+				min_load = load;
+				cpu = i;
+			}
+		}
+	}
+
+	return cpu;
+}
+
+static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int wake_flags)
+{
+	struct cfs_rq *cfs;
+	unsigned long flags;
+	unsigned int cpu = smp_processor_id();
+
+	read_lock_irqsave(&cfs_list_lock, flags);
+	list_for_each_entry(cfs, &cfs_rq_head, bld_cfs_list) {
+		cpu = cpu_of(rq_of_cfs(cfs));
+		if (cpu_online(cpu))
+			break;
+	}
+	read_unlock_irqrestore(&cfs_list_lock, flags);
+	return cpu;
+}
+
+static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
+{
+	struct rt_rq *rt;
+	unsigned long flags;
+	unsigned int cpu = smp_processor_id();
+
+	read_lock_irqsave(&rt_list_lock, flags);
+	list_for_each_entry(rt, &rt_rq_head, bld_rt_list) {
+		cpu = cpu_of(rq_of_rt(rt));
+		if (cpu_online(cpu))
+			break;
+	}
+	read_unlock_irqrestore(&rt_list_lock, flags);
+	return cpu;
+}
+
+static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int wake_flags)
+{
+	unsigned int cpu = smp_processor_id(), want_affine = 0;
+	struct cpumask *tmpmask;
+
+	if (p->nr_cpus_allowed == 1)
+		return task_cpu(p);
+
+	if (sd_flags & SD_BALANCE_WAKE) {
+		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+			want_affine = 1;
+		}
+	}
+
+	if (want_affine)
+		tmpmask = tsk_cpus_allowed(p);
+	else
+		tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
+
+	if (rt_task(p))
+		cpu = select_cpu_for_wakeup(0, tmpmask);
+	else
+		cpu = select_cpu_for_wakeup(1, tmpmask);
+
+	return cpu;
+}
+
+static void track_load_rt(struct rq *rq, struct task_struct *p)
+{
+	unsigned long flag;
+	int firstbit;
+	struct rt_rq *first;
+	struct rt_prio_array *array = &rq->rt.active;
+
+	first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
+	firstbit = sched_find_first_bit(array->bitmap);
+
+	/* Maintaining rt.lowbit */
+	if (firstbit <= rq->rt.lowbit)
+		rq->rt.lowbit = p->prio;
+
+	if (rq->rt.lowbit < first->lowbit) {
+		write_lock_irqsave(&rt_list_lock, flag);
+		list_del(&rq->rt.bld_rt_list);
+		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
+		write_unlock_irqrestore(&rt_list_lock, flag);
+	}
+}
+
+static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
+{
+	unsigned int cpu;
+
+	if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && (get_nr_threads(p) > 1)))
+		cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
+	else {
+		if (rt_task(p))
+			cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
+		else
+			cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
+	}
+
+	return cpu;
+}
+
+static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
+{
+	unsigned long flag;
+	if (rt_task(p)) {
+		track_load_rt(rq, p);
+	} else {
+		if (rq->cfs.pos != 2) {
+			struct cfs_rq *last;
+			last = list_entry(cfs_rq_head.prev, struct cfs_rq, bld_cfs_list);
+			if (rq->cfs.load.weight >= last->load.weight) {
+				write_lock_irqsave(&cfs_list_lock, flag);
+				list_del(&rq->cfs.bld_cfs_list);
+				list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
+				rq->cfs.pos = 2; last->pos = 1;
+				write_unlock_irqrestore(&cfs_list_lock, flag);
+			}
+		}
+	}
+}
+
+static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
+{
+	unsigned long flag;
+	if (rt_task(p)) {
+		track_load_rt(rq, p);
+	} else {
+		if (rq->cfs.pos != 0) {
+			struct cfs_rq *first;
+			first = list_entry(cfs_rq_head.next, struct cfs_rq, bld_cfs_list);
+			if (rq->cfs.load.weight <= first->load.weight) {
+				write_lock_irqsave(&cfs_list_lock, flag);
+				list_del(&rq->cfs.bld_cfs_list);
+				list_add(&rq->cfs.bld_cfs_list, &cfs_rq_head);
+				rq->cfs.pos = 0; first->pos = 1;
+				write_unlock_irqrestore(&cfs_list_lock, flag);
+			}
+		}
+	}
+}
+#else
+static inline void bld_track_load_activate(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
+{
+}
+#endif /* CONFIG_BLD */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286..e2c3cef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,6 +24,8 @@
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
+ *  2012-Feb	The Barbershop Load Distribution (BLD) algorithm - an alternate
+ *		CPU load distribution technique for kernel scheduler by Rakib Mullick.
  */
 
 #include <linux/mm.h>
@@ -86,6 +88,7 @@
 #include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
+#include "bld.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -842,6 +845,8 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	update_rq_clock(rq);
 	sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
+	if (!dl_task(p))
+		bld_track_load_activate(rq, p);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -849,6 +854,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	update_rq_clock(rq);
 	sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
+	if (!dl_task(p))
+		bld_track_load_deactivate(rq, p);
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1409,7 +1416,14 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+#ifndef	CONFIG_BLD
 	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+#else
+	if (dl_task(p))
+		cpu = dl_sched_class.select_task_rq(p, cpu, sd_flags, wake_flags);
+	else
+		cpu = bld_get_cpu(p, sd_flags, wake_flags);
+#endif
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -1579,7 +1593,11 @@ void scheduler_ipi(void)
 	 */
 	preempt_fold_need_resched();
 
+#ifndef	CONFIG_BLD
 	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+#else
+	if (llist_empty(&this_rq()->wake_list))
+#endif
 		return;
 
 	/*
@@ -1601,13 +1619,16 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
+#ifndef	CONFIG_BLD
 	if (unlikely(got_nohz_idle_kick())) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
+#endif
 	irq_exit();
 }
 
+#ifndef	CONFIG_BLD
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -1619,6 +1640,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
 			trace_sched_wake_idle_without_ipi(cpu);
 	}
 }
+#endif
 
 bool cpus_share_cache(int this_cpu, int that_cpu)
 {
@@ -1630,7 +1652,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 
-#if defined(CONFIG_SMP)
+#if defined(CONFIG_SMP) && !defined(CONFIG_BLD)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
@@ -1938,7 +1960,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 * Silence PROVE_RCU.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	set_task_cpu(p, cpu);
+	__set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2413,7 +2435,14 @@ void sched_exec(void)
 	int dest_cpu;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifndef	CONFIG_BLD
 	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+#else
+	if (dl_task(p))
+		dest_cpu = task_cpu(p);
+	else
+		dest_cpu = bld_get_cpu(p, SD_BALANCE_EXEC, 0);
+#endif
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
@@ -2530,8 +2559,10 @@ void scheduler_tick(void)
 
 #ifdef CONFIG_SMP
 	rq->idle_balance = idle_cpu(cpu);
+#ifndef	CONFIG_BLD
 	trigger_load_balance(rq);
 #endif
+#endif
 	rq_last_tick_reset(rq);
 }
 
@@ -7030,6 +7061,15 @@ void __init sched_init(void)
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
+#ifdef CONFIG_BLD
+		INIT_LIST_HEAD(&rq->cfs.bld_cfs_list);
+		list_add_tail(&rq->cfs.bld_cfs_list, &cfs_rq_head);
+		rq->cfs.pos = 0;
+
+		INIT_LIST_HEAD(&rq->rt.bld_rt_list);
+		list_add_tail(&rq->rt.bld_rt_list, &rt_rq_head);
+		rq->rt.lowbit = INT_MAX;
+#endif
 	}
 
 	set_load_weight(&init_task);
@@ -7070,6 +7110,9 @@ void __init sched_init(void)
 	init_sched_fair_class();
 
 	scheduler_running = 1;
+#ifdef	CONFIG_BLD
+	printk(KERN_INFO "BLD: An Alternate CPU load distributor activated.\n");
+#endif
 }
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86..20fc00c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4136,6 +4136,7 @@ static void task_waking_fair(struct task_struct *p)
 	record_wakee(p);
 }
 
+#ifndef	CONFIG_BLD
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * effective_load() calculates the load change as seen from the root_task_group
@@ -4585,6 +4586,7 @@ unlock:
 
 	return new_cpu;
 }
+#endif	/* CONFIG_BLD */
 
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
@@ -4880,6 +4882,7 @@ simple:
 	return p;
 
 idle:
+#ifndef	CONFIG_BLD
 	new_tasks = idle_balance(rq);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
@@ -4891,7 +4894,7 @@ idle:
 
 	if (new_tasks > 0)
 		goto again;
-
+#endif
 	return NULL;
 }
 
@@ -6981,12 +6984,39 @@ static inline int on_null_domain(struct rq *rq)
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
+#ifndef	CONFIG_BLD
 static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
 
+static inline void nohz_balance_exit_idle(int cpu)
+{
+	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+		/*
+		 * Completely isolated CPUs don't ever set, so we must test.
+		 */
+		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+			atomic_dec(&nohz.nr_cpus);
+		}
+		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+	}
+}
+
+static int sched_ilb_notifier(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DYING:
+		nohz_balance_exit_idle(smp_processor_id());
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static inline int find_new_ilb(void)
 {
 	int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -7024,20 +7054,7 @@ static void nohz_balancer_kick(void)
 	smp_send_reschedule(ilb_cpu);
 	return;
 }
-
-static inline void nohz_balance_exit_idle(int cpu)
-{
-	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
-		/*
-		 * Completely isolated CPUs don't ever set, so we must test.
-		 */
-		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
-			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-			atomic_dec(&nohz.nr_cpus);
-		}
-		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-	}
-}
+#endif	/* CONFIG_BLD */
 
 static inline void set_cpu_sd_state_busy(void)
 {
@@ -7079,6 +7096,7 @@ unlock:
  */
 void nohz_balance_enter_idle(int cpu)
 {
+#ifndef	CONFIG_BLD
 	/*
 	 * If this cpu is going down, then nothing needs to be done.
 	 */
@@ -7097,23 +7115,10 @@ void nohz_balance_enter_idle(int cpu)
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-}
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DYING:
-		nohz_balance_exit_idle(smp_processor_id());
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
+#endif
 }
 #endif
 
-static DEFINE_SPINLOCK(balancing);
-
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
@@ -7123,6 +7128,9 @@ void update_max_interval(void)
 	max_load_balance_interval = HZ*num_online_cpus()/10;
 }
 
+#ifndef	CONFIG_BLD
+static DEFINE_SPINLOCK(balancing);
+
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -7371,6 +7379,7 @@ void trigger_load_balance(struct rq *rq)
 		nohz_balancer_kick();
 #endif
 }
+#endif	/* CONFIG_BLD */
 
 static void rq_online_fair(struct rq *rq)
 {
@@ -7816,7 +7825,9 @@ const struct sched_class fair_sched_class = {
 	.put_prev_task		= put_prev_task_fair,
 
 #ifdef CONFIG_SMP
+#ifndef	CONFIG_BLD
 	.select_task_rq		= select_task_rq_fair,
+#endif
 	.migrate_task_rq	= migrate_task_rq_fair,
 
 	.rq_online		= rq_online_fair,
@@ -7854,6 +7865,7 @@ void print_cfs_stats(struct seq_file *m, int cpu)
 
 __init void init_sched_fair_class(void)
 {
+#ifndef	CONFIG_BLD
 #ifdef CONFIG_SMP
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
@@ -7863,5 +7875,5 @@ __init void init_sched_fair_class(void)
 	cpu_notifier(sched_ilb_notifier, 0);
 #endif
 #endif /* SMP */
-
+#endif /* BLD */
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..ea0946c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1295,6 +1295,7 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+#ifndef	CONFIG_BLD
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
@@ -1348,6 +1349,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 out:
 	return cpu;
 }
+#endif	/* CONFIG_BLD */
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
@@ -2112,7 +2114,9 @@ const struct sched_class rt_sched_class = {
 	.put_prev_task		= put_prev_task_rt,
 
 #ifdef CONFIG_SMP
+#ifndef	CONFIG_BLD
 	.select_task_rq		= select_task_rq_rt,
+#endif
 
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..a00914d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -358,9 +358,8 @@ struct cfs_rq {
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
-
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
@@ -384,6 +383,11 @@ struct cfs_rq {
 	struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_BLD
+	struct list_head bld_cfs_list;
+	char pos;
+#endif
 };
 
 static inline int rt_bandwidth_enabled(void)
@@ -417,12 +421,16 @@ struct rt_rq {
 	/* Nests inside the rq lock: */
 	raw_spinlock_t rt_runtime_lock;
 
+	struct rq *rq;
 #ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
 
-	struct rq *rq;
 	struct task_group *tg;
 #endif
+#ifdef CONFIG_BLD
+	struct list_head bld_rt_list;
+	int lowbit;
+#endif
 };
 
 /* Deadline class' related fields in a runqueue */



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [ANNOUNCE] BLD-3.17 release.
  2014-10-11  6:20 [ANNOUNCE] BLD-3.17 release Rakib Mullick
@ 2014-10-13  8:01 ` Mike Galbraith
  2014-10-13 15:14   ` Rakib Mullick
  0 siblings, 1 reply; 5+ messages in thread
From: Mike Galbraith @ 2014-10-13  8:01 UTC (permalink / raw)
  To: Rakib Mullick; +Cc: linux-kernel

On Sat, 2014-10-11 at 12:20 +0600, Rakib Mullick wrote: 
> BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17

I had a curiosity attack, played with it a little.

My little Q6600 box could be describes as being "micro-numa", with two
pathetic little "nodes" connected by the worst interconnect this side of
tin cans and string.  Communicating tasks sorely missed sharing cache.

tbench
3.18.0-master
Throughput 287.411 MB/sec  1 clients  1 procs  max_latency=1.614 ms     1.000
Throughput 568.631 MB/sec  2 clients  2 procs  max_latency=1.942 ms     1.000
Throughput 1069.75 MB/sec  4 clients  4 procs  max_latency=18.494 ms    1.000
Throughput 1040.99 MB/sec  8 clients  8 procs  max_latency=17.364 ms    1.000

3.18.0-master-BLD                                                        vs master
Throughput 261.986 MB/sec  1 clients  1 procs  max_latency=11.943 ms     .911
Throughput 264.461 MB/sec  2 clients  2 procs  max_latency=11.884 ms     .465
Throughput 476.191 MB/sec  4 clients  4 procs  max_latency=11.497 ms     .445
Throughput 558.236 MB/sec  8 clients  8 procs  max_latency=9.008 ms      .536


TCP_RR 4 unbound clients
3.18.0-master
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1 (127.0.0.1) port 0 AF_INET
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate         
bytes  Bytes  bytes    bytes   secs.    per sec   

16384  87380  1        1       30.00    72436.65 
16384  87380  1        1       30.00    72438.55
16384  87380  1        1       30.00    72213.18
16384  87380  1        1       30.00    72493.48
                               sum     289581.86     1.000

3.18.0-master-BLD
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1 (127.0.0.1) port 0 AF_INET
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate         
bytes  Bytes  bytes    bytes   secs.    per sec   

16384  87380  1        1       30.00    29014.09
16384  87380  1        1       30.00    28804.53
16384  87380  1        1       30.00    28999.40
16384  87380  1        1       30.00    28901.84
                               sum     115719.86      .399 vs master



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [ANNOUNCE] BLD-3.17 release.
  2014-10-13  8:01 ` Mike Galbraith
@ 2014-10-13 15:14   ` Rakib Mullick
  2014-10-13 18:07     ` Mike Galbraith
  0 siblings, 1 reply; 5+ messages in thread
From: Rakib Mullick @ 2014-10-13 15:14 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel

On 10/13/14, Mike Galbraith <umgwanakikbuti@gmail.com> wrote:
> On Sat, 2014-10-11 at 12:20 +0600, Rakib Mullick wrote:
>> BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17
>
> I had a curiosity attack, played with it a little.
>
Thanks for showing your interest!

> My little Q6600 box could be describes as being "micro-numa", with two
> pathetic little "nodes" connected by the worst interconnect this side of
> tin cans and string.  Communicating tasks sorely missed sharing cache.
>
> tbench
> 3.18.0-master
> Throughput 287.411 MB/sec  1 clients  1 procs  max_latency=1.614 ms
> 1.000
> Throughput 568.631 MB/sec  2 clients  2 procs  max_latency=1.942 ms
> 1.000
> Throughput 1069.75 MB/sec  4 clients  4 procs  max_latency=18.494 ms
> 1.000
> Throughput 1040.99 MB/sec  8 clients  8 procs  max_latency=17.364 ms
> 1.000
>
> 3.18.0-master-BLD                                                        vs
> master
> Throughput 261.986 MB/sec  1 clients  1 procs  max_latency=11.943 ms
> .911
> Throughput 264.461 MB/sec  2 clients  2 procs  max_latency=11.884 ms
> .465
> Throughput 476.191 MB/sec  4 clients  4 procs  max_latency=11.497 ms
> .445
> Throughput 558.236 MB/sec  8 clients  8 procs  max_latency=9.008 ms
> .536
>
>
> TCP_RR 4 unbound clients
> 3.18.0-master
> TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1
> (127.0.0.1) port 0 AF_INET
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       30.00    72436.65
> 16384  87380  1        1       30.00    72438.55
> 16384  87380  1        1       30.00    72213.18
> 16384  87380  1        1       30.00    72493.48
>                                sum     289581.86     1.000
>
> 3.18.0-master-BLD
> TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1
> (127.0.0.1) port 0 AF_INET
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       30.00    29014.09
> 16384  87380  1        1       30.00    28804.53
> 16384  87380  1        1       30.00    28999.40
> 16384  87380  1        1       30.00    28901.84
>                                sum     115719.86      .399 vs master
>
>
Okay. From the numbers above it's apparent that BLD isn't doing good,
atleast for the
kind of system that you have been using. I didn't had a chance to ran
it on any kind of
NUMA systems, for that reason on Kconfig, I've marked it as "Not
suitable for NUMA", yet.
Part of the reason is, I didn't manage to try it out myself and other
reason is, it's easy to
get things wrong if schedule domains are build improperly. I'm not
sure what was the
sched configuration in your case. BLD assumes (or kindof bliendly
believes systems
default sched domain topology) on wakeup tasks are cache hot and so
don't put those
task's on other sched domains, but if that isn't the case then perhaps
it'll miss out on
balancing oppourtunity, in that case CPU utilization will be improper.

Can you please share the perf stat of netperf runs? So, far I have
seen reduced context
switch numbers with -BLD with a drawback of huge increase of CPU
migration numbers.
But, the kind of systems I ran so far, it deemed too much CPU movement
didn't cost much.
But, it could be wrong for NUMA systems.

Thanks,
Rakib

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [ANNOUNCE] BLD-3.17 release.
  2014-10-13 15:14   ` Rakib Mullick
@ 2014-10-13 18:07     ` Mike Galbraith
  2014-10-14 15:06       ` Rakib Mullick
  0 siblings, 1 reply; 5+ messages in thread
From: Mike Galbraith @ 2014-10-13 18:07 UTC (permalink / raw)
  To: Rakib Mullick; +Cc: linux-kernel

On Mon, 2014-10-13 at 21:14 +0600, Rakib Mullick wrote:

> Okay. From the numbers above it's apparent that BLD isn't doing good,
> atleast for the
> kind of system that you have been using. I didn't had a chance to ran
> it on any kind of
> NUMA systems, for that reason on Kconfig, I've marked it as "Not
> suitable for NUMA", yet.

(yeah, a NUMA box would rip itself to shreds)

> Part of the reason is, I didn't manage to try it out myself and other
> reason is, it's easy to
> get things wrong if schedule domains are build improperly. I'm not
> sure what was the
> sched configuration in your case. BLD assumes (or kindof bliendly
> believes systems
> default sched domain topology) on wakeup tasks are cache hot and so
> don't put those
> task's on other sched domains, but if that isn't the case then perhaps
> it'll miss out on
> balancing oppourtunity, in that case CPU utilization will be improper.

Even when you have only one socket with a big L3, you don't really want
to bounce fast/light tasks around too frequently, L2 misses still hurt.

> Can you please share the perf stat of netperf runs? So, far I have
> seen reduced context
> switch numbers with -BLD with a drawback of huge increase of CPU
> migration numbers.

No need, it's L2 misses.  Q6600 has no L3 to mitigate the miss pain.

> But, the kind of systems I ran so far, it deemed too much CPU movement
> didn't cost much.
> But, it could be wrong for NUMA systems.

You can most definitely move even very fast/light tasks too much within
an L3, L2 misses can demolish throughput.  We had that problem.

-Mike


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [ANNOUNCE] BLD-3.17 release.
  2014-10-13 18:07     ` Mike Galbraith
@ 2014-10-14 15:06       ` Rakib Mullick
  0 siblings, 0 replies; 5+ messages in thread
From: Rakib Mullick @ 2014-10-14 15:06 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel

On 10/14/14, Mike Galbraith <umgwanakikbuti@gmail.com> wrote:
> On Mon, 2014-10-13 at 21:14 +0600, Rakib Mullick wrote:
>
>> Okay. From the numbers above it's apparent that BLD isn't doing good,
>> atleast for the
>> kind of system that you have been using. I didn't had a chance to ran
>> it on any kind of
>> NUMA systems, for that reason on Kconfig, I've marked it as "Not
>> suitable for NUMA", yet.
>
> (yeah, a NUMA box would rip itself to shreds)
>
>> Part of the reason is, I didn't manage to try it out myself and other
>> reason is, it's easy to
>> get things wrong if schedule domains are build improperly. I'm not
>> sure what was the
>> sched configuration in your case. BLD assumes (or kindof bliendly
>> believes systems
>> default sched domain topology) on wakeup tasks are cache hot and so
>> don't put those
>> task's on other sched domains, but if that isn't the case then perhaps
>> it'll miss out on
>> balancing oppourtunity, in that case CPU utilization will be improper.
>
> Even when you have only one socket with a big L3, you don't really want
> to bounce fast/light tasks around too frequently, L2 misses still hurt.
>
>> Can you please share the perf stat of netperf runs? So, far I have
>> seen reduced context
>> switch numbers with -BLD with a drawback of huge increase of CPU
>> migration numbers.
>
> No need, it's L2 misses.  Q6600 has no L3 to mitigate the miss pain.
>
I see.

>> But, the kind of systems I ran so far, it deemed too much CPU movement
>> didn't cost much.
>> But, it could be wrong for NUMA systems.
>
> You can most definitely move even very fast/light tasks too much within
> an L3, L2 misses can demolish throughput.  We had that problem.
>
So, L2 misses is the key here. So far I haven't tried to utilize
various sched domain flags, like. SD_SHARE_PKG_RESOURCE/_CPUPOWER etc.
Probably, integrating those might help? I'll take a look at those and
will try to see what happens, although testing will be problematic.

Thanks,
Rakib.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-10-14 15:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-11  6:20 [ANNOUNCE] BLD-3.17 release Rakib Mullick
2014-10-13  8:01 ` Mike Galbraith
2014-10-13 15:14   ` Rakib Mullick
2014-10-13 18:07     ` Mike Galbraith
2014-10-14 15:06       ` Rakib Mullick

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.