linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
To: Paul Jackson <pj@sgi.com>
Cc: taka@valinux.co.jp, magnus.damm@gmail.com, dino@in.ibm.com,
	linux-kernel@vger.kernel.org
Subject: [PATCH 2/3] CPUMETER: CPU resource controller
Date: Mon, 26 Sep 2005 18:34:09 +0900	[thread overview]
Message-ID: <20050926093432.E082F70044@sv1.valinux.co.jp> (raw)
In-Reply-To: <20050910015209.4f581b8a.pj@sgi.com>

This patch adds CPU resource controller.  It enables us to control
CPU time percentage of tasks grouped by the cpu_rc structure.
It controls time_slice of tasks based on the feedback of difference
between the target value and the current usage in order to control 
the percentage of the CPU usage to the target value.

Signed-off-by: KUROSAWA Takahiro <kurosawa@valinux.co.jp>

--- /dev/null
+++ to-work/include/linux/cpu_rc.h	2005-09-26 17:26:19.234918633 +0900
@@ -0,0 +1,65 @@
+#ifndef _LINUX_CPU_RC_H_
+#define _LINUX_CPU_RC_H_
+/*
+ *  CPU resource controller interface
+ *
+ *  Copyright 2005 FUJITSU LIMITED
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+
+#ifdef CONFIG_CPU_RC
+
+#ifdef __KERNEL__
+void cpu_rc_init(void);
+unsigned int cpu_rc_scale_timeslice(task_t *tsk, unsigned int slice);
+void cpu_rc_account(task_t *tsk, unsigned long now);
+void cpu_rc_collect_hunger(task_t *tsk);
+
+static inline void cpu_rc_record_activated(task_t *tsk, unsigned long now)
+{
+	tsk->last_activated = now;
+}
+
+static inline void cpu_rc_record_allocation(task_t *tsk,
+					    unsigned int slice,
+					    unsigned long now)
+{
+	if (slice == 0) {
+		/* minimal allocated time_slice is 1 (see sched_fork()). */
+		slice = 1;
+	}
+
+	tsk->last_slice = slice;
+	tsk->ts_alloced = now;
+}
+#endif /* __KERNEL__ */
+
+#else /* CONFIG_CPU_RC */
+
+#ifdef __KERNEL__
+static inline void cpu_rc_init(void) {}
+static inline void cpu_rc_account(task_t *tsk, unsigned long now) {}
+static inline void cpu_rc_collect_hunger(task_t *tsk) {}
+static inline void cpu_rc_record_activated(task_t *tsk, unsigned long now) {}
+static inline void cpu_rc_record_allocation(task_t *tsk,
+					    unsigned int slice,
+					    unsigned long now) {}
+
+static inline unsigned int cpu_rc_scale_timeslice(task_t *tsk,
+						  unsigned int slice)
+{
+	return slice;
+}
+#endif /* __KERNEL__ */
+
+#endif /* CONFIG_CPU_RC */
+
+#endif /* _LINUX_CPU_RC_H_ */
+
--- from-0001/include/linux/sched.h
+++ to-work/include/linux/sched.h	2005-09-26 17:26:19.236918355 +0900
@@ -769,6 +769,11 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+#ifdef CONFIG_CPU_RC
+	unsigned int last_slice;
+	unsigned long ts_alloced;
+	unsigned long last_activated;
+#endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 };
 
--- from-0002/init/Kconfig
+++ to-work/init/Kconfig	2005-09-26 17:28:43.911747746 +0900
@@ -247,6 +247,14 @@ config CPUMETER
 
 	  Say N if unsure.
 
+config CPU_RC
+	bool "CPU resource controller"
+	help
+	  This options will let you control the CPU resource by scaling 
+	  the timeslice allocated for each tasks.
+
+	  Say N if unsure.
+
 menuconfig EMBEDDED
 	bool "Configure standard kernel features (for small systems)"
 	help
--- from-0001/init/main.c
+++ to-work/init/main.c	2005-09-26 17:26:19.238918078 +0900
@@ -42,6 +42,7 @@
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/cpu_rc.h>
 #include <linux/efi.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
@@ -524,7 +525,7 @@ asmlinkage void __init start_kernel(void
 	proc_root_init();
 #endif
 	cpuset_init();
-
+	cpu_rc_init();
 	check_bugs();
 
 	acpi_early_init(); /* before LAPIC and SMP init */
--- from-0001/kernel/Makefile
+++ to-work/kernel/Makefile	2005-09-26 17:26:19.233918772 +0900
@@ -20,6 +20,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CPU_RC) += cpu_rc.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
--- /dev/null
+++ to-work/kernel/cpu_rc.c	2005-09-26 17:28:09.131607286 +0900
@@ -0,0 +1,233 @@
+/*
+ *  kernel/cpu_rc.c
+ *
+ *  CPU resource controller by scaling time_slice of the task.
+ *
+ *  Copyright 2005 FUJITSU LIMITED
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/cpu_rc.h>
+
+/* local macros */
+#define CPU_RC_SPREAD_PERIOD	(5 * HZ)
+#define CPU_RC_LOAD_SCALE	1000
+#define CPU_RC_GUAR_SCALE	100
+#define CPU_RC_TSFACTOR_MAX	CPU_RC_GUAR_SCALE
+#define CPU_RC_TSFACTOR_INC	5
+#define CPU_RC_RECALC_INTERVAL	HZ
+
+struct cpu_rc_domain {
+	spinlock_t lock;
+	unsigned int hungry_groups;
+	cpumask_t cpus;
+	int numcpus;
+};
+
+struct cpu_rc {
+	int guarantee;
+	int is_hungry;
+	unsigned int ts_factor;
+	unsigned long last_recalc;
+	struct cpu_rc_domain *rcd;
+	struct {
+		unsigned long timestamp;
+		unsigned int load;
+		int maybe_hungry;
+	} stat[NR_CPUS];	/* XXX  need alignment */
+};
+
+static struct cpu_rc *cpu_rc_get(task_t *tsk);
+
+static inline void cpu_rc_lock(struct cpu_rc *cr)
+{
+	spin_lock(&cr->rcd->lock);
+}
+
+static inline void cpu_rc_unlock(struct cpu_rc *cr)
+{
+	spin_unlock(&cr->rcd->lock);
+}
+
+static inline int cpu_rc_is_hungry(struct cpu_rc *cr)
+{
+	return cr->is_hungry;
+}
+
+static inline void cpu_rc_set_hungry(struct cpu_rc *cr)
+{
+	if (!cr->is_hungry) {
+		cr->rcd->hungry_groups++;
+		cr->is_hungry = !cr->is_hungry;
+	}
+}
+
+static inline void cpu_rc_set_satisfied(struct cpu_rc *cr)
+{
+	if (cr->is_hungry) {
+		cr->rcd->hungry_groups--;
+		cr->is_hungry = !cr->is_hungry;
+	}
+}
+
+static inline int cpu_rc_is_anyone_hungry(struct cpu_rc *cr)
+{
+	return cr->rcd->hungry_groups > 0;
+}
+
+static inline void cpu_rc_recalc_tsfactor(struct cpu_rc *cr)
+{
+	unsigned int load;
+	int maybe_hungry;
+	int i, n;
+
+	n = 0;
+	load = 0;
+	maybe_hungry = 0;
+
+	cpu_rc_lock(cr);
+	for_each_cpu_mask(i, cr->rcd->cpus) {
+		load += cr->stat[i].load;
+		maybe_hungry += cr->stat[i].maybe_hungry;
+		cr->stat[i].maybe_hungry = 0;
+		n++;
+	}
+	load = load / n;
+
+	if (load * CPU_RC_GUAR_SCALE >= cr->guarantee * CPU_RC_LOAD_SCALE) {
+		cpu_rc_set_satisfied(cr);
+	} else if (maybe_hungry > 0) {
+		cpu_rc_set_hungry(cr);
+	} else {
+		cpu_rc_set_satisfied(cr);
+	}
+
+	if (!cpu_rc_is_anyone_hungry(cr)) {
+		/* Everyone satisfied.  Extend time_slice. */
+		cr->ts_factor += CPU_RC_TSFACTOR_INC;
+	} else {
+		if (cpu_rc_is_hungry(cr)) {
+			/* Extend time_slice a little. */
+			cr->ts_factor++;
+		} else {
+			/* time_slice should be scaled. */
+			cr->ts_factor = cr->ts_factor * cr->guarantee 
+				* CPU_RC_LOAD_SCALE
+				/ (load * CPU_RC_GUAR_SCALE);
+		}
+	}
+
+	if (cr->ts_factor == 0) {
+		cr->ts_factor = 1;
+	} else if (cr->ts_factor > CPU_RC_TSFACTOR_MAX) {
+		cr->ts_factor = CPU_RC_TSFACTOR_MAX;
+	}
+
+	cr->last_recalc = jiffies;
+
+	cpu_rc_unlock(cr);
+}
+
+unsigned int cpu_rc_scale_timeslice(task_t *tsk, unsigned int slice)
+{
+	struct cpu_rc *cr;
+	unsigned int scaled;
+
+	cr = cpu_rc_get(tsk);
+	if (cr == NULL) {
+		return slice;
+	}
+
+	if (jiffies - cr->last_recalc > CPU_RC_RECALC_INTERVAL) {
+		cpu_rc_recalc_tsfactor(cr);
+	}	
+
+	scaled = slice * cr->ts_factor / CPU_RC_TSFACTOR_MAX;
+	if (scaled == 0) {
+		scaled = 1;
+	}
+
+	return scaled;
+}
+
+void cpu_rc_account(task_t *tsk, unsigned long now)
+{
+	struct cpu_rc *cr;
+	int cpu = smp_processor_id();
+	unsigned long last;
+	unsigned int load, tsk_load;
+	unsigned long base, update;
+
+	if (tsk == idle_task(task_cpu(tsk))) {
+		return;
+	}
+
+	cr = cpu_rc_get(tsk);
+	if (cr == NULL) {
+		return;
+	}
+
+	base = now - tsk->ts_alloced;
+	if (base == 0) {
+		/* duration too small. can not collect statistics. */
+		return;
+	}
+
+	tsk_load = CPU_RC_LOAD_SCALE * (tsk->last_slice - tsk->time_slice)
+		+ (CPU_RC_LOAD_SCALE - 1);
+	if (base > CPU_RC_SPREAD_PERIOD) {
+		tsk_load = CPU_RC_SPREAD_PERIOD * tsk_load / base;
+	}
+
+	last = cr->stat[cpu].timestamp;
+	update = now - last;
+	if (update > CPU_RC_SPREAD_PERIOD) {
+		/* statistics data obsolete. */
+		load = 0;
+		update = CPU_RC_SPREAD_PERIOD;
+	} else {
+		load = cr->stat[cpu].load * (CPU_RC_SPREAD_PERIOD - update);
+	}
+
+	cr->stat[cpu].timestamp = now;
+	cr->stat[cpu].load = (load + tsk_load) / CPU_RC_SPREAD_PERIOD;
+}
+
+void cpu_rc_collect_hunger(task_t *tsk)
+{
+	struct cpu_rc *cr;
+	unsigned long wait;
+	int cpu = smp_processor_id();
+
+	if (tsk == idle_task(task_cpu(tsk))) {
+		return;
+	}
+
+	if (tsk->last_activated == 0) {
+		return;
+	}
+
+	cr = cpu_rc_get(tsk);
+	if (cr == NULL) {
+		tsk->last_activated = 0;
+		return;
+	}
+
+	wait = jiffies - tsk->last_activated;
+	if (CPU_RC_GUAR_SCALE * tsk->last_slice
+	    / (wait + tsk->last_slice) < cr->guarantee / cr->rcd->numcpus) {
+		cr->stat[cpu].maybe_hungry++;
+	}
+
+	tsk->last_activated = 0;
+}
+
+void cpu_rc_init(void)
+{
+}
--- from-0001/kernel/sched.c
+++ to-work/kernel/sched.c	2005-09-26 17:26:19.231919049 +0900
@@ -41,6 +41,7 @@
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/cpu_rc.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
@@ -168,10 +169,17 @@
 
 static unsigned int task_timeslice(task_t *p)
 {
+	unsigned int timeslice;
+
 	if (p->static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+		timeslice = SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
 	else
-		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+		timeslice = SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+
+	if (!TASK_INTERACTIVE(p))
+		timeslice = cpu_rc_scale_timeslice(p, timeslice);
+
+	return timeslice;
 }
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
@@ -660,6 +668,7 @@ static int effective_prio(task_t *p)
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
+	cpu_rc_record_activated(p, jiffies);
 	enqueue_task(p, rq->active);
 	rq->nr_running++;
 }
@@ -1294,6 +1303,7 @@ int fastcall wake_up_state(task_t *p, un
 void fastcall sched_fork(task_t *p, int clone_flags)
 {
 	int cpu = get_cpu();
+	unsigned long now = jiffies;
 
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
@@ -1330,9 +1340,12 @@ void fastcall sched_fork(task_t *p, int 
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
+	cpu_rc_account(current, now);
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
+	cpu_rc_record_allocation(current, current->time_slice, now);
+	cpu_rc_record_allocation(p, p->time_slice, now);
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
@@ -1390,6 +1403,7 @@ void fastcall wake_up_new_task(task_t * 
 				p->array = current->array;
 				p->array->nr_active++;
 				rq->nr_running++;
+				cpu_rc_record_activated(p, jiffies);
 			}
 			set_need_resched();
 		} else
@@ -1440,16 +1454,21 @@ void fastcall sched_exit(task_t * p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
+	unsigned long now = jiffies;
 
 	/*
 	 * If the child was a (relative-) CPU hog then decrease
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
+	cpu_rc_account(p, now);
 	if (p->first_time_slice) {
+		cpu_rc_account(p->parent, now);
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
+		cpu_rc_record_allocation(p->parent,
+					 p->parent->time_slice, now);
 	}
 	if (p->sleep_avg < p->parent->sleep_avg)
 		p->parent->sleep_avg = p->parent->sleep_avg /
@@ -2487,6 +2506,7 @@ void scheduler_tick(void)
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 	unsigned long long now = sched_clock();
+	unsigned long jnow = jiffies;
 
 	update_cpu_clock(p, rq, now);
 
@@ -2521,6 +2541,9 @@ void scheduler_tick(void)
 			p->time_slice = task_timeslice(p);
 			p->first_time_slice = 0;
 			set_tsk_need_resched(p);
+#ifdef CONFIG_CPU_RC
+			/* XXX  need accounting even for rt_task? */
+#endif
 
 			/* put it at the end of the queue: */
 			requeue_task(p, rq->active);
@@ -2530,9 +2553,12 @@ void scheduler_tick(void)
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
+		cpu_rc_account(p, jnow);
 		p->prio = effective_prio(p);
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
+		cpu_rc_record_allocation(p, p->time_slice, jnow);
+		cpu_rc_record_activated(p, jnow);
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
@@ -2891,6 +2917,7 @@ switch_tasks:
 	rcu_qsctr_inc(task_cpu(prev));
 
 	update_cpu_clock(prev, rq, now);
+	cpu_rc_collect_hunger(next);
 
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)

  parent reply	other threads:[~2005-09-26  9:34 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-09-08  5:39 [PATCH 0/5] SUBCPUSETS: a resource control functionality using CPUSETS KUROSAWA Takahiro
2005-09-08  7:23 ` Paul Jackson
2005-09-08  8:18   ` KUROSAWA Takahiro
2005-09-08 12:02     ` Paul Jackson
2005-09-09  1:38       ` KUROSAWA Takahiro
2005-09-09  4:12         ` Magnus Damm
2005-09-09  5:55           ` Paul Jackson
2005-09-09  7:52             ` Magnus Damm
2005-09-09  8:39               ` Paul Jackson
2005-09-09 11:38             ` Hirokazu Takahashi
2005-09-09 13:31               ` Paul Jackson
2005-09-10  7:11                 ` Hirokazu Takahashi
2005-09-10  8:52                   ` Paul Jackson
2005-09-11 16:02                     ` Hirokazu Takahashi
2005-09-26  9:33                     ` [PATCH 0/3] CPUMETER (Re: [PATCH 0/5] SUBCPUSETS: a resource control functionality using CPUSETS) KUROSAWA Takahiro
2005-10-02  4:20                       ` Paul Jackson
2005-10-04  2:49                         ` KUROSAWA Takahiro
2005-09-26  9:34                     ` [PATCH 1/3] CPUMETER: add cpumeter framework to the CPUSETS KUROSAWA Takahiro
2005-09-27  8:37                       ` Paul Jackson
2005-09-27  9:22                         ` Nick Piggin
2005-09-27 15:53                           ` [ckrm-tech] " Paul Jackson
2005-09-27 21:45                           ` Chandra Seetharaman
2005-09-28  6:35                           ` KUROSAWA Takahiro
2005-09-28 10:08                             ` Hirokazu Takahashi
2005-09-28 10:32                               ` KUROSAWA Takahiro
2005-09-27 11:39                         ` KUROSAWA Takahiro
2005-09-27 15:49                           ` [ckrm-tech] " Paul Jackson
2005-09-28  6:21                             ` KUROSAWA Takahiro
2005-09-28  6:43                               ` Paul Jackson
2005-09-28  7:08                               ` Paul Jackson
2005-09-28  7:53                                 ` KUROSAWA Takahiro
2005-09-28 16:49                                   ` Paul Jackson
2005-09-29  2:53                                     ` KUROSAWA Takahiro
2005-09-29  2:58                                       ` Paul Jackson
2005-09-30  9:39                                       ` Simon Derr
2005-09-30 14:21                                         ` Paul Jackson
2005-10-02  7:01                             ` Ok to change cpuset flags for sched domains? (was [PATCH 1/3] CPUMETER ...) Paul Jackson
2005-10-03 14:00                               ` Dinakar Guniguntala
2005-10-03 23:36                                 ` [ckrm-tech] " Paul Jackson
2005-09-28  9:25                           ` [PATCH][BUG] fix memory leak on reading cpuset files after seeking beyond eof KUROSAWA Takahiro
2005-09-28 13:42                             ` Paul Jackson
2005-09-28 13:42                             ` [PATCH] cpuset read past eof memory leak fix Paul Jackson
2005-09-28 15:01                               ` Linus Torvalds
2005-09-28 17:53                                 ` Paul Jackson
2005-09-28 18:03                                   ` Linus Torvalds
2005-09-28 18:03                                   ` Randy.Dunlap
2005-09-28 19:04                                     ` [ckrm-tech] " Paul Jackson
2005-09-28 14:29                           ` [PATCH 1/3] CPUMETER: add cpumeter framework to the CPUSETS Paul Jackson
2005-09-26  9:34                     ` KUROSAWA Takahiro [this message]
2005-09-26  9:34                     ` [PATCH 3/3] CPUMETER: connect the CPU resource controller to CPUMETER KUROSAWA Takahiro
2005-09-09 22:26           ` [PATCH 0/5] SUBCPUSETS: a resource control functionality using CPUSETS Matthew Helsley
2005-09-08 13:14   ` Dinakar Guniguntala
2005-09-08 14:11     ` Dipankar Sarma
2005-09-08 14:55       ` Paul Jackson
2005-09-08 14:59     ` Paul Jackson
2005-09-08 22:51     ` [ckrm-tech] " Chandra Seetharaman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050926093432.E082F70044@sv1.valinux.co.jp \
    --to=kurosawa@valinux.co.jp \
    --cc=dino@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=magnus.damm@gmail.com \
    --cc=pj@sgi.com \
    --cc=taka@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).