All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
@ 2012-06-16 17:18 Chen
  2012-06-17  6:24 ` Mike Galbraith
  0 siblings, 1 reply; 10+ messages in thread
From: Chen @ 2012-06-16 17:18 UTC (permalink / raw)
  To: linux-kernel; +Cc: mou Chen

[-- Attachment #1: Type: text/plain, Size: 951 bytes --]

This is the Googlecode link of RIFS series.
http://code.google.com/p/rifs-scheduler/

Here is the download link for RIFS.ES-v1-kernel3.4.x:
http://rifs-scheduler.googlecode.com/files/RIFS.ES-v1-kernel3.4.x

The design of RIFS/ES:

First let me list out the 3 equations of SystemV scheduling.

  0.cpu = cpu +1;
  1.cpu = cpu/2;
  2.priority = (cpu/2) + base level priority;

These 3 equations are used to determine the scheduling priority if a
task. Here there is no any complex detection of interactive task(O(1)
failed on desktop because of the complex detection). O(1) did a
hard-coding and kerp focusing on how to detect a whether the task is
an interactive task or not.
CFS is good with its beautiful design. But in my opinion i think the
sleeper feature will destroy the beautiful design.

RIFS-ES adopt the design of System V scheduler.
It can be interactive. On my box everything is very smopth
                  Chen

[-- Attachment #2: RIFS.ES-v1-kernel3.4.x --]
[-- Type: application/octet-stream, Size: 289856 bytes --]

diff -ruN linux-3.4.1/fs/proc/base.c linux-3.4.1-RIFS/fs/proc/base.c
--- linux-3.4.1/fs/proc/base.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/fs/proc/base.c	2012-06-06 15:02:18.000000000 +0800
@@ -342,7 +342,7 @@
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			(unsigned long long)task->se.sum_exec_runtime,
+			(unsigned long long)tsk_seruntime(task),
 			(unsigned long long)task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
diff -ruN linux-3.4.1/include/linux/init_task.h linux-3.4.1-RIFS/include/linux/init_task.h
--- linux-3.4.1/include/linux/init_task.h	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/include/linux/init_task.h	2012-06-06 14:50:29.000000000 +0800
@@ -132,12 +132,68 @@
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
+#ifdef CONFIG_SCHED_RIFS
+#define INIT_TASK_COMM "RIFS"
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= NORMAL_PRIO,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= NORMAL_PRIO,					\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.time_slice	= HZ,					\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_INIT_POINTER(.real_cred, &init_cred),			\
+	RCU_INIT_POINTER(.cred, &init_cred),				\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+}
+#else /* CONFIG_SCHED_RIFS */
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
@@ -201,7 +257,7 @@
 	INIT_TASK_RCU_PREEMPT(tsk)					\
 	INIT_CPUSET_SEQ							\
 }
-
+#endif
 
 #define INIT_CPU_TIMERS(cpu_timers)					\
 {									\
diff -ruN linux-3.4.1/include/linux/init_task (复件).h linux-3.4.1-RIFS/include/linux/init_task (复件).h
--- linux-3.4.1/include/linux/init_task (复件).h	1970-01-01 08:00:00.000000000 +0800
+++ linux-3.4.1-RIFS/include/linux/init_task (复件).h	2012-06-01 15:18:44.000000000 +0800
@@ -0,0 +1,217 @@
+#ifndef _LINUX__INIT_TASK_H
+#define _LINUX__INIT_TASK_H
+
+#include <linux/rcupdate.h>
+#include <linux/irqflags.h>
+#include <linux/utsname.h>
+#include <linux/lockdep.h>
+#include <linux/ftrace.h>
+#include <linux/ipc.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/securebits.h>
+#include <net/net_namespace.h>
+
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk)					\
+	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
+
+extern struct files_struct init_files;
+extern struct fs_struct init_fs;
+
+#ifdef CONFIG_CGROUPS
+#define INIT_GROUP_RWSEM(sig)						\
+	.group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
+#else
+#define INIT_GROUP_RWSEM(sig)
+#endif
+
+#ifdef CONFIG_CPUSETS
+#define INIT_CPUSET_SEQ							\
+	.mems_allowed_seq = SEQCNT_ZERO,
+#else
+#define INIT_CPUSET_SEQ
+#endif
+
+#define INIT_SIGNALS(sig) {						\
+	.nr_threads	= 1,						\
+	.wait_chldexit	= __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
+	.shared_pending	= { 						\
+		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
+		.signal =  {{0}}},					\
+	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
+	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
+	.rlim		= INIT_RLIMITS,					\
+	.cputimer	= { 						\
+		.cputime = INIT_CPUTIME,				\
+		.running = 0,						\
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+	},								\
+	.cred_guard_mutex =						\
+		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
+	INIT_GROUP_RWSEM(sig)						\
+}
+
+extern struct nsproxy init_nsproxy;
+
+#define INIT_SIGHAND(sighand) {						\
+	.count		= ATOMIC_INIT(1), 				\
+	.action		= { { { .sa_handler = SIG_DFL, } }, },		\
+	.siglock	= __SPIN_LOCK_UNLOCKED(sighand.siglock),	\
+	.signalfd_wqh	= __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh),	\
+}
+
+extern struct group_info init_groups;
+
+#define INIT_STRUCT_PID {						\
+	.count 		= ATOMIC_INIT(1),				\
+	.tasks		= {						\
+		{ .first = NULL },					\
+		{ .first = NULL },					\
+		{ .first = NULL },					\
+	},								\
+	.level		= 0,						\
+	.numbers	= { {						\
+		.nr		= 0,					\
+		.ns		= &init_pid_ns,				\
+		.pid_chain	= { .next = NULL, .pprev = NULL },	\
+	}, }								\
+}
+
+#define INIT_PID_LINK(type) 					\
+{								\
+	.node = {						\
+		.next = NULL,					\
+		.pprev = NULL,					\
+	},							\
+	.pid = &init_struct_pid,				\
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+#define INIT_IDS \
+	.loginuid = -1, \
+	.sessionid = -1,
+#else
+#define INIT_IDS
+#endif
+
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()						\
+	.rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
+#ifdef CONFIG_TREE_PREEMPT_RCU
+#define INIT_TASK_RCU_TREE_PREEMPT()					\
+	.rcu_blocked_node = NULL,
+#else
+#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+#define INIT_TASK_RCU_PREEMPT(tsk)					\
+	.rcu_read_lock_nesting = 0,					\
+	.rcu_read_unlock_special = 0,					\
+	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
+	INIT_TASK_RCU_TREE_PREEMPT()					\
+	INIT_TASK_RCU_BOOST()
+#else
+#define INIT_TASK_RCU_PREEMPT(tsk)
+#endif
+
+extern struct cred init_cred;
+
+#ifdef CONFIG_PERF_EVENTS
+# define INIT_PERF_EVENTS(tsk)					\
+	.perf_event_mutex = 						\
+		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
+	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
+#else
+# define INIT_PERF_EVENTS(tsk)
+#endif
+
+#define INIT_TASK_COMM "swapper"
+
+/*
+ *  INIT_TASK is used to set up the first task table, touch at
+ * your own risk!. Base=0, limit=0x1fffff (=2MB)
+ */
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= MAX_PRIO-20,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.se		= {						\
+		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
+	},								\
+	.rt		= {						\
+		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
+		.time_slice	= RR_TIMESLICE,				\
+		.nr_cpus_allowed = NR_CPUS,				\
+	},								\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_INIT_POINTER(.real_cred, &init_cred),			\
+	RCU_INIT_POINTER(.cred, &init_cred),				\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	.thread_group	= LIST_HEAD_INIT(tsk.thread_group),		\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_CPUSET_SEQ							\
+}
+
+
+#define INIT_CPU_TIMERS(cpu_timers)					\
+{									\
+	LIST_HEAD_INIT(cpu_timers[0]),					\
+	LIST_HEAD_INIT(cpu_timers[1]),					\
+	LIST_HEAD_INIT(cpu_timers[2]),					\
+}
+
+/* Attach to the init_task data structure for proper alignment */
+#define __init_task_data __attribute__((__section__(".data..init_task")))
+
+
+#endif
diff -ruN linux-3.4.1/include/linux/sched.h linux-3.4.1-RIFS/include/linux/sched.h
--- linux-3.4.1/include/linux/sched.h	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/include/linux/sched.h	2012-06-06 14:57:06.000000000 +0800
@@ -37,9 +37,13 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
-/* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
-/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+#define SCHED_IDLEPRIO		SCHED_IDLE
+#ifdef CONFIG_SCHED_RIFS
+#define SCHED_MAX		(SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
+#endif
+
 #define SCHED_RESET_ON_FORK     0x40000000
 
 #ifdef __KERNEL__
@@ -268,8 +272,6 @@
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
-extern int runqueue_is_locked(int cpu);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern void select_nohz_load_balancer(int stop_tick);
 extern void set_cpu_sd_state_idle(void);
@@ -1270,15 +1272,33 @@
 
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
-	int on_cpu;
 #endif
+#ifndef CONFIG_SCHED_RIFS
+	int on_cpu;
 	int on_rq;
-
+#else
+	bool on_cpu;
+	bool on_rq;
+#endif
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
+#ifdef CONFIG_SCHED_RIFS
+	int time_slice;
+	u64 crt_time;
+	u64 run_time;
+	u64 run_scale;
+	struct list_head run_list;
+	u64 last_ran;
+	u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMP
+	bool sticky; /* Soft affined flag */
+#endif
+	unsigned long rt_timeout;
+#else /* CONFIG_SCHED_RIFS */
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
@@ -1391,6 +1411,9 @@
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
+#ifdef CONFIG_SCHED_RIFS
+	unsigned long utime_pc, stime_pc;
+#endif
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
 #endif
@@ -1619,6 +1642,55 @@
 #endif
 };
 
+#ifdef CONFIG_SCHED_RIFS
+bool grunqueue_is_locked(void);
+void grq_unlock_wait(void);
+void cpu_scaling(int cpu);
+void cpu_nonscaling(int cpu);
+bool above_background_load(void);
+#define tsk_seruntime(t)		((t)->sched_time)
+#define tsk_rttimeout(t)		((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+static inline int runqueue_is_locked(int cpu)
+{
+	return grunqueue_is_locked();
+}
+
+void print_scheduler_version(void);
+
+#else /* CFS */
+extern int runqueue_is_locked(int cpu);
+static inline void cpu_scaling(int cpu)
+{
+}
+
+static inline void cpu_nonscaling(int cpu)
+{
+}
+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t)	((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+	printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+/* Anyone feel like implementing this? */
+static inline bool above_background_load(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_RIFS */
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -1636,10 +1708,20 @@
  */
 
 #define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
 
+#ifdef CONFIG_SCHED_RIFS
+#define PRIO_RANGE		(40)
+#define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
+//#define ISO_PRIO		(MAX_RT_PRIO) 已经被我干掉,哈哈
+#define NORMAL_PRIO		(MAX_RT_PRIO + 1)
+#define IDLE_PRIO		(MAX_PRIO + 1)
+#define PRIO_LIMIT		((IDLE_PRIO) + 1)
+#else /* CONFIG_SCHED_RIFS */
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+#define NORMAL_PRIO		(DEFAULT_PRIO)
+#endif /* CONFIG_SCHED_RIFS */
 
 static inline int rt_prio(int prio)
 {
@@ -2002,7 +2084,7 @@
 task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_RIFS)
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
diff -ruN linux-3.4.1/include/linux/sched (复件).h linux-3.4.1-RIFS/include/linux/sched (复件).h
--- linux-3.4.1/include/linux/sched (复件).h	1970-01-01 08:00:00.000000000 +0800
+++ linux-3.4.1-RIFS/include/linux/sched (复件).h	2012-06-01 15:18:44.000000000 +0800
@@ -0,0 +1,2823 @@
+#ifndef _LINUX_SCHED_H
+#define _LINUX_SCHED_H
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+#define CLONE_VM	0x00000100	/* set if VM shared between processes */
+#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD	0x00010000	/* Same thread group? */
+#define CLONE_NEWNS	0x00020000	/* New namespace group? */
+#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
+#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
+#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
+#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
+#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
+#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
+#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
+/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+   and is now available for re-use. */
+#define CLONE_NEWUTS		0x04000000	/* New utsname group? */
+#define CLONE_NEWIPC		0x08000000	/* New ipcs */
+#define CLONE_NEWUSER		0x10000000	/* New user namespace */
+#define CLONE_NEWPID		0x20000000	/* New pid namespace */
+#define CLONE_NEWNET		0x40000000	/* New network namespace */
+#define CLONE_IO		0x80000000	/* Clone io context */
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_NORMAL		0
+#define SCHED_FIFO		1
+#define SCHED_RR		2
+#define SCHED_BATCH		3
+/* SCHED_ISO: reserved but not implemented yet */
+#define SCHED_IDLE		5
+/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+#define SCHED_RESET_ON_FORK     0x40000000
+
+#ifdef __KERNEL__
+
+struct sched_param {
+	int sched_priority;
+};
+
+#include <asm/param.h>	/* for HZ */
+
+#include <linux/capability.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/thread_info.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/nodemask.h>
+#include <linux/mm_types.h>
+
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/cputime.h>
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/proportions.h>
+#include <linux/seccomp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/rtmutex.h>
+
+#include <linux/time.h>
+#include <linux/param.h>
+#include <linux/resource.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/task_io_accounting.h>
+#include <linux/latencytop.h>
+#include <linux/cred.h>
+#include <linux/llist.h>
+
+#include <asm/processor.h>
+
+struct exec_domain;
+struct futex_pi_state;
+struct robust_list_head;
+struct bio_list;
+struct fs_struct;
+struct perf_event_context;
+struct blk_plug;
+
+/*
+ * List of flags we want to share for kernel threads,
+ * if only because they are not used by them anyway.
+ */
+#define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
+
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ *    a load-average precision of 10 bits integer + 11 bits fractional
+ *  - if you want to count load-averages more often, you need more
+ *    precision, or rounding will get you. With 2-second counting freq,
+ *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+ *    11 bit fractions.
+ */
+extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+
+#define FSHIFT		11		/* nr of bits of precision */
+#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
+#define LOAD_FREQ	(5*HZ+1)	/* 5 sec intervals */
+#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
+#define EXP_5		2014		/* 1/exp(5sec/5min) */
+#define EXP_15		2037		/* 1/exp(5sec/15min) */
+
+#define CALC_LOAD(load,exp,n) \
+	load *= exp; \
+	load += n*(FIXED_1-exp); \
+	load >>= FSHIFT;
+
+extern unsigned long total_forks;
+extern int nr_threads;
+DECLARE_PER_CPU(unsigned long, process_counts);
+extern int nr_processes(void);
+extern unsigned long nr_running(void);
+extern unsigned long nr_uninterruptible(void);
+extern unsigned long nr_iowait(void);
+extern unsigned long nr_iowait_cpu(int cpu);
+extern unsigned long this_cpu_load(void);
+
+
+extern void calc_global_load(unsigned long ticks);
+
+extern unsigned long get_parent_ip(unsigned long addr);
+
+struct seq_file;
+struct cfs_rq;
+struct task_group;
+#ifdef CONFIG_SCHED_DEBUG
+extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+#else
+static inline void
+proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+}
+static inline void proc_sched_set_task(struct task_struct *p)
+{
+}
+static inline void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+}
+#endif
+
+/*
+ * Task state bitmask. NOTE! These bits are also
+ * encoded in fs/proc/array.c: get_task_state().
+ *
+ * We have two separate sets of flags: task->state
+ * is about runnability, while task->exit_state are
+ * about the task exiting. Confusing, but this way
+ * modifying one set can't modify the other one by
+ * mistake.
+ */
+#define TASK_RUNNING		0
+#define TASK_INTERRUPTIBLE	1
+#define TASK_UNINTERRUPTIBLE	2
+#define __TASK_STOPPED		4
+#define __TASK_TRACED		8
+/* in tsk->exit_state */
+#define EXIT_ZOMBIE		16
+#define EXIT_DEAD		32
+/* in tsk->state again */
+#define TASK_DEAD		64
+#define TASK_WAKEKILL		128
+#define TASK_WAKING		256
+#define TASK_STATE_MAX		512
+
+#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+
+extern char ___assert_task_state[1 - 2*!!(
+		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
+
+/* Convenience macros for the sake of set_task_state */
+#define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
+#define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
+#define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
+
+/* Convenience macros for the sake of wake_up */
+#define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
+#define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
+
+/* get_task_state() */
+#define TASK_REPORT		(TASK_RUNNING | TASK_INTERRUPTIBLE | \
+				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
+				 __TASK_TRACED)
+
+#define task_is_traced(task)	((task->state & __TASK_TRACED) != 0)
+#define task_is_stopped(task)	((task->state & __TASK_STOPPED) != 0)
+#define task_is_dead(task)	((task)->exit_state != 0)
+#define task_is_stopped_or_traced(task)	\
+			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_contributes_to_load(task)	\
+				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
+				 (task->flags & PF_FROZEN) == 0)
+
+#define __set_task_state(tsk, state_value)		\
+	do { (tsk)->state = (state_value); } while (0)
+#define set_task_state(tsk, state_value)		\
+	set_mb((tsk)->state, (state_value))
+
+/*
+ * set_current_state() includes a barrier so that the write of current->state
+ * is correctly serialised wrt the caller's subsequent test of whether to
+ * actually sleep:
+ *
+ *	set_current_state(TASK_UNINTERRUPTIBLE);
+ *	if (do_i_need_to_sleep())
+ *		schedule();
+ *
+ * If the caller does not need such serialisation then use __set_current_state()
+ */
+#define __set_current_state(state_value)			\
+	do { current->state = (state_value); } while (0)
+#define set_current_state(state_value)		\
+	set_mb(current->state, (state_value))
+
+/* Task command name length */
+#define TASK_COMM_LEN 16
+
+#include <linux/spinlock.h>
+
+/*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+extern rwlock_t tasklist_lock;
+extern spinlock_t mmlist_lock;
+
+struct task_struct;
+
+#ifdef CONFIG_PROVE_RCU
+extern int lockdep_tasklist_lock_is_held(void);
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+extern void sched_init(void);
+extern void sched_init_smp(void);
+extern asmlinkage void schedule_tail(struct task_struct *prev);
+extern void init_idle(struct task_struct *idle, int cpu);
+extern void init_idle_bootup_task(struct task_struct *idle);
+
+extern int runqueue_is_locked(int cpu);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
+extern void select_nohz_load_balancer(int stop_tick);
+extern void set_cpu_sd_state_idle(void);
+extern int get_nohz_timer_target(void);
+#else
+static inline void select_nohz_load_balancer(int stop_tick) { }
+static inline void set_cpu_sd_state_idle(void) { }
+#endif
+
+/*
+ * Only dump TASK_* tasks. (0 for all tasks)
+ */
+extern void show_state_filter(unsigned long state_filter);
+
+static inline void show_state(void)
+{
+	show_state_filter(0);
+}
+
+extern void show_regs(struct pt_regs *);
+
+/*
+ * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
+ * task), SP is the stack pointer of the first frame that should be shown in the back
+ * trace (or NULL if the entire call-chain of the task should be shown).
+ */
+extern void show_stack(struct task_struct *task, unsigned long *sp);
+
+void io_schedule(void);
+long io_schedule_timeout(long timeout);
+
+extern void cpu_init (void);
+extern void trap_init(void);
+extern void update_process_times(int user);
+extern void scheduler_tick(void);
+
+extern void sched_show_task(struct task_struct *p);
+
+#ifdef CONFIG_LOCKUP_DETECTOR
+extern void touch_softlockup_watchdog(void);
+extern void touch_softlockup_watchdog_sync(void);
+extern void touch_all_softlockup_watchdogs(void);
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
+extern unsigned int  softlockup_panic;
+void lockup_detector_init(void);
+#else
+static inline void touch_softlockup_watchdog(void)
+{
+}
+static inline void touch_softlockup_watchdog_sync(void)
+{
+}
+static inline void touch_all_softlockup_watchdogs(void)
+{
+}
+static inline void lockup_detector_init(void)
+{
+}
+#endif
+
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#else
+/* Avoid need for ifdefs elsewhere in the code */
+enum { sysctl_hung_task_timeout_secs = 0 };
+#endif
+
+/* Attach to any functions which should be ignored in wchan output. */
+#define __sched		__attribute__((__section__(".sched.text")))
+
+/* Linker adds these: start and end of __sched functions */
+extern char __sched_text_start[], __sched_text_end[];
+
+/* Is this address in the __sched functions? */
+extern int in_sched_functions(unsigned long addr);
+
+#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+extern signed long schedule_timeout(signed long timeout);
+extern signed long schedule_timeout_interruptible(signed long timeout);
+extern signed long schedule_timeout_killable(signed long timeout);
+extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void schedule(void);
+extern void schedule_preempt_disabled(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
+
+struct nsproxy;
+struct user_namespace;
+
+/*
+ * Default maximum number of active map areas, this limits the number of vmas
+ * per mm struct. Users can overwrite this number by sysctl but there is a
+ * problem.
+ *
+ * When a program's coredump is generated as ELF format, a section is created
+ * per a vma. In ELF, the number of sections is represented in unsigned short.
+ * This means the number of sections should be smaller than 65535 at coredump.
+ * Because the kernel adds some informative sections to a image of program at
+ * generating coredump, we need some margin. The number of extra sections is
+ * 1-3 now and depends on arch. We use "5" as safe margin, here.
+ */
+#define MAPCOUNT_ELF_CORE_MARGIN	(5)
+#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+
+extern int sysctl_max_map_count;
+
+#include <linux/aio.h>
+
+#ifdef CONFIG_MMU
+extern void arch_pick_mmap_layout(struct mm_struct *mm);
+extern unsigned long
+arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
+		       unsigned long, unsigned long);
+extern unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+			  unsigned long len, unsigned long pgoff,
+			  unsigned long flags);
+extern void arch_unmap_area(struct mm_struct *, unsigned long);
+extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+#else
+static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
+#endif
+
+
+extern void set_dumpable(struct mm_struct *mm, int value);
+extern int get_dumpable(struct mm_struct *mm);
+
+/* mm flags */
+/* dumpable bits */
+#define MMF_DUMPABLE      0  /* core dump is permitted */
+#define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+
+#define MMF_DUMPABLE_BITS 2
+#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+
+/* coredump filter bits */
+#define MMF_DUMP_ANON_PRIVATE	2
+#define MMF_DUMP_ANON_SHARED	3
+#define MMF_DUMP_MAPPED_PRIVATE	4
+#define MMF_DUMP_MAPPED_SHARED	5
+#define MMF_DUMP_ELF_HEADERS	6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED  8
+
+#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
+#define MMF_DUMP_FILTER_BITS	7
+#define MMF_DUMP_FILTER_MASK \
+	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+#define MMF_DUMP_FILTER_DEFAULT \
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF	0
+#endif
+					/* leave room for more dump flags */
+#define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
+
+struct sighand_struct {
+	atomic_t		count;
+	struct k_sigaction	action[_NSIG];
+	spinlock_t		siglock;
+	wait_queue_head_t	signalfd_wqh;
+};
+
+struct pacct_struct {
+	int			ac_flag;
+	long			ac_exitcode;
+	unsigned long		ac_mem;
+	cputime_t		ac_utime, ac_stime;
+	unsigned long		ac_minflt, ac_majflt;
+};
+
+struct cpu_itimer {
+	cputime_t expires;
+	cputime_t incr;
+	u32 error;
+	u32 incr_error;
+};
+
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime:		time spent in user mode, in &cputime_t units
+ * @stime:		time spent in kernel mode, in &cputime_t units
+ * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ *
+ * This structure groups together three kinds of CPU time that are
+ * tracked for threads and thread groups.  Most things considering
+ * CPU time want to group these counts together and treat all three
+ * of them in parallel.
+ */
+struct task_cputime {
+	cputime_t utime;
+	cputime_t stime;
+	unsigned long long sum_exec_runtime;
+};
+/* Alternate field names when used to cache expirations. */
+#define prof_exp	stime
+#define virt_exp	utime
+#define sched_exp	sum_exec_runtime
+
+#define INIT_CPUTIME	\
+	(struct task_cputime) {					\
+		.utime = 0,					\
+		.stime = 0,					\
+		.sum_exec_runtime = 0,				\
+	}
+
+/*
+ * Disable preemption until the scheduler is running.
+ * Reset by start_kernel()->sched_init()->init_idle().
+ *
+ * We include PREEMPT_ACTIVE to avoid cond_resched() from working
+ * before the scheduler is active -- see should_resched().
+ */
+#define INIT_PREEMPT_COUNT	(1 + PREEMPT_ACTIVE)
+
+/**
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime:		thread group interval timers.
+ * @running:		non-zero when there are timers running and
+ * 			@cputime receives updates.
+ * @lock:		lock for fields in this struct.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU timer calculations.
+ */
+struct thread_group_cputimer {
+	struct task_cputime cputime;
+	int running;
+	raw_spinlock_t lock;
+};
+
+#include <linux/rwsem.h>
+struct autogroup;
+
+/*
+ * NOTE! "signal_struct" does not have its own
+ * locking, because a shared signal_struct always
+ * implies a shared sighand_struct, so locking
+ * sighand_struct is always a proper superset of
+ * the locking of signal_struct.
+ */
+struct signal_struct {
+	atomic_t		sigcnt;
+	atomic_t		live;
+	int			nr_threads;
+
+	wait_queue_head_t	wait_chldexit;	/* for wait4() */
+
+	/* current thread group signal load-balancing target: */
+	struct task_struct	*curr_target;
+
+	/* shared signal handling: */
+	struct sigpending	shared_pending;
+
+	/* thread group exit support */
+	int			group_exit_code;
+	/* overloaded:
+	 * - notify group_exit_task when ->count is equal to notify_count
+	 * - everyone except group_exit_task is stopped during signal delivery
+	 *   of fatal signals, group_exit_task processes the signal.
+	 */
+	int			notify_count;
+	struct task_struct	*group_exit_task;
+
+	/* thread group stop support, overloads group_exit_code too */
+	int			group_stop_count;
+	unsigned int		flags; /* see SIGNAL_* flags below */
+
+	/*
+	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
+	 * manager, to re-parent orphan (double-forking) child processes
+	 * to this process instead of 'init'. The service manager is
+	 * able to receive SIGCHLD signals and is able to investigate
+	 * the process until it calls wait(). All children of this
+	 * process will inherit a flag if they should look for a
+	 * child_subreaper process at exit.
+	 */
+	unsigned int		is_child_subreaper:1;
+	unsigned int		has_child_subreaper:1;
+
+	/* POSIX.1b Interval Timers */
+	struct list_head posix_timers;
+
+	/* ITIMER_REAL timer for the process */
+	struct hrtimer real_timer;
+	struct pid *leader_pid;
+	ktime_t it_real_incr;
+
+	/*
+	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
+	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
+	 * values are defined to 0 and 1 respectively
+	 */
+	struct cpu_itimer it[2];
+
+	/*
+	 * Thread group totals for process CPU timers.
+	 * See thread_group_cputimer(), et al, for details.
+	 */
+	struct thread_group_cputimer cputimer;
+
+	/* Earliest-expiration cache. */
+	struct task_cputime cputime_expires;
+
+	struct list_head cpu_timers[3];
+
+	struct pid *tty_old_pgrp;
+
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct autogroup *autogroup;
+#endif
+	/*
+	 * Cumulative resource counters for dead threads in the group,
+	 * and for reaped dead child processes forked by this group.
+	 * Live threads maintain their own counters and add to these
+	 * in __exit_signal, except for the group leader.
+	 */
+	cputime_t utime, stime, cutime, cstime;
+	cputime_t gtime;
+	cputime_t cgtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	cputime_t prev_utime, prev_stime;
+#endif
+	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+	unsigned long inblock, oublock, cinblock, coublock;
+	unsigned long maxrss, cmaxrss;
+	struct task_io_accounting ioac;
+
+	/*
+	 * Cumulative ns of schedule CPU time fo dead threads in the
+	 * group, not including a zombie group leader, (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sum_sched_runtime;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	struct pacct_struct pacct;	/* per-process accounting information */
+#endif
+#ifdef CONFIG_TASKSTATS
+	struct taskstats *stats;
+#endif
+#ifdef CONFIG_AUDIT
+	unsigned audit_tty;
+	struct tty_audit_buf *tty_audit_buf;
+#endif
+#ifdef CONFIG_CGROUPS
+	/*
+	 * group_rwsem prevents new tasks from entering the threadgroup and
+	 * member tasks from exiting,a more specifically, setting of
+	 * PF_EXITING.  fork and exit paths are protected with this rwsem
+	 * using threadgroup_change_begin/end().  Users which require
+	 * threadgroup to remain stable should use threadgroup_[un]lock()
+	 * which also takes care of exec path.  Currently, cgroup is the
+	 * only user.
+	 */
+	struct rw_semaphore group_rwsem;
+#endif
+
+	int oom_adj;		/* OOM kill score adjustment (bit shift) */
+	int oom_score_adj;	/* OOM kill score adjustment */
+	int oom_score_adj_min;	/* OOM kill score adjustment minimum value.
+				 * Only settable by CAP_SYS_RESOURCE. */
+
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
+};
+
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
+/*
+ * Bits in flags field of signal_struct.
+ */
+#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
+#define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
+#define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
+/*
+ * Pending notifications to parent.
+ */
+#define SIGNAL_CLD_STOPPED	0x00000010
+#define SIGNAL_CLD_CONTINUED	0x00000020
+#define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
+
+#define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
+
+/* If true, all threads except ->group_exit_task have pending SIGKILL */
+static inline int signal_group_exit(const struct signal_struct *sig)
+{
+	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
+		(sig->group_exit_task != NULL);
+}
+
+/*
+ * Some day this will be a full-fledged user tracking system..
+ */
+struct user_struct {
+	atomic_t __count;	/* reference count */
+	atomic_t processes;	/* How many processes does this user have? */
+	atomic_t files;		/* How many open files does this user have? */
+	atomic_t sigpending;	/* How many pending signals does this user have? */
+#ifdef CONFIG_INOTIFY_USER
+	atomic_t inotify_watches; /* How many inotify watches does this user have? */
+	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
+#endif
+#ifdef CONFIG_FANOTIFY
+	atomic_t fanotify_listeners;
+#endif
+#ifdef CONFIG_EPOLL
+	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
+#endif
+#ifdef CONFIG_POSIX_MQUEUE
+	/* protected by mq_lock	*/
+	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
+#endif
+	unsigned long locked_shm; /* How many pages of mlocked shm ? */
+
+#ifdef CONFIG_KEYS
+	struct key *uid_keyring;	/* UID specific keyring */
+	struct key *session_keyring;	/* UID's default session keyring */
+#endif
+
+	/* Hash table maintenance information */
+	struct hlist_node uidhash_node;
+	uid_t uid;
+	struct user_namespace *user_ns;
+
+#ifdef CONFIG_PERF_EVENTS
+	atomic_long_t locked_vm;
+#endif
+};
+
+extern int uids_sysfs_init(void);
+
+extern struct user_struct *find_user(uid_t);
+
+extern struct user_struct root_user;
+#define INIT_USER (&root_user)
+
+
+struct backing_dev_info;
+struct reclaim_state;
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+struct sched_info {
+	/* cumulative counters */
+	unsigned long pcount;	      /* # of times run on this cpu */
+	unsigned long long run_delay; /* time spent waiting on a runqueue */
+
+	/* timestamps */
+	unsigned long long last_arrival,/* when we last ran on a cpu */
+			   last_queued;	/* when we were last queued to run */
+};
+#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+	spinlock_t	lock;
+	unsigned int	flags;	/* Private per-task flags */
+
+	/* For each stat XXX, add following, aligned appropriately
+	 *
+	 * struct timespec XXX_start, XXX_end;
+	 * u64 XXX_delay;
+	 * u32 XXX_count;
+	 *
+	 * Atomicity of updates to XXX_delay, XXX_count protected by
+	 * single lock above (split into XXX_lock if contention is an issue).
+	 */
+
+	/*
+	 * XXX_count is incremented on every XXX operation, the delay
+	 * associated with the operation is added to XXX_delay.
+	 * XXX_delay contains the accumulated delay time in nanoseconds.
+	 */
+	struct timespec blkio_start, blkio_end;	/* Shared by blkio, swapin */
+	u64 blkio_delay;	/* wait for sync block io completion */
+	u64 swapin_delay;	/* wait for swapin block io completion */
+	u32 blkio_count;	/* total count of the number of sync block */
+				/* io operations performed */
+	u32 swapin_count;	/* total count of the number of swapin block */
+				/* io operations performed */
+
+	struct timespec freepages_start, freepages_end;
+	u64 freepages_delay;	/* wait for memory reclaim */
+	u32 freepages_count;	/* total count of memory reclaim */
+};
+#endif	/* CONFIG_TASK_DELAY_ACCT */
+
+static inline int sched_info_on(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	return 1;
+#elif defined(CONFIG_TASK_DELAY_ACCT)
+	extern int delayacct_on;
+	return delayacct_on;
+#else
+	return 0;
+#endif
+}
+
+enum cpu_idle_type {
+	CPU_IDLE,
+	CPU_NOT_IDLE,
+	CPU_NEWLY_IDLE,
+	CPU_MAX_IDLE_TYPES
+};
+
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
+# define SCHED_LOAD_RESOLUTION	10
+# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION	0
+# define scale_load(w)		(w)
+# define scale_load_down(w)	(w)
+#endif
+
+#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
+
+/*
+ * Increase resolution of cpu_power calculations
+ */
+#define SCHED_POWER_SHIFT	10
+#define SCHED_POWER_SCALE	(1L << SCHED_POWER_SHIFT)
+
+/*
+ * sched-domains (multiprocessor balancing) declarations:
+ */
+#ifdef CONFIG_SMP
+#define SD_LOAD_BALANCE		0x0001	/* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
+#define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
+#define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
+#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
+#define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
+#define SD_PREFER_LOCAL		0x0040  /* Prefer to keep tasks local to this domain */
+#define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
+#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
+#define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
+#define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
+#define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
+#define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
+
+enum powersavings_balance_level {
+	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+	POWERSAVINGS_BALANCE_BASIC,	/* Fill one thread/core/package
+					 * first for long running threads
+					 */
+	POWERSAVINGS_BALANCE_WAKEUP,	/* Also bias task wakeups to semi-idle
+					 * cpu package for power savings
+					 */
+	MAX_POWERSAVINGS_BALANCE_LEVELS
+};
+
+extern int sched_mc_power_savings, sched_smt_power_savings;
+
+static inline int sd_balance_for_mc_power(void)
+{
+	if (sched_smt_power_savings)
+		return SD_POWERSAVINGS_BALANCE;
+
+	if (!sched_mc_power_savings)
+		return SD_PREFER_SIBLING;
+
+	return 0;
+}
+
+static inline int sd_balance_for_package_power(void)
+{
+	if (sched_mc_power_savings | sched_smt_power_savings)
+		return SD_POWERSAVINGS_BALANCE;
+
+	return SD_PREFER_SIBLING;
+}
+
+extern int __weak arch_sd_sibiling_asym_packing(void);
+
+/*
+ * Optimise SD flags for power savings:
+ * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
+ * Keep default SD flags if sched_{smt,mc}_power_saving=0
+ */
+
+static inline int sd_power_saving_flags(void)
+{
+	if (sched_mc_power_savings | sched_smt_power_savings)
+		return SD_BALANCE_NEWIDLE;
+
+	return 0;
+}
+
+struct sched_group_power {
+	atomic_t ref;
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU.
+	 */
+	unsigned int power, power_orig;
+	unsigned long next_update;
+	/*
+	 * Number of busy cpus in this group.
+	 */
+	atomic_t nr_busy_cpus;
+};
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	atomic_t ref;
+
+	unsigned int group_weight;
+	struct sched_group_power *sgp;
+
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 */
+	unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+	return to_cpumask(sg->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+	return cpumask_first(sched_group_cpus(group));
+}
+
+struct sched_domain_attr {
+	int relax_domain_level;
+};
+
+#define SD_ATTR_INIT	(struct sched_domain_attr) {	\
+	.relax_domain_level = -1,			\
+}
+
+extern int sched_domain_level_max;
+
+struct sched_domain {
+	/* These fields must be setup */
+	struct sched_domain *parent;	/* top domain must be null terminated */
+	struct sched_domain *child;	/* bottom domain must be null terminated */
+	struct sched_group *groups;	/* the balancing groups of the domain */
+	unsigned long min_interval;	/* Minimum balance interval ms */
+	unsigned long max_interval;	/* Maximum balance interval ms */
+	unsigned int busy_factor;	/* less balancing by factor if busy */
+	unsigned int imbalance_pct;	/* No balance until over watermark */
+	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
+	unsigned int forkexec_idx;
+	unsigned int smt_gain;
+	int flags;			/* See SD_* */
+	int level;
+
+	/* Runtime fields. */
+	unsigned long last_balance;	/* init to jiffies. units in jiffies */
+	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+	unsigned int nr_balance_failed; /* initialise to 0 */
+
+	u64 last_update;
+
+#ifdef CONFIG_SCHEDSTATS
+	/* load_balance() stats */
+	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
+	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
+
+	/* Active load balancing */
+	unsigned int alb_count;
+	unsigned int alb_failed;
+	unsigned int alb_pushed;
+
+	/* SD_BALANCE_EXEC stats */
+	unsigned int sbe_count;
+	unsigned int sbe_balanced;
+	unsigned int sbe_pushed;
+
+	/* SD_BALANCE_FORK stats */
+	unsigned int sbf_count;
+	unsigned int sbf_balanced;
+	unsigned int sbf_pushed;
+
+	/* try_to_wake_up() stats */
+	unsigned int ttwu_wake_remote;
+	unsigned int ttwu_move_affine;
+	unsigned int ttwu_move_balance;
+#endif
+#ifdef CONFIG_SCHED_DEBUG
+	char *name;
+#endif
+	union {
+		void *private;		/* used during construction */
+		struct rcu_head rcu;	/* used during destruction */
+	};
+
+	unsigned int span_weight;
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 */
+	unsigned long span[0];
+};
+
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+	return to_cpumask(sd->span);
+}
+
+extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+				    struct sched_domain_attr *dattr_new);
+
+/* Allocate an array of sched domains, for partition_sched_domains(). */
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+
+/* Test a flag in parent sched domain */
+static inline int test_sd_parent(struct sched_domain *sd, int flag)
+{
+	if (sd->parent && (sd->parent->flags & flag))
+		return 1;
+
+	return 0;
+}
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
+
+bool cpus_share_cache(int this_cpu, int that_cpu);
+
+#else /* CONFIG_SMP */
+
+struct sched_domain_attr;
+
+static inline void
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+			struct sched_domain_attr *dattr_new)
+{
+}
+
+static inline bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
+#endif	/* !CONFIG_SMP */
+
+
+struct io_context;			/* See blkdev.h */
+
+
+#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
+extern void prefetch_stack(struct task_struct *t);
+#else
+static inline void prefetch_stack(struct task_struct *t) { }
+#endif
+
+struct audit_context;		/* See audit.c */
+struct mempolicy;
+struct pipe_inode_info;
+struct uts_namespace;
+
+struct rq;
+struct sched_domain;
+
+/*
+ * wake flags
+ */
+#define WF_SYNC		0x01		/* waker goes to sleep after wakup */
+#define WF_FORK		0x02		/* child wakeup after fork */
+#define WF_MIGRATED	0x04		/* internal use, task got migrated */
+
+#define ENQUEUE_WAKEUP		1
+#define ENQUEUE_HEAD		2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING		0
+#endif
+
+#define DEQUEUE_SLEEP		1
+
+struct sched_class {
+	const struct sched_class *next;
+
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*yield_task) (struct rq *rq);
+	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+	struct task_struct * (*pick_next_task) (struct rq *rq);
+	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+
+	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	void (*post_schedule) (struct rq *this_rq);
+	void (*task_waking) (struct task_struct *task);
+	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+
+	void (*set_cpus_allowed)(struct task_struct *p,
+				 const struct cpumask *newmask);
+
+	void (*rq_online)(struct rq *rq);
+	void (*rq_offline)(struct rq *rq);
+#endif
+
+	void (*set_curr_task) (struct rq *rq);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+	void (*task_fork) (struct task_struct *p);
+
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+			     int oldprio);
+
+	unsigned int (*get_rr_interval) (struct rq *rq,
+					 struct task_struct *task);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
+};
+
+struct load_weight {
+	unsigned long weight, inv_weight;
+};
+
+#ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
+	u64			wait_start;
+	u64			wait_max;
+	u64			wait_count;
+	u64			wait_sum;
+	u64			iowait_count;
+	u64			iowait_sum;
+
+	u64			sleep_start;
+	u64			sleep_max;
+	s64			sum_sleep_runtime;
+
+	u64			block_start;
+	u64			block_max;
+	u64			exec_max;
+	u64			slice_max;
+
+	u64			nr_migrations_cold;
+	u64			nr_failed_migrations_affine;
+	u64			nr_failed_migrations_running;
+	u64			nr_failed_migrations_hot;
+	u64			nr_forced_migrations;
+
+	u64			nr_wakeups;
+	u64			nr_wakeups_sync;
+	u64			nr_wakeups_migrate;
+	u64			nr_wakeups_local;
+	u64			nr_wakeups_remote;
+	u64			nr_wakeups_affine;
+	u64			nr_wakeups_affine_attempts;
+	u64			nr_wakeups_passive;
+	u64			nr_wakeups_idle;
+};
+#endif
+
+struct sched_entity {
+	struct load_weight	load;		/* for load-balancing */
+	struct rb_node		run_node;
+	struct list_head	group_node;
+	unsigned int		on_rq;
+
+	u64			exec_start;
+	u64			sum_exec_runtime;
+	u64			vruntime;
+	u64			prev_sum_exec_runtime;
+
+	u64			nr_migrations;
+
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_statistics statistics;
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity	*parent;
+	/* rq on which this entity is (to be) queued: */
+	struct cfs_rq		*cfs_rq;
+	/* rq "owned" by this entity/group: */
+	struct cfs_rq		*my_q;
+#endif
+};
+
+struct sched_rt_entity {
+	struct list_head run_list;
+	unsigned long timeout;
+	unsigned int time_slice;
+	int nr_cpus_allowed;
+
+	struct sched_rt_entity *back;
+#ifdef CONFIG_RT_GROUP_SCHED
+	struct sched_rt_entity	*parent;
+	/* rq on which this entity is (to be) queued: */
+	struct rt_rq		*rt_rq;
+	/* rq "owned" by this entity/group: */
+	struct rt_rq		*my_q;
+#endif
+};
+
+/*
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define RR_TIMESLICE		(100 * HZ / 1000)
+
+struct rcu_node;
+
+enum perf_event_task_context {
+	perf_invalid_context = -1,
+	perf_hw_context = 0,
+	perf_sw_context,
+	perf_nr_task_contexts,
+};
+
+struct task_struct {
+	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	void *stack;
+	atomic_t usage;
+	unsigned int flags;	/* per process flags, defined below */
+	unsigned int ptrace;
+
+#ifdef CONFIG_SMP
+	struct llist_node wake_entry;
+	int on_cpu;
+#endif
+	int on_rq;
+
+	int prio, static_prio, normal_prio;
+	unsigned int rt_priority;
+	const struct sched_class *sched_class;
+	struct sched_entity se;
+	struct sched_rt_entity rt;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	/* list of struct preempt_notifier: */
+	struct hlist_head preempt_notifiers;
+#endif
+
+	/*
+	 * fpu_counter contains the number of consecutive context switches
+	 * that the FPU is used. If this is over a threshold, the lazy fpu
+	 * saving becomes unlazy to save the trap. This is an unsigned char
+	 * so that after 256 times the counter wraps and the behavior turns
+	 * lazy again; this to deal with bursty apps that only use FPU for
+	 * a short time
+	 */
+	unsigned char fpu_counter;
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	unsigned int btrace_seq;
+#endif
+
+	unsigned int policy;
+	cpumask_t cpus_allowed;
+
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	char rcu_read_unlock_special;
+	struct list_head rcu_node_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	struct rcu_node *rcu_blocked_node;
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	struct sched_info sched_info;
+#endif
+
+	struct list_head tasks;
+#ifdef CONFIG_SMP
+	struct plist_node pushable_tasks;
+#endif
+
+	struct mm_struct *mm, *active_mm;
+#ifdef CONFIG_COMPAT_BRK
+	unsigned brk_randomized:1;
+#endif
+#if defined(SPLIT_RSS_COUNTING)
+	struct task_rss_stat	rss_stat;
+#endif
+/* task state */
+	int exit_state;
+	int exit_code, exit_signal;
+	int pdeath_signal;  /*  The signal sent when the parent dies  */
+	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
+	/* ??? */
+	unsigned int personality;
+	unsigned did_exec:1;
+	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
+				 * execve */
+	unsigned in_iowait:1;
+
+
+	/* Revert to default priority/policy when forking */
+	unsigned sched_reset_on_fork:1;
+	unsigned sched_contributes_to_load:1;
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+	/* IRQ handler threads */
+	unsigned irq_thread:1;
+#endif
+
+	pid_t pid;
+	pid_t tgid;
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+	/* Canary value for the -fstack-protector gcc feature */
+	unsigned long stack_canary;
+#endif
+
+	/* 
+	 * pointers to (original) parent process, youngest child, younger sibling,
+	 * older sibling, respectively.  (p->father can be replaced with 
+	 * p->real_parent->pid)
+	 */
+	struct task_struct __rcu *real_parent; /* real parent process */
+	struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
+	/*
+	 * children/sibling forms the list of my natural children
+	 */
+	struct list_head children;	/* list of my children */
+	struct list_head sibling;	/* linkage in my parent's children list */
+	struct task_struct *group_leader;	/* threadgroup leader */
+
+	/*
+	 * ptraced is the list of tasks this task is using ptrace on.
+	 * This includes both natural children and PTRACE_ATTACH targets.
+	 * p->ptrace_entry is p's link on the p->parent->ptraced list.
+	 */
+	struct list_head ptraced;
+	struct list_head ptrace_entry;
+
+	/* PID/PID hash table linkage. */
+	struct pid_link pids[PIDTYPE_MAX];
+	struct list_head thread_group;
+
+	struct completion *vfork_done;		/* for vfork() */
+	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
+	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
+
+	cputime_t utime, stime, utimescaled, stimescaled;
+	cputime_t gtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	cputime_t prev_utime, prev_stime;
+#endif
+	unsigned long nvcsw, nivcsw; /* context switch counts */
+	struct timespec start_time; 		/* monotonic time */
+	struct timespec real_start_time;	/* boot based time */
+/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+	unsigned long min_flt, maj_flt;
+
+	struct task_cputime cputime_expires;
+	struct list_head cpu_timers[3];
+
+/* process credentials */
+	const struct cred __rcu *real_cred; /* objective and real subjective task
+					 * credentials (COW) */
+	const struct cred __rcu *cred;	/* effective (overridable) subjective task
+					 * credentials (COW) */
+	struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
+
+	char comm[TASK_COMM_LEN]; /* executable name excluding path
+				     - access with [gs]et_task_comm (which lock
+				       it with task_lock())
+				     - initialized normally by setup_new_exec */
+/* file system info */
+	int link_count, total_link_count;
+#ifdef CONFIG_SYSVIPC
+/* ipc stuff */
+	struct sysv_sem sysvsem;
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+/* hung task detection */
+	unsigned long last_switch_count;
+#endif
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/* filesystem information */
+	struct fs_struct *fs;
+/* open file information */
+	struct files_struct *files;
+/* namespaces */
+	struct nsproxy *nsproxy;
+/* signal handlers */
+	struct signal_struct *signal;
+	struct sighand_struct *sighand;
+
+	sigset_t blocked, real_blocked;
+	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
+	struct sigpending pending;
+
+	unsigned long sas_ss_sp;
+	size_t sas_ss_size;
+	int (*notifier)(void *priv);
+	void *notifier_data;
+	sigset_t *notifier_mask;
+	struct audit_context *audit_context;
+#ifdef CONFIG_AUDITSYSCALL
+	uid_t loginuid;
+	unsigned int sessionid;
+#endif
+	seccomp_t seccomp;
+
+/* Thread group tracking */
+   	u32 parent_exec_id;
+   	u32 self_exec_id;
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
+ * mempolicy */
+	spinlock_t alloc_lock;
+
+	/* Protection of the PI data structures: */
+	raw_spinlock_t pi_lock;
+
+#ifdef CONFIG_RT_MUTEXES
+	/* PI waiters blocked on a rt_mutex held by this task */
+	struct plist_head pi_waiters;
+	/* Deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *pi_blocked_on;
+#endif
+
+#ifdef CONFIG_DEBUG_MUTEXES
+	/* mutex deadlock detection */
+	struct mutex_waiter *blocked_on;
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+	unsigned int irq_events;
+	unsigned long hardirq_enable_ip;
+	unsigned long hardirq_disable_ip;
+	unsigned int hardirq_enable_event;
+	unsigned int hardirq_disable_event;
+	int hardirqs_enabled;
+	int hardirq_context;
+	unsigned long softirq_disable_ip;
+	unsigned long softirq_enable_ip;
+	unsigned int softirq_disable_event;
+	unsigned int softirq_enable_event;
+	int softirqs_enabled;
+	int softirq_context;
+#endif
+#ifdef CONFIG_LOCKDEP
+# define MAX_LOCK_DEPTH 48UL
+	u64 curr_chain_key;
+	int lockdep_depth;
+	unsigned int lockdep_recursion;
+	struct held_lock held_locks[MAX_LOCK_DEPTH];
+	gfp_t lockdep_reclaim_gfp;
+#endif
+
+/* journalling filesystem info */
+	void *journal_info;
+
+/* stacked block device info */
+	struct bio_list *bio_list;
+
+#ifdef CONFIG_BLOCK
+/* stack plugging */
+	struct blk_plug *plug;
+#endif
+
+/* VM state */
+	struct reclaim_state *reclaim_state;
+
+	struct backing_dev_info *backing_dev_info;
+
+	struct io_context *io_context;
+
+	unsigned long ptrace_message;
+	siginfo_t *last_siginfo; /* For ptrace use.  */
+	struct task_io_accounting ioac;
+#if defined(CONFIG_TASK_XACCT)
+	u64 acct_rss_mem1;	/* accumulated rss usage */
+	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
+	cputime_t acct_timexpd;	/* stime + utime since last update */
+#endif
+#ifdef CONFIG_CPUSETS
+	nodemask_t mems_allowed;	/* Protected by alloc_lock */
+	seqcount_t mems_allowed_seq;	/* Seqence no to catch updates */
+	int cpuset_mem_spread_rotor;
+	int cpuset_slab_spread_rotor;
+#endif
+#ifdef CONFIG_CGROUPS
+	/* Control Group info protected by css_set_lock */
+	struct css_set __rcu *cgroups;
+	/* cg_list protected by css_set_lock and tsk->alloc_lock */
+	struct list_head cg_list;
+#endif
+#ifdef CONFIG_FUTEX
+	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
+	struct list_head pi_state_list;
+	struct futex_pi_state *pi_state_cache;
+#endif
+#ifdef CONFIG_PERF_EVENTS
+	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
+	struct mutex perf_event_mutex;
+	struct list_head perf_event_list;
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
+	short il_next;
+	short pref_node_fork;
+#endif
+	struct rcu_head rcu;
+
+	/*
+	 * cache last used pipe for splice
+	 */
+	struct pipe_inode_info *splice_pipe;
+#ifdef	CONFIG_TASK_DELAY_ACCT
+	struct task_delay_info *delays;
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+	int make_it_fail;
+#endif
+	/*
+	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for some dirty throttling pause
+	 */
+	int nr_dirtied;
+	int nr_dirtied_pause;
+	unsigned long dirty_paused_when; /* start of a write-and-pause period */
+
+#ifdef CONFIG_LATENCYTOP
+	int latency_record_count;
+	struct latency_record latency_record[LT_SAVECOUNT];
+#endif
+	/*
+	 * time slack values; these are used to round up poll() and
+	 * select() etc timeout values. These are in nanoseconds.
+	 */
+	unsigned long timer_slack_ns;
+	unsigned long default_timer_slack_ns;
+
+	struct list_head	*scm_work_list;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* Index of current stored address in ret_stack */
+	int curr_ret_stack;
+	/* Stack of return addresses for return function tracing */
+	struct ftrace_ret_stack	*ret_stack;
+	/* time stamp for last schedule */
+	unsigned long long ftrace_timestamp;
+	/*
+	 * Number of functions that haven't been traced
+	 * because of depth overrun.
+	 */
+	atomic_t trace_overrun;
+	/* Pause for the tracing */
+	atomic_t tracing_graph_pause;
+#endif
+#ifdef CONFIG_TRACING
+	/* state flags for use by tracers */
+	unsigned long trace;
+	/* bitmask and counter of trace recursion */
+	unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
+	struct memcg_batch_info {
+		int do_batch;	/* incremented when batch uncharge started */
+		struct mem_cgroup *memcg; /* target memcg of uncharge */
+		unsigned long nr_pages;	/* uncharged usage */
+		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
+	} memcg_batch;
+#endif
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	atomic_t ptrace_bp_refcnt;
+#endif
+};
+
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+
+static inline int rt_prio(int prio)
+{
+	if (unlikely(prio < MAX_RT_PRIO))
+		return 1;
+	return 0;
+}
+
+static inline int rt_task(struct task_struct *p)
+{
+	return rt_prio(p->prio);
+}
+
+static inline struct pid *task_pid(struct task_struct *task)
+{
+	return task->pids[PIDTYPE_PID].pid;
+}
+
+static inline struct pid *task_tgid(struct task_struct *task)
+{
+	return task->group_leader->pids[PIDTYPE_PID].pid;
+}
+
+/*
+ * Without tasklist or rcu lock it is not safe to dereference
+ * the result of task_pgrp/task_session even if task == current,
+ * we can race with another thread doing sys_setsid/sys_setpgid.
+ */
+static inline struct pid *task_pgrp(struct task_struct *task)
+{
+	return task->group_leader->pids[PIDTYPE_PGID].pid;
+}
+
+static inline struct pid *task_session(struct task_struct *task)
+{
+	return task->group_leader->pids[PIDTYPE_SID].pid;
+}
+
+struct pid_namespace;
+
+/*
+ * the helpers to get the task's different pids as they are seen
+ * from various namespaces
+ *
+ * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
+ * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
+ *                     current.
+ * task_xid_nr_ns()  : id seen from the ns specified;
+ *
+ * set_task_vxid()   : assigns a virtual id to a task;
+ *
+ * see also pid_nr() etc in include/linux/pid.h
+ */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns);
+
+static inline pid_t task_pid_nr(struct task_struct *tsk)
+{
+	return tsk->pid;
+}
+
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
+
+static inline pid_t task_pid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
+}
+
+
+static inline pid_t task_tgid_nr(struct task_struct *tsk)
+{
+	return tsk->tgid;
+}
+
+pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+
+static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+{
+	return pid_vnr(task_tgid(tsk));
+}
+
+
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
+
+static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
+}
+
+
+static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
+
+static inline pid_t task_session_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
+}
+
+/* obsolete, do not use */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ */
+static inline int pid_alive(struct task_struct *p)
+{
+	return p->pids[PIDTYPE_PID].pid != NULL;
+}
+
+/**
+ * is_global_init - check if a task structure is init
+ * @tsk: Task structure to be checked.
+ *
+ * Check if a task structure is the first user space task the kernel created.
+ */
+static inline int is_global_init(struct task_struct *tsk)
+{
+	return tsk->pid == 1;
+}
+
+/*
+ * is_container_init:
+ * check whether in the task is init in its own pid namespace.
+ */
+extern int is_container_init(struct task_struct *tsk);
+
+extern struct pid *cad_pid;
+
+extern void free_task(struct task_struct *tsk);
+#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+
+extern void __put_task_struct(struct task_struct *t);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		__put_task_struct(t);
+}
+
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+
+/*
+ * Per process flags
+ */
+#define PF_EXITING	0x00000004	/* getting shut down */
+#define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
+#define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_WQ_WORKER	0x00000020	/* I'm a workqueue worker */
+#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
+#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+#define PF_DUMPCORE	0x00000200	/* dumped core */
+#define PF_SIGNALED	0x00000400	/* killed by a signal */
+#define PF_MEMALLOC	0x00000800	/* Allocating memory */
+#define PF_NPROC_EXCEEDED 0x00001000	/* set_user noticed that RLIMIT_NPROC was exceeded */
+#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
+#define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
+#define PF_FROZEN	0x00010000	/* frozen for system suspend */
+#define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00040000	/* I am kswapd */
+#define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
+#define PF_KTHREAD	0x00200000	/* I am a kernel thread */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
+#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
+#define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
+
+/*
+ * Only the _current_ task can read/write to tsk->flags, but other
+ * tasks can access tsk->flags in readonly mode for example
+ * with tsk_used_math (like during threaded core dumping).
+ * There is however an exception to this rule during ptrace
+ * or during fork: the ptracer task is allowed to write to the
+ * child->flags of its traced child (same goes for fork, the parent
+ * can write to the child->flags), because we're guaranteed the
+ * child is not running and in turn not changing child->flags
+ * at the same time the parent does it.
+ */
+#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
+#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
+#define clear_used_math() clear_stopped_child_used_math(current)
+#define set_used_math() set_stopped_child_used_math(current)
+#define conditional_stopped_child_used_math(condition, child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
+#define conditional_used_math(condition) \
+	conditional_stopped_child_used_math(condition, current)
+#define copy_to_stopped_child_used_math(child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
+/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
+#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
+#define used_math() tsk_used_math(current)
+
+/*
+ * task->jobctl flags
+ */
+#define JOBCTL_STOP_SIGMASK	0xffff	/* signr of the last group stop */
+
+#define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
+#define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
+#define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
+#define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
+#define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
+#define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
+#define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
+
+#define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
+#define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
+
+#define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
+#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
+
+extern bool task_set_jobctl_pending(struct task_struct *task,
+				    unsigned int mask);
+extern void task_clear_jobctl_trapping(struct task_struct *task);
+extern void task_clear_jobctl_pending(struct task_struct *task,
+				      unsigned int mask);
+
+#ifdef CONFIG_PREEMPT_RCU
+
+#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+
+static inline void rcu_copy_process(struct task_struct *p)
+{
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_read_unlock_special = 0;
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	p->rcu_blocked_node = NULL;
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+	INIT_LIST_HEAD(&p->rcu_node_entry);
+}
+
+#else
+
+static inline void rcu_copy_process(struct task_struct *p)
+{
+}
+
+#endif
+
+#ifdef CONFIG_SMP
+extern void do_set_cpus_allowed(struct task_struct *p,
+			       const struct cpumask *new_mask);
+
+extern int set_cpus_allowed_ptr(struct task_struct *p,
+				const struct cpumask *new_mask);
+#else
+static inline void do_set_cpus_allowed(struct task_struct *p,
+				      const struct cpumask *new_mask)
+{
+}
+static inline int set_cpus_allowed_ptr(struct task_struct *p,
+				       const struct cpumask *new_mask)
+{
+	if (!cpumask_test_cpu(0, new_mask))
+		return -EINVAL;
+	return 0;
+}
+#endif
+
+#ifndef CONFIG_CPUMASK_OFFSTACK
+static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+{
+	return set_cpus_allowed_ptr(p, &new_mask);
+}
+#endif
+
+/*
+ * Do not use outside of architecture code which knows its limitations.
+ *
+ * sched_clock() has no promise of monotonicity or bounded drift between
+ * CPUs, use (which you should not) requires disabling IRQs.
+ *
+ * Please use one of the three interfaces below.
+ */
+extern unsigned long long notrace sched_clock(void);
+/*
+ * See the comment in kernel/sched_clock.c
+ */
+extern u64 cpu_clock(int cpu);
+extern u64 local_clock(void);
+extern u64 sched_clock_cpu(int cpu);
+
+
+extern void sched_clock_init(void);
+
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_tick(void)
+{
+}
+
+static inline void sched_clock_idle_sleep_event(void)
+{
+}
+
+static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+}
+#else
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+extern int sched_clock_stable;
+
+extern void sched_clock_tick(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ * The reason for this explicit opt-in is not to have perf penalty with
+ * slow sched_clocks.
+ */
+extern void enable_sched_clock_irqtime(void);
+extern void disable_sched_clock_irqtime(void);
+#else
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
+#endif
+
+extern unsigned long long
+task_sched_runtime(struct task_struct *task);
+
+/* sched_exec is called by processes performing an exec */
+#ifdef CONFIG_SMP
+extern void sched_exec(void);
+#else
+#define sched_exec()   {}
+#endif
+
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern void idle_task_exit(void);
+#else
+static inline void idle_task_exit(void) {}
+#endif
+
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+extern void wake_up_idle_cpu(int cpu);
+#else
+static inline void wake_up_idle_cpu(int cpu) { }
+#endif
+
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_child_runs_first;
+
+enum sched_tunable_scaling {
+	SCHED_TUNABLESCALING_NONE,
+	SCHED_TUNABLESCALING_LOG,
+	SCHED_TUNABLESCALING_LINEAR,
+	SCHED_TUNABLESCALING_END,
+};
+extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_time_avg;
+extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
+
+int sched_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *length,
+		loff_t *ppos);
+#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return sysctl_timer_migration;
+}
+#else
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return 1;
+}
+#endif
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+
+int sched_rt_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(struct task_struct *p);
+extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern void rt_mutex_adjust_pi(struct task_struct *p);
+static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
+{
+	return tsk->pi_blocked_on != NULL;
+}
+#else
+static inline int rt_mutex_getprio(struct task_struct *p)
+{
+	return p->normal_prio;
+}
+# define rt_mutex_adjust_pi(p)		do { } while (0)
+static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
+{
+	return false;
+}
+#endif
+
+extern bool yield_to(struct task_struct *p, bool preempt);
+extern void set_user_nice(struct task_struct *p, long nice);
+extern int task_prio(const struct task_struct *p);
+extern int task_nice(const struct task_struct *p);
+extern int can_nice(const struct task_struct *p, const int nice);
+extern int task_curr(const struct task_struct *p);
+extern int idle_cpu(int cpu);
+extern int sched_setscheduler(struct task_struct *, int,
+			      const struct sched_param *);
+extern int sched_setscheduler_nocheck(struct task_struct *, int,
+				      const struct sched_param *);
+extern struct task_struct *idle_task(int cpu);
+/**
+ * is_idle_task - is the specified task an idle task?
+ * @p: the task in question.
+ */
+static inline bool is_idle_task(const struct task_struct *p)
+{
+	return p->pid == 0;
+}
+extern struct task_struct *curr_task(int cpu);
+extern void set_curr_task(int cpu, struct task_struct *p);
+
+void yield(void);
+
+/*
+ * The default (Linux) execution domain.
+ */
+extern struct exec_domain	default_exec_domain;
+
+union thread_union {
+	struct thread_info thread_info;
+	unsigned long stack[THREAD_SIZE/sizeof(long)];
+};
+
+#ifndef __HAVE_ARCH_KSTACK_END
+static inline int kstack_end(void *addr)
+{
+	/* Reliable end of stack detection:
+	 * Some APM bios versions misalign the stack
+	 */
+	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
+}
+#endif
+
+extern union thread_union init_thread_union;
+extern struct task_struct init_task;
+
+extern struct   mm_struct init_mm;
+
+extern struct pid_namespace init_pid_ns;
+
+/*
+ * find a task by one of its numerical ids
+ *
+ * find_task_by_pid_ns():
+ *      finds a task by its pid in the specified namespace
+ * find_task_by_vpid():
+ *      finds a task by its virtual pid
+ *
+ * see also find_vpid() etc in include/linux/pid.h
+ */
+
+extern struct task_struct *find_task_by_vpid(pid_t nr);
+extern struct task_struct *find_task_by_pid_ns(pid_t nr,
+		struct pid_namespace *ns);
+
+extern void __set_special_pids(struct pid *pid);
+
+/* per-UID process charging. */
+extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
+static inline struct user_struct *get_uid(struct user_struct *u)
+{
+	atomic_inc(&u->__count);
+	return u;
+}
+extern void free_uid(struct user_struct *);
+extern void release_uids(struct user_namespace *ns);
+
+#include <asm/current.h>
+
+extern void xtime_update(unsigned long ticks);
+
+extern int wake_up_state(struct task_struct *tsk, unsigned int state);
+extern int wake_up_process(struct task_struct *tsk);
+extern void wake_up_new_task(struct task_struct *tsk);
+#ifdef CONFIG_SMP
+ extern void kick_process(struct task_struct *tsk);
+#else
+ static inline void kick_process(struct task_struct *tsk) { }
+#endif
+extern void sched_fork(struct task_struct *p);
+extern void sched_dead(struct task_struct *p);
+
+extern void proc_caches_init(void);
+extern void flush_signals(struct task_struct *);
+extern void __flush_signals(struct task_struct *);
+extern void ignore_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *, int force_default);
+extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
+
+static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	ret = dequeue_signal(tsk, mask, info);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+
+	return ret;
+}
+
+extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+			      sigset_t *mask);
+extern void unblock_all_signals(void);
+extern void release_task(struct task_struct * p);
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sigsegv(int, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
+extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
+extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
+				const struct cred *, u32);
+extern int kill_pgrp(struct pid *pid, int sig, int priv);
+extern int kill_pid(struct pid *pid, int sig, int priv);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern __must_check bool do_notify_parent(struct task_struct *, int);
+extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
+extern void force_sig(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern int zap_other_threads(struct task_struct *p);
+extern struct sigqueue *sigqueue_alloc(void);
+extern void sigqueue_free(struct sigqueue *);
+extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
+
+static inline int kill_cad_pid(int sig, int priv)
+{
+	return kill_pid(cad_pid, sig, priv);
+}
+
+/* These can be the second arg to send_sig_info/send_group_sig_info.  */
+#define SEND_SIG_NOINFO ((struct siginfo *) 0)
+#define SEND_SIG_PRIV	((struct siginfo *) 1)
+#define SEND_SIG_FORCED	((struct siginfo *) 2)
+
+/*
+ * True if we are on the alternate signal stack.
+ */
+static inline int on_sig_stack(unsigned long sp)
+{
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+	return (current->sas_ss_size == 0 ? SS_DISABLE
+		: on_sig_stack(sp) ? SS_ONSTACK : 0);
+}
+
+/*
+ * Routines for handling mm_structs
+ */
+extern struct mm_struct * mm_alloc(void);
+
+/* mmdrop drops the mm and the page tables */
+extern void __mmdrop(struct mm_struct *);
+static inline void mmdrop(struct mm_struct * mm)
+{
+	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+		__mmdrop(mm);
+}
+
+/* mmput gets rid of the mappings and all user-space */
+extern void mmput(struct mm_struct *);
+/* Grab a reference to a task's mm, if it is not already going away */
+extern struct mm_struct *get_task_mm(struct task_struct *task);
+/*
+ * Grab a reference to a task's mm, if it is not already going away
+ * and ptrace_may_access with the mode parameter passed to it
+ * succeeds.
+ */
+extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
+/* Remove the current tasks stale references to the old mm_struct */
+extern void mm_release(struct task_struct *, struct mm_struct *);
+/* Allocate a new mm structure and copy contents from tsk->mm */
+extern struct mm_struct *dup_mm(struct task_struct *tsk);
+
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, struct pt_regs *);
+extern void flush_thread(void);
+extern void exit_thread(void);
+
+extern void exit_files(struct task_struct *);
+extern void __cleanup_sighand(struct sighand_struct *);
+
+extern void exit_itimers(struct signal_struct *);
+extern void flush_itimer_signals(void);
+
+extern void do_group_exit(int);
+
+extern void daemonize(const char *, ...);
+extern int allow_signal(int);
+extern int disallow_signal(int);
+
+extern int do_execve(const char *,
+		     const char __user * const __user *,
+		     const char __user * const __user *, struct pt_regs *);
+extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+struct task_struct *fork_idle(int);
+
+extern void set_task_comm(struct task_struct *tsk, char *from);
+extern char *get_task_comm(char *to, struct task_struct *tsk);
+
+#ifdef CONFIG_SMP
+void scheduler_ipi(void);
+extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
+#else
+static inline void scheduler_ipi(void) { }
+static inline unsigned long wait_task_inactive(struct task_struct *p,
+					       long match_state)
+{
+	return 1;
+}
+#endif
+
+#define next_task(p) \
+	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
+
+#define for_each_process(p) \
+	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+
+extern bool current_is_single_threaded(void);
+
+/*
+ * Careful: do_each_thread/while_each_thread is a double loop so
+ *          'break' will not work as expected - use goto instead.
+ */
+#define do_each_thread(g, t) \
+	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+
+#define while_each_thread(g, t) \
+	while ((t = next_thread(t)) != g)
+
+static inline int get_nr_threads(struct task_struct *tsk)
+{
+	return tsk->signal->nr_threads;
+}
+
+static inline bool thread_group_leader(struct task_struct *p)
+{
+	return p->exit_signal >= 0;
+}
+
+/* Do to the insanities of de_thread it is possible for a process
+ * to have the pid of the thread group leader without actually being
+ * the thread group leader.  For iteration through the pids in proc
+ * all we care about is that we have a task with the appropriate
+ * pid, we don't actually care if we have the right task.
+ */
+static inline int has_group_leader_pid(struct task_struct *p)
+{
+	return p->pid == p->tgid;
+}
+
+static inline
+int same_thread_group(struct task_struct *p1, struct task_struct *p2)
+{
+	return p1->tgid == p2->tgid;
+}
+
+static inline struct task_struct *next_thread(const struct task_struct *p)
+{
+	return list_entry_rcu(p->thread_group.next,
+			      struct task_struct, thread_group);
+}
+
+static inline int thread_group_empty(struct task_struct *p)
+{
+	return list_empty(&p->thread_group);
+}
+
+#define delay_group_leader(p) \
+		(thread_group_leader(p) && !thread_group_empty(p))
+
+/*
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.  Also protects ->cpuset and
+ * ->cgroup.subsys[]. And ->vfork_done.
+ *
+ * Nests both inside and outside of read_lock(&tasklist_lock).
+ * It must not be nested with write_lock_irq(&tasklist_lock),
+ * neither inside nor outside.
+ */
+static inline void task_lock(struct task_struct *p)
+{
+	spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+	spin_unlock(&p->alloc_lock);
+}
+
+extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+						       unsigned long *flags)
+{
+	struct sighand_struct *ret;
+
+	ret = __lock_task_sighand(tsk, flags);
+	(void)__cond_lock(&tsk->sighand->siglock, ret);
+	return ret;
+}
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
+#ifdef CONFIG_CGROUPS
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+	down_read(&tsk->signal->group_rwsem);
+}
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+	up_read(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * perform exec.  This is useful for cases where the threadgroup needs to
+ * stay stable across blockable operations.
+ *
+ * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
+ * synchronization.  While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * During exec, a task goes and puts its thread group through unusual
+ * changes.  After de-threading, exclusive access is assumed to resources
+ * which are usually shared by tasks in the same group - e.g. sighand may
+ * be replaced with a new one.  Also, the exec'ing task takes over group
+ * leader role including its pid.  Exclude these changes while locked by
+ * grabbing cred_guard_mutex which is used to synchronize exec path.
+ */
+static inline void threadgroup_lock(struct task_struct *tsk)
+{
+	/*
+	 * exec uses exit for de-threading nesting group_rwsem inside
+	 * cred_guard_mutex. Grab cred_guard_mutex first.
+	 */
+	mutex_lock(&tsk->signal->cred_guard_mutex);
+	down_write(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
+static inline void threadgroup_unlock(struct task_struct *tsk)
+{
+	up_write(&tsk->signal->group_rwsem);
+	mutex_unlock(&tsk->signal->cred_guard_mutex);
+}
+#else
+static inline void threadgroup_change_begin(struct task_struct *tsk) {}
+static inline void threadgroup_change_end(struct task_struct *tsk) {}
+static inline void threadgroup_lock(struct task_struct *tsk) {}
+static inline void threadgroup_unlock(struct task_struct *tsk) {}
+#endif
+
+#ifndef __HAVE_THREAD_FUNCTIONS
+
+#define task_thread_info(task)	((struct thread_info *)(task)->stack)
+#define task_stack_page(task)	((task)->stack)
+
+static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
+{
+	*task_thread_info(p) = *task_thread_info(org);
+	task_thread_info(p)->task = p;
+}
+
+static inline unsigned long *end_of_stack(struct task_struct *p)
+{
+	return (unsigned long *)(task_thread_info(p) + 1);
+}
+
+#endif
+
+static inline int object_is_on_stack(void *obj)
+{
+	void *stack = task_stack_page(current);
+
+	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
+}
+
+extern void thread_info_cache_init(void);
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+	unsigned long *n = end_of_stack(p);
+
+	do { 	/* Skip over canary */
+		n++;
+	} while (!*n);
+
+	return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif
+
+/* set thread flags in other task's structures
+ * - see asm/thread_info.h for TIF_xxxx flags available
+ */
+static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	set_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	clear_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
+{
+	return test_ti_thread_flag(task_thread_info(tsk), flag);
+}
+
+static inline void set_tsk_need_resched(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+}
+
+static inline void clear_tsk_need_resched(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+}
+
+static inline int test_tsk_need_resched(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
+}
+
+static inline int restart_syscall(void)
+{
+	set_tsk_thread_flag(current, TIF_SIGPENDING);
+	return -ERESTARTNOINTR;
+}
+
+static inline int signal_pending(struct task_struct *p)
+{
+	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+}
+
+static inline int __fatal_signal_pending(struct task_struct *p)
+{
+	return unlikely(sigismember(&p->pending.signal, SIGKILL));
+}
+
+static inline int fatal_signal_pending(struct task_struct *p)
+{
+	return signal_pending(p) && __fatal_signal_pending(p);
+}
+
+static inline int signal_pending_state(long state, struct task_struct *p)
+{
+	if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
+		return 0;
+	if (!signal_pending(p))
+		return 0;
+
+	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
+}
+
+static inline int need_resched(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int _cond_resched(void);
+
+#define cond_resched() ({			\
+	__might_sleep(__FILE__, __LINE__, 0);	\
+	_cond_resched();			\
+})
+
+extern int __cond_resched_lock(spinlock_t *lock);
+
+#ifdef CONFIG_PREEMPT_COUNT
+#define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
+#else
+#define PREEMPT_LOCK_OFFSET	0
+#endif
+
+#define cond_resched_lock(lock) ({				\
+	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
+	__cond_resched_lock(lock);				\
+})
+
+extern int __cond_resched_softirq(void);
+
+#define cond_resched_softirq() ({					\
+	__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
+	__cond_resched_softirq();					\
+})
+
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ * but a general need for low latency)
+ */
+static inline int spin_needbreak(spinlock_t *lock)
+{
+#ifdef CONFIG_PREEMPT
+	return spin_is_contended(lock);
+#else
+	return 0;
+#endif
+}
+
+/*
+ * Thread group CPU time accounting.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	raw_spin_lock_init(&sig->cputimer.lock);
+}
+
+/*
+ * Reevaluate whether the task has signals pending delivery.
+ * Wake the task if so.
+ * This is required every time the blocked sigset_t changes.
+ * callers must hold sighand->siglock.
+ */
+extern void recalc_sigpending_and_wake(struct task_struct *t);
+extern void recalc_sigpending(void);
+
+extern void signal_wake_up(struct task_struct *t, int resume_stopped);
+
+/*
+ * Wrappers for p->thread_info->cpu access. No-op on UP.
+ */
+#ifdef CONFIG_SMP
+
+static inline unsigned int task_cpu(const struct task_struct *p)
+{
+	return task_thread_info(p)->cpu;
+}
+
+extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
+
+#else
+
+static inline unsigned int task_cpu(const struct task_struct *p)
+{
+	return 0;
+}
+
+static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+}
+
+#endif /* CONFIG_SMP */
+
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
+
+extern void normalize_rt_tasks(void);
+
+#ifdef CONFIG_CGROUP_SCHED
+
+extern struct task_group root_task_group;
+
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_move_task(struct task_struct *tsk);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+extern int sched_group_set_rt_runtime(struct task_group *tg,
+				      long rt_runtime_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
+extern int sched_group_set_rt_period(struct task_group *tg,
+				      long rt_period_us);
+extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
+#endif
+#endif
+
+extern int task_can_switch_user(struct user_struct *up,
+					struct task_struct *tsk);
+
+#ifdef CONFIG_TASK_XACCT
+static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
+{
+	tsk->ioac.rchar += amt;
+}
+
+static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
+{
+	tsk->ioac.wchar += amt;
+}
+
+static inline void inc_syscr(struct task_struct *tsk)
+{
+	tsk->ioac.syscr++;
+}
+
+static inline void inc_syscw(struct task_struct *tsk)
+{
+	tsk->ioac.syscw++;
+}
+#else
+static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
+{
+}
+
+static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
+{
+}
+
+static inline void inc_syscr(struct task_struct *tsk)
+{
+}
+
+static inline void inc_syscw(struct task_struct *tsk)
+{
+}
+#endif
+
+#ifndef TASK_SIZE_OF
+#define TASK_SIZE_OF(tsk)	TASK_SIZE
+#endif
+
+#ifdef CONFIG_MM_OWNER
+extern void mm_update_next_owner(struct mm_struct *mm);
+extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
+#else
+static inline void mm_update_next_owner(struct mm_struct *mm)
+{
+}
+
+static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+}
+#endif /* CONFIG_MM_OWNER */
+
+static inline unsigned long task_rlimit(const struct task_struct *tsk,
+		unsigned int limit)
+{
+	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+}
+
+static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
+		unsigned int limit)
+{
+	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+}
+
+static inline unsigned long rlimit(unsigned int limit)
+{
+	return task_rlimit(current, limit);
+}
+
+static inline unsigned long rlimit_max(unsigned int limit)
+{
+	return task_rlimit_max(current, limit);
+}
+
+#endif /* __KERNEL__ */
+
+#endif
diff -ruN linux-3.4.1/init/Kconfig linux-3.4.1-RIFS/init/Kconfig
--- linux-3.4.1/init/Kconfig	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/init/Kconfig	2012-06-08 22:15:30.000000000 +0800
@@ -29,6 +29,18 @@
 
 menu "General setup"
 
+config SCHED_RIFS
+	bool "RIFS cpu scheduler"
+	---help---
+	  The RIFS cpu scheduler is designed for excellent interactivity and
+	  responsiveness.
+
+	  Currently incompatible with the Group CPU scheduler, and RCU TORTURE
+          TEST so these options are disabled.
+
+          Say Y here.
+	default y
+
 config EXPERIMENTAL
 	bool "Prompt for development and/or incomplete code/drivers"
 	---help---
@@ -631,6 +643,7 @@
 
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting cgroup subsystem"
+	depends on !SCHED_RIFS
 	help
 	  Provides a simple Resource Controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
@@ -718,6 +731,7 @@
 
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
+	depends on !SCHED_RIFS
 	default n
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
@@ -854,6 +868,7 @@
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
+	depends on !SCHED_RIFS
 	select EVENTFD
 	select CGROUPS
 	select CGROUP_SCHED
diff -ruN linux-3.4.1/kernel/delayacct.c linux-3.4.1-RIFS/kernel/delayacct.c
--- linux-3.4.1/kernel/delayacct.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/delayacct.c	2012-06-06 15:03:30.000000000 +0800
@@ -130,7 +130,7 @@
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->se.sum_exec_runtime;
+	t3 = tsk_seruntime(tsk);
 
 	d->cpu_count += t1;
 
diff -ruN linux-3.4.1/kernel/exit.c linux-3.4.1-RIFS/kernel/exit.c
--- linux-3.4.1/kernel/exit.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/exit.c	2012-06-05 20:46:45.000000000 +0800
@@ -133,7 +133,7 @@
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+		sig->sum_sched_runtime += tsk_seruntime(tsk);
 	}
 
 	sig->nr_threads--;
diff -ruN linux-3.4.1/kernel/posix-cpu-timers.c linux-3.4.1-RIFS/kernel/posix-cpu-timers.c
--- linux-3.4.1/kernel/posix-cpu-timers.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/posix-cpu-timers.c	2012-06-06 14:37:35.000000000 +0800
@@ -495,7 +495,7 @@
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
 	cleanup_timers(tsk->cpu_timers,
-		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+		       tsk->utime, tsk->stime, tsk_seruntime(tsk));
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -504,7 +504,7 @@
 
 	cleanup_timers(tsk->signal->cpu_timers,
 		       tsk->utime + sig->utime, tsk->stime + sig->stime,
-		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
+		       tsk_seruntime(tsk) + sig->sum_sched_runtime);
 }
 
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -934,7 +934,7 @@
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
+		if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
 			tsk->cputime_expires.sched_exp = t->expires.sched;
 			break;
 		}
@@ -951,7 +951,7 @@
 			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -959,7 +959,7 @@
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -1252,7 +1252,7 @@
 		struct task_cputime task_sample = {
 			.utime = tsk->utime,
 			.stime = tsk->stime,
-			.sum_exec_runtime = tsk->se.sum_exec_runtime
+			.sum_exec_runtime = tsk_seruntime(tsk)
 		};
 
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
diff -ruN linux-3.4.1/kernel/sched/Makefile linux-3.4.1-RIFS/kernel/sched/Makefile
--- linux-3.4.1/kernel/sched/Makefile	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/Makefile	2012-06-05 20:42:22.000000000 +0800
@@ -11,10 +11,13 @@
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+ifdef CONFIG_SCHED_RIFS
+obj-y += rifs.o clock.o
+else
 obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
-obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+endif
+obj-$(CONFIG_SMP) += cpupri.o
 
 
diff -ruN linux-3.4.1/kernel/sched/rifs.c linux-3.4.1-RIFS/kernel/sched/rifs.c
--- linux-3.4.1/kernel/sched/rifs.c	1970-01-01 08:00:00.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/rifs.c	2012-06-16 22:57:50.000000000 +0800
@@ -0,0 +1,6692 @@
+/*
+ *  kernel/sched/rifs.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *			  fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *			  by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *			  Thomas Gleixner, Mike Kravetz
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/math64.h>
+
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+
+#include "cpupri.h"
+#include "../workqueue_sched.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#define MAX_TICK_USED		(rr_interval * 3)
+#define MAX_SLEEP_COUNT		(rr_interval * 3)
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define rt_queue(rq)		rt_prio((rq)->rq_prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
+					(policy) == SCHED_RR)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+#define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
+#define reverse(max, val)	(max - val)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p) - MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
+#define STOP_PRIO		(MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		(1000000000 / HZ)
+#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
+#define HALF_JIFFY_US		(1000000 / HZ / 2)
+#define MS_TO_NS(TIME)		((TIME) << 20)
+#define MS_TO_US(TIME)		((TIME) << 10)
+#define NS_TO_MS(TIME)		((TIME) >> 20)
+#define NS_TO_US(TIME)		((TIME) >> 10)
+
+#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+	printk(KERN_INFO "Rotary Interactivity Favor Scheduler - RIFS By QQ:3766691.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+int rr_interval __read_mostly = 6;
+
+/*
+ * 兼容。
+ */
+int sched_iso_cpu __read_mostly = 0;
+
+/*
+ * time_slice for each process
+ */
+#define timeslice() MS_TO_US(rr_interval)
+
+#define get_time_slice(p) { \
+	p->time_slice = timeslice();	\
+}
+
+/*
+ * The global runqueue data that all CPUs work off. Data is protected either
+ * by the global grq lock, or the discrete lock that precedes the data in this
+ * struct.
+ */
+struct global_rq {
+	raw_spinlock_t lock;
+	unsigned long nr_running;
+	unsigned long nr_uninterruptible;
+	unsigned long long nr_switches;
+	struct list_head queue[PRIO_LIMIT];
+	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
+#ifdef CONFIG_SMP
+	unsigned long qnr; /* queued not running */
+	cpumask_t cpu_idle_map;
+	bool idle_cpus;
+#endif
+	int noc; /* num_online_cpus stored and updated when it changes */
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+	atomic_t refcount;
+	atomic_t rto_count;
+	struct rcu_head rcu;
+	cpumask_var_t span;
+	cpumask_var_t online;
+
+	/*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_var_t rto_mask;
+	struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+static struct global_rq grq;
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+	struct task_struct *curr, *idle, *stop;
+	struct mm_struct *prev_mm;
+	u64 last_run_scale;
+
+	unsigned int rq_policy;
+	u64 rq_last_ran;
+	int rq_prio;
+	bool rq_running; /* There is a task running */
+
+	/* Accurate timekeeping data */
+	u64 timekeep_clock;
+	unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
+		iowait_pc, idle_pc;
+	long account_pc;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	int cpu;		/* cpu of this runqueue */
+	bool online;
+	bool scaling; /* This CPU is managed by a scaling CPU freq governor */
+	struct task_struct *sticky_task;
+
+	struct root_domain *rd;
+	struct sched_domain *sd;
+	int *cpu_locality; /* CPU relative cache distance */
+#ifdef CONFIG_SCHED_SMT
+	bool (*siblings_idle)(int cpu);
+	/* See if all smt siblings are idle */
+	cpumask_t smt_siblings;
+#endif
+#ifdef CONFIG_SCHED_MC
+	bool (*cache_idle)(int cpu);
+	/* See if all cache siblings are idle */
+	cpumask_t cache_siblings;
+#endif
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif
+
+	u64 clock;
+	u64 clock_task;
+};
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+#ifdef CONFIG_SMP
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+	   return 0*SD_ASYM_PACKING;
+}
+#endif
+
+#define rcu_dereference_check_sched_domain(p) \
+	rcu_dereference_check((p), \
+				  lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+static inline void update_rq_clock(struct rq *rq);
+
+/*
+ * Sanity check should sched_clock return bogus values. We make sure it does
+ * not appear to go backwards, and use jiffies to determine the maximum and
+ * minimum it could possibly have increased, and round down to the nearest
+ * jiffy when it falls outside this.
+ */
+static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
+{
+	unsigned long min_diff, max_diff;
+
+	if (jiff_diff > 1)
+		min_diff = JIFFIES_TO_NS(jiff_diff - 1);
+	else
+		min_diff = 1;
+	/*  Round up to the nearest tick for maximum */
+	max_diff = JIFFIES_TO_NS(jiff_diff + 1);
+
+	if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
+		*niff_diff = min_diff;
+}
+
+#ifdef CONFIG_SMP
+#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
+#define this_rq()		(&__get_cpu_var(runqueues))
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+static inline int cpu_of(struct rq *rq)
+{
+	return rq->cpu;
+}
+
+#else /* CONFIG_SMP */
+static struct rq *uprq;
+#define cpu_rq(cpu)	(uprq)
+#define this_rq()	(uprq)
+#define task_rq(p)	(uprq)
+#define cpu_curr(cpu)	((uprq)->curr)
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+
+#endif
+#define raw_rq()	(&__raw_get_cpu_var(runqueues))
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on grq.lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating the time.
+ * Looking up task_rq must be done under grq.lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
+}
+
+static inline bool task_running(struct task_struct *p)
+{
+	return p->on_cpu;
+}
+
+static inline void grq_lock(void)
+	__acquires(grq.lock)
+{
+	raw_spin_lock(&grq.lock);
+}
+
+static inline void grq_unlock(void)
+	__releases(grq.lock)
+{
+	raw_spin_unlock(&grq.lock);
+}
+
+static inline void grq_lock_irq(void)
+	__acquires(grq.lock)
+{
+	raw_spin_lock_irq(&grq.lock);
+}
+
+static inline void time_lock_grq(struct rq *rq)
+	__acquires(grq.lock)
+{
+	grq_lock();
+}
+
+static inline void grq_unlock_irq(void)
+	__releases(grq.lock)
+{
+	raw_spin_unlock_irq(&grq.lock);
+}
+
+static inline void grq_lock_irqsave(unsigned long *flags)
+	__acquires(grq.lock)
+{
+	raw_spin_lock_irqsave(&grq.lock, *flags);
+}
+
+static inline void grq_unlock_irqrestore(unsigned long *flags)
+	__releases(grq.lock)
+{
+	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+}
+
+static inline struct rq
+*task_grq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	grq_lock_irqsave(flags);
+	return task_rq(p);
+}
+
+static inline struct rq
+*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	struct rq *rq = task_grq_lock(p, flags);
+	return rq;
+}
+
+static inline struct rq *task_grq_lock_irq(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	grq_lock_irq();
+	return task_rq(p);
+}
+
+static inline void task_grq_unlock_irq(void)
+	__releases(grq.lock)
+{
+	grq_unlock_irq();
+}
+
+static inline void task_grq_unlock(unsigned long *flags)
+	__releases(grq.lock)
+{
+	grq_unlock_irqrestore(flags);
+}
+
+/**
+ * grunqueue_is_locked
+ *
+ * Returns true if the global runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+bool grunqueue_is_locked(void)
+{
+	return raw_spin_is_locked(&grq.lock);
+}
+
+void grq_unlock_wait(void)
+	__releases(grq.lock)
+{
+	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+	raw_spin_unlock_wait(&grq.lock);
+}
+
+static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	local_irq_save(*flags);
+	time_lock_grq(rq);
+}
+
+static inline struct rq *__task_grq_lock(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	grq_lock();
+	return task_rq(p);
+}
+
+static inline void __task_grq_unlock(void)
+	__releases(grq.lock)
+{
+	grq_unlock();
+}
+
+/*
+ * Look for any tasks *anywhere* that are running nice 0 or better. We do
+ * this lockless for overhead reasons since the occasional wrong result
+ * is harmless.
+ */
+bool above_background_load(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct task_struct *cpu_curr = cpu_rq(cpu)->curr;
+
+		if (unlikely(!cpu_curr))
+			continue;
+		if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) {
+			return true;
+		}
+	}
+	return false;
+}
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	grq.lock.owner = current;
+#endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+
+	grq_unlock_irq();
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	grq_unlock_irq();
+#else
+	grq_unlock();
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+	smp_wmb();
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+/*
+ * A task that is queued but not running will be on the grq run list.
+ * A task that is not running or queued will not be on the grq run list.
+ * A task that is currently running will have ->on_cpu set but not on the
+ * grq run list.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+	return (!list_empty(&p->run_list));
+}
+
+/*
+ * Removing from the global runqueue. Enter with grq locked.
+ */
+static void dequeue_task(struct task_struct *p)
+{
+	int prio = p->prio;
+	list_del_init(&p->run_list);
+	if (list_empty(grq.queue + prio))
+		__clear_bit(prio, grq.prio_bitmap);
+}
+
+/*
+ * Adding to the global runqueue. Enter with grq locked.
+ */
+static void enqueue_task(struct task_struct *p)
+{
+	int prio = p->prio;
+	__set_bit(prio, grq.prio_bitmap);
+	list_add_tail(&p->run_list, grq.queue + prio);
+}
+
+/* Only idle task does this as a real time task*/
+static inline void enqueue_task_head(struct task_struct *p)
+{
+	int prio = p->prio;
+	__set_bit(prio, grq.prio_bitmap);
+	list_add(&p->run_list, grq.queue + prio);
+}
+
+static inline void requeue_task(struct task_struct *p)
+{
+}
+
+#ifdef CONFIG_SMP
+/*
+ * qnr is the "queued but not running" count which is the total number of
+ * tasks on the global runqueue list waiting for cpu time but not actually
+ * currently running on a cpu.
+ */
+static inline void inc_qnr(void)
+{
+	grq.qnr++;
+}
+
+static inline void dec_qnr(void)
+{
+	grq.qnr--;
+}
+
+static inline int queued_notrunning(void)
+{
+	return grq.qnr;
+}
+
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+	if (likely(cpu_online(cpu))) {
+		cpu_set(cpu, grq.cpu_idle_map);
+		grq.idle_cpus = true;
+	}
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+	cpu_clear(cpu, grq.cpu_idle_map);
+	if (cpus_empty(grq.cpu_idle_map))
+		grq.idle_cpus = false;
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+	if (!grq.idle_cpus)
+		return false;
+	return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map));
+}
+
+#define CPUIDLE_DIFF_THREAD	(1)
+#define CPUIDLE_DIFF_CORE	(2)
+#define CPUIDLE_CACHE_BUSY	(4)
+#define CPUIDLE_DIFF_CPU	(8)
+#define CPUIDLE_THREAD_BUSY	(16)
+#define CPUIDLE_DIFF_NODE	(32)
+
+static void resched_task(struct task_struct *p);
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same core, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static void
+resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+	unsigned int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
+		CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
+		CPUIDLE_DIFF_THREAD;
+	int cpu_tmp;
+
+	if (cpu_isset(best_cpu, *tmpmask))
+		goto out;
+
+	for_each_cpu_mask(cpu_tmp, *tmpmask) {
+		unsigned int ranking;
+		struct rq *tmp_rq;
+
+		ranking = 0;
+		tmp_rq = cpu_rq(cpu_tmp);
+
+#ifdef CONFIG_NUMA
+		if (rq->cpu_locality[cpu_tmp] > 3)
+			ranking |= CPUIDLE_DIFF_NODE;
+		else
+#endif
+		if (rq->cpu_locality[cpu_tmp] > 2)
+			ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+		if (rq->cpu_locality[cpu_tmp] == 2)
+			ranking |= CPUIDLE_DIFF_CORE;
+		if (!(tmp_rq->cache_idle(cpu_tmp)))
+			ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (rq->cpu_locality[cpu_tmp] == 1)
+			ranking |= CPUIDLE_DIFF_THREAD;
+		if (!(tmp_rq->siblings_idle(cpu_tmp)))
+			ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+		if (ranking < best_ranking) {
+			best_cpu = cpu_tmp;
+			best_ranking = ranking;
+		}
+	}
+out:
+	resched_task(cpu_rq(best_cpu)->curr);
+}
+
+static void resched_best_idle(struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+	if (suitable_idle_cpus(p))
+		resched_best_idle(p);
+}
+/*
+ * Flags to tell us whether this CPU is running a CPU frequency governor that
+ * has slowed its speed or not. No locking required as the very rare wrongly
+ * read value would be harmless.
+ */
+void cpu_scaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = true;
+}
+
+void cpu_nonscaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = false;
+}
+
+static inline bool scaling_rq(struct rq *rq)
+{
+	return rq->scaling;
+}
+
+static inline int locality_diff(struct task_struct *p, struct rq *rq)
+{
+	return rq->cpu_locality[task_cpu(p)];
+}
+#else /* CONFIG_SMP */
+static inline void inc_qnr(void)
+{
+}
+
+static inline void dec_qnr(void)
+{
+}
+
+static inline int queued_notrunning(void)
+{
+	return grq.nr_running;
+}
+
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+	return current == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+void cpu_scaling(int __unused)
+{
+}
+
+void cpu_nonscaling(int __unused)
+{
+}
+
+/*
+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
+ * always returns 0.
+ */
+static inline bool scaling_rq(struct rq *rq)
+{
+	return false;
+}
+
+static inline int locality_diff(struct task_struct *p, struct rq *rq)
+{
+	return 0;
+}
+#endif /* CONFIG_SMP */
+EXPORT_SYMBOL_GPL(cpu_scaling);
+EXPORT_SYMBOL_GPL(cpu_nonscaling);
+
+/*
+ * activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void activate_idle_task(struct task_struct *p)
+{
+	enqueue_task_head(p);
+	grq.nr_running++;
+	inc_qnr();
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with grq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+					 (rq->clock - p->last_ran) >> 20);
+		
+	}
+
+	if (task_contributes_to_load(p))
+		grq.nr_uninterruptible--;
+	enqueue_task_head(p);
+	grq.nr_running++;
+	inc_qnr();
+}
+
+static inline void clear_sticky(struct task_struct *p);
+
+/*
+ * deactivate_task - If it's running, it's not on the grq and we can just
+ * decrement the nr_running. Enter with grq locked.
+ */
+static inline void deactivate_task(struct task_struct *p)
+{
+	if (task_contributes_to_load(p))
+		grq.nr_uninterruptible++;
+	grq.nr_running--;
+	clear_sticky(p);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * The caller should hold grq lock.
+	 */
+	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+#endif
+	trace_sched_migrate_task(p, cpu);
+	if (task_cpu(p) != cpu)
+		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+
+	/*
+	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * successfully executed on another CPU. We must ensure that updates of
+	 * per-task data have been completed by this moment.
+	 */
+	smp_wmb();
+	task_thread_info(p)->cpu = cpu;
+}
+
+static inline void clear_sticky(struct task_struct *p)
+{
+	p->sticky = false;
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+	return p->sticky;
+}
+
+/* Reschedule the best idle CPU that is not this one. */
+static void
+resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	cpu_clear(cpu, tmpmask);
+	if (cpus_empty(tmpmask))
+		return;
+	resched_best_mask(cpu, rq, &tmpmask);
+}
+
+/*
+ * We set the sticky flag on a task that is descheduled involuntarily meaning
+ * it is awaiting further CPU time. If the last sticky task is still sticky
+ * but unlucky enough to not be the next task scheduled, we unstick it and try
+ * to find it an idle CPU. Realtime tasks do not stick to minimise their
+ * latency at all times.
+ */
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+	if (rq->sticky_task) {
+		if (rq->sticky_task == p) {
+			p->sticky = true;
+			return;
+		}
+		if (task_sticky(rq->sticky_task)) {
+			clear_sticky(rq->sticky_task);
+			resched_closest_idle(rq, cpu, rq->sticky_task);
+		}
+	}
+	if (!rt_task(p)) {
+		p->sticky = true;
+		rq->sticky_task = p;
+	} else {
+		resched_closest_idle(rq, cpu, p);
+		rq->sticky_task = NULL;
+	}
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+	rq->sticky_task = NULL;
+	clear_sticky(p);
+}
+#else
+static inline void clear_sticky(struct task_struct *p)
+{
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+	return false;
+}
+
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
+{
+	int cpu;
+
+	assert_raw_spin_locked(&grq.lock);
+
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
+}
+
+#else
+static inline void resched_task(struct task_struct *p)
+{
+	assert_raw_spin_locked(&grq.lock);
+	set_tsk_need_resched(p);
+}
+#endif
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+struct migration_req {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+	unsigned long flags;
+	bool running, on_rq;
+	unsigned long ncsw;
+	struct rq *rq;
+
+	for (;;) {
+		/*
+		 * We do the initial early heuristics without holding
+		 * any task-queue locks at all. We'll only try to get
+		 * the runqueue lock when things look like they will
+		 * work out! In the unlikely event rq is dereferenced
+		 * since we're lockless, grab it again.
+		 */
+#ifdef CONFIG_SMP
+retry_rq:
+		rq = task_rq(p);
+		if (unlikely(!rq))
+			goto retry_rq;
+#else /* CONFIG_SMP */
+		rq = task_rq(p);
+#endif
+		/*
+		 * If the task is actively running on another CPU
+		 * still, just relax and busy-wait without holding
+		 * any locks.
+		 *
+		 * NOTE! Since we don't hold any locks, it's not
+		 * even sure that "rq" stays as the right runqueue!
+		 * But we don't care, since this will return false
+		 * if the runqueue has changed and p is actually now
+		 * running somewhere else!
+		 */
+		while (task_running(p) && p == rq->curr) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
+			cpu_relax();
+		}
+
+		/*
+		 * Ok, time to look more closely! We need the grq
+		 * lock now, to be *sure*. If we're wrong, we'll
+		 * just go back and repeat.
+		 */
+		rq = task_grq_lock(p, &flags);
+		trace_sched_wait_task(p);
+		running = task_running(p);
+		on_rq = task_queued(p);
+		ncsw = 0;
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+		task_grq_unlock(&flags);
+
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
+		/*
+		 * Was it really running after all now that we
+		 * checked with the proper locks actually held?
+		 *
+		 * Oops. Go back and try again..
+		 */
+		if (unlikely(running)) {
+			cpu_relax();
+			continue;
+		}
+
+		/*
+		 * It's not enough that it's not actively running,
+		 * it must be off the runqueue _entirely_, and not
+		 * preempted!
+		 *
+		 * So if it was still runnable (but just not actively
+		 * running right now), it's preempted, and we should
+		 * yield - it could be a while.
+		 */
+		if (unlikely(on_rq)) {
+			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+			continue;
+		}
+
+		/*
+		 * Ahh, all good. It wasn't running, and it wasn't
+		 * runnable, which means that it will never become
+		 * running in the future either. We're all done!
+		 */
+		break;
+	}
+
+	return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+#define rq_idle(rq)	((rq)->rq_prio == PRIO_LIMIT)
+
+/*
+ * RT tasks preempt on priority.
+ * NORMAL tasks preempt on sleep time length.
+ * SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio)
+{
+	if (p->prio <= prio)
+		return true;
+	return false;
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Check to see if there is a task that is affined only to offline CPUs but
+ * still wants runtime. This happens to kernel threads during suspend/halt and
+ * disabling of CPUs.
+ */
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+static inline bool online_cpus(struct task_struct *p)
+{
+	return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed)));
+}
+#else /* CONFIG_HOTPLUG_CPU */
+/* All available CPUs are always online without hotplug. */
+static inline bool online_cpus(struct task_struct *p)
+{
+	return true;
+}
+#endif
+
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
+		return true;
+	return false;
+}
+
+/*
+ * When all else is equal, still prefer this_rq.
+ */
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	struct rq *highest_prio_rq = NULL;
+	int cpu, highest_prio = 0;
+	cpumask_t tmp;
+
+	/*
+	 * We clear the sticky flag here because for a task to have called
+	 * try_preempt with the sticky flag enabled means some complicated
+	 * re-scheduling has occurred and we should ignore the sticky flag.
+	 */
+	clear_sticky(p);
+
+	if (suitable_idle_cpus(p)) {
+		resched_best_idle(p);
+		return;
+	}
+
+	/* IDLEPRIO tasks never preempt anything but idle */
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+
+	if (likely(online_cpus(p)))
+		cpus_and(tmp, cpu_online_map, p->cpus_allowed);
+	else
+		return;
+
+	for_each_cpu_mask(cpu, tmp) {
+		struct rq *rq;
+		int rq_prio;
+
+		rq = cpu_rq(cpu);
+		rq_prio = rq->rq_prio;
+		if (rq_prio < highest_prio)
+			continue;
+		
+		if (rq_prio > highest_prio) {
+			highest_prio = rq_prio;
+			highest_prio_rq = rq;
+		}
+	}
+
+	if (likely(highest_prio_rq)) {
+		if (can_preempt(p, highest_prio))
+			resched_task(highest_prio_rq->curr);
+	}
+}
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+	if (can_preempt(p, uprq->rq_prio)) 
+		resched_task(current);
+}
+#endif /* CONFIG_SMP */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+}
+
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+				 bool is_sync)
+{
+	activate_task(p, rq);
+
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption if there are no idle cpus,
+	 * instead waiting for current to deschedule.
+	 */
+	if (!is_sync || suitable_idle_cpus(p)) {
+		try_preempt(p, rq);
+	}
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+					bool success)
+{
+	trace_sched_wakeup(p, success);
+	p->state = TASK_RUNNING;
+
+	/*
+	 * if a worker is waking up, notify workqueue. Note that on BFS, we
+	 * don't really know what cpu it will be, so we fake it for
+	 * wq_worker_waking_up :/
+	 */
+	if ((p->flags & PF_WQ_WORKER) && success)
+		wq_worker_waking_up(p, cpu_of(rq));
+}
+
+#ifdef CONFIG_SMP
+void scheduler_ipi(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
+ */
+static bool try_to_wake_up(struct task_struct *p, unsigned int state,
+			  int wake_flags)
+{
+	bool success = false;
+	unsigned long flags;
+	struct rq *rq;
+	int cpu;
+
+	get_cpu();
+
+	/* This barrier is undocumented, probably for p->state? くそ */
+	smp_wmb();
+
+	/*
+	 * No need to do time_lock_grq as we only need to update the rq clock
+	 * if we activate the task
+	 */
+	rq = task_grq_lock(p, &flags);
+	cpu = task_cpu(p);
+
+	/* state is a volatile long, どうして、分からない */
+	if (!((unsigned int)p->state & state))
+		goto out_unlock;
+
+	if (task_queued(p) || task_running(p))
+		goto out_running;
+
+	ttwu_activate(p, rq, wake_flags & WF_SYNC);
+	success = true;
+
+out_running:
+	ttwu_post_activation(p, rq, success);
+out_unlock:
+	task_grq_unlock(&flags);
+
+	ttwu_stat(p, cpu, wake_flags);
+
+	put_cpu();
+
+	return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with grq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that grq is locked and, @p is not the current task.
+ * grq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	bool success = false;
+
+	lockdep_assert_held(&grq.lock);
+
+	if (!(p->state & TASK_NORMAL))
+		return;
+
+	if (!task_queued(p)) {
+		ttwu_activate(p, rq, false);
+		ttwu_stat(p, smp_processor_id(), 0);
+		success = true;
+	}
+	ttwu_post_activation(p, rq, success);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_ALL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+void sched_fork(struct task_struct *p)
+{
+	struct task_struct *curr;
+	int cpu = get_cpu();
+	struct rq *rq;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+	/*
+	 * We mark the process as running here. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_RUNNING;
+	set_task_cpu(p, cpu);
+
+	p->sched_time = p->stime_pc = p->utime_pc = 0;
+
+	/*
+	 * Revert to default priority/policy on fork if requested.
+	 */
+	if (unlikely(p->sched_reset_on_fork)) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+			p->policy = SCHED_NORMAL;
+		}
+
+		if (PRIO_TO_NICE(p->static_prio) < 0) {
+			p->static_prio = NICE_TO_PRIO(0);
+		}
+
+		/*
+		 * We don't need the reset flag anymore after the fork. It has
+		 * fulfilled its duty:
+		 */
+		p->sched_reset_on_fork = 0;
+	}
+
+	curr = current;
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = curr->static_prio;
+	p->tick_used = curr->tick_used;
+
+	INIT_LIST_HEAD(&p->run_list);
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+
+	p->on_cpu = false;
+	clear_sticky(p);
+
+#ifdef CONFIG_PREEMPT_COUNT
+	/* Want to start with kernel preemption disabled. */
+	task_thread_info(p)->preempt_count = 1;
+#endif
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto out;
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness. If it's negative, it won't
+	 * matter since that's the same as being 0. current's time_slice is
+	 * actually in timeslice when it's running, as is its last_ran
+	 * value. 
+	 */
+	rq = task_grq_lock_irq(curr);
+	if (likely(curr->time_slice >= RESCHED_US * 2)) {
+		curr->time_slice /= 2;
+		p->time_slice = curr->time_slice;
+	} else {
+		/*
+		 * Forking task has run out of timeslice. Reschedule it.
+		 */
+		curr->time_slice = 0;
+		set_tsk_need_resched(curr);
+		get_time_slice(p);
+	}
+	p->last_ran = rq->rq_last_ran;
+	task_grq_unlock_irq();
+out:
+	put_cpu();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+	struct task_struct *parent;
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_grq_lock(p, &flags);
+	p->state = TASK_RUNNING;
+	p->tick_used = 0;
+	parent = p->parent;
+	/* Unnecessary but small chance that the parent changed CPU */
+	set_task_cpu(p, task_cpu(parent));
+	activate_task(p, rq);
+	trace_sched_wakeup_new(p, 1);
+	if (rq->curr == parent && !suitable_idle_cpus(p)) {
+		/*
+		 * The VM isn't cloned, so we're in a good position to
+		 * do child-runs-first in anticipation of an exec. This
+		 * usually avoids a lot of COW overhead.
+		 */
+		resched_task(parent);
+	} else
+		try_preempt(p, rq);
+	task_grq_unlock(&flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+		notifier->ops->sched_out(notifier, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+			struct task_struct *next)
+{
+	perf_event_task_sched_out(prev, next);
+	fire_sched_out_preempt_notifiers(prev, next);
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+	trace_sched_switch(prev, next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+	__releases(grq.lock)
+{
+	struct mm_struct *mm = rq->prev_mm;
+	long prev_state;
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 * The test for TASK_DEAD must occur while the runqueue locks are
+	 * still held, otherwise prev could be scheduled on another cpu, die
+	 * there before we look at prev->state, and then the reference would
+	 * be dropped twice.
+	 *		Manfred Spraul <manfred@colorfullife.com>
+	 */
+	prev_state = prev->state;
+	finish_arch_switch(prev);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+	perf_event_task_sched_in(prev, current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+	finish_lock_switch(rq, prev);
+
+	fire_sched_in_preempt_notifiers(current);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_state == TASK_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(struct task_struct *prev)
+	__releases(grq.lock)
+{
+	struct rq *rq = this_rq();
+
+	finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
+	if (current->set_child_tid)
+		put_user(current->pid, current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+		   struct task_struct *next)
+{
+	struct mm_struct *mm, *oldmm;
+	
+	prepare_task_switch(rq, prev, next);
+
+	mm = next->mm;
+	oldmm = prev->active_mm;
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (!mm) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm(oldmm, mm, next);
+
+	if (!prev->mm) {
+		prev->active_mm = NULL;
+		rq->prev_mm = oldmm;
+	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+#endif
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+
+	barrier();
+	/*
+	 * this_rq must be evaluated again because prev may have moved
+	 * CPUs since it called schedule(), thus the 'rq' on its stack
+	 * frame will be invalid.
+	 */
+	finish_task_switch(this_rq(), prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup. All are measured
+ * without grabbing the grq lock but the occasional inaccurate result
+ * doesn't matter so long as it's positive.
+ */
+unsigned long nr_running(void)
+{
+	long nr = grq.nr_running;
+
+	if (unlikely(nr < 0))
+		nr = 0;
+	return (unsigned long)nr;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+	long nu = grq.nr_uninterruptible;
+
+	if (unlikely(nu < 0))
+		nu = 0;
+	return nu;
+}
+
+unsigned long long nr_context_switches(void)
+{
+	long long ns = grq.nr_switches;
+
+	/* This is of course impossible */
+	if (unlikely(ns < 0))
+		ns = 1;
+	return (unsigned long long)ns;
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+	struct rq *this = cpu_rq(cpu);
+	return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+	return nr_running() + nr_uninterruptible();
+}
+
+/* Beyond a task running on this CPU, load is equal everywhere on BFS */
+unsigned long this_cpu_load(void)
+{
+	return this_rq()->rq_running +
+		((queued_notrunning() + nr_uninterruptible()) / grq.noc);
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+	long active;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+	active = nr_active() * FIXED_1;
+
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+
+	return (this_rq->cpu_locality[that_cpu] < 2);
+}
+EXPORT_SYMBOL(cpus_share_cache);
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void account_system_vtime(struct task_struct *curr)
+{
+	unsigned long flags;
+	s64 delta;
+	int cpu;
+
+	if (!sched_clock_irqtime)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	__this_cpu_add(irq_start_time, delta);
+
+	irq_time_write_begin();
+	/*
+	 * We do not account for softirq time from ksoftirqd here.
+	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * in that case, so as not to confuse scheduler with a special task
+	 * that do not consume any time, but still wants to run.
+	 */
+	if (hardirq_count())
+		__this_cpu_add(cpu_hardirq_time, delta);
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+		__this_cpu_add(cpu_softirq_time, delta);
+
+	irq_time_write_end();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+#endif
+	rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void irqtime_account_hi_si(void)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	u64 latest_ns;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
+	if (latest_ns > cpustat[CPUTIME_IRQ])
+		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
+	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
+		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime	(0)
+
+static inline void irqtime_account_hi_si(void)
+{
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * On each tick, see what percentage of that tick was attributed to each
+ * component and add the percentage to the _pc values. Once a _pc value has
+ * accumulated one tick's worth, account for that. This means the total
+ * percentage of load components will always be 128 (pseudo 100) per tick.
+ */
+static void pc_idle_time(struct rq *rq, unsigned long pc)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	if (atomic_read(&rq->nr_iowait) > 0) {
+		rq->iowait_pc += pc;
+		if (rq->iowait_pc >= 128) {
+			rq->iowait_pc %= 128;
+			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy;
+		}
+	} else {
+		rq->idle_pc += pc;
+		if (rq->idle_pc >= 128) {
+			rq->idle_pc %= 128;
+			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy;
+		}
+	}
+}
+
+static void
+pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
+		   unsigned long pc, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+	p->stime_pc += pc;
+	if (p->stime_pc >= 128) {
+		p->stime_pc %= 128;
+		p->stime += (__force u64)cputime_one_jiffy;
+		p->stimescaled += one_jiffy_scaled;
+		acct_update_integrals(p);
+	}
+	p->sched_time += ns;
+
+	if (hardirq_count() - hardirq_offset) {
+		rq->irq_pc += pc;
+		if (rq->irq_pc >= 128) {
+			rq->irq_pc %= 128;
+			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+		}
+	} else if (in_serving_softirq()) {
+		rq->softirq_pc += pc;
+		if (rq->softirq_pc >= 128) {
+			rq->softirq_pc %= 128;
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+		}
+	} else {
+		rq->system_pc += pc;
+		if (rq->system_pc >= 128) {
+			rq->system_pc %= 128;
+			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy;
+		}
+	}
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p,
+			 unsigned long pc, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+	p->utime_pc += pc;
+	if (p->utime_pc >= 128) {
+		p->utime_pc %= 128;
+		p->utime += (__force u64)cputime_one_jiffy;
+		p->utimescaled += one_jiffy_scaled;
+		acct_update_integrals(p);
+	}
+	p->sched_time += ns;
+
+	if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 */
+		rq->softirq_pc += pc;
+		if (rq->softirq_pc >= 128) {
+			rq->softirq_pc %= 128;
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+		}
+	}
+
+	if (TASK_NICE(p) > 0 || idleprio_task(p)) {
+		rq->nice_pc += pc;
+		if (rq->nice_pc >= 128) {
+			rq->nice_pc %= 128;
+			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy;
+		}
+	} else {
+		rq->user_pc += pc;
+		if (rq->user_pc >= 128) {
+			rq->user_pc %= 128;
+			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy;
+		}
+	}
+}
+
+/*
+ * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
+ * shifts instead of 100
+ */
+#define NS_TO_PC(NS)	(NS * 128 / JIFFY_NS)
+
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock(struct rq *rq, struct task_struct *p)
+{
+	long account_ns = rq->clock - rq->timekeep_clock;
+	struct task_struct *idle = rq->idle;
+	unsigned long account_pc;
+	int user_tick;
+
+	p->last_ran = rq->clock;
+
+	if (unlikely(account_ns < 0))
+		account_ns = 0;
+
+	account_pc = NS_TO_PC(account_ns);
+
+	/* Accurate tick timekeeping */
+	rq->account_pc += account_pc - 128;
+	if (rq->account_pc < 0) {
+		/*
+		 * Small errors in micro accounting may not make the
+		 * accounting add up to 128 each tick so we keep track
+		 * of the percentage and round it up when less than 128
+		 */
+		account_pc += -rq->account_pc;
+		rq->account_pc = 0;
+	}
+
+	user_tick = user_mode(get_irq_regs());
+
+	if (user_tick)
+		pc_user_time(rq, p, account_pc, account_ns);
+	else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
+		pc_system_time(rq, p, HARDIRQ_OFFSET,
+				   account_pc, account_ns);
+	else
+		pc_idle_time(rq, account_pc);
+
+	if (sched_clock_irqtime)
+		irqtime_account_hi_si();
+
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (rq->rq_policy != SCHED_FIFO && p != idle) {
+		s64 time_diff = rq->clock - rq->rq_last_ran;
+
+		niffy_diff(&time_diff, 1);
+		p->time_slice -= NS_TO_US(time_diff);
+	}
+	rq->rq_last_ran = rq->timekeep_clock = rq->clock;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_grq_lock() held.
+ */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	u64 ns = 0;
+
+	if (p == rq->curr) {
+		ns = rq->clock_task - rq->rq_last_ran;
+		if (unlikely((s64)ns < 0))
+			ns = 0;
+	}
+
+	return ns;
+}
+
+unsigned long long task_delta_exec(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+	rq = task_grq_lock(p, &flags);
+	ns = do_task_delta_exec(p, rq);
+	task_grq_unlock(&flags);
+
+	return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+	rq = task_grq_lock(p, &flags);
+	ns = p->sched_time + do_task_delta_exec(p, rq);
+	task_grq_unlock(&flags);
+
+	return ns;
+}
+
+/* Compatibility crap for removal */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+			   cputime_t cputime_scaled)
+{
+}
+
+void account_idle_time(cputime_t cputime)
+{
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+				   cputime_t cputime_scaled)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	/* Add guest time to process. */
+	p->utime += (__force u64)cputime;
+	p->utimescaled += (__force u64)cputime_scaled;
+	p->gtime += (__force u64)cputime;
+
+	/* Add guest time to cpustat. */
+	if (TASK_NICE(p) > 0) {
+		cpustat[CPUTIME_NICE] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
+	} else {
+		cpustat[CPUTIME_USER] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
+	}
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	/* Add system time to process. */
+	p->stime += (__force u64)cputime;
+	p->stimescaled += (__force u64)cputime_scaled;
+
+	/* Add system time to cpustat. */
+	*target_cputime64 += (__force u64)cputime;
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * This is for guest only now.
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime, cputime_t cputime_scaled)
+{
+
+	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+		account_guest_time(p, cputime, cputime_scaled);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+static void account_idle_times(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	struct rq *rq = this_rq();
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
+	else
+		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+	account_idle_times(jiffies_to_cputime(ticks));
+}
+#endif
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+/* 当前队列时钟的控制 */
+static void task_running_tick(struct rq *rq)
+{
+	struct task_struct *p;
+	p = rq->curr;
+
+	/* SCHED_FIFO tasks never run out of timeslice. */
+	if (rq->rq_policy == SCHED_FIFO)
+		return;
+	if(p->tick_used <= MAX_TICK_USED)
+		p->tick_used++;
+	if (p->time_slice > RESCHED_US)
+		return;
+
+	/* time_slice expired. Grq locked */
+	grq_lock();
+	requeue_task(p);
+	set_tsk_need_resched(p);
+	grq_unlock();
+}
+
+void wake_up_idle_cpu(int cpu);
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled. The data modified is all
+ * local to struct rq so we don't need to grab grq lock.
+ */
+void scheduler_tick(void)
+{
+	int cpu __maybe_unused = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	sched_clock_tick();
+	/* grq lock not grabbed, so only update rq clock */
+	update_rq_clock(rq);
+	update_cpu_clock(rq, rq->curr);
+	if (!rq_idle(rq))
+		task_running_tick(rq);
+	perf_event_task_tick();
+}
+
+notrace unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+void __kprobes add_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+		return;
+#endif
+	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void __kprobes sub_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+			!(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+#endif
+
+#define prio_raisable(p) (	\
+	p->prio > ((p->static_prio - reverse(MAX_TICK_USED, p->tick_used)) + \
+			(p->sleep_cnt / 3)))
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled.
+ */
+static inline bool check_timeslice_end(struct rq *rq, struct task_struct *p)
+{
+	u64 sleep_jiffy = jiffies_64;
+	bool preempt = false;
+	if (p->policy == SCHED_FIFO)
+		goto out;
+	if (p->time_slice < RESCHED_US || batch_task(p)) {
+		if(p->prio > MAX_RT_PRIO) {
+			p->prio++;
+			if(p->prio >= IDLE_PRIO) {
+				p->prio = p->static_prio + 5;
+				if(p->prio >= IDLE_PRIO)
+					p->prio = p->static_prio;
+			}
+		}
+		get_time_slice(p);
+	} else if(p->policy != SCHED_RR) {
+		if(p->state != TASK_RUNNING) {
+			if(p->prio > NORMAL_PRIO) {
+				if(p->sleep_cnt < MAX_SLEEP_COUNT) 
+					p->sleep_cnt++;
+				if(prio_raisable(p)) 
+					p->prio--;
+				if(sleep_jiffy != p->sleep_jiffy) 
+					p->sleep_cnt /= 2;
+				p->sleep_jiffy = sleep_jiffy;
+			}
+		} else {
+			preempt = true;
+		}
+	}
+	if((p->state != TASK_RUNNING) || (p->tick_used >= MAX_TICK_USED)) {
+		p->tick_used /= 2;
+	}
+out:
+	return preempt;
+}
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+
+
+/*
+ * 最低位查找,查找最高优先级开始。
+ * Find the lowest bit set in the bitmap.We would find the highest priority first/
+ */
+static inline unsigned long
+get_prio_bit(unsigned long *addr, unsigned long offset)
+{
+	unsigned long *from = addr + (offset / BITS_PER_LONG);
+	unsigned long *limit = addr + PRIO_LIMIT / BITS_PER_LONG;
+	int i = offset % BITS_PER_LONG;
+	
+	if (offset >=  PRIO_LIMIT)
+		return PRIO_LIMIT;
+
+	for(;from != (limit);from++) {
+		for(;i < BITS_PER_LONG;i++, offset++) {
+			if(((*from >> i) & 0x1)) {
+				goto out;
+			}
+		}
+		
+		/*
+		 *  This can make sure to generate the best machine code.
+		 */
+		i = 0;
+	}
+out:
+	return offset;
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU, thereby allowing the data to be
+ * changed without grabbing the grq lock.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_last_ran = p->last_ran = rq->clock;
+	rq->rq_policy = p->policy;
+	rq->rq_prio = p->prio;
+	if (p != rq->idle)
+		rq->rq_running = true;
+	else
+		rq->rq_running = false;
+}
+
+static void reset_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_policy = p->policy;
+	rq->rq_prio = p->prio;
+}
+
+static inline void operate_blk_needs_flush_plug(struct task_struct *p)
+{
+	grq_unlock_irq();
+	sched_preempt_enable_no_resched();
+	blk_schedule_flush_plug(p);
+}
+
+static inline void task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+	/*
+	 * Don't stick tasks when a real time task is going to run as
+	 * they may literally get stuck.
+	 */
+	if (rt_task(next))
+		unstick_task(rq, prev);
+	set_rq_task(rq, next);
+	grq.nr_switches++;
+	prev->on_cpu = false;
+	next->on_cpu = true;
+	rq->curr = next;
+
+	/*
+	 * The context switch have flipped the stack from under us
+	 * and restored the local variables which were saved when
+	 * this task called schedule() in the past. prev == current
+	 * is still correct, but it can be moved to another cpu/rq.
+	 */
+	context_switch(rq, prev, next); /* unlocks the grq */
+}
+
+
+/*
+ * Move a task off the global queue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(int cpu, struct task_struct *p)
+{
+	set_task_cpu(p, cpu);
+	dequeue_task(p);
+	clear_sticky(p);
+	dec_qnr();
+}
+
+/*
+ * Put the descheduling task back to grq.
+ */
+static inline void put_prev_task(struct rq *rq, struct task_struct *p, bool deactivate, bool preempt)
+{
+	if(deactivate)
+		deactivate_task(p);
+	else {
+		inc_qnr();
+		if(preempt)
+			enqueue_task_head(p);
+		else
+			enqueue_task(p);
+	}
+}
+
+/*
+ * Task picking for next time to run.
+ */
+static inline struct
+task_struct *get_runnable_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct task_struct *edt = NULL;
+	unsigned long idx = -1;
+
+	do {
+		struct list_head *queue;
+		struct task_struct *p;
+
+		idx = get_prio_bit(grq.prio_bitmap, ++idx);
+		if (idx >= PRIO_LIMIT)
+			return idle;
+		queue = grq.queue + idx;
+
+		list_for_each_entry(p, queue, run_list) {
+			/* Make sure cpu affinity is ok */
+			if (needs_other_cpu(p, cpu))
+				continue;
+			edt = p;
+			goto out_take;
+		}
+	} while (!edt);
+
+out_take:
+	take_task(cpu, edt);
+	return edt;
+}
+
+#define SCHED_RESCHED -1
+
+/*
+ * schedule() is the main scheduler function.
+ */
+static inline int check_sleep_on_wq(int cpu, struct task_struct *p)
+{
+	int deactivate;
+	deactivate = 0;
+	if (p->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+		if (unlikely(signal_pending_state(p->state, p))) {
+			p->state = TASK_RUNNING;
+		} else {
+			deactivate = 1;
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (p->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(p, cpu);
+				if (to_wakeup) {
+					/* This shouldn't happen, but does */
+					if (unlikely(to_wakeup == p))
+						deactivate = 0;
+					else
+						try_to_wake_up_local(to_wakeup);
+				}
+			}
+
+			/*
+			 * If we are going to sleep and we have plugged IO queued, make
+			 * sure to submit it to avoid deadlocks.
+			 */
+			if (unlikely(deactivate && blk_needs_flush_plug(p))) {
+				operate_blk_needs_flush_plug(p);
+				deactivate = SCHED_RESCHED;
+				goto out;
+			}
+		}
+	}
+out:
+	return deactivate;
+}
+
+static inline int do_schedule(void)
+{
+	struct task_struct *prev, *next, *idle;
+	struct rq *rq;
+	int cpu;
+	int deactivate;
+
+	cpu = smp_processor_id();
+	rq = cpu_rq(cpu);
+	rcu_note_context_switch(cpu);
+	prev = rq->curr;
+
+	grq_lock_irq();
+
+	if((deactivate = check_sleep_on_wq(cpu, prev)) == SCHED_RESCHED) {
+		goto out;
+	}
+
+	clear_tsk_need_resched(prev);
+
+	idle = rq->idle;
+	if (idle != prev) {
+		bool preempt = check_timeslice_end(rq, prev);
+
+		/* Task changed affinity off this CPU */
+		if (needs_other_cpu(prev, cpu))
+			resched_suitable_idle(prev);
+		else if (!deactivate) {
+			if (!queued_notrunning()) {
+				/*
+				* Rerun the prev task again.
+				*/
+				set_rq_task(rq, prev);
+				grq_unlock_irq();
+				goto out;
+			} else
+				swap_sticky(rq, cpu, prev);
+		}
+
+		put_prev_task(rq, prev, deactivate, preempt);
+	}
+
+	if (unlikely(!queued_notrunning())) {
+		/*
+		 * This CPU is now truly idle as opposed to when idle is
+		 * scheduled as a high priority task in its own right.
+		 */
+		next = idle;
+		set_cpuidle_map(cpu);
+	} else {
+		next = get_runnable_task(rq, cpu, idle);
+	}
+
+	if (likely(prev != next)) {
+		prev->nvcsw++;
+		grq.nr_switches++;
+
+		task_switch(rq, prev, next);
+		idle = rq->idle;
+	} else
+		grq_unlock_irq();
+
+out:
+	return deactivate;
+}
+
+asmlinkage void __sched schedule(void)
+{
+reschedule:
+		preempt_disable();
+		
+		if(do_schedule() == SCHED_RESCHED)
+			goto reschedule;
+		
+		sched_preempt_enable_no_resched();
+		if(unlikely(need_resched()))
+			goto reschedule;
+}
+EXPORT_SYMBOL(schedule);
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+	sched_preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+}
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+	if (lock->owner != owner)
+		return false;
+
+	/*
+	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
+	 * lock->owner still matches owner, if that fails, owner might
+	 * point to free()d memory, if it still matches, the rcu_read_lock()
+	 * ensures the memory stays valid.
+	 */
+	barrier();
+
+	return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+	rcu_read_lock();
+	while (owner_running(lock, owner)) {
+		if (need_resched())
+			break;
+
+		arch_mutex_cpu_relax();
+	}
+	rcu_read_unlock();
+
+	/*
+	 * We break out the loop above on need_resched() and when the
+	 * owner changed, which is a sign for heavy contention. Return
+	 * success only when lock->owner is NULL.
+	 */
+	return lock->owner == NULL;
+}
+#endif
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched notrace preempt_schedule(void)
+{
+	struct thread_info *ti = current_thread_info();
+
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task. Just return..
+	 */
+	if (likely(ti->preempt_count || irqs_disabled()))
+		return;
+
+	do {
+		add_preempt_count_notrace(PREEMPT_ACTIVE);
+		schedule();
+		sub_preempt_count_notrace(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
+}
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+	struct thread_info *ti = current_thread_info();
+
+	/* Catch callers which need to be fixed */
+	BUG_ON(ti->preempt_count || !irqs_disabled());
+
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
+		sub_preempt_count(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+			  void *key)
+{
+	return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	struct list_head *tmp, *next;
+
+	list_for_each_safe(tmp, next, &q->task_list) {
+		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		unsigned int flags = curr->flags;
+
+		if (curr->func(curr, mode, wake_flags, key) &&
+				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			break;
+	}
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+	__wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronised'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+	int wake_flags = WF_SYNC;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		wake_flags = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/**
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronised'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	unsigned long flags;
+	int sync = 1;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		sync = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done++;
+	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += UINT_MAX/2;
+	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x, long timeout, int state)
+{
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		do {
+			if (signal_pending_state(state, current)) {
+				timeout = -ERESTARTSYS;
+				break;
+			}
+			__set_current_state(state);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done && timeout);
+		__remove_wait_queue(&x->wait, &wait);
+		if (!x->done)
+			return timeout;
+	}
+	x->done--;
+	return timeout ?: 1;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	timeout = do_wait_for_common(x, timeout, state);
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+	if (t == -ERESTARTSYS)
+		return t;
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+	if (t == -ERESTARTSYS)
+		return t;
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+					 unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ *	try_wait_for_completion - try to decrement a completion without blocking
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if a decrement cannot be done without blocking
+ *		 1 if a decrement succeeded.
+ *
+ *	If a completion is being used as a counting completion,
+ *	attempt to decrement the counter without blocking. This
+ *	enables us to avoid waiting if the resource the completion
+ *	is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	if (!x->done)
+		ret = 0;
+	else
+		x->done--;
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ *	completion_done - Test to see if a completion has any waiters
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *		 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	if (!x->done)
+		ret = 0;
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
+static long __sched
+sleep_on_common(wait_queue_head_t *q, int state, long timeout)
+{
+	unsigned long flags;
+	wait_queue_t wait;
+
+	init_waitqueue_entry(&wait, current);
+
+	__set_current_state(state);
+
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, &wait);
+	spin_unlock(&q->lock);
+	timeout = schedule_timeout(timeout);
+	spin_lock_irq(&q->lock);
+	__remove_wait_queue(q, &wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	return timeout;
+}
+
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void __sched sleep_on(wait_queue_head_t *q)
+{
+	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(sleep_on);
+
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(sleep_on_timeout);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	unsigned long flags;
+	int queued, oldprio;
+	struct rq *rq;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_grq_lock(p, &flags);
+
+	trace_sched_pi_setprio(p, prio);
+	oldprio = p->prio;
+	queued = task_queued(p);
+	p->prio = prio;
+	if (task_running(p) && prio > oldprio)
+		resched_task(p);
+	if (queued) {
+		try_preempt(p, rq);
+	}
+
+	task_grq_unlock(&flags);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+	int queued, new_static, old_static;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+		return;
+	new_static = NICE_TO_PRIO(nice);
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = time_task_grq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (has_rt_policy(p)) {
+		p->static_prio = new_static;
+		goto out_unlock;
+	}
+	queued = task_queued(p);
+
+	old_static = p->static_prio;
+	p->static_prio = new_static;
+	p->prio = p->static_prio;
+
+	if (queued) {
+		if (new_static < old_static)
+			try_preempt(p, rq);
+	} else if (task_running(p)) {
+		reset_rq_task(rq, p);
+		if (old_static < new_static)
+			resched_task(p);
+	}
+out_unlock:
+	task_grq_unlock(&flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = 20 - nice;
+
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+		capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+	long nice, retval;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+	if (increment < -40)
+		increment = -40;
+	if (increment > 40)
+		increment = 40;
+
+	nice = TASK_NICE(current) + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1.
+ */
+int task_prio(const struct task_struct *p)
+{
+	return p->prio - NORMAL_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const struct task_struct *p)
+{
+	return TASK_NICE(p);
+}
+EXPORT_SYMBOL_GPL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+struct task_struct *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold grq lock. */
+static void
+__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio)
+{
+	int oldrtprio, oldprio;
+
+	p->policy = policy;
+	oldrtprio = p->rt_priority;
+	p->rt_priority = prio;
+	oldprio = p->prio;
+	/* we are holding p->pi_lock already */
+	p->prio = rt_mutex_getprio(p);
+	if (task_running(p)) {
+		reset_rq_task(rq, p);
+		/* Resched only if we might now be preempted */
+		if (p->prio > oldprio || p->rt_priority > oldrtprio)
+			resched_task(p);
+	}
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	if (cred->user->user_ns == pcred->user->user_ns)
+		match = (cred->euid == pcred->euid ||
+			 cred->euid == pcred->uid);
+	else
+		match = false;
+	rcu_read_unlock();
+	return match;
+}
+
+static int __sched_setscheduler(struct task_struct *p, int policy,
+				const struct sched_param *param, bool user)
+{
+	struct sched_param zero_param = { .sched_priority = 0 };
+	int queued, retval, oldpolicy = -1;
+	unsigned long flags, rlim_rtprio = 0;
+	int reset_on_fork;
+	struct rq *rq;
+
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
+
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		unsigned long lflags;
+
+		if (!lock_task_sighand(p, &lflags))
+			return -ESRCH;
+		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+		unlock_task_sighand(p, &lflags);
+		if (rlim_rtprio)
+			goto recheck;
+		param = &zero_param;
+	}
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0) {
+		reset_on_fork = p->sched_reset_on_fork;
+		policy = oldpolicy = p->policy;
+	} else {
+		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+		policy &= ~SCHED_RESET_ON_FORK;
+
+		if (!SCHED_RANGE(policy))
+			return -EINVAL;
+	}
+
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+		(p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+		(!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+		return -EINVAL;
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (user && !capable(CAP_SYS_NICE)) {
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio =
+					task_rlimit(p, RLIMIT_RTPRIO);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+				param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		} else {
+			switch (p->policy) {
+				case SCHED_BATCH:
+					if (policy == SCHED_BATCH)
+						goto out;
+					if (policy != SCHED_IDLEPRIO)
+						return -EPERM;
+					break;
+				case SCHED_IDLEPRIO:
+					if (policy == SCHED_IDLEPRIO)
+						goto out;
+					return -EPERM;
+				default:
+					break;
+			}
+		}
+
+		/* can't change other user's priorities */
+		if (!check_same_owner(p))
+			return -EPERM;
+
+		/* Normal users shall not reset the sched_reset_on_fork flag */
+		if (p->sched_reset_on_fork && !reset_on_fork)
+			return -EPERM;
+	}
+
+	if (user) {
+		retval = security_task_setscheduler(p);
+		if (retval)
+			return retval;
+	}
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/*
+	 * To be able to change p->policy safely, the grunqueue lock must be
+	 * held.
+	 */
+	rq = __task_grq_lock(p);
+
+	/*
+	 * Changing the policy of the stop threads its a very bad idea
+	 */
+	if (p == rq->stop) {
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * If not changing anything there's no need to proceed further:
+	 */
+	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+			param->sched_priority == p->rt_priority))) {
+
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return 0;
+	}
+
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		goto recheck;
+	}
+	p->sched_reset_on_fork = reset_on_fork;
+
+	queued = task_queued(p);
+	__setscheduler(p, rq, policy, param->sched_priority);
+	if (queued) {
+		try_preempt(p, rq);
+	}
+	__task_grq_unlock();
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	rt_mutex_adjust_pi(p);
+out:
+	return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+			   const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+				   const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false);
+}
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	int retval;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+					   struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+	return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	rcu_read_unlock();
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+	struct sched_param lp;
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	lp.sched_priority = p->rt_priority;
+	rcu_read_unlock();
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+	cpumask_var_t cpus_allowed, new_mask;
+	struct task_struct *p;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		rcu_read_unlock();
+		put_online_cpus();
+		return -ESRCH;
+	}
+
+	/* Prevent p going away */
+	get_task_struct(p);
+	rcu_read_unlock();
+
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
+	retval = -EPERM;
+	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
+		goto out_unlock;
+
+	retval = security_task_setscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+	retval = set_cpus_allowed_ptr(p, new_mask);
+
+	if (!retval) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (!cpumask_subset(new_mask, cpus_allowed)) {
+			/*
+			 * We must have raced with a concurrent cpuset
+			 * update. Just reset the cpus_allowed to the
+			 * cpuset's cpus_allowed
+			 */
+			cpumask_copy(new_mask, cpus_allowed);
+			goto again;
+		}
+	}
+out_unlock:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+out_put_task:
+	put_task_struct(p);
+	put_online_cpus();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+				 cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	cpumask_var_t new_mask;
+	int retval;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+	if (retval == 0)
+		retval = sched_setaffinity(pid, new_mask);
+	free_cpumask_var(new_mask);
+	return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	struct task_struct *p;
+	unsigned long flags;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	grq_lock_irqsave(&flags);
+	cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
+	grq_unlock_irqrestore(&flags);
+
+out_unlock:
+	rcu_read_unlock();
+	put_online_cpus();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	int ret;
+	cpumask_var_t mask;
+
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = sched_getaffinity(pid, mask);
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
+	free_cpumask_var(mask);
+
+	return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. 
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+	struct task_struct *p;
+
+	p = current;
+	grq_lock_irq();
+	requeue_task(p);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(grq.lock);
+	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&grq.lock);
+	sched_preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+static inline bool should_resched(void)
+{
+	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
+static void __cond_resched(void)
+{
+	/* NOT a real fix but will make voluntary preempt work. 馬鹿な事 */
+	if (unlikely(system_state != SYSTEM_RUNNING))
+		return;
+
+	add_preempt_count(PREEMPT_ACTIVE);
+	schedule();
+	sub_preempt_count(PREEMPT_ACTIVE);
+}
+
+int __sched _cond_resched(void)
+{
+	if (should_resched()) {
+		__cond_resched();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+	int resched = should_resched();
+	int ret = 0;
+
+	lockdep_assert_held(lock);
+
+	if (spin_needbreak(lock) || resched) {
+		spin_unlock(lock);
+		if (resched)
+			__cond_resched();
+		else
+			cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (should_resched()) {
+		local_bh_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * This is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+	unsigned long flags;
+	bool yielded = 0;
+	struct rq *rq;
+	struct task_struct *curr;
+
+	rq = this_rq();
+	grq_lock_irqsave(&flags);
+	if (task_running(p) || p->state)
+		goto out_unlock;
+	yielded = 1;
+	curr = rq->curr;
+	p->time_slice += curr->time_slice;
+	curr->time_slice = 0;
+	if (p->time_slice > timeslice())
+		p->time_slice = timeslice();
+	set_tsk_need_resched(curr);
+out_unlock:
+	grq_unlock_irqrestore(&flags);
+
+	if (yielded)
+		schedule();
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+	struct rq *rq = raw_rq();
+
+	delayacct_blkio_start();
+	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
+	current->in_iowait = 1;
+	schedule();
+	current->in_iowait = 0;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+}
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+	struct rq *rq = raw_rq();
+	long ret;
+
+	delayacct_blkio_start();
+	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
+	current->in_iowait = 1;
+	ret = schedule_timeout(timeout);
+	current->in_iowait = 0;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct task_struct *p;
+	unsigned int time_slice;
+	unsigned long flags;
+	int retval;
+	struct timespec t;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	grq_lock_irqsave(&flags);
+	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(rr_interval);
+	grq_unlock_irqrestore(&flags);
+
+	rcu_read_unlock();
+	t = ns_to_timespec(time_slice);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+	unsigned long free = 0;
+	unsigned state;
+
+	state = p->state ? __ffs(p->state) + 1 : 0;
+	printk(KERN_INFO "%-15.15s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT " running  ");
+	else
+		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT "  running task	");
+	else
+		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	free = stack_not_used(p);
+#endif
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), task_pid_nr(p->real_parent),
+		(unsigned long)task_thread_info(p)->flags);
+
+	show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+	struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+	printk(KERN_INFO
+		"  task				PC stack   pid father\n");
+#else
+	printk(KERN_INFO
+		"  task						PC stack   pid father\n");
+#endif
+	rcu_read_lock();
+	do_each_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take a lot of time:
+		 */
+		touch_nmi_watchdog();
+		if (!state_filter || (p->state & state_filter))
+			sched_show_task(p);
+	} while_each_thread(g, p);
+
+	touch_all_softlockup_watchdogs();
+
+	rcu_read_unlock();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (!state_filter)
+		debug_show_all_locks();
+}
+
+#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	time_grq_lock(rq, &flags);
+	idle->last_ran = rq->clock;
+	idle->state = TASK_RUNNING;
+	/* Setting prio to illegal value shouldn't matter when never queued */
+	idle->prio = PRIO_LIMIT;
+	idle->policy = SCHED_IDLE;
+	set_rq_task(rq, idle);
+	do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu));
+	/* Silence PROVE_RCU */
+	rcu_read_lock();
+	set_task_cpu(idle, cpu);
+	rcu_read_unlock();
+	rq->curr = rq->idle = idle;
+	idle->on_cpu = 1;
+	grq_unlock_irqrestore(&flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+	task_thread_info(idle)->preempt_count = 0;
+
+	ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+static inline void resched_cpu(int cpu)
+{
+	unsigned long flags;
+
+	grq_lock_irqsave(&flags);
+	resched_task(cpu_curr(cpu));
+	grq_unlock_irqrestore(&flags);
+}
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (!idle_cpu(i))
+				cpu = i;
+			goto unlock;
+		}
+	}
+unlock:
+	rcu_read_unlock();
+	return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	struct task_struct *idle;
+	struct rq *rq;
+
+	if (cpu == smp_processor_id())
+		return;
+
+	rq = cpu_rq(cpu);
+	idle = rq->idle;
+
+	/*
+	 * This is safe, as this function is called with the timer
+	 * wheel base lock of (cpu) held. When the CPU is on the way
+	 * to idle and has not yet set rq->curr to idle then it will
+	 * be serialised on the timer wheel base lock and take the new
+	 * timer into account automatically.
+	 */
+	if (unlikely(rq->curr != idle))
+		return;
+
+	/*
+	 * We can set TIF_RESCHED on the idle task of the other CPU
+	 * lockless. The worst case is that the other CPU runs the
+	 * idle task through an additional NOOP schedule()
+	 */
+	set_tsk_need_resched(idle);
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(idle))
+		smp_send_reschedule(cpu);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	bool running_wrong = false;
+	bool queued = false;
+	unsigned long flags;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_grq_lock(p, &flags);
+
+	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	queued = task_queued(p);
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	if (task_running(p)) {
+		/* Task is running on the wrong cpu now, reschedule it. */
+		if (rq == this_rq()) {
+			set_tsk_need_resched(p);
+			running_wrong = true;
+		} else
+			resched_task(p);
+	} else
+		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
+
+out:
+	if (queued)
+		try_preempt(p, rq);
+	task_grq_unlock(&flags);
+
+	if (running_wrong)
+		_cond_resched();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Run through task list and find tasks affined to just the dead cpu, then
+ * allocate a new affinity */
+static void break_sole_affinity(int src_cpu, struct task_struct *idle)
+{
+	struct task_struct *p, *t;
+
+	do_each_thread(t, p) {
+		if (p != idle && !online_cpus(p)) {
+			cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask);
+			/*
+			 * Don't tell them about moving exiting tasks or
+			 * kernel threads (both mm NULL), since they never
+			 * leave kernel.
+			 */
+			if (p->mm && printk_ratelimit()) {
+				printk(KERN_INFO "process %d (%s) no "
+					   "longer affine to cpu %d\n",
+					   task_pid_nr(p), p->comm, src_cpu);
+			}
+		}
+		clear_sticky(p);
+	} while_each_thread(t, p);
+}
+
+/*
+ * Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible.
+ * Used by CPU offline code.
+ */
+void sched_idle_next(struct rq *rq, int this_cpu, struct task_struct *idle)
+{
+	/* cpu has to be offline */
+	BUG_ON(cpu_online(this_cpu));
+
+	__setscheduler(idle, rq, SCHED_FIFO, STOP_PRIO);
+
+	activate_idle_task(idle);
+	set_tsk_need_resched(rq->curr);
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+	struct sched_param start_param = { .sched_priority = MAX_USER_RT_PRIO - 1 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal rt scheduling prio so that
+		 * it can die in pieces.
+		 */
+		sched_setscheduler_nocheck(old_stop, SCHED_FIFO, &start_param);
+	}
+}
+
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		mode_t mode, proc_handler *proc_handler)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(13);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[11], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[12] is terminator */
+
+	return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+	if (sd_sysctl_header)
+		unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = true;
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = false;
+	}
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ */
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	unsigned long flags;
+	struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+	struct task_struct *idle = rq->idle;
+#endif
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+
+	case CPU_UP_PREPARE:
+		break;
+
+	case CPU_ONLINE:
+		/* Update our root-domain */
+		grq_lock_irqsave(&flags);
+		if (rq->rd) {
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+
+			set_rq_online(rq);
+		}
+		grq.noc = num_online_cpus();
+		grq_unlock_irqrestore(&flags);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		/* Idle task back to normal (off runqueue, low prio) */
+		grq_lock_irq();
+		put_prev_task(rq, idle, true, false);
+		idle->static_prio = MAX_PRIO;
+		__setscheduler(idle, rq, SCHED_NORMAL, 0);
+		idle->prio = PRIO_LIMIT;
+		set_rq_task(rq, idle);
+		grq_unlock_irq();
+		break;
+
+	case CPU_DYING:
+		/* Update our root-domain */
+		grq_lock_irqsave(&flags);
+		sched_idle_next(rq, cpu, idle);
+		if (rq->rd) {
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+			set_rq_offline(rq);
+		}
+		break_sole_affinity(cpu, idle);
+		grq.noc = num_online_cpus();
+		grq_unlock_irqrestore(&flags);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
+ */
+static struct notifier_block __cpuinitdata migration_notifier = {
+	.notifier_call = migration_call,
+	.priority = CPU_PRI_MIGRATION,
+};
+
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+					  unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		set_cpu_active((long)hcpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	/* Initialise migration for the boot CPU */
+	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+	return 0;
+}
+early_initcall(migration_init);
+#endif
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_domain_debug_enabled;
+
+static int __init sched_domain_debug_setup(char *str)
+{
+	sched_domain_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  struct cpumask *groupmask)
+{
+	struct sched_group *group = sd->groups;
+	char str[256];
+
+	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+	cpumask_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
+	}
+
+	printk(KERN_CONT "span %s level %s\n", str, sd->name);
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+		printk(KERN_ERR "ERROR: domain->groups does not contain"
+				" CPU%d\n", cpu);
+	}
+
+	printk(KERN_DEBUG "%*s groups:", level + 1, "");
+	do {
+		if (!group) {
+			printk("\n");
+			printk(KERN_ERR "ERROR: group is NULL\n");
+			break;
+		}
+
+		if (!group->sgp->power) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: domain->cpu_power not "
+					"set\n");
+			break;
+		}
+
+		if (!cpumask_weight(sched_group_cpus(group))) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: empty group\n");
+			break;
+		}
+
+		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: repeated CPUs\n");
+			break;
+		}
+
+		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+
+		printk(KERN_CONT " %s", str);
+		if (group->sgp->power != SCHED_POWER_SCALE) {
+			printk(KERN_CONT " (cpu_power = %d)",
+				group->sgp->power);
+		}
+
+		group = group->next;
+	} while (group != sd->groups);
+	printk(KERN_CONT "\n");
+
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+		!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sched_domain_debug_enabled)
+		return;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+			break;
+		level++;
+		sd = sd->parent;
+		if (!sd)
+			break;
+	}
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC |
+			 SD_SHARE_CPUPOWER |
+			 SD_SHARE_PKG_RESOURCES)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_AFFINE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+		return 0;
+
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC |
+				SD_SHARE_CPUPOWER |
+				SD_SHARE_PKG_RESOURCES);
+		if (nr_node_ids == 1)
+			pflags &= ~SD_SERIALIZE;
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+	cpupri_cleanup(&rd->cpupri);
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	struct root_domain *old_rd = NULL;
+	unsigned long flags;
+
+	grq_lock_irqsave(&flags);
+
+	if (rq->rd) {
+		old_rd = rq->rd;
+
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
+
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+		/*
+		 * If we dont want to free the old_rt yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+		set_rq_online(rq);
+
+	grq_unlock_irqrestore(&flags);
+
+	if (old_rd)
+		call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto out;
+	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_online;
+
+	if (cpupri_init(&rd->cpupri) != 0)
+		goto free_rto_mask;
+	return 0;
+
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+out:
+	return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	if (init_rootdomain(rd) != 0) {
+		kfree(rd);
+		return NULL;
+	}
+
+	return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+	struct sched_group *tmp, *first;
+
+	if (!sg)
+		return;
+
+	first = sg;
+	do {
+		tmp = sg->next;
+
+		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+			kfree(sg->sgp);
+
+		kfree(sg);
+		sg = tmp;
+	} while (sg != first);
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	/*
+	 * If its an overlapping domain it has private groups, iterate and
+	 * nuke them all.
+	 */
+	if (sd->flags & SD_OVERLAP) {
+		free_sched_groups(sd->groups, 1);
+	} else if (atomic_dec_and_test(&sd->groups->ref)) {
+		kfree(sd->groups->sgp);
+		kfree(sd->groups);
+	}
+	kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+	call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+	for (; sd; sd = sd->parent)
+		destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; ) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+
+		if (sd_parent_degenerate(tmp, parent)) {
+			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+			destroy_sched_domain(parent, cpu);
+		} else
+			tmp = tmp->parent;
+	}
+
+	if (sd && sd_degenerate(sd)) {
+		tmp = sd;
+		sd = sd->parent;
+		destroy_sched_domain(tmp, cpu);
+		if (sd)
+			sd->child = NULL;
+	}
+
+	sched_domain_debug(sd, cpu);
+
+	rq_attach_root(rq, rd);
+	tmp = rq->sd;
+	rcu_assign_pointer(rq->sd, sd);
+	destroy_sched_domains(tmp, cpu);
+}
+
+/* cpus with isolated domains */
+static cpumask_var_t cpu_isolated_map;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	cpulist_parse(str, cpu_isolated_map);
+	return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+#define SD_NODES_PER_DOMAIN 16
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, nodemask_t *used_nodes)
+{
+	int i, n, val, min_val, best_node = -1;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		/* Start at @node */
+		n = (node + i) % nr_node_ids;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (node_isset(n, *used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node != -1)
+		node_set(best_node, *used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @span: resulting cpumask
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static void sched_domain_node_span(int node, struct cpumask *span)
+{
+	nodemask_t used_nodes;
+	int i;
+
+	cpumask_clear(span);
+	nodes_clear(used_nodes);
+
+	cpumask_or(span, span, cpumask_of_node(node));
+	node_set(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, &used_nodes);
+		if (next_node < 0)
+			break;
+		cpumask_or(span, span, cpumask_of_node(next_node));
+	}
+}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+	lockdep_assert_held(&sched_domains_mutex);
+
+	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+	return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+	return cpu_possible_mask;
+}
+#endif /* CONFIG_NUMA */
+
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+	return cpumask_of_node(cpu_to_node(cpu));
+}
+
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
+struct sd_data {
+	struct sched_domain **__percpu sd;
+	struct sched_group **__percpu sg;
+	struct sched_group_power **__percpu sgp;
+};
+
+struct s_data {
+	struct sched_domain ** __percpu sd;
+	struct root_domain	*rd;
+};
+
+enum s_alloc {
+	sa_rootdomain,
+	sa_sd,
+	sa_sd_storage,
+	sa_none,
+};
+
+struct sched_domain_topology_level;
+
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+
+#define SDTL_OVERLAP	0x01
+
+struct sched_domain_topology_level {
+	sched_domain_init_f init;
+	sched_domain_mask_f mask;
+	int			flags;
+	struct sd_data	  data;
+};
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered = sched_domains_tmpmask;
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *child;
+	int i;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct cpumask *sg_span;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				GFP_KERNEL, cpu_to_node(i));
+
+		if (!sg)
+			goto fail;
+
+		sg_span = sched_group_cpus(sg);
+
+		child = *per_cpu_ptr(sdd->sd, i);
+		if (child->child) {
+			child = child->child;
+			cpumask_copy(sg_span, sched_domain_span(child));
+		} else
+			cpumask_set_cpu(i, sg_span);
+
+		cpumask_or(covered, covered, sg_span);
+
+		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+		atomic_inc(&sg->sgp->ref);
+
+		if (cpumask_test_cpu(cpu, sg_span))
+			groups = sg;
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+		last->next = first;
+	}
+	sd->groups = groups;
+
+	return 0;
+
+fail:
+	free_sched_groups(first, 0);
+
+	return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	struct sched_domain *child = sd->child;
+
+	if (child)
+		cpu = cpumask_first(sched_domain_span(child));
+
+	if (sg) {
+		*sg = *per_cpu_ptr(sdd->sg, cpu);
+		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+	}
+
+	return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL;
+	struct sd_data *sdd = sd->private;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered;
+	int i;
+
+	get_group(cpu, sdd, &sd->groups);
+	atomic_inc(&sd->groups->ref);
+
+	if (cpu != cpumask_first(sched_domain_span(sd)))
+		return 0;
+
+	lockdep_assert_held(&sched_domains_mutex);
+	covered = sched_domains_tmpmask;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct sched_group *sg;
+		int group = get_group(i, sdd, &sg);
+		int j;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		cpumask_clear(sched_group_cpus(sg));
+		sg->sgp->power = 0;
+
+		for_each_cpu(j, span) {
+			if (get_group(j, sdd, NULL) != group)
+				continue;
+
+			cpumask_set_cpu(j, covered);
+			cpumask_set_cpu(j, sched_group_cpus(sg));
+		}
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+	}
+	last->next = first;
+
+	return 0;
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)		sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)		do { } while (0)
+#endif
+
+#define SD_INIT_FUNC(type)						\
+static noinline struct sched_domain *					\
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
+{									\
+	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
+	*sd = SD_##type##_INIT;						\
+	SD_INIT_NAME(sd, type);						\
+	sd->private = &tl->data;					\
+	return sd;							\
+}
+
+SD_INIT_FUNC(CPU)
+#ifdef CONFIG_NUMA
+ SD_INIT_FUNC(ALLNODES)
+ SD_INIT_FUNC(NODE)
+#endif
+#ifdef CONFIG_SCHED_SMT
+ SD_INIT_FUNC(SIBLING)
+#endif
+#ifdef CONFIG_SCHED_MC
+ SD_INIT_FUNC(MC)
+#endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	unsigned long val;
+
+	val = simple_strtoul(str, NULL, 0);
+	if (val < sched_domain_level_max)
+		default_relax_domain_level = val;
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	}
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_rootdomain:
+		if (!atomic_read(&d->rd->refcount))
+			free_rootdomain(&d->rd->rcu); /* fall through */
+	case sa_sd:
+		free_percpu(d->sd); /* fall through */
+	case sa_sd_storage:
+		__sdt_free(cpu_map); /* fall through */
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+						   const struct cpumask *cpu_map)
+{
+	memset(d, 0, sizeof(*d));
+
+	if (__sdt_alloc(cpu_map))
+		return sa_sd_storage;
+	d->sd = alloc_percpu(struct sched_domain *);
+	if (!d->sd)
+		return sa_sd_storage;
+	d->rd = alloc_rootdomain();
+	if (!d->rd)
+		return sa_sd;
+	return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+
+	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *cpu_smt_mask(int cpu)
+{
+	return topology_thread_cpumask(cpu);
+}
+#endif
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ sd_init_SIBLING, cpu_smt_mask, },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ sd_init_MC, cpu_coregroup_mask, },
+#endif
+#ifdef CONFIG_SCHED_BOOK
+	{ sd_init_BOOK, cpu_book_mask, },
+#endif
+	{ sd_init_CPU, cpu_cpu_mask, },
+#ifdef CONFIG_NUMA
+	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
+	{ sd_init_ALLNODES, cpu_allnodes_mask, },
+#endif
+	{ NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for (tl = sched_domain_topology; tl->init; tl++) {
+		struct sd_data *sdd = &tl->data;
+
+		sdd->sd = alloc_percpu(struct sched_domain *);
+		if (!sdd->sd)
+			return -ENOMEM;
+
+		sdd->sg = alloc_percpu(struct sched_group *);
+		if (!sdd->sg)
+			return -ENOMEM;
+
+		sdd->sgp = alloc_percpu(struct sched_group_power *);
+		if (!sdd->sgp)
+			return -ENOMEM;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+			struct sched_group *sg;
+			struct sched_group_power *sgp;
+
+			   	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sd)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sd, j) = sd;
+
+			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sg)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sg, j) = sg;
+
+			sgp = kzalloc_node(sizeof(struct sched_group_power),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sgp)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sgp, j) = sgp;
+		}
+	}
+
+	return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for (tl = sched_domain_topology; tl->init; tl++) {
+		struct sd_data *sdd = &tl->data;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd && (sd->flags & SD_OVERLAP))
+				free_sched_groups(sd->groups, 0);
+			kfree(*per_cpu_ptr(sdd->sd, j));
+			kfree(*per_cpu_ptr(sdd->sg, j));
+			kfree(*per_cpu_ptr(sdd->sgp, j));
+		}
+		free_percpu(sdd->sd);
+		free_percpu(sdd->sg);
+		free_percpu(sdd->sgp);
+	}
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+		struct s_data *d, const struct cpumask *cpu_map,
+		struct sched_domain_attr *attr, struct sched_domain *child,
+		int cpu)
+{
+	struct sched_domain *sd = tl->init(tl, cpu);
+	if (!sd)
+		return child;
+
+	set_domain_attribute(sd, attr);
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	if (child) {
+		sd->level = child->level + 1;
+		sched_domain_level_max = max(sched_domain_level_max, sd->level);
+		child->parent = sd;
+	}
+	sd->child = child;
+
+	return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+				   struct sched_domain_attr *attr)
+{
+	enum s_alloc alloc_state = sa_none;
+	struct sched_domain *sd;
+	struct s_data d;
+	int i, ret = -ENOMEM;
+
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+
+	/* Set up domains for cpus specified by the cpu_map. */
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain_topology_level *tl;
+
+		sd = NULL;
+		for (tl = sched_domain_topology; tl->init; tl++) {
+			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+			if (tl->flags & SDTL_OVERLAP)
+				sd->flags |= SD_OVERLAP;
+			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+				break;
+		}
+
+		while (sd->child)
+			sd = sd->child;
+
+		*per_cpu_ptr(d.sd, i) = sd;
+	}
+
+	/* Build the groups for the domains */
+	for_each_cpu(i, cpu_map) {
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			sd->span_weight = cpumask_weight(sched_domain_span(sd));
+			if (sd->flags & SD_OVERLAP) {
+				if (build_overlap_sched_groups(sd, i))
+					goto error;
+			} else {
+				if (build_sched_groups(sd, i))
+					goto error;
+			}
+		}
+	}
+
+	/* Calculate CPU power for physical packages and nodes */
+	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		if (!cpumask_test_cpu(i, cpu_map))
+			continue;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			claim_allocations(i, sd);
+		}
+	}
+
+	/* Attach the domains */
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d.sd, i);
+		cpu_attach_domain(sd, d.rd, i);
+	}
+	rcu_read_unlock();
+
+	ret = 0;
+error:
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return ret;
+}
+
+static cpumask_var_t *doms_cur;	/* current sched domains */
+static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+				/* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
+{
+	return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+	int err;
+
+	arch_update_cpu_topology();
+	ndoms_cur = 1;
+	doms_cur = alloc_sched_domains(ndoms_cur);
+	if (!doms_cur)
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	dattr_cur = NULL;
+	err = build_sched_domains(doms_cur[0], NULL);
+	register_sched_domain_sysctl();
+
+	return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+	int i;
+
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map)
+		cpu_attach_domain(NULL, &def_root_domain, i);
+	rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* fast path */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+				 struct sched_domain_attr *dattr_new)
+{
+	int i, j, n;
+	int new_topology;
+
+	mutex_lock(&sched_domains_mutex);
+
+	/* always unregister in case we don't destroy any domains */
+	unregister_sched_domain_sysctl();
+
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
+	n = doms_new ? ndoms_new : 0;
+
+	/* Destroy deleted domains */
+	for (i = 0; i < ndoms_cur; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_cur[i], doms_new[j])
+				&& dattrs_equal(dattr_cur, i, dattr_new, j))
+				goto match1;
+		}
+		/* no match - a current sched domain not in new doms_new[] */
+		detach_destroy_domains(doms_cur[i]);
+match1:
+		;
+	}
+
+	if (doms_new == NULL) {
+		ndoms_cur = 0;
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		WARN_ON_ONCE(dattr_new);
+	}
+
+	/* Build new domains */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < ndoms_cur && !new_topology; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j])
+				&& dattrs_equal(dattr_new, i, dattr_cur, j))
+				goto match2;
+		}
+		/* no match - add a new doms_new */
+		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+		;
+	}
+
+	/* Remember the new sched domains */
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
+	kfree(dattr_cur);	/* kfree(NULL) is safe */
+	doms_cur = doms_new;
+	dattr_cur = dattr_new;
+	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
+
+	mutex_unlock(&sched_domains_mutex);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+static void reinit_sched_domains(void)
+{
+	get_online_cpus();
+
+	/* Destroy domains first to force the rebuild */
+	partition_sched_domains(0, NULL, NULL);
+
+	rebuild_sched_domains();
+	put_online_cpus();
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+	unsigned int level = 0;
+
+	if (sscanf(buf, "%u", &level) != 1)
+		return -EINVAL;
+
+	/*
+	 * level is always be positive so don't check for
+	 * level < POWERSAVINGS_BALANCE_NONE which is 0
+	 * What happens on 0 or 1 byte write,
+	 * need to check for count as well?
+	 */
+
+	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
+		return -EINVAL;
+
+	if (smt)
+		sched_smt_power_savings = level;
+	else
+		sched_mc_power_savings = level;
+
+	reinit_sched_domains();
+
+	return count;
+}
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	return sprintf(buf, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 0);
+}
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+		   sched_mc_power_savings_show,
+		   sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	return sprintf(buf, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+						struct device_attribute *attr,
+						 const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 1);
+}
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
+		   sched_smt_power_savings_show,
+		   sched_smt_power_savings_store);
+#endif
+
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
+{
+	int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+	if (smt_capable())
+		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
+#endif
+#ifdef CONFIG_SCHED_MC
+	if (!err && mc_capable())
+		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
+#endif
+	return err;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+/*
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+				 void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_STARTING:
+	case CPU_DOWN_FAILED:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+				   void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(int cpu)
+{
+	return rq_idle(cpu_rq(cpu));
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(int cpu)
+{
+	return cpumask_subset(&(cpu_rq(cpu)->smt_siblings),
+				  &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(int cpu)
+{
+	return cpumask_subset(&(cpu_rq(cpu)->cache_siblings),
+				  &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_BOOK,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+	struct sched_domain *sd;
+	int cpu;
+
+	cpumask_var_t non_isolated_cpus;
+
+	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+	get_online_cpus();
+	mutex_lock(&sched_domains_mutex);
+	init_sched_domains(cpu_active_mask);
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+	if (cpumask_empty(non_isolated_cpus))
+		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
+	put_online_cpus();
+
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+	/* Move init over to a non-isolated CPU */
+	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+		BUG();
+	free_cpumask_var(non_isolated_cpus);
+
+	grq_lock_irq();
+	/*
+	 * Set up the relative cache distance of each online cpu from each
+	 * other in a simple array for quick lookup. Locality is determined
+	 * by the closest sched_domain that CPUs are separated by. CPUs with
+	 * shared cache in SMT and MC are treated as local. Separate CPUs
+	 * (within the same package or physically) within the same node are
+	 * treated as not local. CPUs not even in the same domain (different
+	 * nodes) are treated as very distant.
+	 */
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		for_each_domain(cpu, sd) {
+			int locality, other_cpu;
+
+#ifdef CONFIG_SCHED_SMT
+			if (sd->level == SD_LV_SIBLING) {
+				for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
+					cpumask_set_cpu(other_cpu, &rq->smt_siblings);
+			}
+#endif
+#ifdef CONFIG_SCHED_MC
+			if (sd->level == SD_LV_MC) {
+				for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
+					cpumask_set_cpu(other_cpu, &rq->cache_siblings);
+			}
+#endif
+			if (sd->level <= SD_LV_SIBLING)
+				locality = 1;
+			else if (sd->level <= SD_LV_MC)
+				locality = 2;
+			else if (sd->level <= SD_LV_NODE)
+				locality = 3;
+			else
+				continue;
+
+			for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) {
+				if (locality < rq->cpu_locality[other_cpu])
+					rq->cpu_locality[other_cpu] = locality;
+			}
+		}
+
+/*
+		 * Each runqueue has its own function in case it doesn't have
+		 * siblings of its own allowing mixed topologies.
+		 */
+#ifdef CONFIG_SCHED_SMT
+		if (cpus_weight(rq->smt_siblings) > 1)
+			rq->siblings_idle = siblings_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+		if (cpus_weight(rq->cache_siblings) > 1)
+			rq->cache_idle = cache_cpu_idle;
+#endif
+	}
+	grq_unlock_irq();
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+unsigned int sysctl_timer_migration = 1;
+
+int in_sched_functions(unsigned long addr)
+{
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+	int i;
+	struct rq *rq;
+
+	print_scheduler_version();
+
+	raw_spin_lock_init(&grq.lock);
+	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
+	grq.noc = 1;
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+	grq.qnr = grq.idle_cpus = 0;
+	cpumask_clear(&grq.cpu_idle_map);
+
+#else
+	uprq = &per_cpu(runqueues, 0);
+#endif
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
+				  rq->iowait_pc = rq->idle_pc = 0;
+		rq->last_run_scale = 0;
+#ifdef CONFIG_SMP
+		rq->sticky_task = NULL;
+		rq->sd = NULL;
+		rq->rd = NULL;
+		rq->online = false;
+		rq->cpu = i;
+		rq_attach_root(rq, &def_root_domain);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+	}
+
+#ifdef CONFIG_SMP
+	nr_cpu_ids = i;
+	/*
+	 * Set the base locality for cpu cache distance calculation to
+	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
+	 */
+	for_each_possible_cpu(i) {
+		int j;
+
+		rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+		cpumask_clear(&rq->smt_siblings);
+		cpumask_set_cpu(i, &rq->smt_siblings);
+		rq->siblings_idle = sole_cpu_idle;
+		cpumask_set_cpu(i, &rq->smt_siblings);
+#endif
+#ifdef CONFIG_SCHED_MC
+		cpumask_clear(&rq->cache_siblings);
+		cpumask_set_cpu(i, &rq->cache_siblings);
+		rq->cache_idle = sole_cpu_idle;
+		cpumask_set_cpu(i, &rq->cache_siblings);
+#endif
+		rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC);
+		for_each_possible_cpu(j) {
+			if (i == j)
+				rq->cpu_locality[j] = 0;
+			else
+				rq->cpu_locality[j] = 4;
+		}
+	}
+#endif
+
+	for (i = 0; i < PRIO_LIMIT; i++)
+		INIT_LIST_HEAD(grq.queue + i);
+	/* delimiter for bitsearch */
+	__set_bit(PRIO_LIMIT, grq.prio_bitmap);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+	plist_head_init(&init_task.pi_waiters);
+#endif
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+#endif /* SMP */
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+
+	return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+		system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+	dump_stack();
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	struct rq *rq;
+	int queued;
+
+	read_lock_irq(&tasklist_lock);
+
+	do_each_thread(g, p) {
+		if (!rt_task(p))
+			continue;
+
+		raw_spin_lock_irqsave(&p->pi_lock, flags);
+		rq = __task_grq_lock(p);
+
+		queued = task_queued(p);
+		__setscheduler(p, rq, SCHED_NORMAL, 0);
+		if (queued) {
+			try_preempt(p, rq);
+		}
+
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	} while_each_thread(g, p);
+
+	read_unlock_irq(&tasklist_lock);
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+struct task_struct *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	*ut = p->utime;
+	*st = p->stime;
+}
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
+#else
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	cputime_t rtime, utime = p->utime, total = utime + p->stime;
+
+	rtime = nsecs_to_cputime(p->sched_time);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	/*
+	 * Compare with previous values, to keep monotonicity:
+	 */
+	p->prev_utime = max(p->prev_utime, utime);
+	p->prev_stime = max(p->prev_stime, (rtime - p->prev_utime));
+
+	*ut = p->prev_utime;
+	*st = p->prev_stime;
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct signal_struct *sig = p->signal;
+	struct task_cputime cputime;
+	cputime_t rtime, utime, total;
+
+	thread_group_cputime(p, &cputime);
+
+	total = cputime.utime + cputime.stime;
+	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * cputime.utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	sig->prev_utime = max(sig->prev_utime, utime);
+	sig->prev_stime = max(sig->prev_stime, (rtime - sig->prev_utime));
+
+	*ut = sig->prev_utime;
+	*st = sig->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+	return p->gtime;
+}
+
+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+#endif
diff -ruN linux-3.4.1/kernel/sched/stats.c linux-3.4.1-RIFS/kernel/sched/stats.c
--- linux-3.4.1/kernel/sched/stats.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/stats.c	1970-01-01 08:00:00.000000000 +0800
@@ -1,111 +0,0 @@
-
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "sched.h"
-
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-	int cpu;
-	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-	char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
-	if (mask_str == NULL)
-		return -ENOMEM;
-
-	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-	seq_printf(seq, "timestamp %lu\n", jiffies);
-	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-		struct sched_domain *sd;
-		int dcount = 0;
-#endif
-
-		/* runqueue-specific stats */
-		seq_printf(seq,
-		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
-		    cpu, rq->yld_count,
-		    rq->sched_count, rq->sched_goidle,
-		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-
-		seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
-		/* domain-specific stats */
-		rcu_read_lock();
-		for_each_domain(cpu, sd) {
-			enum cpu_idle_type itype;
-
-			cpumask_scnprintf(mask_str, mask_len,
-					  sched_domain_span(sd));
-			seq_printf(seq, "domain%d %s", dcount++, mask_str);
-			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-					itype++) {
-				seq_printf(seq, " %u %u %u %u %u %u %u %u",
-				    sd->lb_count[itype],
-				    sd->lb_balanced[itype],
-				    sd->lb_failed[itype],
-				    sd->lb_imbalance[itype],
-				    sd->lb_gained[itype],
-				    sd->lb_hot_gained[itype],
-				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype]);
-			}
-			seq_printf(seq,
-				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
-			    sd->ttwu_move_balance);
-		}
-		rcu_read_unlock();
-#endif
-	}
-	kfree(mask_str);
-	return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-	char *buf = kmalloc(size, GFP_KERNEL);
-	struct seq_file *m;
-	int res;
-
-	if (!buf)
-		return -ENOMEM;
-	res = single_open(file, show_schedstat, NULL);
-	if (!res) {
-		m = file->private_data;
-		m->buf = buf;
-		m->size = size;
-	} else
-		kfree(buf);
-	return res;
-}
-
-static const struct file_operations proc_schedstat_operations = {
-	.open    = schedstat_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release,
-};
-
-static int __init proc_schedstat_init(void)
-{
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-	return 0;
-}
-module_init(proc_schedstat_init);
diff -ruN linux-3.4.1/kernel/sched/stats.h linux-3.4.1-RIFS/kernel/sched/stats.h
--- linux-3.4.1/kernel/sched/stats.h	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/stats.h	2012-06-06 14:43:02.000000000 +0800
@@ -1,231 +0,0 @@
-
-#ifdef CONFIG_SCHEDSTATS
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{
-	if (rq) {
-		rq->rq_sched_info.run_delay += delta;
-		rq->rq_sched_info.pcount++;
-	}
-}
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{
-	if (rq)
-		rq->rq_cpu_time += delta;
-}
-
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{
-	if (rq)
-		rq->rq_sched_info.run_delay += delta;
-}
-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)	do { var = (val); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-# define schedstat_inc(rq, field)	do { } while (0)
-# define schedstat_add(rq, field, amt)	do { } while (0)
-# define schedstat_set(var, val)	do { } while (0)
-#endif
-
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
-	t->sched_info.last_queued = 0;
-}
-
-/*
- * We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
- */
-static inline void sched_info_dequeued(struct task_struct *t)
-{
-	unsigned long long now = task_rq(t)->clock, delta = 0;
-
-	if (unlikely(sched_info_on()))
-		if (t->sched_info.last_queued)
-			delta = now - t->sched_info.last_queued;
-	sched_info_reset_dequeued(t);
-	t->sched_info.run_delay += delta;
-
-	rq_sched_info_dequeued(task_rq(t), delta);
-}
-
-/*
- * Called when a task finally hits the cpu.  We can now calculate how
- * long it was waiting to run.  We also note when it began so that we
- * can keep stats on how long its timeslice is.
- */
-static void sched_info_arrive(struct task_struct *t)
-{
-	unsigned long long now = task_rq(t)->clock, delta = 0;
-
-	if (t->sched_info.last_queued)
-		delta = now - t->sched_info.last_queued;
-	sched_info_reset_dequeued(t);
-	t->sched_info.run_delay += delta;
-	t->sched_info.last_arrival = now;
-	t->sched_info.pcount++;
-
-	rq_sched_info_arrive(task_rq(t), delta);
-}
-
-/*
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set.  It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
-{
-	if (unlikely(sched_info_on()))
-		if (!t->sched_info.last_queued)
-			t->sched_info.last_queued = task_rq(t)->clock;
-}
-
-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
- * Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
- * the runqueue.
- */
-static inline void sched_info_depart(struct task_struct *t)
-{
-	unsigned long long delta = task_rq(t)->clock -
-					t->sched_info.last_arrival;
-
-	rq_sched_info_depart(task_rq(t), delta);
-
-	if (t->state == TASK_RUNNING)
-		sched_info_queued(t);
-}
-
-/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice.  (This may also be called when switching to or from
- * the idle task.)  We are only called when prev != next.
- */
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-	struct rq *rq = task_rq(prev);
-
-	/*
-	 * prev now departs the cpu.  It's not interesting to record
-	 * stats about how efficient we were at scheduling the idle
-	 * process, however.
-	 */
-	if (prev != rq->idle)
-		sched_info_depart(prev);
-
-	if (next != rq->idle)
-		sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-	if (unlikely(sched_info_on()))
-		__sched_info_switch(prev, next);
-}
-#else
-#define sched_info_queued(t)			do { } while (0)
-#define sched_info_reset_dequeued(t)	do { } while (0)
-#define sched_info_dequeued(t)			do { } while (0)
-#define sched_info_switch(t, next)		do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick.  None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the utime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-					   cputime_t cputime)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	if (!cputimer->running)
-		return;
-
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the stime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
-					     cputime_t cputime)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	if (!cputimer->running)
-		return;
-
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					      unsigned long long ns)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	if (!cputimer->running)
-		return;
-
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
-}
diff -ruN linux-3.4.1/kernel/sysctl.c linux-3.4.1-RIFS/kernel/sysctl.c
--- linux-3.4.1/kernel/sysctl.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sysctl.c	2012-06-15 23:46:44.000000000 +0800
@@ -242,6 +242,7 @@
 };
 
 #ifdef CONFIG_SCHED_DEBUG
+#ifndef CONFIG_SCHED_RIFS
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -249,6 +250,7 @@
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 #endif
+#endif
 
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -256,14 +258,8 @@
 #endif
 
 static struct ctl_table kern_table[] = {
-	{
-		.procname	= "sched_child_runs_first",
-		.data		= &sysctl_sched_child_runs_first,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHED_DEBUG
+#ifndef CONFIG_SCHED_RIFS
 	{
 		.procname	= "sched_min_granularity_ns",
 		.data		= &sysctl_sched_min_granularity,
@@ -338,20 +334,7 @@
 		.extra2		= &one,
 	},
 #endif
-	{
-		.procname	= "sched_rt_period_us",
-		.data		= &sysctl_sched_rt_period,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_rt_handler,
-	},
-	{
-		.procname	= "sched_rt_runtime_us",
-		.data		= &sysctl_sched_rt_runtime,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= sched_rt_handler,
-	},
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
  2012-06-16 17:18 [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release Chen
@ 2012-06-17  6:24 ` Mike Galbraith
       [not found]   ` <CANQmPXiU_Pz2qZkisF34TBepbRW4iJ4ytLLfiEmuJpCHxUwWvA@mail.gmail.com>
  2012-06-17 22:24   ` Chen
  0 siblings, 2 replies; 10+ messages in thread
From: Mike Galbraith @ 2012-06-17  6:24 UTC (permalink / raw)
  To: Chen; +Cc: linux-kernel

On Sun, 2012-06-17 at 01:18 +0800, Chen wrote:

> CFS is good with its beautiful design. But in my opinion i think the
> sleeper feature will destroy the beautiful design.

Destroys may be a bit strong, but it does have it's problems, one being:
as resource contention increases, so does total sleep.

It would be great to come up with a model that works as well at letting
light tasks compete without the downsides, but until that happens, this
model is better than none at all.  With no wakeup preemption model, the
scheduler is not fair to light tasks, a big hairy wart IMHO, which mars
the beauty more than sleep model "beauty marks" (ok, so they're warts).

"Beauty is in the eye of the beholder" and "Perfect is the enemy of
good" are both applicable.. as is "Math is hard, let's go shopping" :)

-Mike


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
       [not found]   ` <CANQmPXiU_Pz2qZkisF34TBepbRW4iJ4ytLLfiEmuJpCHxUwWvA@mail.gmail.com>
@ 2012-06-17  8:20     ` Chen
       [not found]       ` <4d5440bc-73bb-4c30-8571-bb7004fc5c4a@skytron.local>
  0 siblings, 1 reply; 10+ messages in thread
From: Chen @ 2012-06-17  8:20 UTC (permalink / raw)
  To: linux-kernel

On Sun, Jun 17, 2012 at 2:24 PM, Mike Galbraith <efault@gmx.de> wrote:
> Destroys may be a bit strong, but it does have it's problems, one being:
> as resource contention increases, so does total sleep.

Sleeper feature(I mean the re-designed sleeper fairness) give too much
resources to the sleep task. They need to wake up with smaller latency
but if they were given too much priority it will be harmful to the
other interactive tasks eg. X. Under high IO load the sleeper fairness
feature can causes music lag or X lag. We can't remove it since there
is no any tuning to turn this off from 2.6.31.

CFS can still be simplified and optimised.

RIFS-ES uses different way to do this.

				if((p->state != TASK_RUNNING) || (p->tick_used >= MAX_TICK_USED)) {
					p->tick_used /= 2;
				}

Here, an cpu intensive task won't be punished too heavy. also this can
improve the wake up latency of 50+% CPU_BOUND tasks.

				if(p->sleep_cnt < MAX_SLEEP_COUNT)
					p->sleep_cnt++;

If a task sleep the sleep_cnt increase.

				#define prio_raisable(p) (	\
					p->prio > ((p->static_prio - reverse(MAX_TICK_USED, p->tick_used)) + \
							(p->sleep_cnt / 3)))
	
Priority of a task that is going to sleep is decided by its time spent
on running. sleep_cnt is the sleep rate of a task.

				if(prio_raisable(p))
					p->prio--;
				if(sleep_jiffy != p->sleep_jiffy)
					p->sleep_cnt /= 2;
				p->sleep_jiffy = sleep_jiffy;

With this, if the sleep rate of a task is low, the sleep rate will be
divide by 2 and it can wake up with a lower latency.
That is the reason why I call this scheduler RIFS-ES.(RIFS and
enhancement of Unix System V scheduling).

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
  2012-06-17  6:24 ` Mike Galbraith
       [not found]   ` <CANQmPXiU_Pz2qZkisF34TBepbRW4iJ4ytLLfiEmuJpCHxUwWvA@mail.gmail.com>
@ 2012-06-17 22:24   ` Chen
  2012-06-18  3:01     ` Vikram Dhillon
  1 sibling, 1 reply; 10+ messages in thread
From: Chen @ 2012-06-17 22:24 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 70 bytes --]

Sorry for my fault.
The patch has build issue and I post the new one.

[-- Attachment #2: RIFS.ES-v1-kernel3.4.x --]
[-- Type: application/octet-stream, Size: 195697 bytes --]

diff -ruN linux-3.4.1/fs/proc/base.c linux-3.4.1-RIFS/fs/proc/base.c
--- linux-3.4.1/fs/proc/base.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/fs/proc/base.c	2012-06-06 15:02:18.000000000 +0800
@@ -342,7 +342,7 @@
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			(unsigned long long)task->se.sum_exec_runtime,
+			(unsigned long long)tsk_seruntime(task),
 			(unsigned long long)task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
diff -ruN linux-3.4.1/include/linux/init_task.h linux-3.4.1-RIFS/include/linux/init_task.h
--- linux-3.4.1/include/linux/init_task.h	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/include/linux/init_task.h	2012-06-06 14:50:29.000000000 +0800
@@ -132,12 +132,68 @@
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
+#ifdef CONFIG_SCHED_RIFS
+#define INIT_TASK_COMM "RIFS"
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= NORMAL_PRIO,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= NORMAL_PRIO,					\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.time_slice	= HZ,					\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_INIT_POINTER(.real_cred, &init_cred),			\
+	RCU_INIT_POINTER(.cred, &init_cred),				\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+}
+#else /* CONFIG_SCHED_RIFS */
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
@@ -201,7 +257,7 @@
 	INIT_TASK_RCU_PREEMPT(tsk)					\
 	INIT_CPUSET_SEQ							\
 }
-
+#endif
 
 #define INIT_CPU_TIMERS(cpu_timers)					\
 {									\
diff -ruN linux-3.4.1/include/linux/sched.h linux-3.4.1-RIFS/include/linux/sched.h
--- linux-3.4.1/include/linux/sched.h	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/include/linux/sched.h	2012-06-16 20:28:34.000000000 +0800
@@ -37,9 +37,13 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
-/* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
-/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+#define SCHED_IDLEPRIO		SCHED_IDLE
+#ifdef CONFIG_SCHED_RIFS
+#define SCHED_MAX		(SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
+#endif
+
 #define SCHED_RESET_ON_FORK     0x40000000
 
 #ifdef __KERNEL__
@@ -268,8 +272,6 @@
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
-extern int runqueue_is_locked(int cpu);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern void select_nohz_load_balancer(int stop_tick);
 extern void set_cpu_sd_state_idle(void);
@@ -1270,15 +1272,33 @@
 
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
-	int on_cpu;
 #endif
+#ifndef CONFIG_SCHED_RIFS
+	int on_cpu;
 	int on_rq;
-
+#else
+	bool on_cpu;
+	bool on_rq;
+#endif
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
+#ifdef CONFIG_SCHED_RIFS
+	int time_slice;
+	int tick_used;
+	int sleep_cnt;
+	struct list_head run_list;
+	u64 last_ran;
+	u64 sleep_jiffy;
+	u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMP
+	bool sticky; /* Soft affined flag */
+#endif
+	unsigned long rt_timeout;
+#else /* CONFIG_SCHED_RIFS */
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
@@ -1391,6 +1411,9 @@
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
+#ifdef CONFIG_SCHED_RIFS
+	unsigned long utime_pc, stime_pc;
+#endif
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
 #endif
@@ -1619,6 +1642,55 @@
 #endif
 };
 
+#ifdef CONFIG_SCHED_RIFS
+bool grunqueue_is_locked(void);
+void grq_unlock_wait(void);
+void cpu_scaling(int cpu);
+void cpu_nonscaling(int cpu);
+bool above_background_load(void);
+#define tsk_seruntime(t)		((t)->sched_time)
+#define tsk_rttimeout(t)		((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+static inline int runqueue_is_locked(int cpu)
+{
+	return grunqueue_is_locked();
+}
+
+void print_scheduler_version(void);
+
+#else /* CFS */
+extern int runqueue_is_locked(int cpu);
+static inline void cpu_scaling(int cpu)
+{
+}
+
+static inline void cpu_nonscaling(int cpu)
+{
+}
+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t)	((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+	printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+/* Anyone feel like implementing this? */
+static inline bool above_background_load(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_RIFS */
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -1636,10 +1708,20 @@
  */
 
 #define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
 
+#ifdef CONFIG_SCHED_RIFS
+#define PRIO_RANGE		(40)
+#define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
+//#define ISO_PRIO		(MAX_RT_PRIO) 已经被我干掉,哈哈
+#define NORMAL_PRIO		(MAX_RT_PRIO + 1)
+#define IDLE_PRIO		(MAX_PRIO + 1)
+#define PRIO_LIMIT		((IDLE_PRIO) + 1)
+#else /* CONFIG_SCHED_RIFS */
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+#define NORMAL_PRIO		(DEFAULT_PRIO)
+#endif /* CONFIG_SCHED_RIFS */
 
 static inline int rt_prio(int prio)
 {
@@ -2002,7 +2084,7 @@
 task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_RIFS)
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
diff -ruN linux-3.4.1/init/Kconfig linux-3.4.1-RIFS/init/Kconfig
--- linux-3.4.1/init/Kconfig	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/init/Kconfig	2012-06-08 22:15:30.000000000 +0800
@@ -29,6 +29,18 @@
 
 menu "General setup"
 
+config SCHED_RIFS
+	bool "RIFS cpu scheduler"
+	---help---
+	  The RIFS cpu scheduler is designed for excellent interactivity and
+	  responsiveness.
+
+	  Currently incompatible with the Group CPU scheduler, and RCU TORTURE
+          TEST so these options are disabled.
+
+          Say Y here.
+	default y
+
 config EXPERIMENTAL
 	bool "Prompt for development and/or incomplete code/drivers"
 	---help---
@@ -631,6 +643,7 @@
 
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting cgroup subsystem"
+	depends on !SCHED_RIFS
 	help
 	  Provides a simple Resource Controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
@@ -718,6 +731,7 @@
 
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
+	depends on !SCHED_RIFS
 	default n
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
@@ -854,6 +868,7 @@
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
+	depends on !SCHED_RIFS
 	select EVENTFD
 	select CGROUPS
 	select CGROUP_SCHED
diff -ruN linux-3.4.1/kernel/delayacct.c linux-3.4.1-RIFS/kernel/delayacct.c
--- linux-3.4.1/kernel/delayacct.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/delayacct.c	2012-06-06 15:03:30.000000000 +0800
@@ -130,7 +130,7 @@
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->se.sum_exec_runtime;
+	t3 = tsk_seruntime(tsk);
 
 	d->cpu_count += t1;
 
diff -ruN linux-3.4.1/kernel/exit.c linux-3.4.1-RIFS/kernel/exit.c
--- linux-3.4.1/kernel/exit.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/exit.c	2012-06-05 20:46:45.000000000 +0800
@@ -133,7 +133,7 @@
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+		sig->sum_sched_runtime += tsk_seruntime(tsk);
 	}
 
 	sig->nr_threads--;
diff -ruN linux-3.4.1/kernel/posix-cpu-timers.c linux-3.4.1-RIFS/kernel/posix-cpu-timers.c
--- linux-3.4.1/kernel/posix-cpu-timers.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/posix-cpu-timers.c	2012-06-06 14:37:35.000000000 +0800
@@ -495,7 +495,7 @@
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
 	cleanup_timers(tsk->cpu_timers,
-		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+		       tsk->utime, tsk->stime, tsk_seruntime(tsk));
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -504,7 +504,7 @@
 
 	cleanup_timers(tsk->signal->cpu_timers,
 		       tsk->utime + sig->utime, tsk->stime + sig->stime,
-		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
+		       tsk_seruntime(tsk) + sig->sum_sched_runtime);
 }
 
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -934,7 +934,7 @@
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
+		if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
 			tsk->cputime_expires.sched_exp = t->expires.sched;
 			break;
 		}
@@ -951,7 +951,7 @@
 			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -959,7 +959,7 @@
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -1252,7 +1252,7 @@
 		struct task_cputime task_sample = {
 			.utime = tsk->utime,
 			.stime = tsk->stime,
-			.sum_exec_runtime = tsk->se.sum_exec_runtime
+			.sum_exec_runtime = tsk_seruntime(tsk)
 		};
 
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
diff -ruN linux-3.4.1/kernel/sched/Makefile linux-3.4.1-RIFS/kernel/sched/Makefile
--- linux-3.4.1/kernel/sched/Makefile	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/Makefile	2012-06-05 20:42:22.000000000 +0800
@@ -11,10 +11,13 @@
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+ifdef CONFIG_SCHED_RIFS
+obj-y += rifs.o clock.o
+else
 obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
-obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+endif
+obj-$(CONFIG_SMP) += cpupri.o
 
 
diff -ruN linux-3.4.1/kernel/sched/rifs.c linux-3.4.1-RIFS/kernel/sched/rifs.c
--- linux-3.4.1/kernel/sched/rifs.c	1970-01-01 08:00:00.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/rifs.c	2012-06-16 22:57:50.000000000 +0800
@@ -0,0 +1,6692 @@
+/*
+ *  kernel/sched/rifs.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *			  fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *			  by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *			  Thomas Gleixner, Mike Kravetz
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/math64.h>
+
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+
+#include "cpupri.h"
+#include "../workqueue_sched.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#define MAX_TICK_USED		(rr_interval * 3)
+#define MAX_SLEEP_COUNT		(rr_interval * 3)
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define rt_queue(rq)		rt_prio((rq)->rq_prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
+					(policy) == SCHED_RR)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+#define idleprio_task(p)	unlikely((p)->policy == SCHED_IDLEPRIO)
+#define reverse(max, val)	(max - val)
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p) - MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define SCHED_PRIO(p)		((p) + MAX_RT_PRIO)
+#define STOP_PRIO		(MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		(1000000000 / HZ)
+#define HALF_JIFFY_NS		(1000000000 / HZ / 2)
+#define HALF_JIFFY_US		(1000000 / HZ / 2)
+#define MS_TO_NS(TIME)		((TIME) << 20)
+#define MS_TO_US(TIME)		((TIME) << 10)
+#define NS_TO_MS(TIME)		((TIME) >> 20)
+#define NS_TO_US(TIME)		((TIME) >> 10)
+
+#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+	printk(KERN_INFO "Rotary Interactivity Favor Scheduler - RIFS By QQ:3766691.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+int rr_interval __read_mostly = 6;
+
+/*
+ * 兼容。
+ */
+int sched_iso_cpu __read_mostly = 0;
+
+/*
+ * time_slice for each process
+ */
+#define timeslice() MS_TO_US(rr_interval)
+
+#define get_time_slice(p) { \
+	p->time_slice = timeslice();	\
+}
+
+/*
+ * The global runqueue data that all CPUs work off. Data is protected either
+ * by the global grq lock, or the discrete lock that precedes the data in this
+ * struct.
+ */
+struct global_rq {
+	raw_spinlock_t lock;
+	unsigned long nr_running;
+	unsigned long nr_uninterruptible;
+	unsigned long long nr_switches;
+	struct list_head queue[PRIO_LIMIT];
+	DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
+#ifdef CONFIG_SMP
+	unsigned long qnr; /* queued not running */
+	cpumask_t cpu_idle_map;
+	bool idle_cpus;
+#endif
+	int noc; /* num_online_cpus stored and updated when it changes */
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+	atomic_t refcount;
+	atomic_t rto_count;
+	struct rcu_head rcu;
+	cpumask_var_t span;
+	cpumask_var_t online;
+
+	/*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_var_t rto_mask;
+	struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+static struct global_rq grq;
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+	struct task_struct *curr, *idle, *stop;
+	struct mm_struct *prev_mm;
+	u64 last_run_scale;
+
+	unsigned int rq_policy;
+	u64 rq_last_ran;
+	int rq_prio;
+	bool rq_running; /* There is a task running */
+
+	/* Accurate timekeeping data */
+	u64 timekeep_clock;
+	unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
+		iowait_pc, idle_pc;
+	long account_pc;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	int cpu;		/* cpu of this runqueue */
+	bool online;
+	bool scaling; /* This CPU is managed by a scaling CPU freq governor */
+	struct task_struct *sticky_task;
+
+	struct root_domain *rd;
+	struct sched_domain *sd;
+	int *cpu_locality; /* CPU relative cache distance */
+#ifdef CONFIG_SCHED_SMT
+	bool (*siblings_idle)(int cpu);
+	/* See if all smt siblings are idle */
+	cpumask_t smt_siblings;
+#endif
+#ifdef CONFIG_SCHED_MC
+	bool (*cache_idle)(int cpu);
+	/* See if all cache siblings are idle */
+	cpumask_t cache_siblings;
+#endif
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif
+
+	u64 clock;
+	u64 clock_task;
+};
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+#ifdef CONFIG_SMP
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+	   return 0*SD_ASYM_PACKING;
+}
+#endif
+
+#define rcu_dereference_check_sched_domain(p) \
+	rcu_dereference_check((p), \
+				  lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+static inline void update_rq_clock(struct rq *rq);
+
+/*
+ * Sanity check should sched_clock return bogus values. We make sure it does
+ * not appear to go backwards, and use jiffies to determine the maximum and
+ * minimum it could possibly have increased, and round down to the nearest
+ * jiffy when it falls outside this.
+ */
+static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
+{
+	unsigned long min_diff, max_diff;
+
+	if (jiff_diff > 1)
+		min_diff = JIFFIES_TO_NS(jiff_diff - 1);
+	else
+		min_diff = 1;
+	/*  Round up to the nearest tick for maximum */
+	max_diff = JIFFIES_TO_NS(jiff_diff + 1);
+
+	if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
+		*niff_diff = min_diff;
+}
+
+#ifdef CONFIG_SMP
+#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
+#define this_rq()		(&__get_cpu_var(runqueues))
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+static inline int cpu_of(struct rq *rq)
+{
+	return rq->cpu;
+}
+
+#else /* CONFIG_SMP */
+static struct rq *uprq;
+#define cpu_rq(cpu)	(uprq)
+#define this_rq()	(uprq)
+#define task_rq(p)	(uprq)
+#define cpu_curr(cpu)	((uprq)->curr)
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+
+#endif
+#define raw_rq()	(&__raw_get_cpu_var(runqueues))
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on grq.lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating the time.
+ * Looking up task_rq must be done under grq.lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
+}
+
+static inline bool task_running(struct task_struct *p)
+{
+	return p->on_cpu;
+}
+
+static inline void grq_lock(void)
+	__acquires(grq.lock)
+{
+	raw_spin_lock(&grq.lock);
+}
+
+static inline void grq_unlock(void)
+	__releases(grq.lock)
+{
+	raw_spin_unlock(&grq.lock);
+}
+
+static inline void grq_lock_irq(void)
+	__acquires(grq.lock)
+{
+	raw_spin_lock_irq(&grq.lock);
+}
+
+static inline void time_lock_grq(struct rq *rq)
+	__acquires(grq.lock)
+{
+	grq_lock();
+}
+
+static inline void grq_unlock_irq(void)
+	__releases(grq.lock)
+{
+	raw_spin_unlock_irq(&grq.lock);
+}
+
+static inline void grq_lock_irqsave(unsigned long *flags)
+	__acquires(grq.lock)
+{
+	raw_spin_lock_irqsave(&grq.lock, *flags);
+}
+
+static inline void grq_unlock_irqrestore(unsigned long *flags)
+	__releases(grq.lock)
+{
+	raw_spin_unlock_irqrestore(&grq.lock, *flags);
+}
+
+static inline struct rq
+*task_grq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	grq_lock_irqsave(flags);
+	return task_rq(p);
+}
+
+static inline struct rq
+*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	struct rq *rq = task_grq_lock(p, flags);
+	return rq;
+}
+
+static inline struct rq *task_grq_lock_irq(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	grq_lock_irq();
+	return task_rq(p);
+}
+
+static inline void task_grq_unlock_irq(void)
+	__releases(grq.lock)
+{
+	grq_unlock_irq();
+}
+
+static inline void task_grq_unlock(unsigned long *flags)
+	__releases(grq.lock)
+{
+	grq_unlock_irqrestore(flags);
+}
+
+/**
+ * grunqueue_is_locked
+ *
+ * Returns true if the global runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+bool grunqueue_is_locked(void)
+{
+	return raw_spin_is_locked(&grq.lock);
+}
+
+void grq_unlock_wait(void)
+	__releases(grq.lock)
+{
+	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+	raw_spin_unlock_wait(&grq.lock);
+}
+
+static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
+	__acquires(grq.lock)
+{
+	local_irq_save(*flags);
+	time_lock_grq(rq);
+}
+
+static inline struct rq *__task_grq_lock(struct task_struct *p)
+	__acquires(grq.lock)
+{
+	grq_lock();
+	return task_rq(p);
+}
+
+static inline void __task_grq_unlock(void)
+	__releases(grq.lock)
+{
+	grq_unlock();
+}
+
+/*
+ * Look for any tasks *anywhere* that are running nice 0 or better. We do
+ * this lockless for overhead reasons since the occasional wrong result
+ * is harmless.
+ */
+bool above_background_load(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct task_struct *cpu_curr = cpu_rq(cpu)->curr;
+
+		if (unlikely(!cpu_curr))
+			continue;
+		if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) {
+			return true;
+		}
+	}
+	return false;
+}
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	grq.lock.owner = current;
+#endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+
+	grq_unlock_irq();
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	grq_unlock_irq();
+#else
+	grq_unlock();
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+	smp_wmb();
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+/*
+ * A task that is queued but not running will be on the grq run list.
+ * A task that is not running or queued will not be on the grq run list.
+ * A task that is currently running will have ->on_cpu set but not on the
+ * grq run list.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+	return (!list_empty(&p->run_list));
+}
+
+/*
+ * Removing from the global runqueue. Enter with grq locked.
+ */
+static void dequeue_task(struct task_struct *p)
+{
+	int prio = p->prio;
+	list_del_init(&p->run_list);
+	if (list_empty(grq.queue + prio))
+		__clear_bit(prio, grq.prio_bitmap);
+}
+
+/*
+ * Adding to the global runqueue. Enter with grq locked.
+ */
+static void enqueue_task(struct task_struct *p)
+{
+	int prio = p->prio;
+	__set_bit(prio, grq.prio_bitmap);
+	list_add_tail(&p->run_list, grq.queue + prio);
+}
+
+/* Only idle task does this as a real time task*/
+static inline void enqueue_task_head(struct task_struct *p)
+{
+	int prio = p->prio;
+	__set_bit(prio, grq.prio_bitmap);
+	list_add(&p->run_list, grq.queue + prio);
+}
+
+static inline void requeue_task(struct task_struct *p)
+{
+}
+
+#ifdef CONFIG_SMP
+/*
+ * qnr is the "queued but not running" count which is the total number of
+ * tasks on the global runqueue list waiting for cpu time but not actually
+ * currently running on a cpu.
+ */
+static inline void inc_qnr(void)
+{
+	grq.qnr++;
+}
+
+static inline void dec_qnr(void)
+{
+	grq.qnr--;
+}
+
+static inline int queued_notrunning(void)
+{
+	return grq.qnr;
+}
+
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+	if (likely(cpu_online(cpu))) {
+		cpu_set(cpu, grq.cpu_idle_map);
+		grq.idle_cpus = true;
+	}
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+	cpu_clear(cpu, grq.cpu_idle_map);
+	if (cpus_empty(grq.cpu_idle_map))
+		grq.idle_cpus = false;
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+	if (!grq.idle_cpus)
+		return false;
+	return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map));
+}
+
+#define CPUIDLE_DIFF_THREAD	(1)
+#define CPUIDLE_DIFF_CORE	(2)
+#define CPUIDLE_CACHE_BUSY	(4)
+#define CPUIDLE_DIFF_CPU	(8)
+#define CPUIDLE_THREAD_BUSY	(16)
+#define CPUIDLE_DIFF_NODE	(32)
+
+static void resched_task(struct task_struct *p);
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same core, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static void
+resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+	unsigned int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
+		CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
+		CPUIDLE_DIFF_THREAD;
+	int cpu_tmp;
+
+	if (cpu_isset(best_cpu, *tmpmask))
+		goto out;
+
+	for_each_cpu_mask(cpu_tmp, *tmpmask) {
+		unsigned int ranking;
+		struct rq *tmp_rq;
+
+		ranking = 0;
+		tmp_rq = cpu_rq(cpu_tmp);
+
+#ifdef CONFIG_NUMA
+		if (rq->cpu_locality[cpu_tmp] > 3)
+			ranking |= CPUIDLE_DIFF_NODE;
+		else
+#endif
+		if (rq->cpu_locality[cpu_tmp] > 2)
+			ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+		if (rq->cpu_locality[cpu_tmp] == 2)
+			ranking |= CPUIDLE_DIFF_CORE;
+		if (!(tmp_rq->cache_idle(cpu_tmp)))
+			ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (rq->cpu_locality[cpu_tmp] == 1)
+			ranking |= CPUIDLE_DIFF_THREAD;
+		if (!(tmp_rq->siblings_idle(cpu_tmp)))
+			ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+		if (ranking < best_ranking) {
+			best_cpu = cpu_tmp;
+			best_ranking = ranking;
+		}
+	}
+out:
+	resched_task(cpu_rq(best_cpu)->curr);
+}
+
+static void resched_best_idle(struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+	if (suitable_idle_cpus(p))
+		resched_best_idle(p);
+}
+/*
+ * Flags to tell us whether this CPU is running a CPU frequency governor that
+ * has slowed its speed or not. No locking required as the very rare wrongly
+ * read value would be harmless.
+ */
+void cpu_scaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = true;
+}
+
+void cpu_nonscaling(int cpu)
+{
+	cpu_rq(cpu)->scaling = false;
+}
+
+static inline bool scaling_rq(struct rq *rq)
+{
+	return rq->scaling;
+}
+
+static inline int locality_diff(struct task_struct *p, struct rq *rq)
+{
+	return rq->cpu_locality[task_cpu(p)];
+}
+#else /* CONFIG_SMP */
+static inline void inc_qnr(void)
+{
+}
+
+static inline void dec_qnr(void)
+{
+}
+
+static inline int queued_notrunning(void)
+{
+	return grq.nr_running;
+}
+
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+	return current == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+void cpu_scaling(int __unused)
+{
+}
+
+void cpu_nonscaling(int __unused)
+{
+}
+
+/*
+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
+ * always returns 0.
+ */
+static inline bool scaling_rq(struct rq *rq)
+{
+	return false;
+}
+
+static inline int locality_diff(struct task_struct *p, struct rq *rq)
+{
+	return 0;
+}
+#endif /* CONFIG_SMP */
+EXPORT_SYMBOL_GPL(cpu_scaling);
+EXPORT_SYMBOL_GPL(cpu_nonscaling);
+
+/*
+ * activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void activate_idle_task(struct task_struct *p)
+{
+	enqueue_task_head(p);
+	grq.nr_running++;
+	inc_qnr();
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with grq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+					 (rq->clock - p->last_ran) >> 20);
+		
+	}
+
+	if (task_contributes_to_load(p))
+		grq.nr_uninterruptible--;
+	enqueue_task_head(p);
+	grq.nr_running++;
+	inc_qnr();
+}
+
+static inline void clear_sticky(struct task_struct *p);
+
+/*
+ * deactivate_task - If it's running, it's not on the grq and we can just
+ * decrement the nr_running. Enter with grq locked.
+ */
+static inline void deactivate_task(struct task_struct *p)
+{
+	if (task_contributes_to_load(p))
+		grq.nr_uninterruptible++;
+	grq.nr_running--;
+	clear_sticky(p);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * The caller should hold grq lock.
+	 */
+	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+#endif
+	trace_sched_migrate_task(p, cpu);
+	if (task_cpu(p) != cpu)
+		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+
+	/*
+	 * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+	 * successfully executed on another CPU. We must ensure that updates of
+	 * per-task data have been completed by this moment.
+	 */
+	smp_wmb();
+	task_thread_info(p)->cpu = cpu;
+}
+
+static inline void clear_sticky(struct task_struct *p)
+{
+	p->sticky = false;
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+	return p->sticky;
+}
+
+/* Reschedule the best idle CPU that is not this one. */
+static void
+resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
+{
+	cpumask_t tmpmask;
+
+	cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
+	cpu_clear(cpu, tmpmask);
+	if (cpus_empty(tmpmask))
+		return;
+	resched_best_mask(cpu, rq, &tmpmask);
+}
+
+/*
+ * We set the sticky flag on a task that is descheduled involuntarily meaning
+ * it is awaiting further CPU time. If the last sticky task is still sticky
+ * but unlucky enough to not be the next task scheduled, we unstick it and try
+ * to find it an idle CPU. Realtime tasks do not stick to minimise their
+ * latency at all times.
+ */
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+	if (rq->sticky_task) {
+		if (rq->sticky_task == p) {
+			p->sticky = true;
+			return;
+		}
+		if (task_sticky(rq->sticky_task)) {
+			clear_sticky(rq->sticky_task);
+			resched_closest_idle(rq, cpu, rq->sticky_task);
+		}
+	}
+	if (!rt_task(p)) {
+		p->sticky = true;
+		rq->sticky_task = p;
+	} else {
+		resched_closest_idle(rq, cpu, p);
+		rq->sticky_task = NULL;
+	}
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+	rq->sticky_task = NULL;
+	clear_sticky(p);
+}
+#else
+static inline void clear_sticky(struct task_struct *p)
+{
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+	return false;
+}
+
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
+{
+	int cpu;
+
+	assert_raw_spin_locked(&grq.lock);
+
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
+}
+
+#else
+static inline void resched_task(struct task_struct *p)
+{
+	assert_raw_spin_locked(&grq.lock);
+	set_tsk_need_resched(p);
+}
+#endif
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+struct migration_req {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+	unsigned long flags;
+	bool running, on_rq;
+	unsigned long ncsw;
+	struct rq *rq;
+
+	for (;;) {
+		/*
+		 * We do the initial early heuristics without holding
+		 * any task-queue locks at all. We'll only try to get
+		 * the runqueue lock when things look like they will
+		 * work out! In the unlikely event rq is dereferenced
+		 * since we're lockless, grab it again.
+		 */
+#ifdef CONFIG_SMP
+retry_rq:
+		rq = task_rq(p);
+		if (unlikely(!rq))
+			goto retry_rq;
+#else /* CONFIG_SMP */
+		rq = task_rq(p);
+#endif
+		/*
+		 * If the task is actively running on another CPU
+		 * still, just relax and busy-wait without holding
+		 * any locks.
+		 *
+		 * NOTE! Since we don't hold any locks, it's not
+		 * even sure that "rq" stays as the right runqueue!
+		 * But we don't care, since this will return false
+		 * if the runqueue has changed and p is actually now
+		 * running somewhere else!
+		 */
+		while (task_running(p) && p == rq->curr) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
+			cpu_relax();
+		}
+
+		/*
+		 * Ok, time to look more closely! We need the grq
+		 * lock now, to be *sure*. If we're wrong, we'll
+		 * just go back and repeat.
+		 */
+		rq = task_grq_lock(p, &flags);
+		trace_sched_wait_task(p);
+		running = task_running(p);
+		on_rq = task_queued(p);
+		ncsw = 0;
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+		task_grq_unlock(&flags);
+
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
+		/*
+		 * Was it really running after all now that we
+		 * checked with the proper locks actually held?
+		 *
+		 * Oops. Go back and try again..
+		 */
+		if (unlikely(running)) {
+			cpu_relax();
+			continue;
+		}
+
+		/*
+		 * It's not enough that it's not actively running,
+		 * it must be off the runqueue _entirely_, and not
+		 * preempted!
+		 *
+		 * So if it was still runnable (but just not actively
+		 * running right now), it's preempted, and we should
+		 * yield - it could be a while.
+		 */
+		if (unlikely(on_rq)) {
+			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+			continue;
+		}
+
+		/*
+		 * Ahh, all good. It wasn't running, and it wasn't
+		 * runnable, which means that it will never become
+		 * running in the future either. We're all done!
+		 */
+		break;
+	}
+
+	return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+#define rq_idle(rq)	((rq)->rq_prio == PRIO_LIMIT)
+
+/*
+ * RT tasks preempt on priority.
+ * NORMAL tasks preempt on sleep time length.
+ * SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio)
+{
+	if (p->prio <= prio)
+		return true;
+	return false;
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Check to see if there is a task that is affined only to offline CPUs but
+ * still wants runtime. This happens to kernel threads during suspend/halt and
+ * disabling of CPUs.
+ */
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+static inline bool online_cpus(struct task_struct *p)
+{
+	return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed)));
+}
+#else /* CONFIG_HOTPLUG_CPU */
+/* All available CPUs are always online without hotplug. */
+static inline bool online_cpus(struct task_struct *p)
+{
+	return true;
+}
+#endif
+
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
+		return true;
+	return false;
+}
+
+/*
+ * When all else is equal, still prefer this_rq.
+ */
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	struct rq *highest_prio_rq = NULL;
+	int cpu, highest_prio = 0;
+	cpumask_t tmp;
+
+	/*
+	 * We clear the sticky flag here because for a task to have called
+	 * try_preempt with the sticky flag enabled means some complicated
+	 * re-scheduling has occurred and we should ignore the sticky flag.
+	 */
+	clear_sticky(p);
+
+	if (suitable_idle_cpus(p)) {
+		resched_best_idle(p);
+		return;
+	}
+
+	/* IDLEPRIO tasks never preempt anything but idle */
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+
+	if (likely(online_cpus(p)))
+		cpus_and(tmp, cpu_online_map, p->cpus_allowed);
+	else
+		return;
+
+	for_each_cpu_mask(cpu, tmp) {
+		struct rq *rq;
+		int rq_prio;
+
+		rq = cpu_rq(cpu);
+		rq_prio = rq->rq_prio;
+		if (rq_prio < highest_prio)
+			continue;
+		
+		if (rq_prio > highest_prio) {
+			highest_prio = rq_prio;
+			highest_prio_rq = rq;
+		}
+	}
+
+	if (likely(highest_prio_rq)) {
+		if (can_preempt(p, highest_prio))
+			resched_task(highest_prio_rq->curr);
+	}
+}
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+	if (can_preempt(p, uprq->rq_prio)) 
+		resched_task(current);
+}
+#endif /* CONFIG_SMP */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+}
+
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+				 bool is_sync)
+{
+	activate_task(p, rq);
+
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption if there are no idle cpus,
+	 * instead waiting for current to deschedule.
+	 */
+	if (!is_sync || suitable_idle_cpus(p)) {
+		try_preempt(p, rq);
+	}
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+					bool success)
+{
+	trace_sched_wakeup(p, success);
+	p->state = TASK_RUNNING;
+
+	/*
+	 * if a worker is waking up, notify workqueue. Note that on BFS, we
+	 * don't really know what cpu it will be, so we fake it for
+	 * wq_worker_waking_up :/
+	 */
+	if ((p->flags & PF_WQ_WORKER) && success)
+		wq_worker_waking_up(p, cpu_of(rq));
+}
+
+#ifdef CONFIG_SMP
+void scheduler_ipi(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
+ */
+static bool try_to_wake_up(struct task_struct *p, unsigned int state,
+			  int wake_flags)
+{
+	bool success = false;
+	unsigned long flags;
+	struct rq *rq;
+	int cpu;
+
+	get_cpu();
+
+	/* This barrier is undocumented, probably for p->state? くそ */
+	smp_wmb();
+
+	/*
+	 * No need to do time_lock_grq as we only need to update the rq clock
+	 * if we activate the task
+	 */
+	rq = task_grq_lock(p, &flags);
+	cpu = task_cpu(p);
+
+	/* state is a volatile long, どうして、分からない */
+	if (!((unsigned int)p->state & state))
+		goto out_unlock;
+
+	if (task_queued(p) || task_running(p))
+		goto out_running;
+
+	ttwu_activate(p, rq, wake_flags & WF_SYNC);
+	success = true;
+
+out_running:
+	ttwu_post_activation(p, rq, success);
+out_unlock:
+	task_grq_unlock(&flags);
+
+	ttwu_stat(p, cpu, wake_flags);
+
+	put_cpu();
+
+	return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with grq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that grq is locked and, @p is not the current task.
+ * grq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	bool success = false;
+
+	lockdep_assert_held(&grq.lock);
+
+	if (!(p->state & TASK_NORMAL))
+		return;
+
+	if (!task_queued(p)) {
+		ttwu_activate(p, rq, false);
+		ttwu_stat(p, smp_processor_id(), 0);
+		success = true;
+	}
+	ttwu_post_activation(p, rq, success);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_ALL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+void sched_fork(struct task_struct *p)
+{
+	struct task_struct *curr;
+	int cpu = get_cpu();
+	struct rq *rq;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+	/*
+	 * We mark the process as running here. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_RUNNING;
+	set_task_cpu(p, cpu);
+
+	p->sched_time = p->stime_pc = p->utime_pc = 0;
+
+	/*
+	 * Revert to default priority/policy on fork if requested.
+	 */
+	if (unlikely(p->sched_reset_on_fork)) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+			p->policy = SCHED_NORMAL;
+		}
+
+		if (PRIO_TO_NICE(p->static_prio) < 0) {
+			p->static_prio = NICE_TO_PRIO(0);
+		}
+
+		/*
+		 * We don't need the reset flag anymore after the fork. It has
+		 * fulfilled its duty:
+		 */
+		p->sched_reset_on_fork = 0;
+	}
+
+	curr = current;
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = curr->static_prio;
+	p->tick_used = curr->tick_used;
+
+	INIT_LIST_HEAD(&p->run_list);
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+
+	p->on_cpu = false;
+	clear_sticky(p);
+
+#ifdef CONFIG_PREEMPT_COUNT
+	/* Want to start with kernel preemption disabled. */
+	task_thread_info(p)->preempt_count = 1;
+#endif
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto out;
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness. If it's negative, it won't
+	 * matter since that's the same as being 0. current's time_slice is
+	 * actually in timeslice when it's running, as is its last_ran
+	 * value. 
+	 */
+	rq = task_grq_lock_irq(curr);
+	if (likely(curr->time_slice >= RESCHED_US * 2)) {
+		curr->time_slice /= 2;
+		p->time_slice = curr->time_slice;
+	} else {
+		/*
+		 * Forking task has run out of timeslice. Reschedule it.
+		 */
+		curr->time_slice = 0;
+		set_tsk_need_resched(curr);
+		get_time_slice(p);
+	}
+	p->last_ran = rq->rq_last_ran;
+	task_grq_unlock_irq();
+out:
+	put_cpu();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+	struct task_struct *parent;
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_grq_lock(p, &flags);
+	p->state = TASK_RUNNING;
+	p->tick_used = 0;
+	parent = p->parent;
+	/* Unnecessary but small chance that the parent changed CPU */
+	set_task_cpu(p, task_cpu(parent));
+	activate_task(p, rq);
+	trace_sched_wakeup_new(p, 1);
+	if (rq->curr == parent && !suitable_idle_cpus(p)) {
+		/*
+		 * The VM isn't cloned, so we're in a good position to
+		 * do child-runs-first in anticipation of an exec. This
+		 * usually avoids a lot of COW overhead.
+		 */
+		resched_task(parent);
+	} else
+		try_preempt(p, rq);
+	task_grq_unlock(&flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+		notifier->ops->sched_out(notifier, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+			struct task_struct *next)
+{
+	perf_event_task_sched_out(prev, next);
+	fire_sched_out_preempt_notifiers(prev, next);
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+	trace_sched_switch(prev, next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+	__releases(grq.lock)
+{
+	struct mm_struct *mm = rq->prev_mm;
+	long prev_state;
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 * The test for TASK_DEAD must occur while the runqueue locks are
+	 * still held, otherwise prev could be scheduled on another cpu, die
+	 * there before we look at prev->state, and then the reference would
+	 * be dropped twice.
+	 *		Manfred Spraul <manfred@colorfullife.com>
+	 */
+	prev_state = prev->state;
+	finish_arch_switch(prev);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+	perf_event_task_sched_in(prev, current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+	finish_lock_switch(rq, prev);
+
+	fire_sched_in_preempt_notifiers(current);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_state == TASK_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(struct task_struct *prev)
+	__releases(grq.lock)
+{
+	struct rq *rq = this_rq();
+
+	finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
+	if (current->set_child_tid)
+		put_user(current->pid, current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+		   struct task_struct *next)
+{
+	struct mm_struct *mm, *oldmm;
+	
+	prepare_task_switch(rq, prev, next);
+
+	mm = next->mm;
+	oldmm = prev->active_mm;
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (!mm) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm(oldmm, mm, next);
+
+	if (!prev->mm) {
+		prev->active_mm = NULL;
+		rq->prev_mm = oldmm;
+	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+#endif
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+
+	barrier();
+	/*
+	 * this_rq must be evaluated again because prev may have moved
+	 * CPUs since it called schedule(), thus the 'rq' on its stack
+	 * frame will be invalid.
+	 */
+	finish_task_switch(this_rq(), prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup. All are measured
+ * without grabbing the grq lock but the occasional inaccurate result
+ * doesn't matter so long as it's positive.
+ */
+unsigned long nr_running(void)
+{
+	long nr = grq.nr_running;
+
+	if (unlikely(nr < 0))
+		nr = 0;
+	return (unsigned long)nr;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+	long nu = grq.nr_uninterruptible;
+
+	if (unlikely(nu < 0))
+		nu = 0;
+	return nu;
+}
+
+unsigned long long nr_context_switches(void)
+{
+	long long ns = grq.nr_switches;
+
+	/* This is of course impossible */
+	if (unlikely(ns < 0))
+		ns = 1;
+	return (unsigned long long)ns;
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+	struct rq *this = cpu_rq(cpu);
+	return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+	return nr_running() + nr_uninterruptible();
+}
+
+/* Beyond a task running on this CPU, load is equal everywhere on BFS */
+unsigned long this_cpu_load(void)
+{
+	return this_rq()->rq_running +
+		((queued_notrunning() + nr_uninterruptible()) / grq.noc);
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+	long active;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+	active = nr_active() * FIXED_1;
+
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+	sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+
+	return (this_rq->cpu_locality[that_cpu] < 2);
+}
+EXPORT_SYMBOL(cpus_share_cache);
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void account_system_vtime(struct task_struct *curr)
+{
+	unsigned long flags;
+	s64 delta;
+	int cpu;
+
+	if (!sched_clock_irqtime)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	__this_cpu_add(irq_start_time, delta);
+
+	irq_time_write_begin();
+	/*
+	 * We do not account for softirq time from ksoftirqd here.
+	 * We want to continue accounting softirq time to ksoftirqd thread
+	 * in that case, so as not to confuse scheduler with a special task
+	 * that do not consume any time, but still wants to run.
+	 */
+	if (hardirq_count())
+		__this_cpu_add(cpu_hardirq_time, delta);
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+		__this_cpu_add(cpu_softirq_time, delta);
+
+	irq_time_write_end();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+#endif
+	rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void irqtime_account_hi_si(void)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	u64 latest_ns;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
+	if (latest_ns > cpustat[CPUTIME_IRQ])
+		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+
+	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
+	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
+		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime	(0)
+
+static inline void irqtime_account_hi_si(void)
+{
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * On each tick, see what percentage of that tick was attributed to each
+ * component and add the percentage to the _pc values. Once a _pc value has
+ * accumulated one tick's worth, account for that. This means the total
+ * percentage of load components will always be 128 (pseudo 100) per tick.
+ */
+static void pc_idle_time(struct rq *rq, unsigned long pc)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	if (atomic_read(&rq->nr_iowait) > 0) {
+		rq->iowait_pc += pc;
+		if (rq->iowait_pc >= 128) {
+			rq->iowait_pc %= 128;
+			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy;
+		}
+	} else {
+		rq->idle_pc += pc;
+		if (rq->idle_pc >= 128) {
+			rq->idle_pc %= 128;
+			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy;
+		}
+	}
+}
+
+static void
+pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
+		   unsigned long pc, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+	p->stime_pc += pc;
+	if (p->stime_pc >= 128) {
+		p->stime_pc %= 128;
+		p->stime += (__force u64)cputime_one_jiffy;
+		p->stimescaled += one_jiffy_scaled;
+		acct_update_integrals(p);
+	}
+	p->sched_time += ns;
+
+	if (hardirq_count() - hardirq_offset) {
+		rq->irq_pc += pc;
+		if (rq->irq_pc >= 128) {
+			rq->irq_pc %= 128;
+			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+		}
+	} else if (in_serving_softirq()) {
+		rq->softirq_pc += pc;
+		if (rq->softirq_pc >= 128) {
+			rq->softirq_pc %= 128;
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+		}
+	} else {
+		rq->system_pc += pc;
+		if (rq->system_pc >= 128) {
+			rq->system_pc %= 128;
+			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy;
+		}
+	}
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p,
+			 unsigned long pc, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+	p->utime_pc += pc;
+	if (p->utime_pc >= 128) {
+		p->utime_pc %= 128;
+		p->utime += (__force u64)cputime_one_jiffy;
+		p->utimescaled += one_jiffy_scaled;
+		acct_update_integrals(p);
+	}
+	p->sched_time += ns;
+
+	if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 */
+		rq->softirq_pc += pc;
+		if (rq->softirq_pc >= 128) {
+			rq->softirq_pc %= 128;
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+		}
+	}
+
+	if (TASK_NICE(p) > 0 || idleprio_task(p)) {
+		rq->nice_pc += pc;
+		if (rq->nice_pc >= 128) {
+			rq->nice_pc %= 128;
+			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy;
+		}
+	} else {
+		rq->user_pc += pc;
+		if (rq->user_pc >= 128) {
+			rq->user_pc %= 128;
+			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy;
+		}
+	}
+}
+
+/*
+ * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
+ * shifts instead of 100
+ */
+#define NS_TO_PC(NS)	(NS * 128 / JIFFY_NS)
+
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock(struct rq *rq, struct task_struct *p)
+{
+	long account_ns = rq->clock - rq->timekeep_clock;
+	struct task_struct *idle = rq->idle;
+	unsigned long account_pc;
+	int user_tick;
+
+	p->last_ran = rq->clock;
+
+	if (unlikely(account_ns < 0))
+		account_ns = 0;
+
+	account_pc = NS_TO_PC(account_ns);
+
+	/* Accurate tick timekeeping */
+	rq->account_pc += account_pc - 128;
+	if (rq->account_pc < 0) {
+		/*
+		 * Small errors in micro accounting may not make the
+		 * accounting add up to 128 each tick so we keep track
+		 * of the percentage and round it up when less than 128
+		 */
+		account_pc += -rq->account_pc;
+		rq->account_pc = 0;
+	}
+
+	user_tick = user_mode(get_irq_regs());
+
+	if (user_tick)
+		pc_user_time(rq, p, account_pc, account_ns);
+	else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
+		pc_system_time(rq, p, HARDIRQ_OFFSET,
+				   account_pc, account_ns);
+	else
+		pc_idle_time(rq, account_pc);
+
+	if (sched_clock_irqtime)
+		irqtime_account_hi_si();
+
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (rq->rq_policy != SCHED_FIFO && p != idle) {
+		s64 time_diff = rq->clock - rq->rq_last_ran;
+
+		niffy_diff(&time_diff, 1);
+		p->time_slice -= NS_TO_US(time_diff);
+	}
+	rq->rq_last_ran = rq->timekeep_clock = rq->clock;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_grq_lock() held.
+ */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	u64 ns = 0;
+
+	if (p == rq->curr) {
+		ns = rq->clock_task - rq->rq_last_ran;
+		if (unlikely((s64)ns < 0))
+			ns = 0;
+	}
+
+	return ns;
+}
+
+unsigned long long task_delta_exec(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+	rq = task_grq_lock(p, &flags);
+	ns = do_task_delta_exec(p, rq);
+	task_grq_unlock(&flags);
+
+	return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+	rq = task_grq_lock(p, &flags);
+	ns = p->sched_time + do_task_delta_exec(p, rq);
+	task_grq_unlock(&flags);
+
+	return ns;
+}
+
+/* Compatibility crap for removal */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+			   cputime_t cputime_scaled)
+{
+}
+
+void account_idle_time(cputime_t cputime)
+{
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+				   cputime_t cputime_scaled)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	/* Add guest time to process. */
+	p->utime += (__force u64)cputime;
+	p->utimescaled += (__force u64)cputime_scaled;
+	p->gtime += (__force u64)cputime;
+
+	/* Add guest time to cpustat. */
+	if (TASK_NICE(p) > 0) {
+		cpustat[CPUTIME_NICE] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
+	} else {
+		cpustat[CPUTIME_USER] += (__force u64)cputime;
+		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
+	}
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	/* Add system time to process. */
+	p->stime += (__force u64)cputime;
+	p->stimescaled += (__force u64)cputime_scaled;
+
+	/* Add system time to cpustat. */
+	*target_cputime64 += (__force u64)cputime;
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * This is for guest only now.
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime, cputime_t cputime_scaled)
+{
+
+	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+		account_guest_time(p, cputime, cputime_scaled);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+static void account_idle_times(cputime_t cputime)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	struct rq *rq = this_rq();
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
+	else
+		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+	account_idle_times(jiffies_to_cputime(ticks));
+}
+#endif
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+/* 当前队列时钟的控制 */
+static void task_running_tick(struct rq *rq)
+{
+	struct task_struct *p;
+	p = rq->curr;
+
+	/* SCHED_FIFO tasks never run out of timeslice. */
+	if (rq->rq_policy == SCHED_FIFO)
+		return;
+	if(p->tick_used <= MAX_TICK_USED)
+		p->tick_used++;
+	if (p->time_slice > RESCHED_US)
+		return;
+
+	/* time_slice expired. Grq locked */
+	grq_lock();
+	requeue_task(p);
+	set_tsk_need_resched(p);
+	grq_unlock();
+}
+
+void wake_up_idle_cpu(int cpu);
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled. The data modified is all
+ * local to struct rq so we don't need to grab grq lock.
+ */
+void scheduler_tick(void)
+{
+	int cpu __maybe_unused = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	sched_clock_tick();
+	/* grq lock not grabbed, so only update rq clock */
+	update_rq_clock(rq);
+	update_cpu_clock(rq, rq->curr);
+	if (!rq_idle(rq))
+		task_running_tick(rq);
+	perf_event_task_tick();
+}
+
+notrace unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+void __kprobes add_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+		return;
+#endif
+	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void __kprobes sub_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+			!(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+#endif
+
+#define prio_raisable(p) (	\
+	p->prio > ((p->static_prio - reverse(MAX_TICK_USED, p->tick_used)) + \
+			(p->sleep_cnt / 3)))
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled.
+ */
+static inline bool check_timeslice_end(struct rq *rq, struct task_struct *p)
+{
+	u64 sleep_jiffy = jiffies_64;
+	bool preempt = false;
+	if (p->policy == SCHED_FIFO)
+		goto out;
+	if (p->time_slice < RESCHED_US || batch_task(p)) {
+		if(p->prio > MAX_RT_PRIO) {
+			p->prio++;
+			if(p->prio >= IDLE_PRIO) {
+				p->prio = p->static_prio + 5;
+				if(p->prio >= IDLE_PRIO)
+					p->prio = p->static_prio;
+			}
+		}
+		get_time_slice(p);
+	} else if(p->policy != SCHED_RR) {
+		if(p->state != TASK_RUNNING) {
+			if(p->prio > NORMAL_PRIO) {
+				if(p->sleep_cnt < MAX_SLEEP_COUNT) 
+					p->sleep_cnt++;
+				if(prio_raisable(p)) 
+					p->prio--;
+				if(sleep_jiffy != p->sleep_jiffy) 
+					p->sleep_cnt /= 2;
+				p->sleep_jiffy = sleep_jiffy;
+			}
+		} else {
+			preempt = true;
+		}
+	}
+	if((p->state != TASK_RUNNING) || (p->tick_used >= MAX_TICK_USED)) {
+		p->tick_used /= 2;
+	}
+out:
+	return preempt;
+}
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+
+
+/*
+ * 最低位查找,查找最高优先级开始。
+ * Find the lowest bit set in the bitmap.We would find the highest priority first/
+ */
+static inline unsigned long
+get_prio_bit(unsigned long *addr, unsigned long offset)
+{
+	unsigned long *from = addr + (offset / BITS_PER_LONG);
+	unsigned long *limit = addr + PRIO_LIMIT / BITS_PER_LONG;
+	int i = offset % BITS_PER_LONG;
+	
+	if (offset >=  PRIO_LIMIT)
+		return PRIO_LIMIT;
+
+	for(;from != (limit);from++) {
+		for(;i < BITS_PER_LONG;i++, offset++) {
+			if(((*from >> i) & 0x1)) {
+				goto out;
+			}
+		}
+		
+		/*
+		 *  This can make sure to generate the best machine code.
+		 */
+		i = 0;
+	}
+out:
+	return offset;
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU, thereby allowing the data to be
+ * changed without grabbing the grq lock.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_last_ran = p->last_ran = rq->clock;
+	rq->rq_policy = p->policy;
+	rq->rq_prio = p->prio;
+	if (p != rq->idle)
+		rq->rq_running = true;
+	else
+		rq->rq_running = false;
+}
+
+static void reset_rq_task(struct rq *rq, struct task_struct *p)
+{
+	rq->rq_policy = p->policy;
+	rq->rq_prio = p->prio;
+}
+
+static inline void operate_blk_needs_flush_plug(struct task_struct *p)
+{
+	grq_unlock_irq();
+	sched_preempt_enable_no_resched();
+	blk_schedule_flush_plug(p);
+}
+
+static inline void task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+	/*
+	 * Don't stick tasks when a real time task is going to run as
+	 * they may literally get stuck.
+	 */
+	if (rt_task(next))
+		unstick_task(rq, prev);
+	set_rq_task(rq, next);
+	grq.nr_switches++;
+	prev->on_cpu = false;
+	next->on_cpu = true;
+	rq->curr = next;
+
+	/*
+	 * The context switch have flipped the stack from under us
+	 * and restored the local variables which were saved when
+	 * this task called schedule() in the past. prev == current
+	 * is still correct, but it can be moved to another cpu/rq.
+	 */
+	context_switch(rq, prev, next); /* unlocks the grq */
+}
+
+
+/*
+ * Move a task off the global queue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(int cpu, struct task_struct *p)
+{
+	set_task_cpu(p, cpu);
+	dequeue_task(p);
+	clear_sticky(p);
+	dec_qnr();
+}
+
+/*
+ * Put the descheduling task back to grq.
+ */
+static inline void put_prev_task(struct rq *rq, struct task_struct *p, bool deactivate, bool preempt)
+{
+	if(deactivate)
+		deactivate_task(p);
+	else {
+		inc_qnr();
+		if(preempt)
+			enqueue_task_head(p);
+		else
+			enqueue_task(p);
+	}
+}
+
+/*
+ * Task picking for next time to run.
+ */
+static inline struct
+task_struct *get_runnable_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct task_struct *edt = NULL;
+	unsigned long idx = -1;
+
+	do {
+		struct list_head *queue;
+		struct task_struct *p;
+
+		idx = get_prio_bit(grq.prio_bitmap, ++idx);
+		if (idx >= PRIO_LIMIT)
+			return idle;
+		queue = grq.queue + idx;
+
+		list_for_each_entry(p, queue, run_list) {
+			/* Make sure cpu affinity is ok */
+			if (needs_other_cpu(p, cpu))
+				continue;
+			edt = p;
+			goto out_take;
+		}
+	} while (!edt);
+
+out_take:
+	take_task(cpu, edt);
+	return edt;
+}
+
+#define SCHED_RESCHED -1
+
+/*
+ * schedule() is the main scheduler function.
+ */
+static inline int check_sleep_on_wq(int cpu, struct task_struct *p)
+{
+	int deactivate;
+	deactivate = 0;
+	if (p->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+		if (unlikely(signal_pending_state(p->state, p))) {
+			p->state = TASK_RUNNING;
+		} else {
+			deactivate = 1;
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (p->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(p, cpu);
+				if (to_wakeup) {
+					/* This shouldn't happen, but does */
+					if (unlikely(to_wakeup == p))
+						deactivate = 0;
+					else
+						try_to_wake_up_local(to_wakeup);
+				}
+			}
+
+			/*
+			 * If we are going to sleep and we have plugged IO queued, make
+			 * sure to submit it to avoid deadlocks.
+			 */
+			if (unlikely(deactivate && blk_needs_flush_plug(p))) {
+				operate_blk_needs_flush_plug(p);
+				deactivate = SCHED_RESCHED;
+				goto out;
+			}
+		}
+	}
+out:
+	return deactivate;
+}
+
+static inline int do_schedule(void)
+{
+	struct task_struct *prev, *next, *idle;
+	struct rq *rq;
+	int cpu;
+	int deactivate;
+
+	cpu = smp_processor_id();
+	rq = cpu_rq(cpu);
+	rcu_note_context_switch(cpu);
+	prev = rq->curr;
+
+	grq_lock_irq();
+
+	if((deactivate = check_sleep_on_wq(cpu, prev)) == SCHED_RESCHED) {
+		goto out;
+	}
+
+	clear_tsk_need_resched(prev);
+
+	idle = rq->idle;
+	if (idle != prev) {
+		bool preempt = check_timeslice_end(rq, prev);
+
+		/* Task changed affinity off this CPU */
+		if (needs_other_cpu(prev, cpu))
+			resched_suitable_idle(prev);
+		else if (!deactivate) {
+			if (!queued_notrunning()) {
+				/*
+				* Rerun the prev task again.
+				*/
+				set_rq_task(rq, prev);
+				grq_unlock_irq();
+				goto out;
+			} else
+				swap_sticky(rq, cpu, prev);
+		}
+
+		put_prev_task(rq, prev, deactivate, preempt);
+	}
+
+	if (unlikely(!queued_notrunning())) {
+		/*
+		 * This CPU is now truly idle as opposed to when idle is
+		 * scheduled as a high priority task in its own right.
+		 */
+		next = idle;
+		set_cpuidle_map(cpu);
+	} else {
+		next = get_runnable_task(rq, cpu, idle);
+	}
+
+	if (likely(prev != next)) {
+		prev->nvcsw++;
+		grq.nr_switches++;
+
+		task_switch(rq, prev, next);
+		idle = rq->idle;
+	} else
+		grq_unlock_irq();
+
+out:
+	return deactivate;
+}
+
+asmlinkage void __sched schedule(void)
+{
+reschedule:
+		preempt_disable();
+		
+		if(do_schedule() == SCHED_RESCHED)
+			goto reschedule;
+		
+		sched_preempt_enable_no_resched();
+		if(unlikely(need_resched()))
+			goto reschedule;
+}
+EXPORT_SYMBOL(schedule);
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+	sched_preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+}
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+	if (lock->owner != owner)
+		return false;
+
+	/*
+	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
+	 * lock->owner still matches owner, if that fails, owner might
+	 * point to free()d memory, if it still matches, the rcu_read_lock()
+	 * ensures the memory stays valid.
+	 */
+	barrier();
+
+	return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+	rcu_read_lock();
+	while (owner_running(lock, owner)) {
+		if (need_resched())
+			break;
+
+		arch_mutex_cpu_relax();
+	}
+	rcu_read_unlock();
+
+	/*
+	 * We break out the loop above on need_resched() and when the
+	 * owner changed, which is a sign for heavy contention. Return
+	 * success only when lock->owner is NULL.
+	 */
+	return lock->owner == NULL;
+}
+#endif
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched notrace preempt_schedule(void)
+{
+	struct thread_info *ti = current_thread_info();
+
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task. Just return..
+	 */
+	if (likely(ti->preempt_count || irqs_disabled()))
+		return;
+
+	do {
+		add_preempt_count_notrace(PREEMPT_ACTIVE);
+		schedule();
+		sub_preempt_count_notrace(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
+}
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+	struct thread_info *ti = current_thread_info();
+
+	/* Catch callers which need to be fixed */
+	BUG_ON(ti->preempt_count || !irqs_disabled());
+
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
+		sub_preempt_count(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (need_resched());
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+			  void *key)
+{
+	return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	struct list_head *tmp, *next;
+
+	list_for_each_safe(tmp, next, &q->task_list) {
+		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		unsigned int flags = curr->flags;
+
+		if (curr->func(curr, mode, wake_flags, key) &&
+				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+			break;
+	}
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+	__wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronised'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
+{
+	unsigned long flags;
+	int wake_flags = WF_SYNC;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		wake_flags = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/**
+ * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronised'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	unsigned long flags;
+	int sync = 1;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		sync = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done++;
+	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += UINT_MAX/2;
+	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x, long timeout, int state)
+{
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		do {
+			if (signal_pending_state(state, current)) {
+				timeout = -ERESTARTSYS;
+				break;
+			}
+			__set_current_state(state);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done && timeout);
+		__remove_wait_queue(&x->wait, &wait);
+		if (!x->done)
+			return timeout;
+	}
+	x->done--;
+	return timeout ?: 1;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	timeout = do_wait_for_common(x, timeout, state);
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+	if (t == -ERESTARTSYS)
+		return t;
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+	if (t == -ERESTARTSYS)
+		return t;
+	return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+					 unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ *	try_wait_for_completion - try to decrement a completion without blocking
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if a decrement cannot be done without blocking
+ *		 1 if a decrement succeeded.
+ *
+ *	If a completion is being used as a counting completion,
+ *	attempt to decrement the counter without blocking. This
+ *	enables us to avoid waiting if the resource the completion
+ *	is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	if (!x->done)
+		ret = 0;
+	else
+		x->done--;
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ *	completion_done - Test to see if a completion has any waiters
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *		 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	if (!x->done)
+		ret = 0;
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
+static long __sched
+sleep_on_common(wait_queue_head_t *q, int state, long timeout)
+{
+	unsigned long flags;
+	wait_queue_t wait;
+
+	init_waitqueue_entry(&wait, current);
+
+	__set_current_state(state);
+
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, &wait);
+	spin_unlock(&q->lock);
+	timeout = schedule_timeout(timeout);
+	spin_lock_irq(&q->lock);
+	__remove_wait_queue(q, &wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	return timeout;
+}
+
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void __sched sleep_on(wait_queue_head_t *q)
+{
+	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(sleep_on);
+
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(sleep_on_timeout);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	unsigned long flags;
+	int queued, oldprio;
+	struct rq *rq;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_grq_lock(p, &flags);
+
+	trace_sched_pi_setprio(p, prio);
+	oldprio = p->prio;
+	queued = task_queued(p);
+	p->prio = prio;
+	if (task_running(p) && prio > oldprio)
+		resched_task(p);
+	if (queued) {
+		try_preempt(p, rq);
+	}
+
+	task_grq_unlock(&flags);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+	int queued, new_static, old_static;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+		return;
+	new_static = NICE_TO_PRIO(nice);
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = time_task_grq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (has_rt_policy(p)) {
+		p->static_prio = new_static;
+		goto out_unlock;
+	}
+	queued = task_queued(p);
+
+	old_static = p->static_prio;
+	p->static_prio = new_static;
+	p->prio = p->static_prio;
+
+	if (queued) {
+		if (new_static < old_static)
+			try_preempt(p, rq);
+	} else if (task_running(p)) {
+		reset_rq_task(rq, p);
+		if (old_static < new_static)
+			resched_task(p);
+	}
+out_unlock:
+	task_grq_unlock(&flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = 20 - nice;
+
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+		capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+	long nice, retval;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+	if (increment < -40)
+		increment = -40;
+	if (increment > 40)
+		increment = 40;
+
+	nice = TASK_NICE(current) + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1.
+ */
+int task_prio(const struct task_struct *p)
+{
+	return p->prio - NORMAL_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const struct task_struct *p)
+{
+	return TASK_NICE(p);
+}
+EXPORT_SYMBOL_GPL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+struct task_struct *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold grq lock. */
+static void
+__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio)
+{
+	int oldrtprio, oldprio;
+
+	p->policy = policy;
+	oldrtprio = p->rt_priority;
+	p->rt_priority = prio;
+	oldprio = p->prio;
+	/* we are holding p->pi_lock already */
+	p->prio = rt_mutex_getprio(p);
+	if (task_running(p)) {
+		reset_rq_task(rq, p);
+		/* Resched only if we might now be preempted */
+		if (p->prio > oldprio || p->rt_priority > oldrtprio)
+			resched_task(p);
+	}
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	if (cred->user->user_ns == pcred->user->user_ns)
+		match = (cred->euid == pcred->euid ||
+			 cred->euid == pcred->uid);
+	else
+		match = false;
+	rcu_read_unlock();
+	return match;
+}
+
+static int __sched_setscheduler(struct task_struct *p, int policy,
+				const struct sched_param *param, bool user)
+{
+	struct sched_param zero_param = { .sched_priority = 0 };
+	int queued, retval, oldpolicy = -1;
+	unsigned long flags, rlim_rtprio = 0;
+	int reset_on_fork;
+	struct rq *rq;
+
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
+
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		unsigned long lflags;
+
+		if (!lock_task_sighand(p, &lflags))
+			return -ESRCH;
+		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+		unlock_task_sighand(p, &lflags);
+		if (rlim_rtprio)
+			goto recheck;
+		param = &zero_param;
+	}
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0) {
+		reset_on_fork = p->sched_reset_on_fork;
+		policy = oldpolicy = p->policy;
+	} else {
+		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+		policy &= ~SCHED_RESET_ON_FORK;
+
+		if (!SCHED_RANGE(policy))
+			return -EINVAL;
+	}
+
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+		(p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+		(!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+		return -EINVAL;
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (user && !capable(CAP_SYS_NICE)) {
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio =
+					task_rlimit(p, RLIMIT_RTPRIO);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+				param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		} else {
+			switch (p->policy) {
+				case SCHED_BATCH:
+					if (policy == SCHED_BATCH)
+						goto out;
+					if (policy != SCHED_IDLEPRIO)
+						return -EPERM;
+					break;
+				case SCHED_IDLEPRIO:
+					if (policy == SCHED_IDLEPRIO)
+						goto out;
+					return -EPERM;
+				default:
+					break;
+			}
+		}
+
+		/* can't change other user's priorities */
+		if (!check_same_owner(p))
+			return -EPERM;
+
+		/* Normal users shall not reset the sched_reset_on_fork flag */
+		if (p->sched_reset_on_fork && !reset_on_fork)
+			return -EPERM;
+	}
+
+	if (user) {
+		retval = security_task_setscheduler(p);
+		if (retval)
+			return retval;
+	}
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/*
+	 * To be able to change p->policy safely, the grunqueue lock must be
+	 * held.
+	 */
+	rq = __task_grq_lock(p);
+
+	/*
+	 * Changing the policy of the stop threads its a very bad idea
+	 */
+	if (p == rq->stop) {
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * If not changing anything there's no need to proceed further:
+	 */
+	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+			param->sched_priority == p->rt_priority))) {
+
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		return 0;
+	}
+
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		goto recheck;
+	}
+	p->sched_reset_on_fork = reset_on_fork;
+
+	queued = task_queued(p);
+	__setscheduler(p, rq, policy, param->sched_priority);
+	if (queued) {
+		try_preempt(p, rq);
+	}
+	__task_grq_unlock();
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	rt_mutex_adjust_pi(p);
+out:
+	return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+			   const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+				   const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false);
+}
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	int retval;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+					   struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+	return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	rcu_read_unlock();
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+	struct sched_param lp;
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	lp.sched_priority = p->rt_priority;
+	rcu_read_unlock();
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+	cpumask_var_t cpus_allowed, new_mask;
+	struct task_struct *p;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		rcu_read_unlock();
+		put_online_cpus();
+		return -ESRCH;
+	}
+
+	/* Prevent p going away */
+	get_task_struct(p);
+	rcu_read_unlock();
+
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
+	retval = -EPERM;
+	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
+		goto out_unlock;
+
+	retval = security_task_setscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+	retval = set_cpus_allowed_ptr(p, new_mask);
+
+	if (!retval) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (!cpumask_subset(new_mask, cpus_allowed)) {
+			/*
+			 * We must have raced with a concurrent cpuset
+			 * update. Just reset the cpus_allowed to the
+			 * cpuset's cpus_allowed
+			 */
+			cpumask_copy(new_mask, cpus_allowed);
+			goto again;
+		}
+	}
+out_unlock:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+out_put_task:
+	put_task_struct(p);
+	put_online_cpus();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+				 cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	cpumask_var_t new_mask;
+	int retval;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+	if (retval == 0)
+		retval = sched_setaffinity(pid, new_mask);
+	free_cpumask_var(new_mask);
+	return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	struct task_struct *p;
+	unsigned long flags;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	grq_lock_irqsave(&flags);
+	cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
+	grq_unlock_irqrestore(&flags);
+
+out_unlock:
+	rcu_read_unlock();
+	put_online_cpus();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	int ret;
+	cpumask_var_t mask;
+
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = sched_getaffinity(pid, mask);
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
+	free_cpumask_var(mask);
+
+	return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. 
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+	struct task_struct *p;
+
+	p = current;
+	grq_lock_irq();
+	requeue_task(p);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(grq.lock);
+	spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&grq.lock);
+	sched_preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+static inline bool should_resched(void)
+{
+	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
+static void __cond_resched(void)
+{
+	/* NOT a real fix but will make voluntary preempt work. 馬鹿な事 */
+	if (unlikely(system_state != SYSTEM_RUNNING))
+		return;
+
+	add_preempt_count(PREEMPT_ACTIVE);
+	schedule();
+	sub_preempt_count(PREEMPT_ACTIVE);
+}
+
+int __sched _cond_resched(void)
+{
+	if (should_resched()) {
+		__cond_resched();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+	int resched = should_resched();
+	int ret = 0;
+
+	lockdep_assert_held(lock);
+
+	if (spin_needbreak(lock) || resched) {
+		spin_unlock(lock);
+		if (resched)
+			__cond_resched();
+		else
+			cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (should_resched()) {
+		local_bh_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * This is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+	unsigned long flags;
+	bool yielded = 0;
+	struct rq *rq;
+	struct task_struct *curr;
+
+	rq = this_rq();
+	grq_lock_irqsave(&flags);
+	if (task_running(p) || p->state)
+		goto out_unlock;
+	yielded = 1;
+	curr = rq->curr;
+	p->time_slice += curr->time_slice;
+	curr->time_slice = 0;
+	if (p->time_slice > timeslice())
+		p->time_slice = timeslice();
+	set_tsk_need_resched(curr);
+out_unlock:
+	grq_unlock_irqrestore(&flags);
+
+	if (yielded)
+		schedule();
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+	struct rq *rq = raw_rq();
+
+	delayacct_blkio_start();
+	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
+	current->in_iowait = 1;
+	schedule();
+	current->in_iowait = 0;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+}
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+	struct rq *rq = raw_rq();
+	long ret;
+
+	delayacct_blkio_start();
+	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
+	current->in_iowait = 1;
+	ret = schedule_timeout(timeout);
+	current->in_iowait = 0;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct task_struct *p;
+	unsigned int time_slice;
+	unsigned long flags;
+	int retval;
+	struct timespec t;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	grq_lock_irqsave(&flags);
+	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(rr_interval);
+	grq_unlock_irqrestore(&flags);
+
+	rcu_read_unlock();
+	t = ns_to_timespec(time_slice);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+	unsigned long free = 0;
+	unsigned state;
+
+	state = p->state ? __ffs(p->state) + 1 : 0;
+	printk(KERN_INFO "%-15.15s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT " running  ");
+	else
+		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT "  running task	");
+	else
+		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	free = stack_not_used(p);
+#endif
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), task_pid_nr(p->real_parent),
+		(unsigned long)task_thread_info(p)->flags);
+
+	show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+	struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+	printk(KERN_INFO
+		"  task				PC stack   pid father\n");
+#else
+	printk(KERN_INFO
+		"  task						PC stack   pid father\n");
+#endif
+	rcu_read_lock();
+	do_each_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take a lot of time:
+		 */
+		touch_nmi_watchdog();
+		if (!state_filter || (p->state & state_filter))
+			sched_show_task(p);
+	} while_each_thread(g, p);
+
+	touch_all_softlockup_watchdogs();
+
+	rcu_read_unlock();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (!state_filter)
+		debug_show_all_locks();
+}
+
+#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	time_grq_lock(rq, &flags);
+	idle->last_ran = rq->clock;
+	idle->state = TASK_RUNNING;
+	/* Setting prio to illegal value shouldn't matter when never queued */
+	idle->prio = PRIO_LIMIT;
+	idle->policy = SCHED_IDLE;
+	set_rq_task(rq, idle);
+	do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu));
+	/* Silence PROVE_RCU */
+	rcu_read_lock();
+	set_task_cpu(idle, cpu);
+	rcu_read_unlock();
+	rq->curr = rq->idle = idle;
+	idle->on_cpu = 1;
+	grq_unlock_irqrestore(&flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+	task_thread_info(idle)->preempt_count = 0;
+
+	ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+static inline void resched_cpu(int cpu)
+{
+	unsigned long flags;
+
+	grq_lock_irqsave(&flags);
+	resched_task(cpu_curr(cpu));
+	grq_unlock_irqrestore(&flags);
+}
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (!idle_cpu(i))
+				cpu = i;
+			goto unlock;
+		}
+	}
+unlock:
+	rcu_read_unlock();
+	return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	struct task_struct *idle;
+	struct rq *rq;
+
+	if (cpu == smp_processor_id())
+		return;
+
+	rq = cpu_rq(cpu);
+	idle = rq->idle;
+
+	/*
+	 * This is safe, as this function is called with the timer
+	 * wheel base lock of (cpu) held. When the CPU is on the way
+	 * to idle and has not yet set rq->curr to idle then it will
+	 * be serialised on the timer wheel base lock and take the new
+	 * timer into account automatically.
+	 */
+	if (unlikely(rq->curr != idle))
+		return;
+
+	/*
+	 * We can set TIF_RESCHED on the idle task of the other CPU
+	 * lockless. The worst case is that the other CPU runs the
+	 * idle task through an additional NOOP schedule()
+	 */
+	set_tsk_need_resched(idle);
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(idle))
+		smp_send_reschedule(cpu);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	bool running_wrong = false;
+	bool queued = false;
+	unsigned long flags;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_grq_lock(p, &flags);
+
+	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	queued = task_queued(p);
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	if (task_running(p)) {
+		/* Task is running on the wrong cpu now, reschedule it. */
+		if (rq == this_rq()) {
+			set_tsk_need_resched(p);
+			running_wrong = true;
+		} else
+			resched_task(p);
+	} else
+		set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
+
+out:
+	if (queued)
+		try_preempt(p, rq);
+	task_grq_unlock(&flags);
+
+	if (running_wrong)
+		_cond_resched();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Run through task list and find tasks affined to just the dead cpu, then
+ * allocate a new affinity */
+static void break_sole_affinity(int src_cpu, struct task_struct *idle)
+{
+	struct task_struct *p, *t;
+
+	do_each_thread(t, p) {
+		if (p != idle && !online_cpus(p)) {
+			cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask);
+			/*
+			 * Don't tell them about moving exiting tasks or
+			 * kernel threads (both mm NULL), since they never
+			 * leave kernel.
+			 */
+			if (p->mm && printk_ratelimit()) {
+				printk(KERN_INFO "process %d (%s) no "
+					   "longer affine to cpu %d\n",
+					   task_pid_nr(p), p->comm, src_cpu);
+			}
+		}
+		clear_sticky(p);
+	} while_each_thread(t, p);
+}
+
+/*
+ * Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible.
+ * Used by CPU offline code.
+ */
+void sched_idle_next(struct rq *rq, int this_cpu, struct task_struct *idle)
+{
+	/* cpu has to be offline */
+	BUG_ON(cpu_online(this_cpu));
+
+	__setscheduler(idle, rq, SCHED_FIFO, STOP_PRIO);
+
+	activate_idle_task(idle);
+	set_tsk_need_resched(rq->curr);
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+	struct sched_param start_param = { .sched_priority = MAX_USER_RT_PRIO - 1 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal rt scheduling prio so that
+		 * it can die in pieces.
+		 */
+		sched_setscheduler_nocheck(old_stop, SCHED_FIFO, &start_param);
+	}
+}
+
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		mode_t mode, proc_handler *proc_handler)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(13);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax);
+	set_table_entry(&table[11], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[12] is terminator */
+
+	return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+	if (sd_sysctl_header)
+		unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = true;
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = false;
+	}
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ */
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int cpu = (long)hcpu;
+	unsigned long flags;
+	struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+	struct task_struct *idle = rq->idle;
+#endif
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+
+	case CPU_UP_PREPARE:
+		break;
+
+	case CPU_ONLINE:
+		/* Update our root-domain */
+		grq_lock_irqsave(&flags);
+		if (rq->rd) {
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+
+			set_rq_online(rq);
+		}
+		grq.noc = num_online_cpus();
+		grq_unlock_irqrestore(&flags);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		/* Idle task back to normal (off runqueue, low prio) */
+		grq_lock_irq();
+		put_prev_task(rq, idle, true, false);
+		idle->static_prio = MAX_PRIO;
+		__setscheduler(idle, rq, SCHED_NORMAL, 0);
+		idle->prio = PRIO_LIMIT;
+		set_rq_task(rq, idle);
+		grq_unlock_irq();
+		break;
+
+	case CPU_DYING:
+		/* Update our root-domain */
+		grq_lock_irqsave(&flags);
+		sched_idle_next(rq, cpu, idle);
+		if (rq->rd) {
+			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+			set_rq_offline(rq);
+		}
+		break_sole_affinity(cpu, idle);
+		grq.noc = num_online_cpus();
+		grq_unlock_irqrestore(&flags);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
+ */
+static struct notifier_block __cpuinitdata migration_notifier = {
+	.notifier_call = migration_call,
+	.priority = CPU_PRI_MIGRATION,
+};
+
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+					  unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		set_cpu_active((long)hcpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	/* Initialise migration for the boot CPU */
+	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+	return 0;
+}
+early_initcall(migration_init);
+#endif
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_domain_debug_enabled;
+
+static int __init sched_domain_debug_setup(char *str)
+{
+	sched_domain_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  struct cpumask *groupmask)
+{
+	struct sched_group *group = sd->groups;
+	char str[256];
+
+	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+	cpumask_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
+	}
+
+	printk(KERN_CONT "span %s level %s\n", str, sd->name);
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+		printk(KERN_ERR "ERROR: domain->groups does not contain"
+				" CPU%d\n", cpu);
+	}
+
+	printk(KERN_DEBUG "%*s groups:", level + 1, "");
+	do {
+		if (!group) {
+			printk("\n");
+			printk(KERN_ERR "ERROR: group is NULL\n");
+			break;
+		}
+
+		if (!group->sgp->power) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: domain->cpu_power not "
+					"set\n");
+			break;
+		}
+
+		if (!cpumask_weight(sched_group_cpus(group))) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: empty group\n");
+			break;
+		}
+
+		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: repeated CPUs\n");
+			break;
+		}
+
+		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+
+		printk(KERN_CONT " %s", str);
+		if (group->sgp->power != SCHED_POWER_SCALE) {
+			printk(KERN_CONT " (cpu_power = %d)",
+				group->sgp->power);
+		}
+
+		group = group->next;
+	} while (group != sd->groups);
+	printk(KERN_CONT "\n");
+
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+		!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sched_domain_debug_enabled)
+		return;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+			break;
+		level++;
+		sd = sd->parent;
+		if (!sd)
+			break;
+	}
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC |
+			 SD_SHARE_CPUPOWER |
+			 SD_SHARE_PKG_RESOURCES)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_AFFINE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+		return 0;
+
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC |
+				SD_SHARE_CPUPOWER |
+				SD_SHARE_PKG_RESOURCES);
+		if (nr_node_ids == 1)
+			pflags &= ~SD_SERIALIZE;
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+	cpupri_cleanup(&rd->cpupri);
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	struct root_domain *old_rd = NULL;
+	unsigned long flags;
+
+	grq_lock_irqsave(&flags);
+
+	if (rq->rd) {
+		old_rd = rq->rd;
+
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
+
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+		/*
+		 * If we dont want to free the old_rt yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+		set_rq_online(rq);
+
+	grq_unlock_irqrestore(&flags);
+
+	if (old_rd)
+		call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto out;
+	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_online;
+
+	if (cpupri_init(&rd->cpupri) != 0)
+		goto free_rto_mask;
+	return 0;
+
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+out:
+	return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	if (init_rootdomain(rd) != 0) {
+		kfree(rd);
+		return NULL;
+	}
+
+	return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+	struct sched_group *tmp, *first;
+
+	if (!sg)
+		return;
+
+	first = sg;
+	do {
+		tmp = sg->next;
+
+		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+			kfree(sg->sgp);
+
+		kfree(sg);
+		sg = tmp;
+	} while (sg != first);
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	/*
+	 * If its an overlapping domain it has private groups, iterate and
+	 * nuke them all.
+	 */
+	if (sd->flags & SD_OVERLAP) {
+		free_sched_groups(sd->groups, 1);
+	} else if (atomic_dec_and_test(&sd->groups->ref)) {
+		kfree(sd->groups->sgp);
+		kfree(sd->groups);
+	}
+	kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+	call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+	for (; sd; sd = sd->parent)
+		destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; ) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+
+		if (sd_parent_degenerate(tmp, parent)) {
+			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+			destroy_sched_domain(parent, cpu);
+		} else
+			tmp = tmp->parent;
+	}
+
+	if (sd && sd_degenerate(sd)) {
+		tmp = sd;
+		sd = sd->parent;
+		destroy_sched_domain(tmp, cpu);
+		if (sd)
+			sd->child = NULL;
+	}
+
+	sched_domain_debug(sd, cpu);
+
+	rq_attach_root(rq, rd);
+	tmp = rq->sd;
+	rcu_assign_pointer(rq->sd, sd);
+	destroy_sched_domains(tmp, cpu);
+}
+
+/* cpus with isolated domains */
+static cpumask_var_t cpu_isolated_map;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	cpulist_parse(str, cpu_isolated_map);
+	return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+#define SD_NODES_PER_DOMAIN 16
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, nodemask_t *used_nodes)
+{
+	int i, n, val, min_val, best_node = -1;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		/* Start at @node */
+		n = (node + i) % nr_node_ids;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (node_isset(n, *used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	if (best_node != -1)
+		node_set(best_node, *used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @span: resulting cpumask
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static void sched_domain_node_span(int node, struct cpumask *span)
+{
+	nodemask_t used_nodes;
+	int i;
+
+	cpumask_clear(span);
+	nodes_clear(used_nodes);
+
+	cpumask_or(span, span, cpumask_of_node(node));
+	node_set(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, &used_nodes);
+		if (next_node < 0)
+			break;
+		cpumask_or(span, span, cpumask_of_node(next_node));
+	}
+}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+	lockdep_assert_held(&sched_domains_mutex);
+
+	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+	return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+	return cpu_possible_mask;
+}
+#endif /* CONFIG_NUMA */
+
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+	return cpumask_of_node(cpu_to_node(cpu));
+}
+
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
+struct sd_data {
+	struct sched_domain **__percpu sd;
+	struct sched_group **__percpu sg;
+	struct sched_group_power **__percpu sgp;
+};
+
+struct s_data {
+	struct sched_domain ** __percpu sd;
+	struct root_domain	*rd;
+};
+
+enum s_alloc {
+	sa_rootdomain,
+	sa_sd,
+	sa_sd_storage,
+	sa_none,
+};
+
+struct sched_domain_topology_level;
+
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+
+#define SDTL_OVERLAP	0x01
+
+struct sched_domain_topology_level {
+	sched_domain_init_f init;
+	sched_domain_mask_f mask;
+	int			flags;
+	struct sd_data	  data;
+};
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered = sched_domains_tmpmask;
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *child;
+	int i;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct cpumask *sg_span;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				GFP_KERNEL, cpu_to_node(i));
+
+		if (!sg)
+			goto fail;
+
+		sg_span = sched_group_cpus(sg);
+
+		child = *per_cpu_ptr(sdd->sd, i);
+		if (child->child) {
+			child = child->child;
+			cpumask_copy(sg_span, sched_domain_span(child));
+		} else
+			cpumask_set_cpu(i, sg_span);
+
+		cpumask_or(covered, covered, sg_span);
+
+		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+		atomic_inc(&sg->sgp->ref);
+
+		if (cpumask_test_cpu(cpu, sg_span))
+			groups = sg;
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+		last->next = first;
+	}
+	sd->groups = groups;
+
+	return 0;
+
+fail:
+	free_sched_groups(first, 0);
+
+	return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	struct sched_domain *child = sd->child;
+
+	if (child)
+		cpu = cpumask_first(sched_domain_span(child));
+
+	if (sg) {
+		*sg = *per_cpu_ptr(sdd->sg, cpu);
+		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+	}
+
+	return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL;
+	struct sd_data *sdd = sd->private;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered;
+	int i;
+
+	get_group(cpu, sdd, &sd->groups);
+	atomic_inc(&sd->groups->ref);
+
+	if (cpu != cpumask_first(sched_domain_span(sd)))
+		return 0;
+
+	lockdep_assert_held(&sched_domains_mutex);
+	covered = sched_domains_tmpmask;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct sched_group *sg;
+		int group = get_group(i, sdd, &sg);
+		int j;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		cpumask_clear(sched_group_cpus(sg));
+		sg->sgp->power = 0;
+
+		for_each_cpu(j, span) {
+			if (get_group(j, sdd, NULL) != group)
+				continue;
+
+			cpumask_set_cpu(j, covered);
+			cpumask_set_cpu(j, sched_group_cpus(sg));
+		}
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+	}
+	last->next = first;
+
+	return 0;
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)		sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)		do { } while (0)
+#endif
+
+#define SD_INIT_FUNC(type)						\
+static noinline struct sched_domain *					\
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
+{									\
+	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
+	*sd = SD_##type##_INIT;						\
+	SD_INIT_NAME(sd, type);						\
+	sd->private = &tl->data;					\
+	return sd;							\
+}
+
+SD_INIT_FUNC(CPU)
+#ifdef CONFIG_NUMA
+ SD_INIT_FUNC(ALLNODES)
+ SD_INIT_FUNC(NODE)
+#endif
+#ifdef CONFIG_SCHED_SMT
+ SD_INIT_FUNC(SIBLING)
+#endif
+#ifdef CONFIG_SCHED_MC
+ SD_INIT_FUNC(MC)
+#endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	unsigned long val;
+
+	val = simple_strtoul(str, NULL, 0);
+	if (val < sched_domain_level_max)
+		default_relax_domain_level = val;
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	}
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_rootdomain:
+		if (!atomic_read(&d->rd->refcount))
+			free_rootdomain(&d->rd->rcu); /* fall through */
+	case sa_sd:
+		free_percpu(d->sd); /* fall through */
+	case sa_sd_storage:
+		__sdt_free(cpu_map); /* fall through */
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+						   const struct cpumask *cpu_map)
+{
+	memset(d, 0, sizeof(*d));
+
+	if (__sdt_alloc(cpu_map))
+		return sa_sd_storage;
+	d->sd = alloc_percpu(struct sched_domain *);
+	if (!d->sd)
+		return sa_sd_storage;
+	d->rd = alloc_rootdomain();
+	if (!d->rd)
+		return sa_sd;
+	return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+
+	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *cpu_smt_mask(int cpu)
+{
+	return topology_thread_cpumask(cpu);
+}
+#endif
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ sd_init_SIBLING, cpu_smt_mask, },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ sd_init_MC, cpu_coregroup_mask, },
+#endif
+#ifdef CONFIG_SCHED_BOOK
+	{ sd_init_BOOK, cpu_book_mask, },
+#endif
+	{ sd_init_CPU, cpu_cpu_mask, },
+#ifdef CONFIG_NUMA
+	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
+	{ sd_init_ALLNODES, cpu_allnodes_mask, },
+#endif
+	{ NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for (tl = sched_domain_topology; tl->init; tl++) {
+		struct sd_data *sdd = &tl->data;
+
+		sdd->sd = alloc_percpu(struct sched_domain *);
+		if (!sdd->sd)
+			return -ENOMEM;
+
+		sdd->sg = alloc_percpu(struct sched_group *);
+		if (!sdd->sg)
+			return -ENOMEM;
+
+		sdd->sgp = alloc_percpu(struct sched_group_power *);
+		if (!sdd->sgp)
+			return -ENOMEM;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+			struct sched_group *sg;
+			struct sched_group_power *sgp;
+
+			   	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sd)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sd, j) = sd;
+
+			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sg)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sg, j) = sg;
+
+			sgp = kzalloc_node(sizeof(struct sched_group_power),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sgp)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sgp, j) = sgp;
+		}
+	}
+
+	return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for (tl = sched_domain_topology; tl->init; tl++) {
+		struct sd_data *sdd = &tl->data;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd && (sd->flags & SD_OVERLAP))
+				free_sched_groups(sd->groups, 0);
+			kfree(*per_cpu_ptr(sdd->sd, j));
+			kfree(*per_cpu_ptr(sdd->sg, j));
+			kfree(*per_cpu_ptr(sdd->sgp, j));
+		}
+		free_percpu(sdd->sd);
+		free_percpu(sdd->sg);
+		free_percpu(sdd->sgp);
+	}
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+		struct s_data *d, const struct cpumask *cpu_map,
+		struct sched_domain_attr *attr, struct sched_domain *child,
+		int cpu)
+{
+	struct sched_domain *sd = tl->init(tl, cpu);
+	if (!sd)
+		return child;
+
+	set_domain_attribute(sd, attr);
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	if (child) {
+		sd->level = child->level + 1;
+		sched_domain_level_max = max(sched_domain_level_max, sd->level);
+		child->parent = sd;
+	}
+	sd->child = child;
+
+	return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+				   struct sched_domain_attr *attr)
+{
+	enum s_alloc alloc_state = sa_none;
+	struct sched_domain *sd;
+	struct s_data d;
+	int i, ret = -ENOMEM;
+
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+
+	/* Set up domains for cpus specified by the cpu_map. */
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain_topology_level *tl;
+
+		sd = NULL;
+		for (tl = sched_domain_topology; tl->init; tl++) {
+			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+			if (tl->flags & SDTL_OVERLAP)
+				sd->flags |= SD_OVERLAP;
+			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+				break;
+		}
+
+		while (sd->child)
+			sd = sd->child;
+
+		*per_cpu_ptr(d.sd, i) = sd;
+	}
+
+	/* Build the groups for the domains */
+	for_each_cpu(i, cpu_map) {
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			sd->span_weight = cpumask_weight(sched_domain_span(sd));
+			if (sd->flags & SD_OVERLAP) {
+				if (build_overlap_sched_groups(sd, i))
+					goto error;
+			} else {
+				if (build_sched_groups(sd, i))
+					goto error;
+			}
+		}
+	}
+
+	/* Calculate CPU power for physical packages and nodes */
+	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		if (!cpumask_test_cpu(i, cpu_map))
+			continue;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			claim_allocations(i, sd);
+		}
+	}
+
+	/* Attach the domains */
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d.sd, i);
+		cpu_attach_domain(sd, d.rd, i);
+	}
+	rcu_read_unlock();
+
+	ret = 0;
+error:
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return ret;
+}
+
+static cpumask_var_t *doms_cur;	/* current sched domains */
+static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+				/* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
+{
+	return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+	int err;
+
+	arch_update_cpu_topology();
+	ndoms_cur = 1;
+	doms_cur = alloc_sched_domains(ndoms_cur);
+	if (!doms_cur)
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	dattr_cur = NULL;
+	err = build_sched_domains(doms_cur[0], NULL);
+	register_sched_domain_sysctl();
+
+	return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+	int i;
+
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map)
+		cpu_attach_domain(NULL, &def_root_domain, i);
+	rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* fast path */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+				 struct sched_domain_attr *dattr_new)
+{
+	int i, j, n;
+	int new_topology;
+
+	mutex_lock(&sched_domains_mutex);
+
+	/* always unregister in case we don't destroy any domains */
+	unregister_sched_domain_sysctl();
+
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
+	n = doms_new ? ndoms_new : 0;
+
+	/* Destroy deleted domains */
+	for (i = 0; i < ndoms_cur; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_cur[i], doms_new[j])
+				&& dattrs_equal(dattr_cur, i, dattr_new, j))
+				goto match1;
+		}
+		/* no match - a current sched domain not in new doms_new[] */
+		detach_destroy_domains(doms_cur[i]);
+match1:
+		;
+	}
+
+	if (doms_new == NULL) {
+		ndoms_cur = 0;
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		WARN_ON_ONCE(dattr_new);
+	}
+
+	/* Build new domains */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < ndoms_cur && !new_topology; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j])
+				&& dattrs_equal(dattr_new, i, dattr_cur, j))
+				goto match2;
+		}
+		/* no match - add a new doms_new */
+		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+		;
+	}
+
+	/* Remember the new sched domains */
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
+	kfree(dattr_cur);	/* kfree(NULL) is safe */
+	doms_cur = doms_new;
+	dattr_cur = dattr_new;
+	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
+
+	mutex_unlock(&sched_domains_mutex);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+static void reinit_sched_domains(void)
+{
+	get_online_cpus();
+
+	/* Destroy domains first to force the rebuild */
+	partition_sched_domains(0, NULL, NULL);
+
+	rebuild_sched_domains();
+	put_online_cpus();
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+	unsigned int level = 0;
+
+	if (sscanf(buf, "%u", &level) != 1)
+		return -EINVAL;
+
+	/*
+	 * level is always be positive so don't check for
+	 * level < POWERSAVINGS_BALANCE_NONE which is 0
+	 * What happens on 0 or 1 byte write,
+	 * need to check for count as well?
+	 */
+
+	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
+		return -EINVAL;
+
+	if (smt)
+		sched_smt_power_savings = level;
+	else
+		sched_mc_power_savings = level;
+
+	reinit_sched_domains();
+
+	return count;
+}
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	return sprintf(buf, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 0);
+}
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+		   sched_mc_power_savings_show,
+		   sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	return sprintf(buf, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+						struct device_attribute *attr,
+						 const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 1);
+}
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
+		   sched_smt_power_savings_show,
+		   sched_smt_power_savings_store);
+#endif
+
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
+{
+	int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+	if (smt_capable())
+		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
+#endif
+#ifdef CONFIG_SCHED_MC
+	if (!err && mc_capable())
+		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
+#endif
+	return err;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+/*
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+				 void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_STARTING:
+	case CPU_DOWN_FAILED:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+				   void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(int cpu)
+{
+	return rq_idle(cpu_rq(cpu));
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(int cpu)
+{
+	return cpumask_subset(&(cpu_rq(cpu)->smt_siblings),
+				  &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(int cpu)
+{
+	return cpumask_subset(&(cpu_rq(cpu)->cache_siblings),
+				  &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_BOOK,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+	struct sched_domain *sd;
+	int cpu;
+
+	cpumask_var_t non_isolated_cpus;
+
+	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+	get_online_cpus();
+	mutex_lock(&sched_domains_mutex);
+	init_sched_domains(cpu_active_mask);
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+	if (cpumask_empty(non_isolated_cpus))
+		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
+	put_online_cpus();
+
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+	/* Move init over to a non-isolated CPU */
+	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+		BUG();
+	free_cpumask_var(non_isolated_cpus);
+
+	grq_lock_irq();
+	/*
+	 * Set up the relative cache distance of each online cpu from each
+	 * other in a simple array for quick lookup. Locality is determined
+	 * by the closest sched_domain that CPUs are separated by. CPUs with
+	 * shared cache in SMT and MC are treated as local. Separate CPUs
+	 * (within the same package or physically) within the same node are
+	 * treated as not local. CPUs not even in the same domain (different
+	 * nodes) are treated as very distant.
+	 */
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		for_each_domain(cpu, sd) {
+			int locality, other_cpu;
+
+#ifdef CONFIG_SCHED_SMT
+			if (sd->level == SD_LV_SIBLING) {
+				for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
+					cpumask_set_cpu(other_cpu, &rq->smt_siblings);
+			}
+#endif
+#ifdef CONFIG_SCHED_MC
+			if (sd->level == SD_LV_MC) {
+				for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
+					cpumask_set_cpu(other_cpu, &rq->cache_siblings);
+			}
+#endif
+			if (sd->level <= SD_LV_SIBLING)
+				locality = 1;
+			else if (sd->level <= SD_LV_MC)
+				locality = 2;
+			else if (sd->level <= SD_LV_NODE)
+				locality = 3;
+			else
+				continue;
+
+			for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) {
+				if (locality < rq->cpu_locality[other_cpu])
+					rq->cpu_locality[other_cpu] = locality;
+			}
+		}
+
+/*
+		 * Each runqueue has its own function in case it doesn't have
+		 * siblings of its own allowing mixed topologies.
+		 */
+#ifdef CONFIG_SCHED_SMT
+		if (cpus_weight(rq->smt_siblings) > 1)
+			rq->siblings_idle = siblings_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+		if (cpus_weight(rq->cache_siblings) > 1)
+			rq->cache_idle = cache_cpu_idle;
+#endif
+	}
+	grq_unlock_irq();
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+unsigned int sysctl_timer_migration = 1;
+
+int in_sched_functions(unsigned long addr)
+{
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+	int i;
+	struct rq *rq;
+
+	print_scheduler_version();
+
+	raw_spin_lock_init(&grq.lock);
+	grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
+	grq.noc = 1;
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+	grq.qnr = grq.idle_cpus = 0;
+	cpumask_clear(&grq.cpu_idle_map);
+
+#else
+	uprq = &per_cpu(runqueues, 0);
+#endif
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
+				  rq->iowait_pc = rq->idle_pc = 0;
+		rq->last_run_scale = 0;
+#ifdef CONFIG_SMP
+		rq->sticky_task = NULL;
+		rq->sd = NULL;
+		rq->rd = NULL;
+		rq->online = false;
+		rq->cpu = i;
+		rq_attach_root(rq, &def_root_domain);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+	}
+
+#ifdef CONFIG_SMP
+	nr_cpu_ids = i;
+	/*
+	 * Set the base locality for cpu cache distance calculation to
+	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
+	 */
+	for_each_possible_cpu(i) {
+		int j;
+
+		rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+		cpumask_clear(&rq->smt_siblings);
+		cpumask_set_cpu(i, &rq->smt_siblings);
+		rq->siblings_idle = sole_cpu_idle;
+		cpumask_set_cpu(i, &rq->smt_siblings);
+#endif
+#ifdef CONFIG_SCHED_MC
+		cpumask_clear(&rq->cache_siblings);
+		cpumask_set_cpu(i, &rq->cache_siblings);
+		rq->cache_idle = sole_cpu_idle;
+		cpumask_set_cpu(i, &rq->cache_siblings);
+#endif
+		rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC);
+		for_each_possible_cpu(j) {
+			if (i == j)
+				rq->cpu_locality[j] = 0;
+			else
+				rq->cpu_locality[j] = 4;
+		}
+	}
+#endif
+
+	for (i = 0; i < PRIO_LIMIT; i++)
+		INIT_LIST_HEAD(grq.queue + i);
+	/* delimiter for bitsearch */
+	__set_bit(PRIO_LIMIT, grq.prio_bitmap);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+	plist_head_init(&init_task.pi_waiters);
+#endif
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+#endif /* SMP */
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+
+	return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+		system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+	dump_stack();
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	struct rq *rq;
+	int queued;
+
+	read_lock_irq(&tasklist_lock);
+
+	do_each_thread(g, p) {
+		if (!rt_task(p))
+			continue;
+
+		raw_spin_lock_irqsave(&p->pi_lock, flags);
+		rq = __task_grq_lock(p);
+
+		queued = task_queued(p);
+		__setscheduler(p, rq, SCHED_NORMAL, 0);
+		if (queued) {
+			try_preempt(p, rq);
+		}
+
+		__task_grq_unlock();
+		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	} while_each_thread(g, p);
+
+	read_unlock_irq(&tasklist_lock);
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+struct task_struct *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	*ut = p->utime;
+	*st = p->stime;
+}
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
+#else
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	cputime_t rtime, utime = p->utime, total = utime + p->stime;
+
+	rtime = nsecs_to_cputime(p->sched_time);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	/*
+	 * Compare with previous values, to keep monotonicity:
+	 */
+	p->prev_utime = max(p->prev_utime, utime);
+	p->prev_stime = max(p->prev_stime, (rtime - p->prev_utime));
+
+	*ut = p->prev_utime;
+	*st = p->prev_stime;
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct signal_struct *sig = p->signal;
+	struct task_cputime cputime;
+	cputime_t rtime, utime, total;
+
+	thread_group_cputime(p, &cputime);
+
+	total = cputime.utime + cputime.stime;
+	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * cputime.utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	sig->prev_utime = max(sig->prev_utime, utime);
+	sig->prev_stime = max(sig->prev_stime, (rtime - sig->prev_utime));
+
+	*ut = sig->prev_utime;
+	*st = sig->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+	return p->gtime;
+}
+
+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+#endif
diff -ruN linux-3.4.1/kernel/sched/stats.c linux-3.4.1-RIFS/kernel/sched/stats.c
--- linux-3.4.1/kernel/sched/stats.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/stats.c	1970-01-01 08:00:00.000000000 +0800
@@ -1,111 +0,0 @@
-
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "sched.h"
-
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-	int cpu;
-	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-	char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
-	if (mask_str == NULL)
-		return -ENOMEM;
-
-	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-	seq_printf(seq, "timestamp %lu\n", jiffies);
-	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-		struct sched_domain *sd;
-		int dcount = 0;
-#endif
-
-		/* runqueue-specific stats */
-		seq_printf(seq,
-		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
-		    cpu, rq->yld_count,
-		    rq->sched_count, rq->sched_goidle,
-		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-
-		seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
-		/* domain-specific stats */
-		rcu_read_lock();
-		for_each_domain(cpu, sd) {
-			enum cpu_idle_type itype;
-
-			cpumask_scnprintf(mask_str, mask_len,
-					  sched_domain_span(sd));
-			seq_printf(seq, "domain%d %s", dcount++, mask_str);
-			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-					itype++) {
-				seq_printf(seq, " %u %u %u %u %u %u %u %u",
-				    sd->lb_count[itype],
-				    sd->lb_balanced[itype],
-				    sd->lb_failed[itype],
-				    sd->lb_imbalance[itype],
-				    sd->lb_gained[itype],
-				    sd->lb_hot_gained[itype],
-				    sd->lb_nobusyq[itype],
-				    sd->lb_nobusyg[itype]);
-			}
-			seq_printf(seq,
-				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
-			    sd->ttwu_move_balance);
-		}
-		rcu_read_unlock();
-#endif
-	}
-	kfree(mask_str);
-	return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-	char *buf = kmalloc(size, GFP_KERNEL);
-	struct seq_file *m;
-	int res;
-
-	if (!buf)
-		return -ENOMEM;
-	res = single_open(file, show_schedstat, NULL);
-	if (!res) {
-		m = file->private_data;
-		m->buf = buf;
-		m->size = size;
-	} else
-		kfree(buf);
-	return res;
-}
-
-static const struct file_operations proc_schedstat_operations = {
-	.open    = schedstat_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = single_release,
-};
-
-static int __init proc_schedstat_init(void)
-{
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-	return 0;
-}
-module_init(proc_schedstat_init);
diff -ruN linux-3.4.1/kernel/sched/stats.h linux-3.4.1-RIFS/kernel/sched/stats.h
--- linux-3.4.1/kernel/sched/stats.h	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sched/stats.h	2012-06-06 14:43:02.000000000 +0800
@@ -1,231 +0,0 @@
-
-#ifdef CONFIG_SCHEDSTATS
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{
-	if (rq) {
-		rq->rq_sched_info.run_delay += delta;
-		rq->rq_sched_info.pcount++;
-	}
-}
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{
-	if (rq)
-		rq->rq_cpu_time += delta;
-}
-
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{
-	if (rq)
-		rq->rq_sched_info.run_delay += delta;
-}
-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)	do { var = (val); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-# define schedstat_inc(rq, field)	do { } while (0)
-# define schedstat_add(rq, field, amt)	do { } while (0)
-# define schedstat_set(var, val)	do { } while (0)
-#endif
-
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
-	t->sched_info.last_queued = 0;
-}
-
-/*
- * We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
- */
-static inline void sched_info_dequeued(struct task_struct *t)
-{
-	unsigned long long now = task_rq(t)->clock, delta = 0;
-
-	if (unlikely(sched_info_on()))
-		if (t->sched_info.last_queued)
-			delta = now - t->sched_info.last_queued;
-	sched_info_reset_dequeued(t);
-	t->sched_info.run_delay += delta;
-
-	rq_sched_info_dequeued(task_rq(t), delta);
-}
-
-/*
- * Called when a task finally hits the cpu.  We can now calculate how
- * long it was waiting to run.  We also note when it began so that we
- * can keep stats on how long its timeslice is.
- */
-static void sched_info_arrive(struct task_struct *t)
-{
-	unsigned long long now = task_rq(t)->clock, delta = 0;
-
-	if (t->sched_info.last_queued)
-		delta = now - t->sched_info.last_queued;
-	sched_info_reset_dequeued(t);
-	t->sched_info.run_delay += delta;
-	t->sched_info.last_arrival = now;
-	t->sched_info.pcount++;
-
-	rq_sched_info_arrive(task_rq(t), delta);
-}
-
-/*
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set.  It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
-{
-	if (unlikely(sched_info_on()))
-		if (!t->sched_info.last_queued)
-			t->sched_info.last_queued = task_rq(t)->clock;
-}
-
-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
- * Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
- * the runqueue.
- */
-static inline void sched_info_depart(struct task_struct *t)
-{
-	unsigned long long delta = task_rq(t)->clock -
-					t->sched_info.last_arrival;
-
-	rq_sched_info_depart(task_rq(t), delta);
-
-	if (t->state == TASK_RUNNING)
-		sched_info_queued(t);
-}
-
-/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice.  (This may also be called when switching to or from
- * the idle task.)  We are only called when prev != next.
- */
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-	struct rq *rq = task_rq(prev);
-
-	/*
-	 * prev now departs the cpu.  It's not interesting to record
-	 * stats about how efficient we were at scheduling the idle
-	 * process, however.
-	 */
-	if (prev != rq->idle)
-		sched_info_depart(prev);
-
-	if (next != rq->idle)
-		sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-	if (unlikely(sched_info_on()))
-		__sched_info_switch(prev, next);
-}
-#else
-#define sched_info_queued(t)			do { } while (0)
-#define sched_info_reset_dequeued(t)	do { } while (0)
-#define sched_info_dequeued(t)			do { } while (0)
-#define sched_info_switch(t, next)		do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick.  None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the utime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-					   cputime_t cputime)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	if (!cputimer->running)
-		return;
-
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the stime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
-					     cputime_t cputime)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	if (!cputimer->running)
-		return;
-
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					      unsigned long long ns)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	if (!cputimer->running)
-		return;
-
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
-}
diff -ruN linux-3.4.1/kernel/sysctl.c linux-3.4.1-RIFS/kernel/sysctl.c
--- linux-3.4.1/kernel/sysctl.c	2012-06-01 15:18:44.000000000 +0800
+++ linux-3.4.1-RIFS/kernel/sysctl.c	2012-06-15 23:46:44.000000000 +0800
@@ -242,6 +242,7 @@
 };
 
 #ifdef CONFIG_SCHED_DEBUG
+#ifndef CONFIG_SCHED_RIFS
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -249,6 +250,7 @@
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 #endif
+#endif
 
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -256,14 +258,8 @@
 #endif
 
 static struct ctl_table kern_table[] = {
-	{
-		.procname	= "sched_child_runs_first",
-		.data		= &sysctl_sched_child_runs_first,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHED_DEBUG
+#ifndef CONFIG_SCHED_RIFS
 	{
 		.procname	= "sched_min_granularity_ns",
 		.data		= &sysctl_sched_min_granularity,
@@ -338,20 +334,7 @@
 		.extra2		= &one,
 	},
 #endif
-	{
-		.procname	= "sched_rt_period_us",
-		.data		= &sysctl_sched_rt_period,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_rt_handler,
-	},
-	{
-		.procname	= "sched_rt_runtime_us",
-		.data		= &sysctl_sched_rt_runtime,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= sched_rt_handler,
-	},
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
  2012-06-17 22:24   ` Chen
@ 2012-06-18  3:01     ` Vikram Dhillon
  2012-06-18  6:28       ` Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Vikram Dhillon @ 2012-06-18  3:01 UTC (permalink / raw)
  To: Chen; +Cc: Mike Galbraith, linux-kernel

On Sun, Jun 17, 2012 at 6:24 PM, Chen <hi3766691@gmail.com> wrote:
> Sorry for my fault.
> The patch has build issue and I post the new one.

Hi Chen,

It is always very interesting to see new scheduler designs however can
you shows us some benchmarks? How does this compare to CFS currently?
Also how well does it scale beyond desktops?

- Vikram

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
  2012-06-18  3:01     ` Vikram Dhillon
@ 2012-06-18  6:28       ` Chen
  2012-06-18 10:56         ` Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Chen @ 2012-06-18  6:28 UTC (permalink / raw)
  To: Vikram Dhillon; +Cc: Mike Galbraith, linux-kernel

On Mon, Jun 18, 2012 at 11:01 AM, Vikram Dhillon
<opensolarisdev@gmail.com> wrote:
> On Sun, Jun 17, 2012 at 6:24 PM, Chen <hi3766691@gmail.com> wrote:
>> Sorry for my fault.
>> The patch has build issue and I post the new one.
>
> Hi Chen,
>
> It is always very interesting to see new scheduler designs however can
> you shows us some benchmarks? How does this compare to CFS currently?
> Also how well does it scale beyond desktops?
>
> - Vikram


Here is the benchmark result(Latency) for RIFS-ES and CFS(I have done
them with computer and I am posting them with my Android phone now)

RIFS-ES
========================================
[root@localhost admin]# latt -c1 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=1
Entries logged: 27

Wakeup averages
-------------------------------------
	Max		      25 usec
	Avg		      10 usec
	Stdev		       3 usec
	Stdev mean	       1 usec

Work averages
-------------------------------------
	Max		   20265 usec
	Avg		   20100 usec
	Stdev		      82 usec
	Stdev mean	      16 usec
[root@localhost admin]# latt -c2 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=2
Entries logged: 54

Wakeup averages
-------------------------------------
	Max		     824 usec
	Avg		      30 usec
	Stdev		     117 usec
	Stdev mean	      16 usec

Work averages
-------------------------------------
	Max		   21472 usec
	Avg		   20486 usec
	Stdev		     190 usec
	Stdev mean	      26 usec
[root@localhost admin]# latt -c4 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=4
Entries logged: 104

Wakeup averages
-------------------------------------
	Max		   20423 usec
	Avg		    2343 usec
	Stdev		    3810 usec
	Stdev mean	     374 usec

Work averages
-------------------------------------
	Max		   41037 usec
	Avg		   36096 usec
	Stdev		    4247 usec
	Stdev mean	     416 usec
[root@localhost admin]# latt -c8 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=8
Entries logged: 184

Wakeup averages
-------------------------------------
	Max		   71284 usec
	Avg		    6112 usec
	Stdev		    9640 usec
	Stdev mean	     711 usec

Work averages
-------------------------------------
	Max		   83043 usec
	Avg		   68615 usec
	Stdev		   10615 usec
	Stdev mean	     783 usec
[root@localhost admin]# latt -c16 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=16
Entries logged: 320

Wakeup averages
-------------------------------------
	Max		   35669 usec
	Avg		    9690 usec
	Stdev		    8961 usec
	Stdev mean	     501 usec

Work averages
-------------------------------------
	Max		  166790 usec
	Avg		  136161 usec
	Stdev		   30660 usec
	Stdev mean	    1714 usec


RIFS-ES: Total result for wake up average latency:
-c1:
	Avg		      10 usec
-c2:
	Avg		      30 usec
-c4:
	Avg		    2343 usec
-c8:
	Avg		    6112 usec
-c16:
	Avg		    9690 usec

CFS
========================================
[root@localhost ~]$ latt -c1 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=1
Entries logged: 27

Wakeup averages
-------------------------------------
	Max		      40 usec
	Avg		      21 usec
	Stdev		       8 usec
	Stdev mean	       2 usec

Work averages
-------------------------------------
	Max		   20648 usec
	Avg		   20262 usec
	Stdev		      97 usec
	Stdev mean	      19 usec
[root@localhost ~]$ latt -c2 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=2
Entries logged: 54

Wakeup averages
-------------------------------------
	Max		      45 usec
	Avg		      19 usec
	Stdev		      10 usec
	Stdev mean	       1 usec

Work averages
-------------------------------------
	Max		   21549 usec
	Avg		   20495 usec
	Stdev		     320 usec
	Stdev mean	      43 usec
[root@localhost ~]$ latt -c4 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=4
Entries logged: 104

Wakeup averages
-------------------------------------
	Max		   20430 usec
	Avg		    4900 usec
	Stdev		    6274 usec
	Stdev mean	     615 usec

Work averages
-------------------------------------
	Max		   50538 usec
	Avg		   30564 usec
	Stdev		    6492 usec
	Stdev mean	     637 usec
[root@localhost ~]$ latt -c8 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=8
Entries logged: 184

Wakeup averages
-------------------------------------
	Max		   38195 usec
	Avg		    8068 usec
	Stdev		    8132 usec
	Stdev mean	     599 usec

Work averages
-------------------------------------
	Max		   83525 usec
	Avg		   67597 usec
	Stdev		   10037 usec
	Stdev mean	     740 usec
[root@localhost ~]$ latt -c16 sleep 10

Parameters: min_wait=100ms, max_wait=500ms, clients=16
Entries logged: 320

Wakeup averages
-------------------------------------
	Max		   83711 usec
	Avg		   17497 usec
	Stdev		   21219 usec
	Stdev mean	    1186 usec

Work averages
-------------------------------------
	Max		  166547 usec
	Avg		  119376 usec
	Stdev		   31758 usec
	Stdev mean	    1775 usec


CFS: Total result for wake up average latency:
-c1:
	Avg		      21 usec
-c2:
	Avg		      19 usec
-c4:
	Avg		    4900 usec
-c8:
	Avg		    8068 usec
-c16:
	Avg		   17497 usec

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
  2012-06-18  6:28       ` Chen
@ 2012-06-18 10:56         ` Chen
       [not found]           ` <CANQmPXiyrd9EU=K+6VmMf-fyt5sb7v4bUhyRameHt5OKWc8CKQ@mail.gmail.com>
  0 siblings, 1 reply; 10+ messages in thread
From: Chen @ 2012-06-18 10:56 UTC (permalink / raw)
  To: Vikram Dhillon; +Cc: Mike Galbraith, linux-kernel, mou Chen

[-- Attachment #1: Type: text/plain, Size: 153 bytes --]

It is still a benchmark about latency, but BFS is included.
http://hiphotos.baidu.com/%DF%91%DF%91/pic/item/9cd31d88b9014a904acc4316a9773912b11beefe.jpg

[-- Attachment #2: scheduler-test.tar.xz --]
[-- Type: application/octet-stream, Size: 28052 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
       [not found]           ` <CANQmPXiyrd9EU=K+6VmMf-fyt5sb7v4bUhyRameHt5OKWc8CKQ@mail.gmail.com>
@ 2012-06-19 14:22             ` Chen
  0 siblings, 0 replies; 10+ messages in thread
From: Chen @ 2012-06-19 14:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: mou Chen

http://rifs-scheduler.googlecode.com/files/Result-001.PNG

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Unzustellbar: Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
       [not found]       ` <4d5440bc-73bb-4c30-8571-bb7004fc5c4a@skytron.local>
@ 2012-06-20 15:17         ` Chen
  2012-06-21 10:40           ` Chen
  0 siblings, 1 reply; 10+ messages in thread
From: Chen @ 2012-06-20 15:17 UTC (permalink / raw)
  To: linux-kernel

#MONTH	CFS	RIFS-ES	BFS
1	21	8		7
2	19	8		7
4	4678	1402		2625
8	9111	4361		5271
16	17944	7589		12611
32	35628	21864		25381
64	69205	43145		51653
128	93522	35553		97354
256	134560	6288		200071

http://rifs-scheduler.googlecode.com/files/Result-002.PNG

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Unzustellbar: Re: [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release.
  2012-06-20 15:17         ` Unzustellbar: " Chen
@ 2012-06-21 10:40           ` Chen
  0 siblings, 0 replies; 10+ messages in thread
From: Chen @ 2012-06-21 10:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Peter Zijlstra, Juri Lelli, Ingo Molnar, mou Chen

Here is a newer benchmark around latt -c255 sleep 10 between these 3
cpu scheduler.
http://rifs-scheduler.googlecode.com/files/Result-003.PNG

RIFS-ES-Low-Spec has been posted, if you want a link please click the
follow site.(The testing data in test form and PNG statistic form)
http://code.google.com/p/rifs-scheduler/downloads/detail?name=Result-003.PNG

During the test I will move the mouse(I just move the mouse and I
don't click anything).
With BFS the mouse stalls.
With CFS or RIFS-ES-Low-Spec the mouse is very smooth.

With average latency RIFS wins, with maximum latency BFS wins.

After using it if you think it is useful please invite the others to use it.
Thanks

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-06-21 10:40 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-16 17:18 [PATCH][ANNOUNCE]RIFS-ES Scheduling V1 release Chen
2012-06-17  6:24 ` Mike Galbraith
     [not found]   ` <CANQmPXiU_Pz2qZkisF34TBepbRW4iJ4ytLLfiEmuJpCHxUwWvA@mail.gmail.com>
2012-06-17  8:20     ` Chen
     [not found]       ` <4d5440bc-73bb-4c30-8571-bb7004fc5c4a@skytron.local>
2012-06-20 15:17         ` Unzustellbar: " Chen
2012-06-21 10:40           ` Chen
2012-06-17 22:24   ` Chen
2012-06-18  3:01     ` Vikram Dhillon
2012-06-18  6:28       ` Chen
2012-06-18 10:56         ` Chen
     [not found]           ` <CANQmPXiyrd9EU=K+6VmMf-fyt5sb7v4bUhyRameHt5OKWc8CKQ@mail.gmail.com>
2012-06-19 14:22             ` Chen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.