linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* fairsched + O(1) process scheduler
@ 2003-04-01 12:51 Antonio Vargas
  2003-04-01 16:41 ` William Lee Irwin III
  2003-04-01 18:33 ` Robert Love
  0 siblings, 2 replies; 21+ messages in thread
From: Antonio Vargas @ 2003-04-01 12:51 UTC (permalink / raw)
  To: linux-kernel


This patch is a forward port of the fairsched-2.4.19 patch by Rik van Riel, which
ensures all competing users get an equal share of cpu time.

Since the earlier patch applied to the classic O(n) process scheduler,
and this one applies to the standard 2.5 O(1) one, the mapping isn't
one-to-one but rather more complex.

Original 2.4.19 version:       Rik van Riel, riel@surriel.com
Forward ported 2.5.66 version: Antonio Vargas, wind@cocodriloo.com


 include/linux/sched.h  |    4 ++++
 include/linux/sysctl.h |    1 +
 kernel/sched.c         |   11 +++++++++++
 kernel/sysctl.c        |    3 +++
 kernel/user.c          |   10 +++++++++-
 5 files changed, 28 insertions(+), 1 deletion(-)

diff -puN include/linux/sched.h~fairsched-A0 include/linux/sched.h
--- 25/include/linux/sched.h~fairsched-A0	Tue Apr  1 14:46:21 2003
+++ 25-wind/include/linux/sched.h	Tue Apr  1 14:46:21 2003
@@ -280,6 +280,10 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+
+	/* List maintenance information */
+	struct list_head uid_list;
+	unsigned long ticks;	/* How many ticks has received the user */
 };
 
 #define get_current_user() ({ 				\
diff -puN include/linux/sysctl.h~fairsched-A0 include/linux/sysctl.h
--- 25/include/linux/sysctl.h~fairsched-A0	Tue Apr  1 14:46:21 2003
+++ 25-wind/include/linux/sysctl.h	Tue Apr  1 14:46:21 2003
@@ -129,6 +129,7 @@ enum
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
 	KERN_PIDMAX=55,		/* int: PID # limit */
   	KERN_CORE_PATTERN=56,	/* string: pattern for core-file names */
+  	KERN_FAIRSCHED=57,	/* int: turn per-user fair cpu scheduler on/off */
 };
 
 
diff -puN kernel/sched.c~fairsched-A0 kernel/sched.c
--- 25/kernel/sched.c~fairsched-A0	Tue Apr  1 14:46:21 2003
+++ 25-wind/kernel/sched.c	Tue Apr  1 14:46:21 2003
@@ -33,6 +33,13 @@
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 
+/*
+ * turn per-user fair scheduler on/off
+ */
+
+int fairsched = 1;
+
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -316,6 +323,10 @@ static int effective_prio(task_t *p)
 
 	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+
+	if(fairsched){
+		/* special processing for per-user fair scheduler */
+	}
 
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
diff -puN kernel/sysctl.c~fairsched-A0 kernel/sysctl.c
--- 25/kernel/sysctl.c~fairsched-A0	Tue Apr  1 14:46:21 2003
+++ 25-wind/kernel/sysctl.c	Tue Apr  1 14:46:21 2003
@@ -57,6 +57,7 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int fairsched;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -263,6 +264,8 @@ static ctl_table kern_table[] = {
 #endif
 	{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
 	 0600, NULL, &proc_dointvec},
+	{KERN_FAIRSCHED,"fairsched",
+	 &fairsched,sizeof(int),0644,NULL,&proc_dointvec},
 	{0}
 };
 
diff -puN kernel/user.c~fairsched-A0 kernel/user.c
--- 25/kernel/user.c~fairsched-A0	Tue Apr  1 14:46:21 2003
+++ 25-wind/kernel/user.c	Tue Apr  1 14:47:09 2003
@@ -27,10 +27,13 @@ static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
 
+static struct list_head user_list;
+
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(1),
 	.processes	= ATOMIC_INIT(1),
-	.files		= ATOMIC_INIT(0)
+	.files		= ATOMIC_INIT(0),
+	.ticks		= NICE_TO_PRIO(0)
 };
 
 /*
@@ -39,10 +42,12 @@ struct user_struct root_user = {
 static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
 {
 	list_add(&up->uidhash_list, hashent);
+	list_add(&up->uid_list, &user_list);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
+	list_del(&up->uid_list);
 	list_del(&up->uidhash_list);
 }
 
@@ -97,6 +102,7 @@ struct user_struct * alloc_uid(uid_t uid
 		atomic_set(&new->__count, 1);
 		atomic_set(&new->processes, 0);
 		atomic_set(&new->files, 0);
+		atomic_set(&new->ticks, NICE_TO_PRIO(0));
 
 		/*
 		 * Before adding this, check whether we raced
@@ -146,6 +152,8 @@ static int __init uid_cache_init(void)
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_LIST_HEAD(uidhash_table + n);
+
+	INIT_LIST_HEAD(&user_list);
 
 	/* Insert the root user immediately - init already runs with this */
 	uid_hash_insert(&root_user, uidhashentry(0));

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-01 12:51 fairsched + O(1) process scheduler Antonio Vargas
@ 2003-04-01 16:41 ` William Lee Irwin III
  2003-04-01 22:19   ` Antonio Vargas
  2003-04-01 18:33 ` Robert Love
  1 sibling, 1 reply; 21+ messages in thread
From: William Lee Irwin III @ 2003-04-01 16:41 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: linux-kernel

On Tue, Apr 01, 2003 at 02:51:59PM +0200, Antonio Vargas wrote:
+
+	if(fairsched){
+		/* special processing for per-user fair scheduler */
+	}

I suspect something more needs to happen there. =)

I'd recommend a different approach, i.e. stratifying the queue into a
user top level and a task bottom level. The state is relatively well
encapsulated so it shouldn't be that far out.


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-01 12:51 fairsched + O(1) process scheduler Antonio Vargas
  2003-04-01 16:41 ` William Lee Irwin III
@ 2003-04-01 18:33 ` Robert Love
  1 sibling, 0 replies; 21+ messages in thread
From: Robert Love @ 2003-04-01 18:33 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: linux-kernel

On Tue, 2003-04-01 at 07:51, Antonio Vargas wrote:

> +	if(fairsched){
> +		/* special processing for per-user fair scheduler */
> +	}

Cute.

	Robert Love


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-01 16:41 ` William Lee Irwin III
@ 2003-04-01 22:19   ` Antonio Vargas
  2003-04-02 12:46     ` Antonio Vargas
       [not found]     ` <200304021144.21924.frankeh@watson.ibm.com>
  0 siblings, 2 replies; 21+ messages in thread
From: Antonio Vargas @ 2003-04-01 22:19 UTC (permalink / raw)
  To: William Lee Irwin III, Antonio Vargas, linux-kernel, Robert Love

On Tue, Apr 01, 2003 at 08:41:26AM -0800, William Lee Irwin III wrote:
> On Tue, Apr 01, 2003 at 02:51:59PM +0200, Antonio Vargas wrote:
> +
> +	if(fairsched){
> +		/* special processing for per-user fair scheduler */
> +	}
> 
> I suspect something more needs to happen there. =)

 :) I haven't even compiled with this patch, I'm just trying
to get around my ideas and thus I posted so that:

a. We had an off-site backup

b. People with experience could shout out loud if they saw
   some big-time silliness.

> I'd recommend a different approach, i.e. stratifying the queue into a
> user top level and a task bottom level. The state is relatively well
> encapsulated so it shouldn't be that far out.
> 
> 
> -- wli

I suspect you mean the scheduler runqueues?

My initial idea runs as follows:

a. On each array switch, we add some fixed value
   to user->ticks. (HZ perhaps?)

b. When adding, we cap at some maximum value.
   (2*HZ perhaps?)

c. On each timer tick, we decrement current->user->ticks.

d. When decrementing, if it's below zero we just
   end this thread. (user has expired)

e. When switching threads, we take the first one that belongs
   to a non-expired user. I think this can be done by sending
   the user-expired threads to the expired array, just like
   when they expire themselves.

I'll try to code this tonight, so I'll post later on
if I'm lucky.

I think this needs much less complexity for a first version,
but I would try what you propose if I get intimate enough
with the scheduler.

Greets, Antonio.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-01 22:19   ` Antonio Vargas
@ 2003-04-02 12:46     ` Antonio Vargas
  2003-04-02 16:35       ` William Lee Irwin III
       [not found]     ` <200304021144.21924.frankeh@watson.ibm.com>
  1 sibling, 1 reply; 21+ messages in thread
From: Antonio Vargas @ 2003-04-02 12:46 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: William Lee Irwin III, linux-kernel, Robert Love

[-- Attachment #1: Type: text/plain, Size: 1864 bytes --]

On Wed, Apr 02, 2003 at 12:19:27AM +0200, Antonio Vargas wrote:
> On Tue, Apr 01, 2003 at 08:41:26AM -0800, William Lee Irwin III wrote:
> > On Tue, Apr 01, 2003 at 02:51:59PM +0200, Antonio Vargas wrote:
> > +
> > +	if(fairsched){
> > +		/* special processing for per-user fair scheduler */
> > +	}
> > 
> > I suspect something more needs to happen there. =)
> 
>  :) I haven't even compiled with this patch, I'm just trying
> to get around my ideas and thus I posted so that:
> 
> a. We had an off-site backup
> 
> b. People with experience could shout out loud if they saw
>    some big-time silliness.
> 
> > I'd recommend a different approach, i.e. stratifying the queue into a
> > user top level and a task bottom level. The state is relatively well
> > encapsulated so it shouldn't be that far out.
> > 
> > 
> > -- wli
> 
> I suspect you mean the scheduler runqueues?
> 
> My initial idea runs as follows:
> 
> a. On each array switch, we add some fixed value
>    to user->ticks. (HZ perhaps?)
> 
> b. When adding, we cap at some maximum value.
>    (2*HZ perhaps?)
> 
> c. On each timer tick, we decrement current->user->ticks.
> 
> d. When decrementing, if it's below zero we just
>    end this thread. (user has expired)
> 
> e. When switching threads, we take the first one that belongs
>    to a non-expired user. I think this can be done by sending
>    the user-expired threads to the expired array, just like
>    when they expire themselves.
> 
> I'll try to code this tonight, so I'll post later on
> if I'm lucky.
> 
> I think this needs much less complexity for a first version,
> but I would try what you propose if I get intimate enough
> with the scheduler.
> 
> Greets, Antonio.

Ok, second try...

I have a doubt, do we need any locking for struct user when reading/writing
his priority or when surfing the user_list?

Greets, Antnonio.


[-- Attachment #2: fairsched-A1.patch --]
[-- Type: text/plain, Size: 7656 bytes --]



This patch is a forward port of the fairsched-2.4.19 patch by Rik van Riel, which
ensures all competing users get an equal share of cpu time.

Since the earlier patch applied to the classic O(n) process scheduler,
and this one applies to the standard 2.5 O(1) one, the mapping isn't
one-to-one but rather more complex.

Original 2.4.19 version:       Rik van Riel, riel@surriel.com
Forward ported 2.5.66 version: Antonio Vargas, wind@cocodriloo.com

A0: initial layout for code and data
A1: introduce runqueue handling related to per-user cpu share


 include/linux/sched.h  |    6 +++
 include/linux/sysctl.h |    2 +
 kernel/sched.c         |   78 +++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sysctl.c        |    6 +++
 kernel/user.c          |   10 +++++-
 5 files changed, 99 insertions(+), 3 deletions(-)

diff -puN include/linux/sched.h~fairsched-A1 include/linux/sched.h
--- 25/include/linux/sched.h~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/include/linux/sched.h	Wed Apr  2 13:17:44 2003
@@ -280,6 +280,12 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+
+	/* List maintenance information */
+	struct list_head uid_list;
+
+	/* Per-user timeslice management */
+	unsigned int time_slice;
 };
 
 #define get_current_user() ({ 				\
diff -puN include/linux/sysctl.h~fairsched-A1 include/linux/sysctl.h
--- 25/include/linux/sysctl.h~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/include/linux/sysctl.h	Wed Apr  2 13:17:44 2003
@@ -129,6 +129,8 @@ enum
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
 	KERN_PIDMAX=55,		/* int: PID # limit */
   	KERN_CORE_PATTERN=56,	/* string: pattern for core-file names */
+	KERN_FAIRSCHED=57,		/* turn per-user fair cpu scheduler on/off */
+	KERN_MAX_TIMESLICE_USER=58,	/* maximum user timeslice */
 };
 
 
diff -puN kernel/sched.c~fairsched-A1 kernel/sched.c
--- 25/kernel/sched.c~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/kernel/sched.c	Wed Apr  2 14:02:29 2003
@@ -33,6 +33,13 @@
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 
+/*
+ * turn per-user fair scheduler on/off
+ */
+
+int fairsched = 0;
+
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -75,6 +82,9 @@
 #define STARVATION_LIMIT	(10*HZ)
 #define NODE_THRESHOLD		125
 
+int max_timeslice_user = (200 * HZ) / 100;
+#define MAX_TIMESLICE_USER	(max_timeslice_user)
+
 /*
  * If a task is 'interactive' then we reinsert it in the active
  * array after it has expired its current timeslice. (it will not
@@ -317,6 +327,10 @@ static int effective_prio(task_t *p)
 	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 
+	if(fairsched){
+		/* special processing for per-user fair scheduler */
+	}
+
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
@@ -1194,6 +1208,8 @@ void scheduler_tick(int user_ticks, int 
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
+	unsigned int user_time_slice;
+
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
 
@@ -1246,14 +1262,24 @@ void scheduler_tick(int user_ticks, int 
 		}
 		goto out;
 	}
-	if (!--p->time_slice) {
+
+	/* Update per-user timeslice */
+	if(fairsched) {
+		if(p->user->time_slice > 0)
+			--p->user->time_slice;
+		user_time_slice = p->user->time_slice;
+	}else
+		user_time_slice = 1;
+
+	/* Update per-process timeslice and queue the task when used up */
+	if (!--p->time_slice || !user_time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq) || !user_time_slice) {
 			if (!rq->expired_timestamp)
 				rq->expired_timestamp = jiffies;
 			enqueue_task(p, rq->expired);
@@ -1268,6 +1294,38 @@ out:
 void scheduling_functions_start_here(void) { }
 
 /*
+ * Gime more timeslices to users on each array switch
+ */
+static inline void update_user_timeslices(void)
+{
+	extern struct list_head user_list;
+	struct list_head *entry;
+	struct user_struct *user;
+	unsigned int user_time_slice;
+
+	if(!fairsched) return;
+
+	list_for_each(entry, &user_list) {
+		user = list_entry(entry, struct user_struct, uid_list);
+
+		if(!user) continue;
+
+		if(0){
+			user_time_slice = user->time_slice;
+
+			user_time_slice += HZ;
+			if(user_time_slice > MAX_TIMESLICE_USER)
+				user_time_slice = MAX_TIMESLICE_USER;
+
+			if(user->time_slice != user_time_slice)
+				user->time_slice = user_time_slice;
+		}else{
+			user->time_slice += HZ;
+		}
+	}
+}
+
+/*
  * schedule() is the main scheduler function.
  */
 asmlinkage void schedule(void)
@@ -1339,11 +1397,27 @@ pick_next_task:
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
+
+		/*
+		 * Give new timeslices to users
+		 */
+		update_user_timeslices();
 	}
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
+
+	/*
+	 * If next task' user has used all his timeslice,
+	 * send his task to the expired array.
+	 */
+
+	if(fairsched && !next->user->time_slice){
+		dequeue_task(next, array);
+		enqueue_task(next, rq->expired);
+		goto pick_next_task;
+	}
 
 switch_tasks:
 	prefetch(next);
diff -puN kernel/sysctl.c~fairsched-A1 kernel/sysctl.c
--- 25/kernel/sysctl.c~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/kernel/sysctl.c	Wed Apr  2 13:17:44 2003
@@ -57,6 +57,8 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int fairsched;
+extern int max_timeslice_user;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -263,6 +265,10 @@ static ctl_table kern_table[] = {
 #endif
 	{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
 	 0600, NULL, &proc_dointvec},
+	{KERN_FAIRSCHED, "fairsched", &pid_max, sizeof (int),
+	 0644, NULL, &proc_dointvec},
+	{KERN_MAX_TIMESLICE_USER, "max_timeslice_user", &pid_max, sizeof (int),
+	 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -puN kernel/user.c~fairsched-A1 kernel/user.c
--- 25/kernel/user.c~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/kernel/user.c	Wed Apr  2 13:17:44 2003
@@ -27,10 +27,13 @@ static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
 
+struct list_head user_list;
+
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(1),
 	.processes	= ATOMIC_INIT(1),
-	.files		= ATOMIC_INIT(0)
+	.files		= ATOMIC_INIT(0),
+	.time_slice	= 1
 };
 
 /*
@@ -39,10 +42,12 @@ struct user_struct root_user = {
 static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
 {
 	list_add(&up->uidhash_list, hashent);
+	list_add(&up->uid_list, &user_list);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
+	list_del(&up->uid_list);
 	list_del(&up->uidhash_list);
 }
 
@@ -97,6 +102,7 @@ struct user_struct * alloc_uid(uid_t uid
 		atomic_set(&new->__count, 1);
 		atomic_set(&new->processes, 0);
 		atomic_set(&new->files, 0);
+		new->time_slice = 1;
 
 		/*
 		 * Before adding this, check whether we raced
@@ -146,6 +152,8 @@ static int __init uid_cache_init(void)
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_LIST_HEAD(uidhash_table + n);
+
+	INIT_LIST_HEAD(&user_list);
 
 	/* Insert the root user immediately - init already runs with this */
 	uid_hash_insert(&root_user, uidhashentry(0));

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 12:46     ` Antonio Vargas
@ 2003-04-02 16:35       ` William Lee Irwin III
  2003-04-02 21:36         ` Antonio Vargas
  0 siblings, 1 reply; 21+ messages in thread
From: William Lee Irwin III @ 2003-04-02 16:35 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: linux-kernel, Robert Love

On Wed, Apr 02, 2003 at 02:46:43PM +0200, Antonio Vargas wrote:
+static inline void update_user_timeslices(void)
...
+	list_for_each(entry, &user_list) {
+		user = list_entry(entry, struct user_struct, uid_list);
+
+		if(!user) continue;
+
+		if(0){
+			user_time_slice = user->time_slice;

Hmm, this looks very O(n)... BTW, doesn't uidhash_lock lock user_list?


On Wed, Apr 02, 2003 at 02:46:43PM +0200, Antonio Vargas wrote:
> @@ -39,10 +42,12 @@ struct user_struct root_user = {
>  static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
>  {
>  	list_add(&up->uidhash_list, hashent);
> +	list_add(&up->uid_list, &user_list);
>  }

Okay, there are three or four problems:

(1) uidhash_lock can't be taken in interrupt context
(2) you aren't taking uidhash_lock at all in update_user_timeslices()
(3) you're not actually handing out user timeslices due to an if (0)
(4) walking user_list is O(n)


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 21:36         ` Antonio Vargas
@ 2003-04-02 21:35           ` Robert Love
  2003-04-02 22:07             ` Antonio Vargas
  0 siblings, 1 reply; 21+ messages in thread
From: Robert Love @ 2003-04-02 21:35 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: William Lee Irwin III, linux-kernel

On Wed, 2003-04-02 at 16:36, Antonio Vargas wrote:

> I've been thinking about this thing a while ago, and I think I could do this:
> 
> a. Have a kernel thread which wakes up on each tick.

Why not use the timer tick itself?  It already calls scheduler_tick()...

Oh, because you need to grab uidhash_lock?  Ew.  Needing a kernel thread
for this is not pretty.

> Also, this locking rule means I can't even read current->user->time_slice?
> What if I changed the type to an atomic_int?

You can always read a single word-sized type atomically.  No need for
atomic_t's.

	Robert Love


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 16:35       ` William Lee Irwin III
@ 2003-04-02 21:36         ` Antonio Vargas
  2003-04-02 21:35           ` Robert Love
  0 siblings, 1 reply; 21+ messages in thread
From: Antonio Vargas @ 2003-04-02 21:36 UTC (permalink / raw)
  To: William Lee Irwin III, Antonio Vargas, linux-kernel, Robert Love

On Wed, Apr 02, 2003 at 08:35:12AM -0800, William Lee Irwin III wrote:
> On Wed, Apr 02, 2003 at 02:46:43PM +0200, Antonio Vargas wrote:
> +static inline void update_user_timeslices(void)
> ...
> +	list_for_each(entry, &user_list) {
> +		user = list_entry(entry, struct user_struct, uid_list);
> +
> +		if(!user) continue;
> +
> +		if(0){
> +			user_time_slice = user->time_slice;
> 
> Hmm, this looks very O(n)... BTW, doesn't uidhash_lock lock user_list?
> 
> 
> On Wed, Apr 02, 2003 at 02:46:43PM +0200, Antonio Vargas wrote:
> > @@ -39,10 +42,12 @@ struct user_struct root_user = {
> >  static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
> >  {
> >  	list_add(&up->uidhash_list, hashent);
> > +	list_add(&up->uid_list, &user_list);
> >  }
> 
> Okay, there are three or four problems:
> 
> (1) uidhash_lock can't be taken in interrupt context
> (2) you aren't taking uidhash_lock at all in update_user_timeslices()
> (3) you're not actually handing out user timeslices due to an if (0)
> (4) walking user_list is O(n)

Yes I know there are problems, I'm just trying to make it run compile and run :)

I've been thinking about this thing a while ago, and I think I could do this:

a. Have a kernel thread which wakes up on each tick.

b. The thread just takes the first user_truct from the list, adds one tick to his
   timeslice and sends it to the back. This makes the thread giveone
   tick to each user in turn and slowly, instead of trying to give all ticks
   at the same time.

Now, I don't know too much about the locking rules, but I think I could
send a signal, semaphore o spinlock which wakes the thread from the
scheduler_tick(), and the thread could take the uidhash_lock for the update.

Also, this locking rule means I can't even read current->user->time_slice?
What if I changed the type to an atomic_int?

Greets, Antonio.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 21:35           ` Robert Love
@ 2003-04-02 22:07             ` Antonio Vargas
  2003-04-02 22:10               ` Robert Love
  0 siblings, 1 reply; 21+ messages in thread
From: Antonio Vargas @ 2003-04-02 22:07 UTC (permalink / raw)
  To: Robert Love; +Cc: Antonio Vargas, William Lee Irwin III, linux-kernel

On Wed, Apr 02, 2003 at 04:35:00PM -0500, Robert Love wrote:
> On Wed, 2003-04-02 at 16:36, Antonio Vargas wrote:
> 
> > I've been thinking about this thing a while ago, and I think I could do this:
> > 
> > a. Have a kernel thread which wakes up on each tick.
> 
> Why not use the timer tick itself?  It already calls scheduler_tick()...
> 
> Oh, because you need to grab uidhash_lock?  Ew.  Needing a kernel thread
> for this is not pretty.

Hmmm, we had some way for executing code just after an interrupt,
but outside interrupt scope... was it a bottom half? Can you
point me to some place where it's done?
 
> > Also, this locking rule means I can't even read current->user->time_slice?
> > What if I changed the type to an atomic_int?
> 
> You can always read a single word-sized type atomically.  No need for
> atomic_t's.

Ok, I did know m68k can do it, but wasn't sure about all other arches :)

Btw, I'm testing the patch using UML and besides I don't have any SMP
machine, hope any of you can test it when it looks good :)

Greets, Antonio

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 22:07             ` Antonio Vargas
@ 2003-04-02 22:10               ` Robert Love
  2003-04-02 22:35                 ` Antonio Vargas
  2003-04-03 17:15                 ` Corey Minyard
  0 siblings, 2 replies; 21+ messages in thread
From: Robert Love @ 2003-04-02 22:10 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: William Lee Irwin III, linux-kernel

On Wed, 2003-04-02 at 17:07, Antonio Vargas wrote:

> Hmmm, we had some way for executing code just after an interrupt,
> but outside interrupt scope... was it a bottom half? Can you
> point me to some place where it's done?

Unfortunately uidhash_lock cannot be used from a bottom half either.

You can push it into a work queue.  See schedule_work() and the default
events queue.

> Ok, I did know m68k can do it, but wasn't sure about all other arches :)

Yep.  Everyone architecture I know of - and certainly all that Linux
support - can do atomic read/writes to a word.  Thinking about it, it
would be odd if not (two writes to a single word interleaving?).  There
are places this assumption is used.

Anything more complicated, of course, needs atomic operations or locks.

	Robert Love


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 22:10               ` Robert Love
@ 2003-04-02 22:35                 ` Antonio Vargas
  2003-04-03 17:15                 ` Corey Minyard
  1 sibling, 0 replies; 21+ messages in thread
From: Antonio Vargas @ 2003-04-02 22:35 UTC (permalink / raw)
  To: Robert Love; +Cc: Antonio Vargas, William Lee Irwin III, linux-kernel

On Wed, Apr 02, 2003 at 05:10:27PM -0500, Robert Love wrote:
> On Wed, 2003-04-02 at 17:07, Antonio Vargas wrote:
> 
> > Hmmm, we had some way for executing code just after an interrupt,
> > but outside interrupt scope... was it a bottom half? Can you
> > point me to some place where it's done?
> 
> Unfortunately uidhash_lock cannot be used from a bottom half either.
> 
> You can push it into a work queue.  See schedule_work() and the default
> events queue.

I'll take a look.

> > Ok, I did know m68k can do it, but wasn't sure about all other arches :)
> 
> Yep.  Everyone architecture I know of - and certainly all that Linux
> support - can do atomic read/writes to a word.  Thinking about it, it
> would be odd if not (two writes to a single word interleaving?).  There
> are places this assumption is used.
> 
> Anything more complicated, of course, needs atomic operations or locks.

"word == basic unit of transfer from/to memory" on your reasoning, I suppose.

I'm coding this now, hope I can post a basic implementation.

Greets, Antonio.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
       [not found]     ` <200304021144.21924.frankeh@watson.ibm.com>
@ 2003-04-03 12:53       ` Antonio Vargas
  2003-04-03 19:22         ` William Lee Irwin III
  0 siblings, 1 reply; 21+ messages in thread
From: Antonio Vargas @ 2003-04-03 12:53 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Antonio Vargas, William Lee Irwin III, linux-kernel, Robert Love

[-- Attachment #1: Type: text/plain, Size: 2680 bytes --]

On Wed, Apr 02, 2003 at 11:44:21AM -0500, Hubertus Franke wrote:
> On Tuesday 01 April 2003 17:19, Antonio Vargas wrote:
> > On Tue, Apr 01, 2003 at 08:41:26AM -0800, William Lee Irwin III wrote:
> > > On Tue, Apr 01, 2003 at 02:51:59PM +0200, Antonio Vargas wrote:
> 
> To add to my previous reply...
> In your approach you might have a problem with starvation.
> High priority processes simply can eat all the ticks allocated to 
> a user and when lower priority tasks pop up at the top of
> the active list they are move to the inactive list. This could happen
> over and over again, thus starving those tasks.

Sorry for not answering sooner, but I had my mail pointing at
the lkml instead of my personal email.

Last night I continued coding up but I think I've it a deadlock:
it seems the system stops calling schedule_tick() when there are
no more tasks to run, and it's there where I use a workqueue so
that I can update the user timeslices out of interrupt context.

I think that because if I put printk's on my code, it continues to
run OK, but if I remove them it freezes hard.

About giving ticks to users, I've got an idea: assuming there are N
total processes in the system and an user has got M processes, we should
give N * max_timeslice ticks to each user every M * max_timeslice *
num_users ticks. I've been thinking about it and it seems it would
balance the ticks correctly.

About the starvation for low-priority processes, I can get and
example.. assume user A has process X Y and user B has process Z,
and max_timeslice = 2:

max_timeslice = 2 and 4 total processes ==>
give 2 * 3 = 6 ticks to user A every 2 * 2 * 2 =  8 ticks
give 2 * 3 = 6 ticks to user B every 1 * 2 * 2 =  4 ticks

running		revised		nr. of ticks for each
task		user		for each entity after executing

			A	B	X	Y	Z
-		-	6	6	2	2	2
X		A1	5	6	1	2	2
X		B1	4	6	0	2	2
Y		A2	3	6	0	1	2
Y		B2	2	6	0	0	2
Z		A3	2	5	0	0	1
Z		B3	2	4	0	0	0

reset_tasks	-	2	4	2	2	2

X		A4	1	4	1	2	2
X		B4	0	4	0	2	2

reset_user	B4	0	10	0	2	2

Z		A5	0	9	0	2	1
Z		B1	0	8	0	2	0

reset_tasks	-	0	8	2	2	2

Z		A6	0	7	2	2	1
Z		B2	0	6	2	2	0

reset_tasks	-	0	6	2	2	2

Z		A7	0	5	2	2	1
Z		B3	0	4	2	2	0

reset_tasks	-	0	4	2	2	2

Z		A8	0	3	2	2	1

reset_user	A8	6	3	2	2	1

Z		B4	6	2	2	2	0

reset_user	B4	6	8	2	2	0

A		A1	5	8	1	2	0
A		B1	4	8	0	2	0
B		A2	3	8	0	1	0
B		B2	2	8	0	0	0

reset_tasks	-	2	8	2	2	2

Z		A3	2	7	2	2	1
Z		B3	2	6	2	2	0
A		A4	1	6	1	2	0
A		B4	0	6	0	2	0

... ad infinitum ...

probably getting a graph out of this would be good :)

I will attach my current not-so-working patch and will try
this scheme later at night.

Note there are some spourius calls to run my workqueue due to debugging
tests.

Greets, Antonio.


[-- Attachment #2: fairsched-A2.patch~uml-2.5.66-1 --]
[-- Type: text/plain, Size: 8032 bytes --]



This patch is a forward port of the fairsched-2.4.19 patch by Rik van Riel, which
ensures all competing users get an equal share of cpu time.

Since the earlier patch applied to the classic O(n) process scheduler,
and this one applies to the standard 2.5 O(1) one, the mapping isn't
one-to-one but rather more complex.

Original 2.4.19 version:       Rik van Riel, riel@surriel.com
Forward ported 2.5.66 version: Antonio Vargas, wind@cocodriloo.com

A0: initial layout for code and data
A1: introduce runqueue handling related to per-user cpu share


 include/linux/sched.h  |    6 +++
 include/linux/sysctl.h |    2 +
 kernel/sched.c         |   97 ++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c        |    6 +++
 kernel/user.c          |   10 ++++-
 5 files changed, 119 insertions(+), 2 deletions(-)

diff -puN include/linux/sched.h~fairsched-A1 include/linux/sched.h
--- 25/include/linux/sched.h~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/include/linux/sched.h	Wed Apr  2 13:17:44 2003
@@ -280,6 +280,12 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+
+	/* List maintenance information */
+	struct list_head uid_list;
+
+	/* Per-user timeslice management */
+	unsigned int time_slice;
 };
 
 #define get_current_user() ({ 				\
diff -puN include/linux/sysctl.h~fairsched-A1 include/linux/sysctl.h
--- 25/include/linux/sysctl.h~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/include/linux/sysctl.h	Wed Apr  2 13:17:44 2003
@@ -129,6 +129,8 @@ enum
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
 	KERN_PIDMAX=55,		/* int: PID # limit */
   	KERN_CORE_PATTERN=56,	/* string: pattern for core-file names */
+	KERN_FAIRSCHED=57,		/* turn per-user fair cpu scheduler on/off */
+	KERN_MAX_TIMESLICE_USER=58,	/* maximum user timeslice */
 };
 
 
diff -puN kernel/sched.c~fairsched-A1 kernel/sched.c
--- 25/kernel/sched.c~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/kernel/sched.c	Thu Apr  3 00:41:26 2003
@@ -33,6 +33,13 @@
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 
+/*
+ * turn per-user fair scheduler on/off
+ */
+
+int fairsched = 0;
+
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -75,6 +82,9 @@
 #define STARVATION_LIMIT	(10*HZ)
 #define NODE_THRESHOLD		125
 
+int max_timeslice_user = (200 * HZ) / 100;
+#define MAX_TIMESLICE_USER	(max_timeslice_user)
+
 /*
  * If a task is 'interactive' then we reinsert it in the active
  * array after it has expired its current timeslice. (it will not
@@ -317,6 +327,10 @@ static int effective_prio(task_t *p)
 	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 
+	if(fairsched){
+		/* special processing for per-user fair scheduler */
+	}
+
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
@@ -1188,12 +1202,17 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
+
+static struct user_struct fairsched_last_user = 0;
+
 void scheduler_tick(int user_ticks, int sys_ticks)
 {
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
+	unsigned int user_time_slice;
+
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
 
@@ -1246,7 +1265,16 @@ void scheduler_tick(int user_ticks, int 
 		}
 		goto out;
 	}
-	if (!--p->time_slice) {
+
+	/* Update per-user timeslice */
+	if(fairsched) {
+		user_time_slice = p->user->time_slice;
+		fairsched_last_user = p->user;
+	}else
+		user_time_slice = 1;
+
+	/* Update per-process timeslice and queue the task when used up */
+	if (!--p->time_slice || !user_time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
@@ -1268,6 +1296,57 @@ out:
 void scheduling_functions_start_here(void) { }
 
 /*
+ * Gime more timeslices to users on each array switch
+ */
+static inline void update_user_timeslices(void)
+{
+	extern struct list_head user_list;
+	struct list_head *entry;
+	struct user_struct *user;
+	unsigned int old_user_time_slice;
+	unsigned int new_user_time_slice;
+
+	if(!fairsched) return;
+
+	/* Take the lock */
+
+	spin_lock(&uidhash_lock);
+
+	/* Decrement user timeslice for last user who got a tick */
+
+	user = fairsched_last_user;
+	if(user)
+		if(user->time_slice > 0)
+			user->time_slice--;
+
+	/* Get head of user list */
+
+	entry = user_list.next;
+	user = list_entry(entry, struct user_struct, uid_list);
+	if(!user)
+		goto out:
+
+	/* Calculate new timeslice for user */
+
+	old_user_time_slice = user->time_slice;
+	new_user_time_slice = old_user_time_slice + 1;
+
+	if(new_user_time_slice > MAX_TIMESLICE_USER)
+		new_user_time_slice = MAX_TIMESLICE_USER;
+
+	if(old_user_time_slice != new_user_time_slice)
+			user->time_slice = new_user_time_slice;
+
+	/* Reinsert at the end of the user list */
+
+	list_del(&user->uid_list);
+	list_add_tail(&user->uid_list,&uid_list);
+
+out:
+	spin_unlock(&uidhash_lock);
+}
+
+/*
  * schedule() is the main scheduler function.
  */
 asmlinkage void schedule(void)
@@ -1339,11 +1418,27 @@ pick_next_task:
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
+
+		/*
+		 * Give new timeslices to users
+		 */
+		update_user_timeslices();
 	}
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
+
+	/*
+	 * If next task' user has used all his timeslice,
+	 * send his task to the expired array.
+	 */
+
+	if(fairsched && !next->user->time_slice){
+		dequeue_task(next, array);
+		enqueue_task(next, rq->expired);
+		goto pick_next_task;
+	}
 
 switch_tasks:
 	prefetch(next);
diff -puN kernel/sysctl.c~fairsched-A1 kernel/sysctl.c
--- 25/kernel/sysctl.c~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/kernel/sysctl.c	Wed Apr  2 23:15:00 2003
@@ -57,6 +57,8 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int fairsched;
+extern int max_timeslice_user;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -263,6 +265,10 @@ static ctl_table kern_table[] = {
 #endif
 	{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
 	 0600, NULL, &proc_dointvec},
+	{KERN_FAIRSCHED, "fairsched", &fairsched, sizeof (int),
+	 0644, NULL, &proc_dointvec},
+	{KERN_MAX_TIMESLICE_USER, "max_timeslice_user", &max_timeslice_user, sizeof (int),
+	 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -puN kernel/user.c~fairsched-A1 kernel/user.c
--- 25/kernel/user.c~fairsched-A1	Wed Apr  2 13:17:44 2003
+++ 25-wind/kernel/user.c	Wed Apr  2 13:17:44 2003
@@ -27,10 +27,13 @@ static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
 
+struct list_head user_list;
+
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(1),
 	.processes	= ATOMIC_INIT(1),
-	.files		= ATOMIC_INIT(0)
+	.files		= ATOMIC_INIT(0),
+	.time_slice	= 1
 };
 
 /*
@@ -39,10 +42,12 @@ struct user_struct root_user = {
 static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
 {
 	list_add(&up->uidhash_list, hashent);
+	list_add(&up->uid_list, &user_list);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
+	list_del(&up->uid_list);
 	list_del(&up->uidhash_list);
 }
 
@@ -97,6 +102,7 @@ struct user_struct * alloc_uid(uid_t uid
 		atomic_set(&new->__count, 1);
 		atomic_set(&new->processes, 0);
 		atomic_set(&new->files, 0);
+		new->time_slice = 1;
 
 		/*
 		 * Before adding this, check whether we raced
@@ -146,6 +152,8 @@ static int __init uid_cache_init(void)
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_LIST_HEAD(uidhash_table + n);
+
+	INIT_LIST_HEAD(&user_list);
 
 	/* Insert the root user immediately - init already runs with this */
 	uid_hash_insert(&root_user, uidhashentry(0));

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-02 22:10               ` Robert Love
  2003-04-02 22:35                 ` Antonio Vargas
@ 2003-04-03 17:15                 ` Corey Minyard
  2003-04-03 18:17                   ` Robert Love
  1 sibling, 1 reply; 21+ messages in thread
From: Corey Minyard @ 2003-04-03 17:15 UTC (permalink / raw)
  To: Robert Love; +Cc: Antonio Vargas, William Lee Irwin III, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Robert Love wrote:

>Yep.  Everyone architecture I know of - and certainly all that Linux
>support - can do atomic read/writes to a word.  Thinking about it, it
>would be odd if not (two writes to a single word interleaving?).  There
>are places this assumption is used.
>
I believe this is true if the data is aligned.  I don't think it's true
for unaligned access on an x86.

- -Corey
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.0.6 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQE+jGwdIXnXXONXERcRAg7NAJ9p3/dhYIvyQrUSR9RQhzOHebJxcQCgnnIt
TX41ng3GQo8SlGXLKmJvSKA=
=wjHE
-----END PGP SIGNATURE-----



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-03 17:15                 ` Corey Minyard
@ 2003-04-03 18:17                   ` Robert Love
  0 siblings, 0 replies; 21+ messages in thread
From: Robert Love @ 2003-04-03 18:17 UTC (permalink / raw)
  To: Corey Minyard; +Cc: Antonio Vargas, William Lee Irwin III, linux-kernel

On Thu, 2003-04-03 at 12:15, Corey Minyard wrote:

> I believe this is true if the data is aligned.  I don't think it's
> true for unaligned access on an x86.

Yes, this is true.  Sorry for not mentioning.

Unless Antonio plays games with pointers and types, though, all his
accesses should be aligned.

	Robert Love


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-03 12:53       ` Antonio Vargas
@ 2003-04-03 19:22         ` William Lee Irwin III
  2003-04-04 11:27           ` Antonio Vargas
  0 siblings, 1 reply; 21+ messages in thread
From: William Lee Irwin III @ 2003-04-03 19:22 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: Hubertus Franke, linux-kernel, Robert Love

On Thu, Apr 03, 2003 at 02:53:55PM +0200, Antonio Vargas wrote:
> Sorry for not answering sooner, but I had my mail pointing at
> the lkml instead of my personal email.
> Last night I continued coding up but I think I've it a deadlock:
> it seems the system stops calling schedule_tick() when there are
> no more tasks to run, and it's there where I use a workqueue so
> that I can update the user timeslices out of interrupt context.
> I think that because if I put printk's on my code, it continues to
> run OK, but if I remove them it freezes hard.

Use spin_lock_irq(&uidhash_lock) or you will deadlock if you hold it
while you take a timer tick, but it's wrong anyway. it's O(N) with
respect to number of users present. An O(1) algorithm could easily
make use of reference counts held by tasks.


On Thu, Apr 03, 2003 at 02:53:55PM +0200, Antonio Vargas wrote:
> About giving ticks to users, I've got an idea: assuming there are N
> total processes in the system and an user has got M processes, we should
> give N * max_timeslice ticks to each user every M * max_timeslice *
> num_users ticks. I've been thinking about it and it seems it would
> balance the ticks correctly.
> About the starvation for low-priority processes, I can get and
> example.. assume user A has process X Y and user B has process Z,
> and max_timeslice = 2:
> max_timeslice = 2 and 4 total processes ==>
> give 2 * 3 = 6 ticks to user A every 2 * 2 * 2 =  8 ticks
> give 2 * 3 = 6 ticks to user B every 1 * 2 * 2 =  4 ticks

This isn't right, when expiration happens needs to be tracked by both
user and task. Basically which tasks are penalized when the user
expiration happens? The prediction is the same set of tasks will always
be the target of the penalty.


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-03 19:22         ` William Lee Irwin III
@ 2003-04-04 11:27           ` Antonio Vargas
  2003-04-04 14:04             ` William Lee Irwin III
       [not found]             ` <200304041453.16630.frankeh@watson.ibm.com>
  0 siblings, 2 replies; 21+ messages in thread
From: Antonio Vargas @ 2003-04-04 11:27 UTC (permalink / raw)
  To: William Lee Irwin III, Antonio Vargas, Hubertus Franke,
	linux-kernel, Robert Love

[-- Attachment #1: Type: text/plain, Size: 2884 bytes --]

On Thu, Apr 03, 2003 at 11:22:41AM -0800, William Lee Irwin III wrote:
> On Thu, Apr 03, 2003 at 02:53:55PM +0200, Antonio Vargas wrote:
> > Sorry for not answering sooner, but I had my mail pointing at
> > the lkml instead of my personal email.
> > Last night I continued coding up but I think I've it a deadlock:
> > it seems the system stops calling schedule_tick() when there are
> > no more tasks to run, and it's there where I use a workqueue so
> > that I can update the user timeslices out of interrupt context.
> > I think that because if I put printk's on my code, it continues to
> > run OK, but if I remove them it freezes hard.
> 
> Use spin_lock_irq(&uidhash_lock) or you will deadlock if you hold it
> while you take a timer tick, but it's wrong anyway. it's O(N) with
> respect to number of users present. An O(1) algorithm could easily
> make use of reference counts held by tasks.
> 
> 
> On Thu, Apr 03, 2003 at 02:53:55PM +0200, Antonio Vargas wrote:
> > About giving ticks to users, I've got an idea: assuming there are N
> > total processes in the system and an user has got M processes, we should
> > give N * max_timeslice ticks to each user every M * max_timeslice *
> > num_users ticks. I've been thinking about it and it seems it would
> > balance the ticks correctly.
> > About the starvation for low-priority processes, I can get and
> > example.. assume user A has process X Y and user B has process Z,
> > and max_timeslice = 2:
> > max_timeslice = 2 and 4 total processes ==>
> > give 2 * 3 = 6 ticks to user A every 2 * 2 * 2 =  8 ticks
> > give 2 * 3 = 6 ticks to user B every 1 * 2 * 2 =  4 ticks
> 
> This isn't right, when expiration happens needs to be tracked by both
> user and task. Basically which tasks are penalized when the user
> expiration happens? The prediction is the same set of tasks will always
> be the target of the penalty.

Just out of experimenting, I've coded something that looks reasonable
and would not experience starvation.

In the normal scheduler, a non-interactive task will, when used all his
timeslice, reset p->time_slice and queue onto the expired array. I'm
now reseting p->reserved_time_slice instead and queuing the task
onto a per-user pending task queue.

A separate kernel thread walks the user list, calculates the user timeslice
and distributes it to it's pending tasks. When a task receives timeslices,
it's moved from the per-user pending queue to the expired array of the runqueue,
thus preparing it to run on the next array-switch.

If the user has many timeslices, he can give timeslices to many tasks, thus
getting more work done.

While the implementation may not be good enough, due to locking problems and
the use of a kernel-thread, I think the fundamental algorithm feels right.

William, should I take the lock on the uidhash_list when adding a task
to a per-user task list?

Greets, Antonio.

[-- Attachment #2: fairsched-A3.patch --]
[-- Type: text/plain, Size: 10808 bytes --]



This patch is a forward port of the fairsched-2.4.19 patch by Rik van Riel, which
ensures all competing users get an equal share of cpu time.

Since the earlier patch applied to the classic O(n) process scheduler,
and this one applies to the standard 2.5 O(1) one, the mapping isn't
one-to-one but rather more complex.

Original 2.4.19 version:       Rik van Riel, riel@surriel.com
Forward ported 2.5.66 version: Antonio Vargas, wind@cocodriloo.com

A0: initial layout for code and data
A1: introduce runqueue handling related to per-user cpu share
A2: add proper locking and O(1) behaviour
A3: perform all work with a kernel thread


 include/linux/init_task.h |    2 
 include/linux/sched.h     |   13 ++++
 include/linux/sysctl.h    |    1 
 kernel/fork.c             |    1 
 kernel/sched.c            |   53 +++++++++++++++++++-
 kernel/sysctl.c           |    3 +
 kernel/user.c             |  120 +++++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 191 insertions(+), 2 deletions(-)

diff -puN include/linux/init_task.h~fairsched-A3 include/linux/init_task.h
--- 25/include/linux/init_task.h~fairsched-A3	Fri Apr  4 12:02:44 2003
+++ 25-wind/include/linux/init_task.h	Fri Apr  4 12:02:45 2003
@@ -73,6 +73,8 @@
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
 	.time_slice	= HZ,						\
+	.reserved_time_slice	= HZ,					\
+	.user_tasks	= LIST_HEAD_INIT(tsk.user_tasks),		\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
diff -puN include/linux/sched.h~fairsched-A3 include/linux/sched.h
--- 25/include/linux/sched.h~fairsched-A3	Fri Apr  4 12:02:45 2003
+++ 25-wind/include/linux/sched.h	Fri Apr  4 12:02:45 2003
@@ -280,6 +280,15 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+
+	/* User list */
+	struct list_head user_list;
+
+	/* Task list for this user */
+	struct list_head task_list;
+
+	/* Per-user timeslice management */
+	unsigned int time_slice;
 };
 
 #define get_current_user() ({ 				\
@@ -333,7 +342,11 @@ struct task_struct {
 	unsigned long policy;
 	unsigned long cpus_allowed;
 	unsigned int time_slice, first_time_slice;
+	unsigned int reserved_time_slice;
+
+	struct work_struct work_to_user;
 
+	struct list_head user_tasks;
 	struct list_head tasks;
 	struct list_head ptrace_children;
 	struct list_head ptrace_list;
diff -puN include/linux/sysctl.h~fairsched-A3 include/linux/sysctl.h
--- 25/include/linux/sysctl.h~fairsched-A3	Fri Apr  4 12:02:45 2003
+++ 25-wind/include/linux/sysctl.h	Fri Apr  4 12:02:45 2003
@@ -129,6 +129,7 @@ enum
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
 	KERN_PIDMAX=55,		/* int: PID # limit */
   	KERN_CORE_PATTERN=56,	/* string: pattern for core-file names */
+	KERN_FAIRSCHED=57,	/* switch per-user fair cpu scheduler*/
 };
 
 
diff -puN kernel/fork.c~fairsched-A3 kernel/fork.c
--- 25/kernel/fork.c~fairsched-A3	Fri Apr  4 12:02:45 2003
+++ 25-wind/kernel/fork.c	Fri Apr  4 12:02:45 2003
@@ -917,6 +917,7 @@ static struct task_struct *copy_process(
 	 * the parent if the child exits early enough.
 	 */
 	p->first_time_slice = 1;
+	p->reserved_time_slice = 0;
 	current->time_slice >>= 1;
 	p->last_run = jiffies;
 	if (!current->time_slice) {
diff -puN kernel/sched.c~fairsched-A3 kernel/sched.c
--- 25/kernel/sched.c~fairsched-A3	Fri Apr  4 12:02:45 2003
+++ 25-wind/kernel/sched.c	Fri Apr  4 13:23:07 2003
@@ -33,6 +33,11 @@
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 
+/*
+ * Switch per-user fair scheduler on/off
+ */
+int fairsched = 0;
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -1181,6 +1186,18 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 		(jiffies - (rq)->expired_timestamp >= \
 			STARVATION_LIMIT * ((rq)->nr_running) + 1)))
 
+extern void add_task_to_user(void *);
+
+void user_task_to_expired(task_t *p)
+{
+	runqueue_t *rq = this_rq();
+
+	spin_lock_irq(&rq->lock);
+	list_del(&p->user_tasks);
+	enqueue_task(p, rq->expired);
+	spin_unlock_irq(&rq->lock);
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -1250,15 +1267,43 @@ void scheduler_tick(int user_ticks, int 
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
+		/*
+		 * Never apply fairsched to root user
+		 */
+		if(p->user->uid && fairsched) {
+			/*
+			 * If not interactive, reserve the timeslice
+			 * and queue the task to the user
+			 */
+			if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+				p->reserved_time_slice += task_timeslice(p);
+				if (!rq->expired_timestamp)
+					rq->expired_timestamp = jiffies;
+				INIT_WORK(&p->work_to_user, add_task_to_user, p);
+				schedule_work(&p->work_to_user);
+			} else {
+				/*
+				 * Interactive work isn't altered by fairsched
+				 */
+				p->time_slice += task_timeslice(p);
+				enqueue_task(p, rq->active);
+			}
+			goto did_fairsched;
+		}
+
+		/*
+		 * Do usual work when root user or fairsched is disabled
+		 */
+		p->time_slice += task_timeslice(p);
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			if (!rq->expired_timestamp)
 				rq->expired_timestamp = jiffies;
 			enqueue_task(p, rq->expired);
 		} else
 			enqueue_task(p, rq->active);
+did_fairsched:
 	}
 out:
 	spin_unlock(&rq->lock);
@@ -1344,6 +1389,12 @@ pick_next_task:
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
+
+	if(!next->time_slice){
+		dequeue_task(next, array);
+		enqueue_task(next, rq->expired);
+		goto pick_next_task;
+	}
 
 switch_tasks:
 	prefetch(next);
diff -puN kernel/sysctl.c~fairsched-A3 kernel/sysctl.c
--- 25/kernel/sysctl.c~fairsched-A3	Fri Apr  4 12:02:45 2003
+++ 25-wind/kernel/sysctl.c	Fri Apr  4 12:02:45 2003
@@ -57,6 +57,7 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int fairsched;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -263,6 +264,8 @@ static ctl_table kern_table[] = {
 #endif
 	{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
 	 0600, NULL, &proc_dointvec},
+	{KERN_FAIRSCHED, "fairsched", &fairsched, sizeof (int),
+	 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff -puN kernel/user.c~fairsched-A3 kernel/user.c
--- 25/kernel/user.c~fairsched-A3	Fri Apr  4 12:02:45 2003
+++ 25-wind/kernel/user.c	Fri Apr  4 13:04:30 2003
@@ -13,6 +13,8 @@
 #include <linux/slab.h>
 #include <linux/bitops.h>
 
+#include <asm/uaccess.h>
+
 /*
  * UID task count cache, to get fast user lookup in "alloc_uid"
  * when changing user ID's (ie setuid() and friends).
@@ -26,11 +28,13 @@
 static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
+struct list_head user_list;
 
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(1),
 	.processes	= ATOMIC_INIT(1),
-	.files		= ATOMIC_INIT(0)
+	.files		= ATOMIC_INIT(0),
+	.time_slice	= 1
 };
 
 /*
@@ -39,10 +43,12 @@ struct user_struct root_user = {
 static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
 {
 	list_add(&up->uidhash_list, hashent);
+	list_add(&up->user_list, &user_list);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
+	list_del(&up->user_list);
 	list_del(&up->uidhash_list);
 }
 
@@ -97,6 +103,10 @@ struct user_struct * alloc_uid(uid_t uid
 		atomic_set(&new->__count, 1);
 		atomic_set(&new->processes, 0);
 		atomic_set(&new->files, 0);
+		new->time_slice = 1;
+
+		INIT_LIST_HEAD(&new->user_list);
+		INIT_LIST_HEAD(&new->task_list);
 
 		/*
 		 * Before adding this, check whether we raced
@@ -134,6 +144,105 @@ void switch_uid(struct user_struct *new_
 }
 
 
+extern void user_task_to_expired(task_t *);
+
+/*
+ * fairsched_thread - this is a highprio system thread that gives
+ * out timeslices to users and passes user' timeslices to tasks
+ */
+static int fairsched_thread(void *data)
+{
+	int cpu = (long) data;
+	extern int fairsched;
+
+	daemonize("fairsched/%d", cpu);
+	set_fs(KERNEL_DS);
+
+	for(;;) {
+		struct user_struct *u;
+		task_t *p;
+
+		int num_tasks;
+
+		/* Wake up every 4 ticks */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(4);
+
+		while(!fairsched){
+			/* Sleep until we are switched on */
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ);
+		}
+
+		/* Get user */
+		spin_lock(&uidhash_lock);
+		u = list_entry(&user_list, struct user_struct, user_list);
+		if(!u) goto no_user;
+		atomic_inc(&u->__count);
+		spin_unlock(&uidhash_lock);
+
+		if(u == &root_user)
+			goto root_user;
+
+		/*
+		 * Update user time slice count
+		 */
+		u->time_slice += 4 * nr_running();
+		num_tasks = 1 + atomic_read(&u->processes);
+
+		read_lock(&tasklist_lock);
+
+		while(u->time_slice && --num_tasks) {
+			int giveout;
+
+			p = list_entry(&u->task_list, task_t, user_tasks);
+			if(!p) goto end_user;
+
+			/* Get timeslices from user and task reserve */
+			giveout = min(u->time_slice, p->reserved_time_slice);
+			if(giveout) {
+				/*
+				 * Give the timeslice to the task,
+				 * and queue the task into the expired array.
+				 */
+				u->time_slice -= giveout;
+				p->reserved_time_slice -= giveout;
+				p->time_slice += giveout;
+
+				user_task_to_expired(p);
+			} else {
+				/* Round-robin per-user task list */
+				list_del(&p->user_tasks);
+				list_add_tail(&p->user_tasks, &u->task_list);
+			}
+		}
+
+end_user:
+		read_unlock(&tasklist_lock);
+root_user:
+		spin_lock(&uidhash_lock);
+
+		/* Round-robin user list */
+		list_del(&u->uidhash_list);
+		list_add_tail(&u->user_list, &user_list);
+		atomic_dec(&u->__count);
+no_user:
+		spin_unlock(&uidhash_lock);
+	}
+}
+
+void add_task_to_user(void *data)
+{
+	task_t *p = data;
+
+	/* add task to user's expired task list */
+
+	spin_lock(&uidhash_lock);
+	INIT_LIST_HEAD(&p->user_tasks);
+	list_add_tail(&p->user_tasks,&p->user->task_list);
+	spin_unlock(&uidhash_lock);
+}
+
 static int __init uid_cache_init(void)
 {
 	int n;
@@ -147,8 +256,17 @@ static int __init uid_cache_init(void)
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_LIST_HEAD(uidhash_table + n);
 
+	INIT_LIST_HEAD(&user_list);
+	INIT_LIST_HEAD(&root_user.user_list);
+	INIT_LIST_HEAD(&root_user.task_list);
+
+	list_add(&root_user.user_list, &user_list);
+
 	/* Insert the root user immediately - init already runs with this */
 	uid_hash_insert(&root_user, uidhashentry(0));
+
+	kernel_thread(fairsched_thread, 0, CLONE_KERNEL);
+
 	return 0;
 }
 

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-04 11:27           ` Antonio Vargas
@ 2003-04-04 14:04             ` William Lee Irwin III
  2003-04-04 20:12               ` Antonio Vargas
       [not found]             ` <200304041453.16630.frankeh@watson.ibm.com>
  1 sibling, 1 reply; 21+ messages in thread
From: William Lee Irwin III @ 2003-04-04 14:04 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: Hubertus Franke, linux-kernel, Robert Love

On Thu, Apr 03, 2003 at 11:22:41AM -0800, William Lee Irwin III wrote:
>> Use spin_lock_irq(&uidhash_lock) or you will deadlock if you hold it
>> while you take a timer tick, but it's wrong anyway. it's O(N) with
>> respect to number of users present. An O(1) algorithm could easily
>> make use of reference counts held by tasks.
[...]
>> This isn't right, when expiration happens needs to be tracked by both
>> user and task. Basically which tasks are penalized when the user
>> expiration happens? The prediction is the same set of tasks will always
>> be the target of the penalty.

On Fri, Apr 04, 2003 at 01:27:04PM +0200, Antonio Vargas wrote:
> Just out of experimenting, I've coded something that looks reasonable
> and would not experience starvation.
> In the normal scheduler, a non-interactive task will, when used all
> his timeslice, reset p->time_slice and queue onto the expired array.
> I'm now reseting p->reserved_time_slice instead and queuing the task
> onto a per-user pending task queue.
> A separate kernel thread walks the user list, calculates the user
> timeslice and distributes it to it's pending tasks. When a task
> receives timeslices, it's moved from the per-user pending queue to
> the expired array of the runqueue, thus preparing it to run on the
> next array-switch.

Hmm, priorities getting recalculated by a kernel thread sounds kind of
scary, but who am I to judge?


On Fri, Apr 04, 2003 at 01:27:04PM +0200, Antonio Vargas wrote:
> If the user has many timeslices, he can give timeslices to many tasks, thus
> getting more work done.
> While the implementation may not be good enough, due to locking problems and
> the use of a kernel-thread, I think the fundamental algorithm feels right.
> William, should I take the lock on the uidhash_list when adding a task
> to a per-user task list?

Possible, though I'd favor a per-user spinlock.

The code looks reasonable now, modulo that race you asked about.


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-04 14:04             ` William Lee Irwin III
@ 2003-04-04 20:12               ` Antonio Vargas
  2003-04-04 20:40                 ` William Lee Irwin III
  0 siblings, 1 reply; 21+ messages in thread
From: Antonio Vargas @ 2003-04-04 20:12 UTC (permalink / raw)
  To: William Lee Irwin III, Antonio Vargas, Hubertus Franke,
	linux-kernel, Robert Love

On Fri, Apr 04, 2003 at 06:04:47AM -0800, William Lee Irwin III wrote:
> On Thu, Apr 03, 2003 at 11:22:41AM -0800, William Lee Irwin III wrote:
> >> Use spin_lock_irq(&uidhash_lock) or you will deadlock if you hold it
> >> while you take a timer tick, but it's wrong anyway. it's O(N) with
> >> respect to number of users present. An O(1) algorithm could easily
> >> make use of reference counts held by tasks.
> [...]
> >> This isn't right, when expiration happens needs to be tracked by both
> >> user and task. Basically which tasks are penalized when the user
> >> expiration happens? The prediction is the same set of tasks will always
> >> be the target of the penalty.
> 
> On Fri, Apr 04, 2003 at 01:27:04PM +0200, Antonio Vargas wrote:
> > Just out of experimenting, I've coded something that looks reasonable
> > and would not experience starvation.
> > In the normal scheduler, a non-interactive task will, when used all
> > his timeslice, reset p->time_slice and queue onto the expired array.
> > I'm now reseting p->reserved_time_slice instead and queuing the task
> > onto a per-user pending task queue.
> > A separate kernel thread walks the user list, calculates the user
> > timeslice and distributes it to it's pending tasks. When a task
> > receives timeslices, it's moved from the per-user pending queue to
> > the expired array of the runqueue, thus preparing it to run on the
> > next array-switch.
> 
> Hmm, priorities getting recalculated by a kernel thread sounds kind of
> scary, but who am I to judge?

First and foremost, root user is handled as usual, so kernel tasks should
also be handled as usual if they always run as root...
but I've yet to find out how to check if a task is a kernel thread or not,
and skip these explicitely just to be safe.

Also, while I've coded it on a kernel thread, I'd settle perfectly to a
periodic timer, but I don't know yet ask for one... perhaps a
self-propagating workqueue entry with a delay?

> On Fri, Apr 04, 2003 at 01:27:04PM +0200, Antonio Vargas wrote:
> > If the user has many timeslices, he can give timeslices to many tasks, thus
> > getting more work done.
> > While the implementation may not be good enough, due to locking problems and
> > the use of a kernel-thread, I think the fundamental algorithm feels right.
> > William, should I take the lock on the uidhash_list when adding a task
> > to a per-user task list?
> 
> Possible, though I'd favor a per-user spinlock.

So, when trying to add a task to the per-user pending tasks, I'd have
to do this:

1. spin_lock_irqsave(uidhash_lock, flags)
2. spin_lock(my_user->user_lock)
3. spin_unlock_restore(uidhash_lock, flags);

Is this any good?

Could I simply do "spin_unlock(my_user->user_lock)" at end without
taking the uidhash_lock again?

> The code looks reasonable now, modulo that race you asked about.

What do I need to lock when I want to add a task to a runqueue?
I'm doing a "spin_lock_irq(this_rq()->lock);"

As you can see, I'm not yet at speed with the locking rules... any
online references to the latest locking rules? The BKL was really
easy to understand in comparison! *grin*

Greets, Antonio.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
  2003-04-04 20:12               ` Antonio Vargas
@ 2003-04-04 20:40                 ` William Lee Irwin III
  0 siblings, 0 replies; 21+ messages in thread
From: William Lee Irwin III @ 2003-04-04 20:40 UTC (permalink / raw)
  To: Antonio Vargas; +Cc: Hubertus Franke, linux-kernel, Robert Love

On Fri, Apr 04, 2003 at 06:04:47AM -0800, William Lee Irwin III wrote:
>> Possible, though I'd favor a per-user spinlock.

On Fri, Apr 04, 2003 at 10:12:41PM +0200, Antonio Vargas wrote:
> So, when trying to add a task to the per-user pending tasks, I'd have
> to do this:
> 1. spin_lock_irqsave(uidhash_lock, flags)
> 2. spin_lock(my_user->user_lock)
> 3. spin_unlock_restore(uidhash_lock, flags);
> Is this any good?
> Could I simply do "spin_unlock(my_user->user_lock)" at end without
> taking the uidhash_lock again?

Well, you should unlock in the opposite order you acquired, and also
unlock all the locks you acquired in general.

But you shouldn't need to do this. There's a trick with reference
counting that saves you from the ostensible hierarchy:

You know the user won't go away b/c you hold a reference. You've now
forked a child, and are looking to set it up. You get a reference for
the child so it won't go away in the child's lifetime, and then take
the per-user lock _only_. The per-user lock keeps two things looking at
a user's task list from interfering with each other, but isn't needed
for looking at the rest of the user. Now you can safely park a new
task on the list or remove an old one, and when you're done, you drop
the per-user lock only.

The per-user and uidhash locks shouldn't be taken simultaneously; the
reference count on the user should be moved to or from zero,and the
thing allocated or freed, and the uidhash_lock taken and dropped to
hash or unhash without ever taking the per-user lock. Then we can just
take the per-user lock when adding to it (we hold a reference) or not
take any lock at all when freeing it (we were the last reference; no
one will ever use this again without re-initializing it). Fresh
allocations can also allow the task to add itself to the per-user
tasklist without taking the per-user lock.

The actual tricky parts are setuid and so on, where you have to move
a task between users. For this just compare the addresses of the two
per-user locks and acquire them in address order, much like something
in the scheduler, double_rq_lock(). Then you can modify both lists
while holding both the locks to make the move happen, and when done,
just unlock in the same order you acquired.


On Fri, Apr 04, 2003 at 06:04:47AM -0800, William Lee Irwin III wrote:
>> The code looks reasonable now, modulo that race you asked about.

On Fri, Apr 04, 2003 at 10:12:41PM +0200, Antonio Vargas wrote:
> What do I need to lock when I want to add a task to a runqueue?
> I'm doing a "spin_lock_irq(this_rq()->lock);"
> As you can see, I'm not yet at speed with the locking rules... any
> online references to the latest locking rules? The BKL was really
> easy to understand in comparison! *grin*

That's fine for the runqueue. Seems you've gotten it right. There are
hard problems with using the BKL; be glad the lock you're taking has a
specific relationship to what you're doing and what you're doing it to.

BTW, this probably won't actually be very efficient on SMP or with large
numbers of users, but you're probably having fun writing it anyway. =)
Best to get something running and tweak it later.

Hubertus & rml & others know more about scheduling policy in general,
so you should probably ask them about the details of that; I just
occasionally debug this stuff when it explodes.


-- wli

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
       [not found]             ` <200304041453.16630.frankeh@watson.ibm.com>
@ 2003-04-04 21:36               ` Antonio Vargas
       [not found]                 ` <200304081456.34367.frankeh@watson.ibm.com>
  0 siblings, 1 reply; 21+ messages in thread
From: Antonio Vargas @ 2003-04-04 21:36 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Antonio Vargas, William Lee Irwin III, linux-kernel, Robert Love,
	frankeh, nagar.us.ibm.com

On Fri, Apr 04, 2003 at 02:53:16PM -0500, Hubertus Franke wrote:
> On Friday 04 April 2003 06:27, Antonio Vargas wrote:
> > On Thu, Apr 03, 2003 at 11:22:41AM -0800, William Lee Irwin III wrote:
> > > On Thu, Apr 03, 2003 at 02:53:55PM +0200, Antonio Vargas wrote:
> > > > Sorry for not answering sooner, but I had my mail pointing at
> > > > the lkml instead of my personal email.
> > > > Last night I continued coding up but I think I've it a deadlock:
> > > > it seems the system stops calling schedule_tick() when there are
> > > > no more tasks to run, and it's there where I use a workqueue so
> > > > that I can update the user timeslices out of interrupt context.
> > > > I think that because if I put printk's on my code, it continues to
> > > > run OK, but if I remove them it freezes hard.
> > >
> > > Use spin_lock_irq(&uidhash_lock) or you will deadlock if you hold it
> > > while you take a timer tick, but it's wrong anyway. it's O(N) with
> > > respect to number of users present. An O(1) algorithm could easily
> > > make use of reference counts held by tasks.
> > >
> > > On Thu, Apr 03, 2003 at 02:53:55PM +0200, Antonio Vargas wrote:
> > > > About giving ticks to users, I've got an idea: assuming there are N
> > > > total processes in the system and an user has got M processes, we
> > > > should give N * max_timeslice ticks to each user every M *
> > > > max_timeslice * num_users ticks. I've been thinking about it and it
> > > > seems it would balance the ticks correctly.
> > > > About the starvation for low-priority processes, I can get and
> > > > example.. assume user A has process X Y and user B has process Z,
> > > > and max_timeslice = 2:
> > > > max_timeslice = 2 and 4 total processes ==>
> > > > give 2 * 3 = 6 ticks to user A every 2 * 2 * 2 =  8 ticks
> > > > give 2 * 3 = 6 ticks to user B every 1 * 2 * 2 =  4 ticks
> > >
> > > This isn't right, when expiration happens needs to be tracked by both
> > > user and task. Basically which tasks are penalized when the user
> > > expiration happens? The prediction is the same set of tasks will always
> > > be the target of the penalty.
> >
> > Just out of experimenting, I've coded something that looks reasonable
> > and would not experience starvation.
> >
> > In the normal scheduler, a non-interactive task will, when used all his
> > timeslice, reset p->time_slice and queue onto the expired array. I'm
> > now reseting p->reserved_time_slice instead and queuing the task
> > onto a per-user pending task queue.
> >
> > A separate kernel thread walks the user list, calculates the user timeslice
> > and distributes it to it's pending tasks. When a task receives timeslices,
> > it's moved from the per-user pending queue to the expired array of the
> > runqueue, thus preparing it to run on the next array-switch.
> >
> > If the user has many timeslices, he can give timeslices to many tasks, thus
> > getting more work done.
> >
> > While the implementation may not be good enough, due to locking problems
> > and the use of a kernel-thread, I think the fundamental algorithm feels
> > right.
> >
> > William, should I take the lock on the uidhash_list when adding a task
> > to a per-user task list?
> >
> > Greets, Antonio.
> 
> Antonio, I am going right now through some stuff.
> Again I am coming from a class based approach, but fundamentally now 
> difference.
> 
> First there are several issues that need to be addressed and they might have 
> to be handled differently. 
> Shares can be observed under hard limits or soft limits (work preserving).
> The first proposal you put forward was clearly work preserving. The latter one
> (if I understood correctly reading it in 2 seconds) would facilitate hard 
> working limits. The problem to simply do it at array switch time is that
> you could spent increased time in schedule() simply moving tasks between
> active and expired until ticks are regranted to the user.

The testing on pick_next_task is just a failsafe, it should not be done
on the real working patch.

> I have two concerns (and pardon me again if you already addressed them)
> 
> (A) starvation problem. 
> ---------------------------------
> One process could eat all the ticks
> assigned to a user, leading that subsequent scheduled task will always move
> to the expired list.

Yes.

> Effectively, the sum(timeslices) per user is the number of ticks I need to 
> give (given all cpu bound) for a user to get all tasks execute. 

Yes.

> The ratio between the sums of the different users is effectively what is 
> dictated by the shares. 
> One has effectively two means to manipulate that 
> (a) modifying the static_priority of tasks belonging to a user
> (b) modifying the effective_priority of tasks belonging to a user
> (c) modifying the time_slice of tasks
> 
> all modifications are based on whether a user is overspending or underspending 
> at some given rate.
> To me (a) is not an option.
> (b) tells me to modify the urgency of a task to be scheduled
> (c) tells me how long task should run, its also depends on (a).
> 
> What needs to be achieved is that starving tasks move up in the 
> effective_priority above the ones that keep eating allotted ticks.

I've it done correctly in the fairsched-A3 patch I sent. When a task
uses up the timeslice, it goes to the end of a pending list. When the user
receives slices, he gives it to the head of the pending list,
quits it from the pending list and attaches to the expired array just like
if it has gone on the non-fairsched path. So, the tasks just take a sleep
while they wait their relaive per-user turn.

> (B)    O(1) issues
> -------------------------
> 
> By setting tasks aside to wait for replenishment of ticks for the users you 
> might introduce an non O(1) effect, that Ingo might object against.
> One could argue, that you simply put the tasks asleep..

Yes, it's exactly that. Instead of passing a task
from active to expired directly, they
take a detour until the user can "afford" to do it.

When handling more users, there is a natural inflation because
each user can receives less slices per time unit.

----------------------------------  
> 
> I am currently doing the math and coming up with a scheme. I should be 
> hopefully close to something by Monday, which I can then share and if 
> you want you can experiment around with. Need to do this anyway for 
> the OLS paper we are presenting.
> Also, I need to go through your latest patch in more detail, before I give you 
> injustice, I just want to have the framework in my head first.
> 
> Ping me on Monday...
> 
> -- Hubertus

Getting your linux-kernel subscription running would be great,
I almost skipped your mail again because of it ;)

I'll try to code on this during the weekend, so see you on Monday.

Greets, Antonio.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: fairsched + O(1) process scheduler
       [not found]                 ` <200304081456.34367.frankeh@watson.ibm.com>
@ 2003-04-08 20:19                   ` Antonio Vargas
  0 siblings, 0 replies; 21+ messages in thread
From: Antonio Vargas @ 2003-04-08 20:19 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Antonio Vargas, William Lee Irwin III, linux-kernel, Robert Love,
	frankeh, nagar.us.ibm.com

On Tue, Apr 08, 2003 at 02:56:34PM -0400, Hubertus Franke wrote:
> 
> Antonio, while you are coding here is some additional input.

Great!
 
> By intoducing the user based pending queue and leaking
> the tasks back into the runqueue based on the per user ticks, you
> are changing semantics slightly.
> 
> If the task is reinserted back into the runqueue oit should be 
> reinserted into the expired queue iff and only no
> expired/active queue switch has happened. 
> Otherwise it should be reinserted into the active list.

This sounds right :)
 
> This will becoe important when we later distinguish between
> users that have limited share and those that have unlimited.

Yes, we should push the "p->user->uid == 0" test into the
send_task_to_user() function then... and later on implement
a "p->user->unlimited_cpu_share == 1" test.
 
> Can be implemented by keeping a per runqueue array switch count
> and store it with the pending task. On reinsertion, if 
> the task has the same as the current it goes to the expired.
> If its older than the current, then the task missed the array switch
> and it should go into the active queue.
> 
> Also, I don't follow necessarily your reason to put an INTERACTIVE
> task back on the active queue immediately, rather than going first
> through the pending queue again.
> 
> This way, a high priority job with even a small sleep_avg already being 
> declared INTERACTIVE, will continue to suck up cycles beyond its
> user's limits. This can be as long as 8 secs currently.

Perhaps I'm not declaring this in any explicit way, but I feel
that this type of control should only be applied to cpu hogs.

> Instead, things to consider is to feed these as well through the 
> pending queue and distinguish in the pending queue ?
> More thoughts required here... I let you know when any of them is
> successful at my end.

I'm somewhat deadlocked right now, so I'll have to wait until I have
some think up some way to make it work as intended.
 
Greets, Antonio

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2003-04-08 19:58 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-01 12:51 fairsched + O(1) process scheduler Antonio Vargas
2003-04-01 16:41 ` William Lee Irwin III
2003-04-01 22:19   ` Antonio Vargas
2003-04-02 12:46     ` Antonio Vargas
2003-04-02 16:35       ` William Lee Irwin III
2003-04-02 21:36         ` Antonio Vargas
2003-04-02 21:35           ` Robert Love
2003-04-02 22:07             ` Antonio Vargas
2003-04-02 22:10               ` Robert Love
2003-04-02 22:35                 ` Antonio Vargas
2003-04-03 17:15                 ` Corey Minyard
2003-04-03 18:17                   ` Robert Love
     [not found]     ` <200304021144.21924.frankeh@watson.ibm.com>
2003-04-03 12:53       ` Antonio Vargas
2003-04-03 19:22         ` William Lee Irwin III
2003-04-04 11:27           ` Antonio Vargas
2003-04-04 14:04             ` William Lee Irwin III
2003-04-04 20:12               ` Antonio Vargas
2003-04-04 20:40                 ` William Lee Irwin III
     [not found]             ` <200304041453.16630.frankeh@watson.ibm.com>
2003-04-04 21:36               ` Antonio Vargas
     [not found]                 ` <200304081456.34367.frankeh@watson.ibm.com>
2003-04-08 20:19                   ` Antonio Vargas
2003-04-01 18:33 ` Robert Love

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).