linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] [RFC] sched: accurate user accounting
@ 2007-03-25  1:59 Con Kolivas
  2007-03-25  2:14 ` Con Kolivas
  2007-03-25  7:51 ` [patch] " Ingo Molnar
  0 siblings, 2 replies; 34+ messages in thread
From: Con Kolivas @ 2007-03-25  1:59 UTC (permalink / raw)
  To: linux list; +Cc: malc, zwane, Ingo Molnar, ck list, ijuz

For an rsdl 0.33 patched kernel. Comments? Overhead worth it?

---
Currently we only do cpu accounting to userspace based on what is actually
happening precisely on each tick. The accuracy of that accounting gets
progressively worse the lower HZ is. As we already keep accounting of
nanosecond resolution we can accurately track user cpu, nice cpu and idle cpu
if we move the accounting to update_cpu_clock with a nanosecond cpu_usage_stat
entry. This increases overhead slightly but avoids the problem of tick
aliasing errors making accounting unreliable.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 include/linux/kernel_stat.h |    3 ++
 include/linux/sched.h       |    2 -
 kernel/sched.c              |   51 +++++++++++++++++++++++++++++++++++++++++---
 kernel/timer.c              |    5 +---
 4 files changed, 54 insertions(+), 7 deletions(-)

Index: linux-2.6.20.4-ck1/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.20.4-ck1.orig/include/linux/kernel_stat.h	2007-03-25 09:47:52.000000000 +1000
+++ linux-2.6.20.4-ck1/include/linux/kernel_stat.h	2007-03-25 11:31:29.000000000 +1000
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux-2.6.20.4-ck1/kernel/sched.c
===================================================================
--- linux-2.6.20.4-ck1.orig/kernel/sched.c	2007-03-25 09:47:56.000000000 +1000
+++ linux-2.6.20.4-ck1/kernel/sched.c	2007-03-25 11:42:28.000000000 +1000
@@ -77,6 +77,11 @@
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 #define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
 
 /*
@@ -2993,8 +2998,50 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff = now - p->last_ran;
+
+	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (NS_TO_JIFFIES(cpustat->nice_ns) > 1) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFIES_TO_NS(1));
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+						time_diff);
+			if (NS_TO_JIFFIES(cpustat->user_ns) > 1) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFIES_TO_NS(1));
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (NS_TO_JIFFIES(p->utime_ns) > 1) {
+			p->utime_ns = cputime_sub(p->utime_ns,
+						  JIFFIES_TO_NS(1));
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (NS_TO_JIFFIES(cpustat->idle_ns) > 1) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFIES_TO_NS(1));
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
 }
 
 /*
@@ -3059,8 +3106,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
Index: linux-2.6.20.4-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.20.4-ck1.orig/include/linux/sched.h	2007-03-25 11:08:21.000000000 +1000
+++ linux-2.6.20.4-ck1/include/linux/sched.h	2007-03-25 11:08:54.000000000 +1000
@@ -903,7 +903,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux-2.6.20.4-ck1/kernel/timer.c
===================================================================
--- linux-2.6.20.4-ck1.orig/kernel/timer.c	2007-03-25 11:28:56.000000000 +1000
+++ linux-2.6.20.4-ck1/kernel/timer.c	2007-03-25 11:30:44.000000000 +1000
@@ -1106,10 +1106,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] [RFC] sched: accurate user accounting
  2007-03-25  1:59 [PATCH] [RFC] sched: accurate user accounting Con Kolivas
@ 2007-03-25  2:14 ` Con Kolivas
  2007-03-25  7:51 ` [patch] " Ingo Molnar
  1 sibling, 0 replies; 34+ messages in thread
From: Con Kolivas @ 2007-03-25  2:14 UTC (permalink / raw)
  To: linux list; +Cc: malc, zwane, Ingo Molnar, ck list

On Sunday 25 March 2007 11:59, Con Kolivas wrote:
> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>
> ---
> Currently we only do cpu accounting to userspace based on what is actually
> happening precisely on each tick. The accuracy of that accounting gets
> progressively worse the lower HZ is. As we already keep accounting of
> nanosecond resolution we can accurately track user cpu, nice cpu and idle
> cpu if we move the accounting to update_cpu_clock with a nanosecond
> cpu_usage_stat entry. This increases overhead slightly but avoids the
> problem of tick aliasing errors making accounting unreliable.

Vale, this fixes your testcase you sent. Attached below for reference. 

P.S. Sorry about one of the cc email addresses in the first email; I succumbed 
to a silly practical joke unwittingly so you'll have to remove it when 
replying to all.

/* gcc -o hog smallhog.c */
#include <time.h>
#include <limits.h>
#include <signal.h>
#include <sys/time.h>

#define HIST 10

static sig_atomic_t stop;

static void sighandler (int signr)
{
     (void) signr;
     stop = 1;
}

static unsigned long hog (unsigned long niters)
{
     stop = 0;
     while (!stop && --niters);
     return niters;
}

int main (void)
{
     int i;
     struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
                             .it_value = { .tv_sec = 0, .tv_usec = 1 } };
     sigset_t set;
     unsigned long v[HIST];
     double tmp = 0.0;
     unsigned long n;

     signal (SIGALRM, &sighandler);
     setitimer (ITIMER_REAL, &it, NULL);

     for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
     for (i = 0; i < HIST; ++i) tmp += v[i];
     tmp /= HIST;
     n = tmp - (tmp / 3.0);

     sigemptyset (&set);
     sigaddset (&set, SIGALRM);

     for (;;) {
         hog (n);
         sigwait (&set, &i);
     }
     return 0;
}

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [patch] sched: accurate user accounting
  2007-03-25  1:59 [PATCH] [RFC] sched: accurate user accounting Con Kolivas
  2007-03-25  2:14 ` Con Kolivas
@ 2007-03-25  7:51 ` Ingo Molnar
  2007-03-25  8:39   ` Con Kolivas
  2007-03-25 11:34   ` malc
  1 sibling, 2 replies; 34+ messages in thread
From: Ingo Molnar @ 2007-03-25  7:51 UTC (permalink / raw)
  To: Con Kolivas
  Cc: linux list, malc, zwane, ck list, ijuz, Andrew Morton, Thomas Gleixner


* Con Kolivas <kernel@kolivas.org> wrote:

> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?

we want to do this - and we should do this to the vanilla scheduler 
first and check the results. I've back-merged the patch to before RSDL 
and have tested it - find the patch below. Vale, could you try this 
patch against a 2.6.21-rc4-ish kernel and re-test your testcase?

	Ingo

--------------------->
Subject: [patch] sched: accurate user accounting
From: Con Kolivas <kernel@kolivas.org>

Currently we only do cpu accounting to userspace based on what is 
actually happening precisely on each tick. The accuracy of that 
accounting gets progressively worse the lower HZ is. As we already keep 
accounting of nanosecond resolution we can accurately track user cpu, 
nice cpu and idle cpu if we move the accounting to update_cpu_clock with 
a nanosecond cpu_usage_stat entry. This increases overhead slightly but 
avoids the problem of tick aliasing errors making accounting unreliable.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |    3 ++
 include/linux/sched.h       |    2 -
 kernel/sched.c              |   52 +++++++++++++++++++++++++++++++++++++++++---
 kernel/timer.c              |    5 +---
 4 files changed, 55 insertions(+), 7 deletions(-)

Index: linux/include/linux/kernel_stat.h
===================================================================
--- linux.orig/include/linux/kernel_stat.h
+++ linux/include/linux/kernel_stat.h
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -882,7 +882,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -167,6 +167,12 @@ unsigned long long __attribute__((weak))
 	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)   ((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)   ((TIME) * (1000000000 / HZ))
+
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 
@@ -3017,8 +3023,50 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff = now - p->last_ran;
+
+	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (NS_TO_JIFFIES(cpustat->nice_ns) > 1) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFIES_TO_NS(1));
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+						time_diff);
+			if (NS_TO_JIFFIES(cpustat->user_ns) > 1) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFIES_TO_NS(1));
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (NS_TO_JIFFIES(p->utime_ns) > 1) {
+			p->utime_ns = cputime_sub(p->utime_ns,
+						  JIFFIES_TO_NS(1));
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (NS_TO_JIFFIES(cpustat->idle_ns) > 1) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFIES_TO_NS(1));
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
 }
 
 /*
@@ -3104,8 +3152,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25  7:51 ` [patch] " Ingo Molnar
@ 2007-03-25  8:39   ` Con Kolivas
  2007-03-25  9:04     ` Ingo Molnar
  2007-03-25 11:34   ` malc
  1 sibling, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25  8:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux list, malc, zwane, ck list, Andrew Morton, Thomas Gleixner

On Sunday 25 March 2007 17:51, Ingo Molnar wrote:
> * Con Kolivas <kernel@kolivas.org> wrote:
> > For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>
> we want to do this - and we should do this to the vanilla scheduler
> first and check the results. I've back-merged the patch to before RSDL
> and have tested it - find the patch below. Vale, could you try this
> patch against a 2.6.21-rc4-ish kernel and re-test your testcase?

Great. That should fix a lot of misconceptions about cpu usage and HZ.

However-

> +/*
> + * Some helpers for converting nanosecond timing to jiffy resolution
> + */
> +#define NS_TO_JIFFIES(TIME)   ((TIME) / (1000000000 / HZ))
> +#define JIFFIES_TO_NS(TIME)   ((TIME) * (1000000000 / HZ))
> +

This hunk is already in mainline so it will be double defined now.

Thanks.

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25  8:39   ` Con Kolivas
@ 2007-03-25  9:04     ` Ingo Molnar
  0 siblings, 0 replies; 34+ messages in thread
From: Ingo Molnar @ 2007-03-25  9:04 UTC (permalink / raw)
  To: Con Kolivas
  Cc: linux list, malc, zwane, ck list, Andrew Morton, Thomas Gleixner


* Con Kolivas <kernel@kolivas.org> wrote:

> > +/*
> > + * Some helpers for converting nanosecond timing to jiffy resolution
> > + */
> > +#define NS_TO_JIFFIES(TIME)   ((TIME) / (1000000000 / HZ))
> > +#define JIFFIES_TO_NS(TIME)   ((TIME) * (1000000000 / HZ))
> > +
> 
> This hunk is already in mainline so it will be double defined now.

yeah. Updated patch below.

	Ingo

---------------------->
Subject: [patch] sched: accurate user accounting
From: Con Kolivas <kernel@kolivas.org>

Currently we only do cpu accounting to userspace based on what is 
actually happening precisely on each tick. The accuracy of that 
accounting gets progressively worse the lower HZ is. As we already keep 
accounting of nanosecond resolution we can accurately track user cpu, 
nice cpu and idle cpu if we move the accounting to update_cpu_clock with 
a nanosecond cpu_usage_stat entry. This increases overhead slightly but 
avoids the problem of tick aliasing errors making accounting unreliable.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |    3 ++
 include/linux/sched.h       |    2 -
 kernel/sched.c              |   46 +++++++++++++++++++++++++++++++++++++++++---
 kernel/timer.c              |    5 +---
 4 files changed, 49 insertions(+), 7 deletions(-)

Index: linux/include/linux/kernel_stat.h
===================================================================
--- linux.orig/include/linux/kernel_stat.h
+++ linux/include/linux/kernel_stat.h
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -882,7 +882,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -3017,8 +3017,50 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff = now - p->last_ran;
+
+	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (NS_TO_JIFFIES(cpustat->nice_ns) > 1) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFIES_TO_NS(1));
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+						time_diff);
+			if (NS_TO_JIFFIES(cpustat->user_ns) > 1) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFIES_TO_NS(1));
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (NS_TO_JIFFIES(p->utime_ns) > 1) {
+			p->utime_ns = cputime_sub(p->utime_ns,
+						  JIFFIES_TO_NS(1));
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (NS_TO_JIFFIES(cpustat->idle_ns) > 1) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFIES_TO_NS(1));
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
 }
 
 /*
@@ -3104,8 +3146,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25  7:51 ` [patch] " Ingo Molnar
  2007-03-25  8:39   ` Con Kolivas
@ 2007-03-25 11:34   ` malc
  2007-03-25 11:46     ` Con Kolivas
  1 sibling, 1 reply; 34+ messages in thread
From: malc @ 2007-03-25 11:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Con Kolivas, linux list, zwane, ck list, ijuz, Andrew Morton,
	Thomas Gleixner

On Sun, 25 Mar 2007, Ingo Molnar wrote:

>
> * Con Kolivas <kernel@kolivas.org> wrote:
>
>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>
> we want to do this - and we should do this to the vanilla scheduler
> first and check the results. I've back-merged the patch to before RSDL
> and have tested it - find the patch below. Vale, could you try this
> patch against a 2.6.21-rc4-ish kernel and re-test your testcase?

[..snip..]

Compilation failed with:
kernel/built-in.o(.sched.text+0x564): more undefined references to `__udivdi3' follow

$ gcc --version | head -1
gcc (GCC) 3.4.6

$ cat /proc/cpuinfo | grep cpu
cpu             : 7447A, altivec supported

Can't say i really understand why 64bit arithmetics suddenly became an
issue here.

Am i supposed to run the testcase and see if numbers in `/proc/stat'
now match the reality closer? To be really accurate `/proc/stat'
should be left alone methinks, because no matter how good you try the
fundamential fact that time(and consequently load percentage) is not
really devided in USER_HZ intervals will interfere with ones quest for
accurate statistics. (Wonder what this patch will do to slightly modified
hog that produced this: http://www.boblycat.org/~malc/apc/load-c2d-hog.png
but this will have to wait till i get to the PC at work)

-- 
vale (this is not name/nick/alias, this is latin for farewell)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 11:34   ` malc
@ 2007-03-25 11:46     ` Con Kolivas
  2007-03-25 12:02       ` Con Kolivas
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 11:46 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, ijuz, Andrew Morton,
	Thomas Gleixner

On Sunday 25 March 2007 21:34, malc wrote:
> On Sun, 25 Mar 2007, Ingo Molnar wrote:
> > * Con Kolivas <kernel@kolivas.org> wrote:
> >> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
> >
> > we want to do this - and we should do this to the vanilla scheduler
> > first and check the results. I've back-merged the patch to before RSDL
> > and have tested it - find the patch below. Vale, could you try this
> > patch against a 2.6.21-rc4-ish kernel and re-test your testcase?
>
> [..snip..]
>
> Compilation failed with:
> kernel/built-in.o(.sched.text+0x564): more undefined references to
> `__udivdi3' follow
>
> $ gcc --version | head -1
> gcc (GCC) 3.4.6
>
> $ cat /proc/cpuinfo | grep cpu
> cpu             : 7447A, altivec supported
>
> Can't say i really understand why 64bit arithmetics suddenly became an
> issue here.

Probably due to use of:

#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))

Excuse our 64bit world while we strive to correct our 32bit blindness and fix 
this bug.

>
> Am i supposed to run the testcase and see if numbers in `/proc/stat'
> now match the reality closer? To be really accurate `/proc/stat'
> should be left alone methinks, because no matter how good you try the
> fundamential fact that time(and consequently load percentage) is not
> really devided in USER_HZ intervals will interfere with ones quest for
> accurate statistics. (Wonder what this patch will do to slightly modified
> hog that produced this: http://www.boblycat.org/~malc/apc/load-c2d-hog.png
> but this will have to wait till i get to the PC at work)

It should far more accurately represent the cpu usage without any userspace 
changes.

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 11:46     ` Con Kolivas
@ 2007-03-25 12:02       ` Con Kolivas
  2007-03-25 12:32         ` Gene Heskett
                           ` (2 more replies)
  0 siblings, 3 replies; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 12:02 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Sunday 25 March 2007 21:46, Con Kolivas wrote:
> On Sunday 25 March 2007 21:34, malc wrote:
> > On Sun, 25 Mar 2007, Ingo Molnar wrote:
> > > * Con Kolivas <kernel@kolivas.org> wrote:
> > >> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
> > >
> > > we want to do this - and we should do this to the vanilla scheduler
> > > first and check the results. I've back-merged the patch to before RSDL
> > > and have tested it - find the patch below. Vale, could you try this
> > > patch against a 2.6.21-rc4-ish kernel and re-test your testcase?
> >
> > [..snip..]
> >
> > Compilation failed with:
> > kernel/built-in.o(.sched.text+0x564): more undefined references to
> > `__udivdi3' follow
> >
> > $ gcc --version | head -1
> > gcc (GCC) 3.4.6
> >
> > $ cat /proc/cpuinfo | grep cpu
> > cpu             : 7447A, altivec supported
> >
> > Can't say i really understand why 64bit arithmetics suddenly became an
> > issue here.
>
> Probably due to use of:
>
> #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
> #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
>
> Excuse our 64bit world while we strive to correct our 32bit blindness and
> fix this bug.

Please try this (akpm please don't include till we confirm it builds on ppc,
sorry). For 2.6.21-rc4

---
Currently we only do cpu accounting to userspace based on what is 
actually happening precisely on each tick. The accuracy of that 
accounting gets progressively worse the lower HZ is. As we already keep 
accounting of nanosecond resolution we can accurately track user cpu, 
nice cpu and idle cpu if we move the accounting to update_cpu_clock with 
a nanosecond cpu_usage_stat entry. This increases overhead slightly but 
avoids the problem of tick aliasing errors making accounting unreliable.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |    3 ++
 include/linux/sched.h       |    2 -
 kernel/sched.c              |   46 +++++++++++++++++++++++++++++++++++++++++---
 kernel/timer.c              |    5 +---
 4 files changed, 49 insertions(+), 7 deletions(-)

Index: linux-2.6.21-rc4-acct/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.21-rc4-acct.orig/include/linux/kernel_stat.h	2006-09-21 19:54:58.000000000 +1000
+++ linux-2.6.21-rc4-acct/include/linux/kernel_stat.h	2007-03-25 21:51:49.000000000 +1000
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux-2.6.21-rc4-acct/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc4-acct.orig/include/linux/sched.h	2007-03-21 12:53:00.000000000 +1100
+++ linux-2.6.21-rc4-acct/include/linux/sched.h	2007-03-25 21:51:49.000000000 +1000
@@ -882,7 +882,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux-2.6.21-rc4-acct/kernel/sched.c
===================================================================
--- linux-2.6.21-rc4-acct.orig/kernel/sched.c	2007-03-21 12:53:00.000000000 +1100
+++ linux-2.6.21-rc4-acct/kernel/sched.c	2007-03-25 21:58:27.000000000 +1000
@@ -89,6 +89,7 @@ unsigned long long __attribute__((weak))
  */
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		JIFFIES_TO_NS(1)
 
 /*
  * These are the 'tuning knobs' of the scheduler:
@@ -3017,8 +3018,49 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff = now - p->last_ran;
+
+	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (cpustat->nice_ns > JIFFY_NS) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFY_NS);
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+							 time_diff);
+			if (cpustat->user_ns > JIFFY_NS) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFY_NS);
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (p->utime_ns > JIFFY_NS) {
+			p->utime_ns = cputime_sub(p->utime_ns, JIFFY_NS);
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (cpustat->idle_ns > JIFFY_NS) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFY_NS);
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
 }
 
 /*
@@ -3104,8 +3146,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
Index: linux-2.6.21-rc4-acct/kernel/timer.c
===================================================================
--- linux-2.6.21-rc4-acct.orig/kernel/timer.c	2007-03-21 12:53:00.000000000 +1100
+++ linux-2.6.21-rc4-acct/kernel/timer.c	2007-03-25 21:51:49.000000000 +1000
@@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 12:02       ` Con Kolivas
@ 2007-03-25 12:32         ` Gene Heskett
  2007-03-25 12:41           ` Con Kolivas
  2007-03-25 13:05         ` malc
  2007-03-25 13:06         ` malc
  2 siblings, 1 reply; 34+ messages in thread
From: Gene Heskett @ 2007-03-25 12:32 UTC (permalink / raw)
  To: linux-kernel
  Cc: Con Kolivas, malc, Ingo Molnar, zwane, ck list, Andrew Morton,
	Thomas Gleixner

On Sunday 25 March 2007, Con Kolivas wrote:
>On Sunday 25 March 2007 21:46, Con Kolivas wrote:
>> On Sunday 25 March 2007 21:34, malc wrote:
>> > On Sun, 25 Mar 2007, Ingo Molnar wrote:
>> > > * Con Kolivas <kernel@kolivas.org> wrote:
>> > >> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>> > >
>> > > we want to do this - and we should do this to the vanilla
>> > > scheduler first and check the results. I've back-merged the patch
>> > > to before RSDL and have tested it - find the patch below. Vale,
>> > > could you try this patch against a 2.6.21-rc4-ish kernel and
>> > > re-test your testcase?
>> >
>> > [..snip..]
>> >
>> > Compilation failed with:
>> > kernel/built-in.o(.sched.text+0x564): more undefined references to
>> > `__udivdi3' follow
>> >
>> > $ gcc --version | head -1
>> > gcc (GCC) 3.4.6
>> >
>> > $ cat /proc/cpuinfo | grep cpu
>> > cpu             : 7447A, altivec supported
>> >
>> > Can't say i really understand why 64bit arithmetics suddenly became
>> > an issue here.
>>
>> Probably due to use of:
>>
>> #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
>> #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
>>
>> Excuse our 64bit world while we strive to correct our 32bit blindness
>> and fix this bug.
>
>Please try this (akpm please don't include till we confirm it builds on
> ppc, sorry). For 2.6.21-rc4
>
>---
>Currently we only do cpu accounting to userspace based on what is
>actually happening precisely on each tick. The accuracy of that
>accounting gets progressively worse the lower HZ is. As we already keep
>accounting of nanosecond resolution we can accurately track user cpu,
>nice cpu and idle cpu if we move the accounting to update_cpu_clock with
>a nanosecond cpu_usage_stat entry. This increases overhead slightly but
>avoids the problem of tick aliasing errors making accounting unreliable.
>
>Signed-off-by: Con Kolivas <kernel@kolivas.org>
>Signed-off-by: Ingo Molnar <mingo@elte.hu>
>---
> include/linux/kernel_stat.h |    3 ++
> include/linux/sched.h       |    2 -
> kernel/sched.c              |   46
> +++++++++++++++++++++++++++++++++++++++++--- kernel/timer.c            
>  |    5 +---
> 4 files changed, 49 insertions(+), 7 deletions(-)
>
>Index: linux-2.6.21-rc4-acct/include/linux/kernel_stat.h
>===================================================================
>--- linux-2.6.21-rc4-acct.orig/include/linux/kernel_stat.h	2006-09-21
> 19:54:58.000000000 +1000 +++
> linux-2.6.21-rc4-acct/include/linux/kernel_stat.h	2007-03-25
> 21:51:49.000000000 +1000 @@ -16,11 +16,14 @@
>
> struct cpu_usage_stat {
> 	cputime64_t user;
>+	cputime64_t user_ns;
> 	cputime64_t nice;
>+	cputime64_t nice_ns;
> 	cputime64_t system;
> 	cputime64_t softirq;
> 	cputime64_t irq;
> 	cputime64_t idle;
>+	cputime64_t idle_ns;
> 	cputime64_t iowait;
> 	cputime64_t steal;
> };
>Index: linux-2.6.21-rc4-acct/include/linux/sched.h
>===================================================================
>--- linux-2.6.21-rc4-acct.orig/include/linux/sched.h	2007-03-21
> 12:53:00.000000000 +1100 +++
> linux-2.6.21-rc4-acct/include/linux/sched.h	2007-03-25
> 21:51:49.000000000 +1000 @@ -882,7 +882,7 @@ struct task_struct {
> 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
>
> 	unsigned long rt_priority;
>-	cputime_t utime, stime;
>+	cputime_t utime, utime_ns, stime;
> 	unsigned long nvcsw, nivcsw; /* context switch counts */
> 	struct timespec start_time;
> /* mm fault and swap info: this can arguably be seen as either
> mm-specific or thread-specific */ Index:
> linux-2.6.21-rc4-acct/kernel/sched.c
>===================================================================
This, hunk 1, did not apply as the #defines aren't there after applying 
0.33 to a 2.6.20.4 kernel tree.

>--- linux-2.6.21-rc4-acct.orig/kernel/sched.c	2007-03-21
> 12:53:00.000000000 +1100 +++
> linux-2.6.21-rc4-acct/kernel/sched.c	2007-03-25 21:58:27.000000000
> +1000 @@ -89,6 +89,7 @@ unsigned long long __attribute__((weak))
>  */
> #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
> #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
>+#define JIFFY_NS		JIFFIES_TO_NS(1)
>
> /*
>  * These are the 'tuning knobs' of the scheduler:
>@@ -3017,8 +3018,49 @@ EXPORT_PER_CPU_SYMBOL(kstat);
> static inline void
> update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long
> long now) {
>-	p->sched_time += now - p->last_ran;
>+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
>+	cputime64_t time_diff = now - p->last_ran;
>+
>+	p->sched_time += time_diff;
> 	p->last_ran = rq->most_recent_timestamp = now;
>+	if (p != rq->idle) {
>+		cputime_t utime_diff = time_diff;
>+
>+		if (TASK_NICE(p) > 0) {
>+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
>+							 time_diff);
>+			if (cpustat->nice_ns > JIFFY_NS) {
>+				cpustat->nice_ns =
>+					cputime64_sub(cpustat->nice_ns,
>+					JIFFY_NS);
>+				cpustat->nice =
>+					cputime64_add(cpustat->nice, 1);
>+			}
>+		} else {
>+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
>+							 time_diff);
>+			if (cpustat->user_ns > JIFFY_NS) {
>+				cpustat->user_ns =
>+					cputime64_sub(cpustat->user_ns,
>+					JIFFY_NS);
>+				cpustat ->user =
>+					cputime64_add(cpustat->user, 1);
>+			}
>+		}
>+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
>+		if (p->utime_ns > JIFFY_NS) {
>+			p->utime_ns = cputime_sub(p->utime_ns, JIFFY_NS);
>+			p->utime = cputime_add(p->utime,
>+					       jiffies_to_cputime(1));
>+		}
>+	} else {
>+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
>+		if (cpustat->idle_ns > JIFFY_NS) {
>+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
>+							 JIFFY_NS);
>+			cpustat->idle = cputime64_add(cpustat->idle, 1);
>+		}
>+	}
> }
>
> /*
>@@ -3104,8 +3146,6 @@ void account_system_time(struct task_str
> 		cpustat->system = cputime64_add(cpustat->system, tmp);
> 	else if (atomic_read(&rq->nr_iowait) > 0)
> 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
>-	else
>-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
> 	/* Account for system time used */
> 	acct_update_integrals(p);
> }
>Index: linux-2.6.21-rc4-acct/kernel/timer.c
>===================================================================
>--- linux-2.6.21-rc4-acct.orig/kernel/timer.c	2007-03-21
> 12:53:00.000000000 +1100 +++
> linux-2.6.21-rc4-acct/kernel/timer.c	2007-03-25 21:51:49.000000000
> +1000 @@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
> int cpu = smp_processor_id();
>
> 	/* Note: this timer irq context must be accounted for as well. */
>-	if (user_tick)
>-		account_user_time(p, jiffies_to_cputime(1));
>-	else
>+	if (!user_tick)
> 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
>+	/* User time is accounted for in update_cpu_clock in sched.c */
> 	run_local_timers();
> 	if (rcu_pending(cpu))
> 		rcu_check_callbacks(cpu, user_tick);

I'm playing again because the final 2.6.20.4 does NOT break amanda, where 
2.6.20.4-rc1 did.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
A classic is something that everyone wants to have read
and nobody wants to read.
		-- Mark Twain, "The Disappearance of Literature"

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 12:32         ` Gene Heskett
@ 2007-03-25 12:41           ` Con Kolivas
  2007-03-25 13:33             ` Gene Heskett
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 12:41 UTC (permalink / raw)
  To: Gene Heskett
  Cc: linux-kernel, malc, Ingo Molnar, zwane, ck list, Andrew Morton,
	Thomas Gleixner

On Sunday 25 March 2007 22:32, Gene Heskett wrote:
> On Sunday 25 March 2007, Con Kolivas wrote:
> >On Sunday 25 March 2007 21:46, Con Kolivas wrote:
> >> On Sunday 25 March 2007 21:34, malc wrote:
> >> > On Sun, 25 Mar 2007, Ingo Molnar wrote:
> >> > > * Con Kolivas <kernel@kolivas.org> wrote:
> >> > >> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
> >> > >
> >> > > we want to do this - and we should do this to the vanilla
> >> > > scheduler first and check the results. I've back-merged the patch
> >> > > to before RSDL and have tested it - find the patch below. Vale,
> >> > > could you try this patch against a 2.6.21-rc4-ish kernel and
> >> > > re-test your testcase?
> >> >
> >> > [..snip..]
> >> >
> >> > Compilation failed with:
> >> > kernel/built-in.o(.sched.text+0x564): more undefined references to
> >> > `__udivdi3' follow
> >> >
> >> > $ gcc --version | head -1
> >> > gcc (GCC) 3.4.6
> >> >
> >> > $ cat /proc/cpuinfo | grep cpu
> >> > cpu             : 7447A, altivec supported
> >> >
> >> > Can't say i really understand why 64bit arithmetics suddenly became
> >> > an issue here.
> >>
> >> Probably due to use of:
> >>
> >> #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
> >> #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
> >>
> >> Excuse our 64bit world while we strive to correct our 32bit blindness
> >> and fix this bug.
> >
> >Please try this (akpm please don't include till we confirm it builds on
> > ppc, sorry). For 2.6.21-rc4
> >
> >---
> >Currently we only do cpu accounting to userspace based on what is
> >actually happening precisely on each tick. The accuracy of that
> >accounting gets progressively worse the lower HZ is. As we already keep
> >accounting of nanosecond resolution we can accurately track user cpu,
> >nice cpu and idle cpu if we move the accounting to update_cpu_clock with
> >a nanosecond cpu_usage_stat entry. This increases overhead slightly but
> >avoids the problem of tick aliasing errors making accounting unreliable.

>
> I'm playing again because the final 2.6.20.4 does NOT break amanda, where
> 2.6.20.4-rc1 did.

Yes only the original version I posted on this email thread was for an RSDL 
0.33 patched kernel. That original patch should build fine on i386 and x86_64 
(where I tried it). This version I sent out following Ingo's lead has 
2.6.21-rc4 in mind (without rsdl).

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 12:02       ` Con Kolivas
  2007-03-25 12:32         ` Gene Heskett
@ 2007-03-25 13:05         ` malc
  2007-03-25 13:06         ` malc
  2 siblings, 0 replies; 34+ messages in thread
From: malc @ 2007-03-25 13:05 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Sun, 25 Mar 2007, Con Kolivas wrote:

> On Sunday 25 March 2007 21:46, Con Kolivas wrote:
>> On Sunday 25 March 2007 21:34, malc wrote:
>>> On Sun, 25 Mar 2007, Ingo Molnar wrote:
>>>> * Con Kolivas <kernel@kolivas.org> wrote:
>>>>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>>>>
>>>> we want to do this - and we should do this to the vanilla scheduler
>>>> first and check the results. I've back-merged the patch to before RSDL
>>>> and have tested it - find the patch below. Vale, could you try this
>>>> patch against a 2.6.21-rc4-ish kernel and re-test your testcase?
>>>
>>> [..snip..]
>>>
>>> Compilation failed with:
>>> kernel/built-in.o(.sched.text+0x564): more undefined references to
>>> `__udivdi3' follow
>>>
>>> $ gcc --version | head -1
>>> gcc (GCC) 3.4.6
>>>
>>> $ cat /proc/cpuinfo | grep cpu
>>> cpu             : 7447A, altivec supported
>>>
>>> Can't say i really understand why 64bit arithmetics suddenly became an
>>> issue here.
>>
>> Probably due to use of:
>>
>> #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
>> #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
>>
>> Excuse our 64bit world while we strive to correct our 32bit blindness and
>> fix this bug.
>
> Please try this (akpm please don't include till we confirm it builds on ppc,
> sorry). For 2.6.21-rc4
>

Works. Accuracy is on par (most of the time with fraction of the
percent) with ad hoc method my kernel module employs. However there's
something strange happening w.r.t. iowait, it looks as if it doesn't
affect the idle field any more, nothing big but a digression from the
previous behavior. Thank you very much for this.

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 12:02       ` Con Kolivas
  2007-03-25 12:32         ` Gene Heskett
  2007-03-25 13:05         ` malc
@ 2007-03-25 13:06         ` malc
  2007-03-25 14:15           ` Con Kolivas
  2 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-03-25 13:06 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Sun, 25 Mar 2007, Con Kolivas wrote:

> On Sunday 25 March 2007 21:46, Con Kolivas wrote:
>> On Sunday 25 March 2007 21:34, malc wrote:
>>> On Sun, 25 Mar 2007, Ingo Molnar wrote:
>>>> * Con Kolivas <kernel@kolivas.org> wrote:
>>>>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?

[..snip..]

>
> ---
> Currently we only do cpu accounting to userspace based on what is
> actually happening precisely on each tick. The accuracy of that
> accounting gets progressively worse the lower HZ is. As we already keep
> accounting of nanosecond resolution we can accurately track user cpu,
> nice cpu and idle cpu if we move the accounting to update_cpu_clock with
> a nanosecond cpu_usage_stat entry. This increases overhead slightly but
> avoids the problem of tick aliasing errors making accounting unreliable.
>
> Signed-off-by: Con Kolivas <kernel@kolivas.org>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>

[..snip..]

Forgot to mention. Given that this goes into the kernel, shouldn't
Documentation/cpu-load.txt be amended/removed?

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 12:41           ` Con Kolivas
@ 2007-03-25 13:33             ` Gene Heskett
  0 siblings, 0 replies; 34+ messages in thread
From: Gene Heskett @ 2007-03-25 13:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Con Kolivas, malc, Ingo Molnar, zwane, ck list, Andrew Morton,
	Thomas Gleixner

On Sunday 25 March 2007, Con Kolivas wrote:
>On Sunday 25 March 2007 22:32, Gene Heskett wrote:
>> On Sunday 25 March 2007, Con Kolivas wrote:
>> >On Sunday 25 March 2007 21:46, Con Kolivas wrote:
>> >> On Sunday 25 March 2007 21:34, malc wrote:
>> >> > On Sun, 25 Mar 2007, Ingo Molnar wrote:
>> >> > > * Con Kolivas <kernel@kolivas.org> wrote:
>> >> > >> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>> >> > >
>> >> > > we want to do this - and we should do this to the vanilla
>> >> > > scheduler first and check the results. I've back-merged the
>> >> > > patch to before RSDL and have tested it - find the patch below.
>> >> > > Vale, could you try this patch against a 2.6.21-rc4-ish kernel
>> >> > > and re-test your testcase?
>> >> >
>> >> > [..snip..]
>> >> >
>> >> > Compilation failed with:
>> >> > kernel/built-in.o(.sched.text+0x564): more undefined references
>> >> > to `__udivdi3' follow
>> >> >
>> >> > $ gcc --version | head -1
>> >> > gcc (GCC) 3.4.6
>> >> >
>> >> > $ cat /proc/cpuinfo | grep cpu
>> >> > cpu             : 7447A, altivec supported
>> >> >
>> >> > Can't say i really understand why 64bit arithmetics suddenly
>> >> > became an issue here.
>> >>
>> >> Probably due to use of:
>> >>
>> >> #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
>> >> #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
>> >>
>> >> Excuse our 64bit world while we strive to correct our 32bit
>> >> blindness and fix this bug.
>> >
>> >Please try this (akpm please don't include till we confirm it builds
>> > on ppc, sorry). For 2.6.21-rc4
>> >
>> >---
>> >Currently we only do cpu accounting to userspace based on what is
>> >actually happening precisely on each tick. The accuracy of that
>> >accounting gets progressively worse the lower HZ is. As we already
>> > keep accounting of nanosecond resolution we can accurately track
>> > user cpu, nice cpu and idle cpu if we move the accounting to
>> > update_cpu_clock with a nanosecond cpu_usage_stat entry. This
>> > increases overhead slightly but avoids the problem of tick aliasing
>> > errors making accounting unreliable.
>>
>> I'm playing again because the final 2.6.20.4 does NOT break amanda,
>> where 2.6.20.4-rc1 did.
>
>Yes only the original version I posted on this email thread was for an
> RSDL 0.33 patched kernel. That original patch should build fine on i386
> and x86_64 (where I tried it). This version I sent out following Ingo's
> lead has 2.6.21-rc4 in mind (without rsdl).

And since I have an amdump session running in the background using the 
first 2.6.20.4-rdsl-0.33 patch I got from your web page about an hour 
ago, I am very pleased with this, I still have a machine as gzip is being 
restrained to a max of about 83% of the cpu.  And according to my 
grepping of the /tmp/amanda-dbg/client/Daily/sendsize* files, this patch 
does not break amanda.

Also, to Ingo Molnar, I've removed that sched-batch thing from my scripts 
as this feels considerably smoother in comparison.

Thank you very much, Con, and I hope you are improving.

-- 
Cheers, Gene
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
Aberdeen was so small that when the family with the car went
on vacation, the gas station and drive-in theatre had to close.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 13:06         ` malc
@ 2007-03-25 14:15           ` Con Kolivas
  2007-03-25 14:57             ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 14:15 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Sunday 25 March 2007 23:06, malc wrote:
> On Sun, 25 Mar 2007, Con Kolivas wrote:
> > On Sunday 25 March 2007 21:46, Con Kolivas wrote:
> >> On Sunday 25 March 2007 21:34, malc wrote:
> >>> On Sun, 25 Mar 2007, Ingo Molnar wrote:
> >>>> * Con Kolivas <kernel@kolivas.org> wrote:
> >>>>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>
> [..snip..]
>
> > ---
> > Currently we only do cpu accounting to userspace based on what is
> > actually happening precisely on each tick. The accuracy of that
> > accounting gets progressively worse the lower HZ is. As we already keep
> > accounting of nanosecond resolution we can accurately track user cpu,
> > nice cpu and idle cpu if we move the accounting to update_cpu_clock with
> > a nanosecond cpu_usage_stat entry. This increases overhead slightly but
> > avoids the problem of tick aliasing errors making accounting unreliable.
> >
> > Signed-off-by: Con Kolivas <kernel@kolivas.org>
> > Signed-off-by: Ingo Molnar <mingo@elte.hu>
>
> [..snip..]
>
> Forgot to mention. Given that this goes into the kernel, shouldn't
> Documentation/cpu-load.txt be amended/removed?

Yes that's a good idea. Also there should be a sanity check because sometimes 
for some reason noone's been able to explain to me sched_clock gives a value 
which doesn't make sense (time appears to have gone backwards) and that will 
completely ruin the accounting from then on. 

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 14:15           ` Con Kolivas
@ 2007-03-25 14:57             ` malc
  2007-03-25 15:08               ` Con Kolivas
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-03-25 14:57 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Mon, 26 Mar 2007, Con Kolivas wrote:

> On Sunday 25 March 2007 23:06, malc wrote:
>> On Sun, 25 Mar 2007, Con Kolivas wrote:
>>> On Sunday 25 March 2007 21:46, Con Kolivas wrote:
>>>> On Sunday 25 March 2007 21:34, malc wrote:
>>>>> On Sun, 25 Mar 2007, Ingo Molnar wrote:
>>>>>> * Con Kolivas <kernel@kolivas.org> wrote:
>>>>>>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>>
>> [..snip..]
>>
>>> ---
>>> Currently we only do cpu accounting to userspace based on what is
>>> actually happening precisely on each tick. The accuracy of that
>>> accounting gets progressively worse the lower HZ is. As we already keep
>>> accounting of nanosecond resolution we can accurately track user cpu,
>>> nice cpu and idle cpu if we move the accounting to update_cpu_clock with
>>> a nanosecond cpu_usage_stat entry. This increases overhead slightly but
>>> avoids the problem of tick aliasing errors making accounting unreliable.
>>>
>>> Signed-off-by: Con Kolivas <kernel@kolivas.org>
>>> Signed-off-by: Ingo Molnar <mingo@elte.hu>
>>
>> [..snip..]
>>
>> Forgot to mention. Given that this goes into the kernel, shouldn't
>> Documentation/cpu-load.txt be amended/removed?
>
> Yes that's a good idea. Also there should be a sanity check because sometimes
> for some reason noone's been able to explain to me sched_clock gives a value
> which doesn't make sense (time appears to have gone backwards) and that will
> completely ruin the accounting from then on.

After running this new kernel for a while i guess i have hit this issue:
http://www.boblycat.org/~malc/apc/bad-load.png

Top and icewm's monitor do show incredibly huge load while in reality 
nothing like that is really happening. Both ad-hoc and `/proc/stat' (idle)
show normal CPU utilization (7% since i'm doing some A/V stuff in the
background)

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 14:57             ` malc
@ 2007-03-25 15:08               ` Con Kolivas
  2007-03-25 15:19                 ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 15:08 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Monday 26 March 2007 00:57, malc wrote:
> On Mon, 26 Mar 2007, Con Kolivas wrote:
> > On Sunday 25 March 2007 23:06, malc wrote:
> >> On Sun, 25 Mar 2007, Con Kolivas wrote:
> >>> On Sunday 25 March 2007 21:46, Con Kolivas wrote:
> >>>> On Sunday 25 March 2007 21:34, malc wrote:
> >>>>> On Sun, 25 Mar 2007, Ingo Molnar wrote:
> >>>>>> * Con Kolivas <kernel@kolivas.org> wrote:
> >>>>>>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
> >>
> >> [..snip..]
> >>
> >>> ---
> >>> Currently we only do cpu accounting to userspace based on what is
> >>> actually happening precisely on each tick. The accuracy of that
> >>> accounting gets progressively worse the lower HZ is. As we already keep
> >>> accounting of nanosecond resolution we can accurately track user cpu,
> >>> nice cpu and idle cpu if we move the accounting to update_cpu_clock
> >>> with a nanosecond cpu_usage_stat entry. This increases overhead
> >>> slightly but avoids the problem of tick aliasing errors making
> >>> accounting unreliable.
> >>>
> >>> Signed-off-by: Con Kolivas <kernel@kolivas.org>
> >>> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> >>
> >> [..snip..]
> >>
> >> Forgot to mention. Given that this goes into the kernel, shouldn't
> >> Documentation/cpu-load.txt be amended/removed?
> >
> > Yes that's a good idea. Also there should be a sanity check because
> > sometimes for some reason noone's been able to explain to me sched_clock
> > gives a value which doesn't make sense (time appears to have gone
> > backwards) and that will completely ruin the accounting from then on.
>
> After running this new kernel for a while i guess i have hit this issue:
> http://www.boblycat.org/~malc/apc/bad-load.png
>
> Top and icewm's monitor do show incredibly huge load while in reality
> nothing like that is really happening. Both ad-hoc and `/proc/stat' (idle)
> show normal CPU utilization (7% since i'm doing some A/V stuff in the
> background)

Yes I'd say you hit the problem I described earlier. When playing with
sched_clock() I found it gave some "interesting" results fairly infrequently.
They could lead to ridiculous accounting mistakes.

So before we go any further with this patch, can you try the following one and 
see if this simple sanity check is enough?

Thanks!

---
Currently we only do cpu accounting to userspace based on what is 
actually happening precisely on each tick. The accuracy of that 
accounting gets progressively worse the lower HZ is. As we already keep 
accounting of nanosecond resolution we can accurately track user cpu, 
nice cpu and idle cpu if we move the accounting to update_cpu_clock with 
a nanosecond cpu_usage_stat entry. This increases overhead slightly but 
avoids the problem of tick aliasing errors making accounting unreliable.

Remove the now defunct Documentation/cpu-load.txt file.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
---
 Documentation/cpu-load.txt  |  113 --------------------------------------------
 include/linux/kernel_stat.h |    3 +
 include/linux/sched.h       |    2 
 kernel/sched.c              |   50 ++++++++++++++++++-
 kernel/timer.c              |    5 -
 5 files changed, 53 insertions(+), 120 deletions(-)

Index: linux-2.6.21-rc4-acct/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.21-rc4-acct.orig/include/linux/kernel_stat.h	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/include/linux/kernel_stat.h	2007-03-26 00:56:25.000000000 +1000
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux-2.6.21-rc4-acct/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc4-acct.orig/include/linux/sched.h	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/include/linux/sched.h	2007-03-26 00:57:01.000000000 +1000
@@ -882,7 +882,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux-2.6.21-rc4-acct/kernel/sched.c
===================================================================
--- linux-2.6.21-rc4-acct.orig/kernel/sched.c	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/kernel/sched.c	2007-03-26 01:01:22.000000000 +1000
@@ -89,6 +89,7 @@ unsigned long long __attribute__((weak))
  */
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		JIFFIES_TO_NS(1)
 
 /*
  * These are the 'tuning knobs' of the scheduler:
@@ -3017,8 +3018,53 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff;
+
 	p->last_ran = rq->most_recent_timestamp = now;
+	/* Sanity check. It should never go backwards or ruin accounting */
+	if (unlikely(now < p->last_ran))
+		return;
+	time_diff = now - p->last_ran;
+	p->sched_time += time_diff;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (cpustat->nice_ns > JIFFY_NS) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFY_NS);
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+							 time_diff);
+			if (cpustat->user_ns > JIFFY_NS) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFY_NS);
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (p->utime_ns > JIFFY_NS) {
+			p->utime_ns = cputime_sub(p->utime_ns, JIFFY_NS);
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (cpustat->idle_ns > JIFFY_NS) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFY_NS);
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
 }
 
 /*
@@ -3104,8 +3150,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
Index: linux-2.6.21-rc4-acct/kernel/timer.c
===================================================================
--- linux-2.6.21-rc4-acct.orig/kernel/timer.c	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/kernel/timer.c	2007-03-26 00:56:25.000000000 +1000
@@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
Index: linux-2.6.21-rc4-acct/Documentation/cpu-load.txt
===================================================================
--- linux-2.6.21-rc4-acct.orig/Documentation/cpu-load.txt	2007-03-21 12:52:51.000000000 +1100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,113 +0,0 @@
-CPU load
---------
-
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
-
-    $ iostat
-    Linux 2.6.18.3-exp (linmac)     02/20/2007
-
-    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
-              10.01    0.00    2.92    5.44    0.00   81.63
-
-    ...
-
-Here the system thinks that over the default sampling period the
-system spent 10.01% of the time doing work in user space, 2.92% in the
-kernel, and was overall 81.63% of the time idle.
-
-In most cases the `/proc/stat' information reflects the reality quite
-closely, however due to the nature of how/when the kernel collects
-this data sometimes it can not be trusted at all.
-
-So how is this information collected?  Whenever timer interrupt is
-signalled the kernel looks what kind of task was running at this
-moment and increments the counter that corresponds to this tasks
-kind/state.  The problem with this is that the system could have
-switched between various states multiple times between two timer
-interrupts yet the counter is incremented only for the last state.
-
-
-Example
--------
-
-If we imagine the system with one task that periodically burns cycles
-in the following manner:
-
- time line between two timer interrupts
-|--------------------------------------|
- ^                                    ^
- |_ something begins working          |
-                                      |_ something goes to sleep
-                                     (only to be awaken quite soon)
-
-In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
-system is executing the idle handler), but in reality the load is
-closer to 99%.
-
-One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
-
-
-/* gcc -o hog smallhog.c */
-#include <time.h>
-#include <limits.h>
-#include <signal.h>
-#include <sys/time.h>
-#define HIST 10
-
-static volatile sig_atomic_t stop;
-
-static void sighandler (int signr)
-{
-     (void) signr;
-     stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
-     stop = 0;
-     while (!stop && --niters);
-     return niters;
-}
-int main (void)
-{
-     int i;
-     struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-                             .it_value = { .tv_sec = 0, .tv_usec = 1 } };
-     sigset_t set;
-     unsigned long v[HIST];
-     double tmp = 0.0;
-     unsigned long n;
-     signal (SIGALRM, &sighandler);
-     setitimer (ITIMER_REAL, &it, NULL);
-
-     hog (ULONG_MAX);
-     for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-     for (i = 0; i < HIST; ++i) tmp += v[i];
-     tmp /= HIST;
-     n = tmp - (tmp / 3.0);
-
-     sigemptyset (&set);
-     sigaddset (&set, SIGALRM);
-
-     for (;;) {
-         hog (n);
-         sigwait (&set, &i);
-     }
-     return 0;
-}
-
-
-References
-----------
-
-http://lkml.org/lkml/2007/2/12/6
-Documentation/filesystems/proc.txt (1.8)
-
-
-Thanks
-------
-
-Con Kolivas, Pavel Machek



-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 15:08               ` Con Kolivas
@ 2007-03-25 15:19                 ` malc
  2007-03-25 15:28                   ` Con Kolivas
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-03-25 15:19 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Mon, 26 Mar 2007, Con Kolivas wrote:

> On Monday 26 March 2007 00:57, malc wrote:
>> On Mon, 26 Mar 2007, Con Kolivas wrote:
>>> On Sunday 25 March 2007 23:06, malc wrote:
>>>> On Sun, 25 Mar 2007, Con Kolivas wrote:
>>>>> On Sunday 25 March 2007 21:46, Con Kolivas wrote:
>>>>>> On Sunday 25 March 2007 21:34, malc wrote:
>>>>>>> On Sun, 25 Mar 2007, Ingo Molnar wrote:
>>>>>>>> * Con Kolivas <kernel@kolivas.org> wrote:
>>>>>>>>> For an rsdl 0.33 patched kernel. Comments? Overhead worth it?
>>>>
>>>> [..snip..]
>>>>
>>>>> ---
>>>>> Currently we only do cpu accounting to userspace based on what is
>>>>> actually happening precisely on each tick. The accuracy of that
>>>>> accounting gets progressively worse the lower HZ is. As we already keep
>>>>> accounting of nanosecond resolution we can accurately track user cpu,
>>>>> nice cpu and idle cpu if we move the accounting to update_cpu_clock
>>>>> with a nanosecond cpu_usage_stat entry. This increases overhead
>>>>> slightly but avoids the problem of tick aliasing errors making
>>>>> accounting unreliable.
>>>>>
>>>>> Signed-off-by: Con Kolivas <kernel@kolivas.org>
>>>>> Signed-off-by: Ingo Molnar <mingo@elte.hu>
>>>>
>>>> [..snip..]
>>>>
>>>> Forgot to mention. Given that this goes into the kernel, shouldn't
>>>> Documentation/cpu-load.txt be amended/removed?
>>>
>>> Yes that's a good idea. Also there should be a sanity check because
>>> sometimes for some reason noone's been able to explain to me sched_clock
>>> gives a value which doesn't make sense (time appears to have gone
>>> backwards) and that will completely ruin the accounting from then on.
>>
>> After running this new kernel for a while i guess i have hit this issue:
>> http://www.boblycat.org/~malc/apc/bad-load.png
>>
>> Top and icewm's monitor do show incredibly huge load while in reality
>> nothing like that is really happening. Both ad-hoc and `/proc/stat' (idle)
>> show normal CPU utilization (7% since i'm doing some A/V stuff in the
>> background)
>
> Yes I'd say you hit the problem I described earlier. When playing with
> sched_clock() I found it gave some "interesting" results fairly infrequently.
> They could lead to ridiculous accounting mistakes.
>
> So before we go any further with this patch, can you try the following one and
> see if this simple sanity check is enough?

Sure (compiling the kernel now), too bad old axiom that testing can not
confirm absence of bugs holds.

I have one nit and one request from clarification. Question first (i
admit i haven't looked at the surroundings of the patch maybe things
would have been are self evident if i did):

What this patch amounts to is that the accounting logic is moved from
timer interrupt to the place where scheduler switches task (or something
to that effect)?

[..snip..]

>  * These are the 'tuning knobs' of the scheduler:
> @@ -3017,8 +3018,53 @@ EXPORT_PER_CPU_SYMBOL(kstat);
> static inline void
> update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
> {
> -	p->sched_time += now - p->last_ran;
> +	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
> +	cputime64_t time_diff;
> +
> 	p->last_ran = rq->most_recent_timestamp = now;
> +	/* Sanity check. It should never go backwards or ruin accounting */
> +	if (unlikely(now < p->last_ran))
> +		return;
> +	time_diff = now - p->last_ran;

A nit. Anything wrong with:

time_diff = now - p->last_ran;
if (unlikeley (LESS_THAN_ZERO (time_diff))
         return;

[..snip..]

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 15:19                 ` malc
@ 2007-03-25 15:28                   ` Con Kolivas
  2007-03-25 17:14                     ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 15:28 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Monday 26 March 2007 01:19, malc wrote:
> On Mon, 26 Mar 2007, Con Kolivas wrote:
> > So before we go any further with this patch, can you try the following
> > one and see if this simple sanity check is enough?
>
> Sure (compiling the kernel now), too bad old axiom that testing can not
> confirm absence of bugs holds.
>
> I have one nit and one request from clarification. Question first (i
> admit i haven't looked at the surroundings of the patch maybe things
> would have been are self evident if i did):
>
> What this patch amounts to is that the accounting logic is moved from
> timer interrupt to the place where scheduler switches task (or something
> to that effect)?

Both the scheduler tick and context switch now. So yes it adds overhead as I 
said, although we already do update_cpu_clock on context switch, but it's not 
this complex.

> [..snip..]
>
> >  * These are the 'tuning knobs' of the scheduler:
> > @@ -3017,8 +3018,53 @@ EXPORT_PER_CPU_SYMBOL(kstat);
> > static inline void
> > update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long
> > now) {
> > -	p->sched_time += now - p->last_ran;
> > +	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
> > +	cputime64_t time_diff;
> > +
> > 	p->last_ran = rq->most_recent_timestamp = now;
> > +	/* Sanity check. It should never go backwards or ruin accounting */
> > +	if (unlikely(now < p->last_ran))
> > +		return;
> > +	time_diff = now - p->last_ran;
>
> A nit. Anything wrong with:
>
> time_diff = now - p->last_ran;
> if (unlikeley (LESS_THAN_ZERO (time_diff))
>          return;

Does LESS_THAN_ZERO work on a cputime64_t on all arches? I can't figure that 
out just by looking myself which is why I did it the other way.

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 15:28                   ` Con Kolivas
@ 2007-03-25 17:14                     ` malc
  2007-03-25 23:01                       ` Con Kolivas
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-03-25 17:14 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Mon, 26 Mar 2007, Con Kolivas wrote:

> On Monday 26 March 2007 01:19, malc wrote:
>> On Mon, 26 Mar 2007, Con Kolivas wrote:
>>> So before we go any further with this patch, can you try the following
>>> one and see if this simple sanity check is enough?
>>
>> Sure (compiling the kernel now), too bad old axiom that testing can not
>> confirm absence of bugs holds.
>>
>> I have one nit and one request from clarification. Question first (i
>> admit i haven't looked at the surroundings of the patch maybe things
>> would have been are self evident if i did):
>>
>> What this patch amounts to is that the accounting logic is moved from
>> timer interrupt to the place where scheduler switches task (or something
>> to that effect)?
>
> Both the scheduler tick and context switch now. So yes it adds overhead as I
> said, although we already do update_cpu_clock on context switch, but it's not
> this complex.
>
>> [..snip..]
>>
>>>  * These are the 'tuning knobs' of the scheduler:
>>> @@ -3017,8 +3018,53 @@ EXPORT_PER_CPU_SYMBOL(kstat);
>>> static inline void
>>> update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long
>>> now) {
>>> -	p->sched_time += now - p->last_ran;
>>> +	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
>>> +	cputime64_t time_diff;
>>> +
>>> 	p->last_ran = rq->most_recent_timestamp = now;
>>> +	/* Sanity check. It should never go backwards or ruin accounting */
>>> +	if (unlikely(now < p->last_ran))
>>> +		return;
>>> +	time_diff = now - p->last_ran;
>>
>> A nit. Anything wrong with:
>>
>> time_diff = now - p->last_ran;
>> if (unlikeley (LESS_THAN_ZERO (time_diff))
>>          return;
>
> Does LESS_THAN_ZERO work on a cputime64_t on all arches? I can't figure that
> out just by looking myself which is why I did it the other way.

I have no idea what type cputime64_t really is, so used this imaginary
LESS_THAN_ZERO thing.

Erm... i just looked at the code and suddenly it stopped making any sense
at all:

         p->last_ran = rq->most_recent_timestamp = now;
         /* Sanity check. It should never go backwards or ruin accounting */
         if (unlikely(now < p->last_ran))
                 return;
         time_diff = now - p->last_ran;

First `now' is assigned to `p->last_ran' and the very next line
compares those two values, and then the difference is taken.. I quite
frankly am either very tired or fail to see the point.. time_diff is
either always zero or there's always a race here.

--
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 17:14                     ` malc
@ 2007-03-25 23:01                       ` Con Kolivas
  2007-03-25 23:57                         ` Con Kolivas
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 23:01 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Monday 26 March 2007 03:14, malc wrote:
> On Mon, 26 Mar 2007, Con Kolivas wrote:
> > On Monday 26 March 2007 01:19, malc wrote:
> >> On Mon, 26 Mar 2007, Con Kolivas wrote:
> >>> So before we go any further with this patch, can you try the following
> >>> one and see if this simple sanity check is enough?
> >>
> >> Sure (compiling the kernel now), too bad old axiom that testing can not
> >> confirm absence of bugs holds.
> >>
> >> I have one nit and one request from clarification. Question first (i
> >> admit i haven't looked at the surroundings of the patch maybe things
> >> would have been are self evident if i did):
> >>
> >> What this patch amounts to is that the accounting logic is moved from
> >> timer interrupt to the place where scheduler switches task (or something
> >> to that effect)?
> >
> > Both the scheduler tick and context switch now. So yes it adds overhead
> > as I said, although we already do update_cpu_clock on context switch, but
> > it's not this complex.
> >
> >> [..snip..]
> >>
> >>>  * These are the 'tuning knobs' of the scheduler:
> >>> @@ -3017,8 +3018,53 @@ EXPORT_PER_CPU_SYMBOL(kstat);
> >>> static inline void
> >>> update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long
> >>> long now) {
> >>> -	p->sched_time += now - p->last_ran;
> >>> +	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
> >>> +	cputime64_t time_diff;
> >>> +
> >>> 	p->last_ran = rq->most_recent_timestamp = now;
> >>> +	/* Sanity check. It should never go backwards or ruin accounting */
> >>> +	if (unlikely(now < p->last_ran))
> >>> +		return;
> >>> +	time_diff = now - p->last_ran;
> >>
> >> A nit. Anything wrong with:
> >>
> >> time_diff = now - p->last_ran;
> >> if (unlikeley (LESS_THAN_ZERO (time_diff))
> >>          return;
> >
> > Does LESS_THAN_ZERO work on a cputime64_t on all arches? I can't figure
> > that out just by looking myself which is why I did it the other way.
>
> I have no idea what type cputime64_t really is, so used this imaginary
> LESS_THAN_ZERO thing.
>
> Erm... i just looked at the code and suddenly it stopped making any sense
> at all:
>
>          p->last_ran = rq->most_recent_timestamp = now;
>          /* Sanity check. It should never go backwards or ruin accounting
> */ if (unlikely(now < p->last_ran))
>                  return;
>          time_diff = now - p->last_ran;
>
> First `now' is assigned to `p->last_ran' and the very next line
> compares those two values, and then the difference is taken.. I quite
> frankly am either very tired or fail to see the point.. time_diff is
> either always zero or there's always a race here.

Bah major thinko error on my part! That will teach me to post patches untested 
at 1:30 am. I'll try again shortly sorry.

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 23:01                       ` Con Kolivas
@ 2007-03-25 23:57                         ` Con Kolivas
  2007-03-26 10:49                           ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Con Kolivas @ 2007-03-25 23:57 UTC (permalink / raw)
  To: malc
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Monday 26 March 2007 09:01, Con Kolivas wrote:
> On Monday 26 March 2007 03:14, malc wrote:
> > On Mon, 26 Mar 2007, Con Kolivas wrote:
> > > On Monday 26 March 2007 01:19, malc wrote:
> > Erm... i just looked at the code and suddenly it stopped making any sense
> > at all:
> >
> >          p->last_ran = rq->most_recent_timestamp = now;
> >          /* Sanity check. It should never go backwards or ruin accounting
> > */ if (unlikely(now < p->last_ran))
> >                  return;
> >          time_diff = now - p->last_ran;
> >
> > First `now' is assigned to `p->last_ran' and the very next line
> > compares those two values, and then the difference is taken.. I quite
> > frankly am either very tired or fail to see the point.. time_diff is
> > either always zero or there's always a race here.
>
> Bah major thinko error on my part! That will teach me to post patches
> untested at 1:30 am. I'll try again shortly sorry.

Ok this one is heavily tested. Please try it when you find the time.

---
Currently we only do cpu accounting to userspace based on what is 
actually happening precisely on each tick. The accuracy of that 
accounting gets progressively worse the lower HZ is. As we already keep 
accounting of nanosecond resolution we can accurately track user cpu, 
nice cpu and idle cpu if we move the accounting to update_cpu_clock with 
a nanosecond cpu_usage_stat entry. This increases overhead slightly but 
avoids the problem of tick aliasing errors making accounting unreliable.

Remove the now defunct Documentation/cpu-load.txt file.

Signed-off-by: Con Kolivas <kernel@kolivas.org>

---
 Documentation/cpu-load.txt  |  113 --------------------------------------------
 include/linux/kernel_stat.h |    3 +
 include/linux/sched.h       |    2 
 kernel/sched.c              |   58 +++++++++++++++++++++-
 kernel/timer.c              |    5 -
 5 files changed, 60 insertions(+), 121 deletions(-)

Index: linux-2.6.21-rc4-acct/include/linux/kernel_stat.h
===================================================================
--- linux-2.6.21-rc4-acct.orig/include/linux/kernel_stat.h	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/include/linux/kernel_stat.h	2007-03-26 00:56:25.000000000 +1000
@@ -16,11 +16,14 @@
 
 struct cpu_usage_stat {
 	cputime64_t user;
+	cputime64_t user_ns;
 	cputime64_t nice;
+	cputime64_t nice_ns;
 	cputime64_t system;
 	cputime64_t softirq;
 	cputime64_t irq;
 	cputime64_t idle;
+	cputime64_t idle_ns;
 	cputime64_t iowait;
 	cputime64_t steal;
 };
Index: linux-2.6.21-rc4-acct/include/linux/sched.h
===================================================================
--- linux-2.6.21-rc4-acct.orig/include/linux/sched.h	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/include/linux/sched.h	2007-03-26 00:57:01.000000000 +1000
@@ -882,7 +882,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	unsigned long rt_priority;
-	cputime_t utime, stime;
+	cputime_t utime, utime_ns, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
Index: linux-2.6.21-rc4-acct/kernel/sched.c
===================================================================
--- linux-2.6.21-rc4-acct.orig/kernel/sched.c	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/kernel/sched.c	2007-03-26 09:38:50.000000000 +1000
@@ -89,6 +89,7 @@ unsigned long long __attribute__((weak))
  */
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define JIFFY_NS		JIFFIES_TO_NS(1)
 
 /*
  * These are the 'tuning knobs' of the scheduler:
@@ -3017,8 +3018,59 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-	p->sched_time += now - p->last_ran;
-	p->last_ran = rq->most_recent_timestamp = now;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t time_diff;
+
+	/* Sanity check. It should never go backwards or ruin accounting */
+	if (unlikely(now < p->last_ran))
+		goto out_set;
+	/* All the userspace visible cpu accounting is done here */
+	time_diff = now - p->last_ran;
+	p->sched_time += time_diff;
+	if (p != rq->idle) {
+		cputime_t utime_diff = time_diff;
+
+		if (TASK_NICE(p) > 0) {
+			cpustat->nice_ns = cputime64_add(cpustat->nice_ns,
+							 time_diff);
+			if (cpustat->nice_ns > JIFFY_NS) {
+				cpustat->nice_ns =
+					cputime64_sub(cpustat->nice_ns,
+					JIFFY_NS);
+				cpustat->nice =
+					cputime64_add(cpustat->nice, 1);
+			}
+		} else {
+			cpustat->user_ns = cputime64_add(cpustat->user_ns,
+							 time_diff);
+			if (cpustat->user_ns > JIFFY_NS) {
+				cpustat->user_ns =
+					cputime64_sub(cpustat->user_ns,
+					JIFFY_NS);
+				cpustat ->user =
+					cputime64_add(cpustat->user, 1);
+			}
+		}
+		p->utime_ns = cputime_add(p->utime_ns, utime_diff);
+		if (p->utime_ns > JIFFY_NS) {
+			p->utime_ns = cputime_sub(p->utime_ns, JIFFY_NS);
+			p->utime = cputime_add(p->utime,
+					       jiffies_to_cputime(1));
+		}
+	} else {
+		cpustat->idle_ns = cputime64_add(cpustat->idle_ns, time_diff);
+		if (cpustat->idle_ns > JIFFY_NS) {
+			cpustat->idle_ns = cputime64_sub(cpustat->idle_ns,
+							 JIFFY_NS);
+			cpustat->idle = cputime64_add(cpustat->idle, 1);
+		}
+	}
+out_set:
+	/*
+	 * We still need to set these values even if the clock appeared to
+	 * go backwards in case _this_ is the correct timestamp.
+	 */
+	rq->most_recent_timestamp = p->last_ran = now;
 }
 
 /*
@@ -3104,8 +3156,6 @@ void account_system_time(struct task_str
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
Index: linux-2.6.21-rc4-acct/kernel/timer.c
===================================================================
--- linux-2.6.21-rc4-acct.orig/kernel/timer.c	2007-03-26 00:56:05.000000000 +1000
+++ linux-2.6.21-rc4-acct/kernel/timer.c	2007-03-26 00:56:25.000000000 +1000
@@ -1196,10 +1196,9 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id();
 
 	/* Note: this timer irq context must be accounted for as well. */
-	if (user_tick)
-		account_user_time(p, jiffies_to_cputime(1));
-	else
+	if (!user_tick)
 		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	/* User time is accounted for in update_cpu_clock in sched.c */
 	run_local_timers();
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
Index: linux-2.6.21-rc4-acct/Documentation/cpu-load.txt
===================================================================
--- linux-2.6.21-rc4-acct.orig/Documentation/cpu-load.txt	2007-03-21 12:52:51.000000000 +1100
+++ /dev/null	1970-01-01 00:00:00.000000000 +0000
@@ -1,113 +0,0 @@
-CPU load
---------
-
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
-
-    $ iostat
-    Linux 2.6.18.3-exp (linmac)     02/20/2007
-
-    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
-              10.01    0.00    2.92    5.44    0.00   81.63
-
-    ...
-
-Here the system thinks that over the default sampling period the
-system spent 10.01% of the time doing work in user space, 2.92% in the
-kernel, and was overall 81.63% of the time idle.
-
-In most cases the `/proc/stat' information reflects the reality quite
-closely, however due to the nature of how/when the kernel collects
-this data sometimes it can not be trusted at all.
-
-So how is this information collected?  Whenever timer interrupt is
-signalled the kernel looks what kind of task was running at this
-moment and increments the counter that corresponds to this tasks
-kind/state.  The problem with this is that the system could have
-switched between various states multiple times between two timer
-interrupts yet the counter is incremented only for the last state.
-
-
-Example
--------
-
-If we imagine the system with one task that periodically burns cycles
-in the following manner:
-
- time line between two timer interrupts
-|--------------------------------------|
- ^                                    ^
- |_ something begins working          |
-                                      |_ something goes to sleep
-                                     (only to be awaken quite soon)
-
-In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
-system is executing the idle handler), but in reality the load is
-closer to 99%.
-
-One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
-
-
-/* gcc -o hog smallhog.c */
-#include <time.h>
-#include <limits.h>
-#include <signal.h>
-#include <sys/time.h>
-#define HIST 10
-
-static volatile sig_atomic_t stop;
-
-static void sighandler (int signr)
-{
-     (void) signr;
-     stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
-     stop = 0;
-     while (!stop && --niters);
-     return niters;
-}
-int main (void)
-{
-     int i;
-     struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-                             .it_value = { .tv_sec = 0, .tv_usec = 1 } };
-     sigset_t set;
-     unsigned long v[HIST];
-     double tmp = 0.0;
-     unsigned long n;
-     signal (SIGALRM, &sighandler);
-     setitimer (ITIMER_REAL, &it, NULL);
-
-     hog (ULONG_MAX);
-     for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-     for (i = 0; i < HIST; ++i) tmp += v[i];
-     tmp /= HIST;
-     n = tmp - (tmp / 3.0);
-
-     sigemptyset (&set);
-     sigaddset (&set, SIGALRM);
-
-     for (;;) {
-         hog (n);
-         sigwait (&set, &i);
-     }
-     return 0;
-}
-
-
-References
-----------
-
-http://lkml.org/lkml/2007/2/12/6
-Documentation/filesystems/proc.txt (1.8)
-
-
-Thanks
-------
-
-Con Kolivas, Pavel Machek

-- 
-ck

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-25 23:57                         ` Con Kolivas
@ 2007-03-26 10:49                           ` malc
  2007-03-28 11:37                             ` Ingo Molnar
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-03-26 10:49 UTC (permalink / raw)
  To: Con Kolivas
  Cc: Ingo Molnar, linux list, zwane, ck list, Andrew Morton, Thomas Gleixner

On Mon, 26 Mar 2007, Con Kolivas wrote:

> On Monday 26 March 2007 09:01, Con Kolivas wrote:
>> On Monday 26 March 2007 03:14, malc wrote:
>>> On Mon, 26 Mar 2007, Con Kolivas wrote:
>>>> On Monday 26 March 2007 01:19, malc wrote:
>>> Erm... i just looked at the code and suddenly it stopped making any sense
>>> at all:
>>>
>>>          p->last_ran = rq->most_recent_timestamp = now;
>>>          /* Sanity check. It should never go backwards or ruin accounting
>>> */ if (unlikely(now < p->last_ran))
>>>                  return;
>>>          time_diff = now - p->last_ran;
>>>
>>> First `now' is assigned to `p->last_ran' and the very next line
>>> compares those two values, and then the difference is taken.. I quite
>>> frankly am either very tired or fail to see the point.. time_diff is
>>> either always zero or there's always a race here.
>>
>> Bah major thinko error on my part! That will teach me to post patches
>> untested at 1:30 am. I'll try again shortly sorry.
>
> Ok this one is heavily tested. Please try it when you find the time.

[..snip..]

Done, works. However there's a problem with accuracy comming from a
different angle.

I have this USB video grabber and also quite efficient way of putting
the pixels to the screen. Video is grabbed using isochronous
transfers, i.e. lots of small(on the order of 1K) chunks of data are
being transferred continously instead of one big burst unlike in, for
instance, PCI setup. With your accounting change idle from
`/proc/stat' is accurate but unfortunatelly top(1)/icewm's monitor/etc
apparently use user+sys+nice+intr+softirq+iowait to show the system
load, so system tools claim that the load is 10-12% while in reality
it is ~3%.

This situation is harder to write a hog-like testcase for. Anyhow it
seems the difference in percentage stems from the `intr' field of
`/proc/stat', which fits. And following patch (which should be applied
on top of yours) seems to help. I wouldn't really know what to do with
softirq and the rest of counts touched by this function, so i left them
alone.

Comments?

diff -ru linux-2.6.21-rc4/include/linux/kernel_stat.h linux-2.6.21-rc4-load/include/linux/kernel_stat.h
--- linux-2.6.21-rc4/include/linux/kernel_stat.h	2007-03-26 14:33:19.000000000 +0400
+++ linux-2.6.21-rc4-load/include/linux/kernel_stat.h	2007-03-26 14:06:21.000000000 +0400
@@ -22,6 +22,7 @@
  	cputime64_t system;
  	cputime64_t softirq;
  	cputime64_t irq;
+	cputime64_t irq_ns;
  	cputime64_t idle;
  	cputime64_t idle_ns;
  	cputime64_t iowait;
diff -ru linux-2.6.21-rc4/kernel/sched.c linux-2.6.21-rc4-load/kernel/sched.c
--- linux-2.6.21-rc4/kernel/sched.c	2007-03-26 14:33:19.000000000 +0400
+++ linux-2.6.21-rc4-load/kernel/sched.c	2007-03-26 14:11:36.000000000 +0400
@@ -3148,9 +3148,14 @@

  	/* Add system time to cpustat. */
  	tmp = cputime_to_cputime64(cputime);
-	if (hardirq_count() - hardirq_offset)
-		cpustat->irq = cputime64_add(cpustat->irq, tmp);
-	else if (softirq_count())
+	if (hardirq_count() - hardirq_offset) {
+		cpustat->irq_ns = cputime64_add(cpustat->irq_ns, tmp);
+		if (cpustat->irq_ns > JIFFY_NS) {
+			cpustat->irq_ns = cputime64_sub(cpustat->irq_ns,
+							JIFFY_NS);
+			cpustat->irq = cputime64_add(cpustat->irq, 1);
+		}
+	} else if (softirq_count())
  		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
  	else if (p != rq->idle)
  		cpustat->system = cputime64_add(cpustat->system, tmp);

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-26 10:49                           ` malc
@ 2007-03-28 11:37                             ` Ingo Molnar
  2007-06-14 17:56                               ` Vassili Karpov
  0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2007-03-28 11:37 UTC (permalink / raw)
  To: malc; +Cc: linux-kernel


* malc <av1474@comtv.ru> wrote:

> This situation is harder to write a hog-like testcase for. Anyhow it 
> seems the difference in percentage stems from the `intr' field of 
> `/proc/stat', which fits. And following patch (which should be applied 
> on top of yours) seems to help. I wouldn't really know what to do with 
> softirq and the rest of counts touched by this function, so i left 
> them alone.
> 
> Comments?

i like it. FYI, i've applied your patch to -rt.

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-03-28 11:37                             ` Ingo Molnar
@ 2007-06-14 17:56                               ` Vassili Karpov
  2007-06-14 20:42                                 ` Ingo Molnar
  0 siblings, 1 reply; 34+ messages in thread
From: Vassili Karpov @ 2007-06-14 17:56 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel

Hello Ingo and others,

After reading http://lwn.net/Articles/236485/ and noticing few refernces
to accounting i decided to give CFS a try. With sched-cfs-v2.6.21.4-16
i get pretty weird results, it seems like scheduler is dead set on trying
to move the processes to different CPUs/cores all the time. And with hog
(manually tweaking the amount iterations) i get fairly strange resuls,
first of all the process is split between two cores, secondly while
integral load provided by the kernel looks correct, it's off by good
20 percent on each idividial core.

(http://www.boblycat.org/~malc/apc/hog-cfs-v16.png)

Thought this information might be of some interest.

P.S. How come the /proc/stat information is much closer to reality now?
      Something like what Con Kolivas suggested was added to sched.c?

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-14 17:56                               ` Vassili Karpov
@ 2007-06-14 20:42                                 ` Ingo Molnar
  2007-06-14 20:56                                   ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2007-06-14 20:42 UTC (permalink / raw)
  To: Vassili Karpov; +Cc: linux-kernel


* Vassili Karpov <av1474@comtv.ru> wrote:

> Hello Ingo and others,
> 
> After reading http://lwn.net/Articles/236485/ and noticing few 
> refernces to accounting i decided to give CFS a try. With 
> sched-cfs-v2.6.21.4-16 i get pretty weird results, it seems like 
> scheduler is dead set on trying to move the processes to different 
> CPUs/cores all the time. And with hog (manually tweaking the amount 
> iterations) i get fairly strange resuls, first of all the process is 
> split between two cores, secondly while integral load provided by the 
> kernel looks correct, it's off by good 20 percent on each idividial 
> core.
> 
> (http://www.boblycat.org/~malc/apc/hog-cfs-v16.png)
> 
> Thought this information might be of some interest.

hm - what does 'hog' do, can i download hog.c from somewhere?

the alternating balancing might be due to an uneven number of tasks 
perhaps? If you have 3 tasks on 2 cores then there's no other solution 
to achieve even performance of each task but to rotate them amongst the 
cores.

> P.S. How come the /proc/stat information is much closer to reality 
>      now? Something like what Con Kolivas suggested was added to 
>      sched.c?

well, precise/finegrained accounting patches have been available for 
years, the thing with CFS is that there we get them 'for free', because 
CFS needs those metrics for its own logic. That's why this information 
is much closer to reality now. But note: right now what is affected by 
the changes in the CFS patches is /proc/PID/stat (i.e. the per-task 
information that 'top' and 'ps' displays, _not_ /proc/stat) - but more 
accurate /proc/stat could certainly come later on too.

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-14 20:42                                 ` Ingo Molnar
@ 2007-06-14 20:56                                   ` malc
  2007-06-14 21:18                                     ` Ingo Molnar
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-06-14 20:56 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel

On Thu, 14 Jun 2007, Ingo Molnar wrote:

>
> * Vassili Karpov <av1474@comtv.ru> wrote:
>
>> Hello Ingo and others,
>>
>> After reading http://lwn.net/Articles/236485/ and noticing few
>> refernces to accounting i decided to give CFS a try. With
>> sched-cfs-v2.6.21.4-16 i get pretty weird results, it seems like
>> scheduler is dead set on trying to move the processes to different
>> CPUs/cores all the time. And with hog (manually tweaking the amount
>> iterations) i get fairly strange resuls, first of all the process is
>> split between two cores, secondly while integral load provided by the
>> kernel looks correct, it's off by good 20 percent on each idividial
>> core.
>>
>> (http://www.boblycat.org/~malc/apc/hog-cfs-v16.png)
>>
>> Thought this information might be of some interest.
>
> hm - what does 'hog' do, can i download hog.c from somewhere?

http://www.boblycat.org/~malc/apc/hog.c and also a in
Documentation/cpu-load.txt.

>
> the alternating balancing might be due to an uneven number of tasks
> perhaps? If you have 3 tasks on 2 cores then there's no other solution
> to achieve even performance of each task but to rotate them amongst the
> cores.

One task, one thread. I have also tried to watch fairly demanding video
(Elephants Dream in 1920x1080/MPEG4) with mplayer, and CFS moves the
only task between cores almost every second.

>> P.S. How come the /proc/stat information is much closer to reality
>>      now? Something like what Con Kolivas suggested was added to
>>      sched.c?
>
> well, precise/finegrained accounting patches have been available for
> years, the thing with CFS is that there we get them 'for free', because
> CFS needs those metrics for its own logic. That's why this information
> is much closer to reality now. But note: right now what is affected by
> the changes in the CFS patches is /proc/PID/stat (i.e. the per-task
> information that 'top' and 'ps' displays, _not_ /proc/stat) - but more
> accurate /proc/stat could certainly come later on too.

Aha. I see, it's just that integral load for hog is vastly improved
compared to vanilla 2.6.21 (then again some other tests are off by a few
percent (at least), though they were fine with Con's patch (which was
announced at the beginning of this thread))

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-14 20:56                                   ` malc
@ 2007-06-14 21:18                                     ` Ingo Molnar
  2007-06-14 21:37                                       ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2007-06-14 21:18 UTC (permalink / raw)
  To: malc; +Cc: linux-kernel, Balbir Singh, Dmitry Adamushko


* malc <av1474@comtv.ru> wrote:

> > the alternating balancing might be due to an uneven number of tasks 
> > perhaps? If you have 3 tasks on 2 cores then there's no other 
> > solution to achieve even performance of each task but to rotate them 
> > amongst the cores.
> 
> One task, one thread. I have also tried to watch fairly demanding 
> video (Elephants Dream in 1920x1080/MPEG4) with mplayer, and CFS moves 
> the only task between cores almost every second.

hm, mplayer is not running alone when it does video playback: Xorg is 
also pretty active. Furthermore, the task you are using to monitor 
mplayer counts too. The Core2Duo has a shared L2 cache between cores, so 
it is pretty cheap to move tasks between the cores.

> > well, precise/finegrained accounting patches have been available for 
> > years, the thing with CFS is that there we get them 'for free', 
> > because CFS needs those metrics for its own logic. That's why this 
> > information is much closer to reality now. But note: right now what 
> > is affected by the changes in the CFS patches is /proc/PID/stat 
> > (i.e. the per-task information that 'top' and 'ps' displays, _not_ 
> > /proc/stat) - but more accurate /proc/stat could certainly come 
> > later on too.
> 
> Aha. I see, it's just that integral load for hog is vastly improved 
> compared to vanilla 2.6.21 [...]

hm, which ones are improved? Could this be due to some other property of 
CFS? If your app relies on /proc/stat then there's no extra precision in 
those cpustat values yet.

i've Cc:-ed Balbir Singh and Dmitry Adamushko who are the main authors 
of the current precise accounting code in CFS. Maybe i missed some 
detail :-)

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-14 21:18                                     ` Ingo Molnar
@ 2007-06-14 21:37                                       ` malc
  2007-06-15  3:44                                         ` Balbir Singh
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-06-14 21:37 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Balbir Singh, Dmitry Adamushko

On Thu, 14 Jun 2007, Ingo Molnar wrote:

>
> * malc <av1474@comtv.ru> wrote:
>
>>> the alternating balancing might be due to an uneven number of tasks
>>> perhaps? If you have 3 tasks on 2 cores then there's no other
>>> solution to achieve even performance of each task but to rotate them
>>> amongst the cores.
>>
>> One task, one thread. I have also tried to watch fairly demanding
>> video (Elephants Dream in 1920x1080/MPEG4) with mplayer, and CFS moves
>> the only task between cores almost every second.
>
> hm, mplayer is not running alone when it does video playback: Xorg is
> also pretty active. Furthermore, the task you are using to monitor
> mplayer counts too. The Core2Duo has a shared L2 cache between cores, so
> it is pretty cheap to move tasks between the cores.
>

Well just to be sure i reran the test with `-vo null' (and fwiw i tried
few completely different output drivers) the behavior is the same. I'm
not running Core2Duo but X2, but guess that does not really matter here.

As for the task that monitors, i've written it myself (there are two
monitoring methods, one(the accurate) does not depend on contets of
`/proc/stat' at all), so it can be cheaply (for me) changed in any
way one wants. Sources are available at the same place where screenshot
was found.

>>> well, precise/finegrained accounting patches have been available for
>>> years, the thing with CFS is that there we get them 'for free',
>>> because CFS needs those metrics for its own logic. That's why this
>>> information is much closer to reality now. But note: right now what
>>> is affected by the changes in the CFS patches is /proc/PID/stat
>>> (i.e. the per-task information that 'top' and 'ps' displays, _not_
>>> /proc/stat) - but more accurate /proc/stat could certainly come
>>> later on too.
>>
>> Aha. I see, it's just that integral load for hog is vastly improved
>> compared to vanilla 2.6.21 [...]
>
> hm, which ones are improved? Could this be due to some other property of
> CFS? If your app relies on /proc/stat then there's no extra precision in
> those cpustat values yet.

This is what it looked like before: 
http://www.boblycat.org/~malc/apc/load-x2-hog.png

Now integral load matches the one obtained via the "accurate" method.
However the report for individual cores are of by around 20% percent.

Though i'm not quite sure what you mean by "which ones are improved".

> i've Cc:-ed Balbir Singh and Dmitry Adamushko who are the main authors
> of the current precise accounting code in CFS. Maybe i missed some
> detail :-)

Oh, the famous "With enough eyeballs, all bugs are shallow." in action.

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-14 21:37                                       ` malc
@ 2007-06-15  3:44                                         ` Balbir Singh
  2007-06-15  6:07                                           ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Balbir Singh @ 2007-06-15  3:44 UTC (permalink / raw)
  To: malc; +Cc: Ingo Molnar, linux-kernel, Dmitry Adamushko

malc wrote:
> On Thu, 14 Jun 2007, Ingo Molnar wrote:
> 
>>
>> * malc <av1474@comtv.ru> wrote:
>>
>>>> the alternating balancing might be due to an uneven number of tasks
>>>> perhaps? If you have 3 tasks on 2 cores then there's no other
>>>> solution to achieve even performance of each task but to rotate them
>>>> amongst the cores.
>>>
>>> One task, one thread. I have also tried to watch fairly demanding
>>> video (Elephants Dream in 1920x1080/MPEG4) with mplayer, and CFS moves
>>> the only task between cores almost every second.
>>
>> hm, mplayer is not running alone when it does video playback: Xorg is
>> also pretty active. Furthermore, the task you are using to monitor
>> mplayer counts too. The Core2Duo has a shared L2 cache between cores, so
>> it is pretty cheap to move tasks between the cores.
>>
> 
> Well just to be sure i reran the test with `-vo null' (and fwiw i tried
> few completely different output drivers) the behavior is the same. I'm
> not running Core2Duo but X2, but guess that does not really matter here.
> 
> As for the task that monitors, i've written it myself (there are two
> monitoring methods, one(the accurate) does not depend on contets of
> `/proc/stat' at all), so it can be cheaply (for me) changed in any
> way one wants. Sources are available at the same place where screenshot
> was found.
> 
>>>> well, precise/finegrained accounting patches have been available for
>>>> years, the thing with CFS is that there we get them 'for free',
>>>> because CFS needs those metrics for its own logic. That's why this
>>>> information is much closer to reality now. But note: right now what
>>>> is affected by the changes in the CFS patches is /proc/PID/stat
>>>> (i.e. the per-task information that 'top' and 'ps' displays, _not_
>>>> /proc/stat) - but more accurate /proc/stat could certainly come
>>>> later on too.
>>>
>>> Aha. I see, it's just that integral load for hog is vastly improved
>>> compared to vanilla 2.6.21 [...]
>>
>> hm, which ones are improved? Could this be due to some other property of
>> CFS? If your app relies on /proc/stat then there's no extra precision in
>> those cpustat values yet.
> 
> This is what it looked like before:
> http://www.boblycat.org/~malc/apc/load-x2-hog.png
> 
> Now integral load matches the one obtained via the "accurate" method.
> However the report for individual cores are of by around 20% percent.
> 

I think I missed some of the context, is the accounting of individual tasks
or cpustat values off by 20%? I'll try and reproduce this problem.

Could you provide more details on the APC tool that you are using -- I
do not understand the orange and yellow lines, do they represent system
and user time?

NOTE: There is some inconsistency in the values reported by /usr/bin/time
(getrusage) and values reported in /proc or through delay accounting.


> Though i'm not quite sure what you mean by "which ones are improved".
> 
>> i've Cc:-ed Balbir Singh and Dmitry Adamushko who are the main authors
>> of the current precise accounting code in CFS. Maybe i missed some
>> detail :-)
> 
> Oh, the famous "With enough eyeballs, all bugs are shallow." in action.
> 


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-15  3:44                                         ` Balbir Singh
@ 2007-06-15  6:07                                           ` malc
  2007-06-16 13:21                                             ` Balbir Singh
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-06-15  6:07 UTC (permalink / raw)
  To: Balbir Singh; +Cc: Ingo Molnar, linux-kernel, Dmitry Adamushko

On Fri, 15 Jun 2007, Balbir Singh wrote:

> malc wrote:
>> On Thu, 14 Jun 2007, Ingo Molnar wrote:
>>

[..snip..]

>>
>> Now integral load matches the one obtained via the "accurate" method.
>> However the report for individual cores are of by around 20% percent.
>>
>
> I think I missed some of the context, is the accounting of individual tasks
> or cpustat values off by 20%? I'll try and reproduce this problem.

Neither actually, the individual core idle times reported via `/proc/stat'
are off by 20 percent, one over estimates and the other under estimates
and the sum is right on the mark.

>
> Could you provide more details on the APC tool that you are using -- I
> do not understand the orange and yellow lines, do they represent system
> and user time?

It's somewhat documented on the page: http://www.boblycat.org/~malc/apc
Anyway the left bar is based on information from `/proc/stat' the right
one is derived from the kernel module that just times how much time was
spent in idle handler. The graphs: red - `/proc/stat', yellow - module.

> NOTE: There is some inconsistency in the values reported by /usr/bin/time
> (getrusage) and values reported in /proc or through delay accounting.

I don't really use `getrusage'.

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-15  6:07                                           ` malc
@ 2007-06-16 13:21                                             ` Balbir Singh
  2007-06-16 14:07                                               ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Balbir Singh @ 2007-06-16 13:21 UTC (permalink / raw)
  To: malc; +Cc: Ingo Molnar, linux-kernel, Dmitry Adamushko

malc wrote:
> On Fri, 15 Jun 2007, Balbir Singh wrote:
> 
>> malc wrote:
>>> On Thu, 14 Jun 2007, Ingo Molnar wrote:
>>>
> 
> [..snip..]
> 
>>>
>>> Now integral load matches the one obtained via the "accurate" method.
>>> However the report for individual cores are of by around 20% percent.
>>>
>>
>> I think I missed some of the context, is the accounting of individual
>> tasks
>> or cpustat values off by 20%? I'll try and reproduce this problem.
> 
> Neither actually, the individual core idle times reported via `/proc/stat'
> are off by 20 percent, one over estimates and the other under estimates
> and the sum is right on the mark.
> 

Interesting, the idle time accounting (done from account_system_time())
has not changed. Has your .config changed? Could you please send
it across. I've downloaded apc and I am trying to reproduce your problem.

>>
>> Could you provide more details on the APC tool that you are using -- I
>> do not understand the orange and yellow lines, do they represent system
>> and user time?
> 
> It's somewhat documented on the page: http://www.boblycat.org/~malc/apc
> Anyway the left bar is based on information from `/proc/stat' the right
> one is derived from the kernel module that just times how much time was
> spent in idle handler. The graphs: red - `/proc/stat', yellow - module.
> 
>> NOTE: There is some inconsistency in the values reported by /usr/bin/time
>> (getrusage) and values reported in /proc or through delay accounting.
> 
> I don't really use `getrusage'.
> 

Tools like /usr/bin/time do.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-16 13:21                                             ` Balbir Singh
@ 2007-06-16 14:07                                               ` malc
  2007-06-16 18:40                                                 ` Ingo Molnar
  0 siblings, 1 reply; 34+ messages in thread
From: malc @ 2007-06-16 14:07 UTC (permalink / raw)
  To: Balbir Singh; +Cc: Ingo Molnar, linux-kernel, Dmitry Adamushko

On Sat, 16 Jun 2007, Balbir Singh wrote:

> malc wrote:
>> On Fri, 15 Jun 2007, Balbir Singh wrote:
>>
>>> malc wrote:
>>>> On Thu, 14 Jun 2007, Ingo Molnar wrote:
>>>>
>>
>> [..snip..]
>>
>>>>
>>>> Now integral load matches the one obtained via the "accurate" method.
>>>> However the report for individual cores are of by around 20% percent.
>>>>
>>>
>>> I think I missed some of the context, is the accounting of individual
>>> tasks
>>> or cpustat values off by 20%? I'll try and reproduce this problem.
>>
>> Neither actually, the individual core idle times reported via `/proc/stat'
>> are off by 20 percent, one over estimates and the other under estimates
>> and the sum is right on the mark.
>>
>
> Interesting, the idle time accounting (done from account_system_time())
> has not changed. Has your .config changed? Could you please send
> it across. I've downloaded apc and I am trying to reproduce your problem.

http://www.boblycat.org/~malc/apc/cfs/ has config for 2.6.21 an the 
diff against 2.6.21.4-cfs-v16.

I updated hog (can be found in the above directory) to call setitimer
with a bit saner values (apprantly tickless has profound effect on
itimers interface). While fooling around with this version of hog
on 2.6.21.4-cfs-v16 i stumbled upon rather strange behavior (the
screenshot is also at the address above, note that kernel was booted
with maxcpus=1)

[..snip..]

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-16 14:07                                               ` malc
@ 2007-06-16 18:40                                                 ` Ingo Molnar
  2007-06-16 20:31                                                   ` malc
  0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2007-06-16 18:40 UTC (permalink / raw)
  To: malc; +Cc: Balbir Singh, linux-kernel, Dmitry Adamushko, Thomas Gleixner


* malc <av1474@comtv.ru> wrote:

> > Interesting, the idle time accounting (done from 
> > account_system_time()) has not changed. Has your .config changed? 
> > Could you please send it across. I've downloaded apc and I am trying 
> > to reproduce your problem.
> 
> http://www.boblycat.org/~malc/apc/cfs/ has config for 2.6.21 an the 
> diff against 2.6.21.4-cfs-v16.

hm. Could you add this to your kernel boot command line:

     highres=off nohz=off

and retest, to inquire whether this problem is independent of any 
timer-events effects?

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [patch] sched: accurate user accounting
  2007-06-16 18:40                                                 ` Ingo Molnar
@ 2007-06-16 20:31                                                   ` malc
  0 siblings, 0 replies; 34+ messages in thread
From: malc @ 2007-06-16 20:31 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Balbir Singh, linux-kernel, Dmitry Adamushko, Thomas Gleixner

On Sat, 16 Jun 2007, Ingo Molnar wrote:

>
> * malc <av1474@comtv.ru> wrote:
>
>>> Interesting, the idle time accounting (done from
>>> account_system_time()) has not changed. Has your .config changed?
>>> Could you please send it across. I've downloaded apc and I am trying
>>> to reproduce your problem.
>>
>> http://www.boblycat.org/~malc/apc/cfs/ has config for 2.6.21 an the
>> diff against 2.6.21.4-cfs-v16.
>
> hm. Could you add this to your kernel boot command line:
>
>     highres=off nohz=off
>
> and retest, to inquire whether this problem is independent of any
> timer-events effects?

It certainly makes a difference. Without dynticks however scheduler
still moves the task (be it hog or mplayer) between CPUs for no good
reason, for hog the switching is every few seconds (instead of more or
less all the time in case of dynticks). What's rather strange is that
while it hogs 7x% of CPU on core#1 it only hogs 3x% on core#2
(percentage is obtained by timing idle handler not form /proc/stat,
according to /proc/stat either core is 0% loaded)..

Live report... After a while it stabilized and was running on core#2
all the time, when the process was stopped and restarted it started to
run constantly on core#1 (with ~70% load)

Btw. i don't want to waste anyones time here. All i originally wanted
is to know if something was done (as per LWN article) with increasing
the accuracy of system wide statistics (in /proc/stat), turns out that
nothing really happened in this area, but latest developments
(CFS/dynticks) have some peculiar effect on hog. Plus this constant
migration of hog/mplayer is somewhat strange.

Live report... again.. Okay now that hog stabilized on running
exclusively on core#1 and at 70% load i switched to the machine
where it runs and after just switching the windows in IceWM
resulted in system load dropping to 30%.. Quite reproducible too.

-- 
vale

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2007-06-16 20:33 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-25  1:59 [PATCH] [RFC] sched: accurate user accounting Con Kolivas
2007-03-25  2:14 ` Con Kolivas
2007-03-25  7:51 ` [patch] " Ingo Molnar
2007-03-25  8:39   ` Con Kolivas
2007-03-25  9:04     ` Ingo Molnar
2007-03-25 11:34   ` malc
2007-03-25 11:46     ` Con Kolivas
2007-03-25 12:02       ` Con Kolivas
2007-03-25 12:32         ` Gene Heskett
2007-03-25 12:41           ` Con Kolivas
2007-03-25 13:33             ` Gene Heskett
2007-03-25 13:05         ` malc
2007-03-25 13:06         ` malc
2007-03-25 14:15           ` Con Kolivas
2007-03-25 14:57             ` malc
2007-03-25 15:08               ` Con Kolivas
2007-03-25 15:19                 ` malc
2007-03-25 15:28                   ` Con Kolivas
2007-03-25 17:14                     ` malc
2007-03-25 23:01                       ` Con Kolivas
2007-03-25 23:57                         ` Con Kolivas
2007-03-26 10:49                           ` malc
2007-03-28 11:37                             ` Ingo Molnar
2007-06-14 17:56                               ` Vassili Karpov
2007-06-14 20:42                                 ` Ingo Molnar
2007-06-14 20:56                                   ` malc
2007-06-14 21:18                                     ` Ingo Molnar
2007-06-14 21:37                                       ` malc
2007-06-15  3:44                                         ` Balbir Singh
2007-06-15  6:07                                           ` malc
2007-06-16 13:21                                             ` Balbir Singh
2007-06-16 14:07                                               ` malc
2007-06-16 18:40                                                 ` Ingo Molnar
2007-06-16 20:31                                                   ` malc

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).