linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched/cputime: make scale_stime() more precise
@ 2019-07-18 13:18 Oleg Nesterov
  2019-07-18 13:21 ` Oleg Nesterov
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-18 13:18 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel

People report that utime and stime from /proc/<pid>/stat become very wrong
when the numbers are big enough. In particular, the monitored application
can run all the time in user-space but only stime grows.

This is because scale_stime() is very inaccurate. It tries to minimize the
relative error, but the absolute error can be huge.

Andrew wrote the test-case:

	int main(int argc, char **argv)
	{
	    struct task_cputime c;
	    struct prev_cputime p;
	    u64 st, pst, cst;
	    u64 ut, put, cut;
	    u64 x;
	    int i = -1; // one step not printed

	    if (argc != 2)
	    {
		printf("usage: %s <start_in_seconds>\n", argv[0]);
		return 1;
	    }
	    x = strtoull(argv[1], NULL, 0) * SEC;
	    printf("start=%lld\n", x);

	    p.stime = 0;
	    p.utime = 0;

	    while (i++ < NSTEPS)
	    {
		x += STEP;
		c.stime = x;
		c.utime = x;
		c.sum_exec_runtime = x + x;
		pst = cputime_to_clock_t(p.stime);
		put = cputime_to_clock_t(p.utime);
		cputime_adjust(&c, &p, &ut, &st);
		cst = cputime_to_clock_t(st);
		cut = cputime_to_clock_t(ut);
		if (i)
		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
			cut, cut - put, cst, cst - pst);
	    }
	}

For example,

	$ ./stime 300000
	start=300000000000000
	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)

shows the problem even when sum_exec_runtime is not that big: 300000 secs.

The new implementation of scale_stime() does the additional div64_u64_rem()
in a loop but see the comment, as long it is used by cputime_adjust() this
can happen only once.

Reported-by: Andrew Fox <afox@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/sched/cputime.c | 66 ++++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce8..ad055a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -525,47 +525,55 @@ void account_idle_ticks(unsigned long ticks)
 }
 
 /*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
+ * Perform (stime * rtime) / total, but avoid multiplication overflow
+ * by losing precision when the numbers are big.
+ *
+ * NOTE! currently the only user is cputime_adjust() and thus
+ *
+ *	stime < total && rtime > total
+ *
+ * this means that the end result is always precise and the additional
+ * div64_u64_rem() inside the main loop is called at most once.
  */
 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
-	u64 scaled;
+	u64 res = 0, div, rem;
 
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
+	/* can stime * rtime overflow ? */
+	while (ilog2(stime) + ilog2(rtime) > 62) {
 		if (stime > rtime)
 			swap(rtime, stime);
 
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
+		if (rtime >= total) {
+			/*
+			 * (rtime * stime) / total is equal to
+			 *
+			 *	(rtime / total) * stime +
+			 *	(rtime % total) * stime / total
+			 *
+			 * if nothing overflows. Can the 1st multiplication
+			 * overflow? Yes, but we do not care: this can only
+			 * happen if the end result can't fit in u64 anyway.
+			 *
+			 * So the code below does
+			 *
+			 *	res += (rtime / total) * stime;
+			 *	rtime = rtime % total;
+			 */
+			div = div64_u64_rem(rtime, total, &rem);
+			res += div * stime;
+			rtime = rem;
+			continue;
+		}
 
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
+		/* drop precision */
 		rtime >>= 1;
 		total >>= 1;
+		if (!total)
+			return res;
 	}
 
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
+	return res + div64_u64(stime * rtime, total);
 }
 
 /*
-- 
2.5.0



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
@ 2019-07-18 13:21 ` Oleg Nesterov
  2019-07-18 14:55 ` Oleg Nesterov
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-18 13:21 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel

To simplify the review, see the code with this patch applied:

/*
 * Perform (stime * rtime) / total, but avoid multiplication overflow
 * by losing precision when the numbers are big.
 *
 * NOTE! currently the only user is cputime_adjust() and thus
 *
 *	stime < total && rtime > total
 *
 * this means that the end result is always precise and the additional
 * div64_u64_rem() inside the main loop is called at most once.
 */
static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	/* can stime * rtime overflow ? */
	while (ilog2(stime) + ilog2(rtime) > 62) {
		if (stime > rtime)
			swap(rtime, stime);

		if (rtime >= total) {
			/*
			 * (rtime * stime) / total is equal to
			 *
			 *	(rtime / total) * stime +
			 *	(rtime % total) * stime / total
			 *
			 * if nothing overflows. Can the 1st multiplication
			 * overflow? Yes, but we do not care: this can only
			 * happen if the end result can't fit in u64 anyway.
			 *
			 * So the code below does
			 *
			 *	res += (rtime / total) * stime;
			 *	rtime = rtime % total;
			 */
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;
			continue;
		}

		/* drop precision */
		rtime >>= 1;
		total >>= 1;
		if (!total)
			return res;
	}

	return res + div64_u64(stime * rtime, total);
}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
  2019-07-18 13:21 ` Oleg Nesterov
@ 2019-07-18 14:55 ` Oleg Nesterov
  2019-07-19 11:03 ` Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-18 14:55 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel

On 07/18, Oleg Nesterov wrote:
>
> + * NOTE! currently the only user is cputime_adjust() and thus
> + *
> + *	stime < total && rtime > total
> + *
> + * this means that the end result is always precise and the additional
> + * div64_u64_rem() inside the main loop is called at most once.

Ah, I just noticed that the comment is not 100% correct... in theory we
can drop the precision and even do div64_u64_rem() more than once, but this
can only happen if stime or total = stime + utime is "really" huge, I don't
think this can happen in practice...

We can probably just do

	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shitt;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

but this way the code looks less generic.

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
  2019-07-18 13:21 ` Oleg Nesterov
  2019-07-18 14:55 ` Oleg Nesterov
@ 2019-07-19 11:03 ` Peter Zijlstra
  2019-07-19 13:47   ` Peter Zijlstra
                     ` (2 more replies)
  2020-01-22 16:46 ` Oleg Nesterov
  2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
  4 siblings, 3 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-19 11:03 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> People report that utime and stime from /proc/<pid>/stat become very wrong
> when the numbers are big enough. In particular, the monitored application
> can run all the time in user-space but only stime grows.
> 
> This is because scale_stime() is very inaccurate. It tries to minimize the
> relative error, but the absolute error can be huge.
> 
> Andrew wrote the test-case:
> 
> 	int main(int argc, char **argv)
> 	{
> 	    struct task_cputime c;
> 	    struct prev_cputime p;
> 	    u64 st, pst, cst;
> 	    u64 ut, put, cut;
> 	    u64 x;
> 	    int i = -1; // one step not printed
> 
> 	    if (argc != 2)
> 	    {
> 		printf("usage: %s <start_in_seconds>\n", argv[0]);
> 		return 1;
> 	    }
> 	    x = strtoull(argv[1], NULL, 0) * SEC;
> 	    printf("start=%lld\n", x);
> 
> 	    p.stime = 0;
> 	    p.utime = 0;
> 
> 	    while (i++ < NSTEPS)
> 	    {
> 		x += STEP;
> 		c.stime = x;
> 		c.utime = x;
> 		c.sum_exec_runtime = x + x;
> 		pst = cputime_to_clock_t(p.stime);
> 		put = cputime_to_clock_t(p.utime);
> 		cputime_adjust(&c, &p, &ut, &st);
> 		cst = cputime_to_clock_t(st);
> 		cut = cputime_to_clock_t(ut);
> 		if (i)
> 		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
> 			cut, cut - put, cst, cst - pst);
> 	    }
> 	}
> 
> For example,
> 
> 	$ ./stime 300000
> 	start=300000000000000
> 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> 
> shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> 
> The new implementation of scale_stime() does the additional div64_u64_rem()
> in a loop but see the comment, as long it is used by cputime_adjust() this
> can happen only once.

That only shows something after long long staring :/ There's no words on
what the output actually means or what would've been expected.

Also, your example is incomplete; the below is a test for scale_stime();
from this we can see that the division results in too large a number,
but, important for our use-case in cputime_adjust(), it is a step
function (due to loss in precision) and for every plateau we shift
runtime into the wrong bucket.

Your proposed function works; but is atrocious, esp. on 32bit. That
said, before we 'fixed' it, it had similar horrible divisions in, see
commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").

Included below is also an x86_64 implementation in 2 instructions.

I'm still trying see if there's anything saner we can do...

---
#include <stdio.h>
#include <stdlib.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;

static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
	u64 q;
	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
	return q;
}

static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder);

static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static __always_inline int fls(unsigned int x)
{
	return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}

#if 0
static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	union {
		u64 v64;
		u32 v32[2];
	} d = { dividend };
	u32 upper;

	upper = d.v32[1];
	d.v32[1] = 0;
	if (upper >= divisor) {
		d.v32[1] = upper / divisor;
		upper %= divisor;
	}
	asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
		"rm" (divisor), "0" (d.v32[0]), "1" (upper));
	return d.v64;
}
static u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		u32 rem32;
		quot = div_u64_rem(dividend, divisor, &rem32);
		*remainder = rem32;
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;

		*remainder = dividend - quot * divisor;
		if (*remainder >= divisor) {
			quot++;
			*remainder -= divisor;
		}
	}

	return quot;
}
static u64 div64_u64(u64 dividend, u64 divisor)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		quot = div_u64(dividend, divisor);
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;
		if ((dividend - quot * divisor) >= divisor)
			quot++;
	}

	return quot;
}
#else
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
#endif

static __always_inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

static noinline u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((stime * rtime), total);
	return scaled;
}

static noinline u64 oleg(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;
	/* can stime * rtime overflow ? */
	while (ilog2(stime) + ilog2(rtime) > 62) {
		if (stime > rtime)
			swap(rtime, stime);
		if (rtime >= total) {
			/*
			 * (rtime * stime) / total is equal to
			 *
			 *      (rtime / total) * stime +
			 *      (rtime % total) * stime / total
			 *
			 * if nothing overflows. Can the 1st multiplication
			 * overflow? Yes, but we do not care: this can only
			 * happen if the end result can't fit in u64 anyway.
			 *
			 * So the code below does
			 *
			 *      res += (rtime / total) * stime;
			 *      rtime = rtime % total;
			 */
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;
			continue;
		}
		/* drop precision */
		rtime >>= 1;
		total >>= 1;
		if (!total)
			return res;
	}
	return res + div64_u64(stime * rtime, total);
}

#define SEC	1000000000ULL

int main(int argc, char **argv)
{
	u64 u, s;
	u64 x;
	int i = -1; // one step not printed
	if (argc != 2)
	{
		printf("usage: %s <start_in_seconds>\n", argv[0]);
		return 1;
	}
	x = strtoull(argv[1], NULL, 0) * SEC;
	printf("start=%lld\n", x);

	for (i=0; i<50; i++, x += 2000) {
		printf("%lld = %lld * %lld / %lld\n", mul_u64_u64_div_u64(x, x+x, x+x), x, x+x, x+x);
		printf("%lld = %lld * %lld / %lld\n", scale_stime(x, x+x, x+x), x, x+x, x+x);
		printf("%lld = %lld * %lld / %lld\n", oleg(x, x+x, x+x), x, x+x, x+x);
		printf("---\n");
	}
}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-19 11:03 ` Peter Zijlstra
@ 2019-07-19 13:47   ` Peter Zijlstra
  2019-07-19 14:37     ` Oleg Nesterov
  2019-07-19 14:03   ` Oleg Nesterov
  2019-07-22 10:52   ` Stanislaw Gruszka
  2 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-19 13:47 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> > People report that utime and stime from /proc/<pid>/stat become very wrong
> > when the numbers are big enough. In particular, the monitored application
> > can run all the time in user-space but only stime grows.
> > 
> > This is because scale_stime() is very inaccurate. It tries to minimize the
> > relative error, but the absolute error can be huge.
> > 
> > Andrew wrote the test-case:
> > 
> > 	int main(int argc, char **argv)
> > 	{
> > 	    struct task_cputime c;
> > 	    struct prev_cputime p;
> > 	    u64 st, pst, cst;
> > 	    u64 ut, put, cut;
> > 	    u64 x;
> > 	    int i = -1; // one step not printed
> > 
> > 	    if (argc != 2)
> > 	    {
> > 		printf("usage: %s <start_in_seconds>\n", argv[0]);
> > 		return 1;
> > 	    }
> > 	    x = strtoull(argv[1], NULL, 0) * SEC;
> > 	    printf("start=%lld\n", x);
> > 
> > 	    p.stime = 0;
> > 	    p.utime = 0;
> > 
> > 	    while (i++ < NSTEPS)
> > 	    {
> > 		x += STEP;
> > 		c.stime = x;
> > 		c.utime = x;
> > 		c.sum_exec_runtime = x + x;
> > 		pst = cputime_to_clock_t(p.stime);
> > 		put = cputime_to_clock_t(p.utime);
> > 		cputime_adjust(&c, &p, &ut, &st);
> > 		cst = cputime_to_clock_t(st);
> > 		cut = cputime_to_clock_t(ut);
> > 		if (i)
> > 		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
> > 			cut, cut - put, cst, cst - pst);
> > 	    }
> > 	}
> > 
> > For example,
> > 
> > 	$ ./stime 300000
> > 	start=300000000000000
> > 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> > 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> > 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> > 
> > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > 
> > The new implementation of scale_stime() does the additional div64_u64_rem()
> > in a loop but see the comment, as long it is used by cputime_adjust() this
> > can happen only once.
> 
> That only shows something after long long staring :/ There's no words on
> what the output actually means or what would've been expected.
> 
> Also, your example is incomplete; the below is a test for scale_stime();
> from this we can see that the division results in too large a number,
> but, important for our use-case in cputime_adjust(), it is a step
> function (due to loss in precision) and for every plateau we shift
> runtime into the wrong bucket.

But I'm still confused, since in the long run, it should still end up
with a proportionally divided user/system, irrespective of some short
term wobblies.

So please, better articulate the problem.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-19 11:03 ` Peter Zijlstra
  2019-07-19 13:47   ` Peter Zijlstra
@ 2019-07-19 14:03   ` Oleg Nesterov
  2019-07-22 19:45     ` Peter Zijlstra
  2019-07-22 10:52   ` Stanislaw Gruszka
  2 siblings, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-19 14:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On 07/19, Peter Zijlstra wrote:
>
> On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> >
> > 	$ ./stime 300000
> > 	start=300000000000000
> > 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> > 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> > 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> >
> > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> >
> > The new implementation of scale_stime() does the additional div64_u64_rem()
> > in a loop but see the comment, as long it is used by cputime_adjust() this
> > can happen only once.
>
> That only shows something after long long staring :/ There's no words on
> what the output actually means or what would've been expected.

Sorry, I should have explained it in more details,

> Also, your example is incomplete; the below is a test for scale_stime();
> from this we can see that the division results in too large a number,
> but, important for our use-case in cputime_adjust(), it is a step
> function (due to loss in precision) and for every plateau we shift
> runtime into the wrong bucket.

Yes.

> Your proposed function works; but is atrocious,

Agreed,

> esp. on 32bit.

Yes... but lets compare it with the current implementation. To simplify,
lets look at the "less generic" version I showed in reply to this patch:

	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shift;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

So, if stime * rtime overflows it does div64_u64() twice while the
current version does a single div_u64() == do_div() (on 32bit).

Even a single div64_u64() is more expensive than do_div() but afaics
a) not too much and b) only if divisor == total doesn't fit in 32bit
and I think this is unlikely.

So I'd say it makes scale_stime() approximately twice more expensive
on 32bit. But hopefully fixe the problem.

> Included below is also an x86_64 implementation in 2 instructions.

But we need the arch-neutral implementation anyway, the code above
is the best I could invent.

But see below!

> I'm still trying see if there's anything saner we can do...

Oh, please, it is not that I like my solution very much, I would like
to see something more clever.

> static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> {
> 	u64 q;
> 	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
> 	return q;
> }

Heh. I have to admit that I didn't know that divq divides 128bit by
64bit. gcc calls the __udivti3 intrinsic in this case so I wrongly
came to conclusion this is not simple even on x86_64. Plus the fact
that linux/math64.h only has mul_u64_u64_shr()...

IIUC, the new helper above is not "safe", it generates an exception
if the result doesn't fit in 64bit. But scale_stime() can safely use
it because stime < total.

So may be we can do

	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		#ifdef mul_u64_u64_div_u64
		return mul_u64_u64_div_u64(stime, rtime, total);
		#endif

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shift;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

?

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-19 13:47   ` Peter Zijlstra
@ 2019-07-19 14:37     ` Oleg Nesterov
  2019-07-22 19:56       ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-19 14:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On 07/19, Peter Zijlstra wrote:
>
> > > 	$ ./stime 300000
> > > 	start=300000000000000
> > > 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> > > 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> > >
> > > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > >
> > > The new implementation of scale_stime() does the additional div64_u64_rem()
> > > in a loop but see the comment, as long it is used by cputime_adjust() this
> > > can happen only once.
> >
> > That only shows something after long long staring :/ There's no words on
> > what the output actually means or what would've been expected.
> >
> > Also, your example is incomplete; the below is a test for scale_stime();
> > from this we can see that the division results in too large a number,
> > but, important for our use-case in cputime_adjust(), it is a step
> > function (due to loss in precision) and for every plateau we shift
> > runtime into the wrong bucket.
>
> But I'm still confused, since in the long run, it should still end up
> with a proportionally divided user/system, irrespective of some short
> term wobblies.

Why?

Yes, statistically the numbers are proportionally divided.

but you will (probably) never see the real stime == 1000 && utime == 10000
numbers if you watch incrementally.

Just in case... yes I know that these numbers can only "converge" to the
reality, only their sum is correct. But people complain.

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-19 11:03 ` Peter Zijlstra
  2019-07-19 13:47   ` Peter Zijlstra
  2019-07-19 14:03   ` Oleg Nesterov
@ 2019-07-22 10:52   ` Stanislaw Gruszka
  2019-07-22 20:00     ` Peter Zijlstra
  2 siblings, 1 reply; 29+ messages in thread
From: Stanislaw Gruszka @ 2019-07-22 10:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, linux-kernel, Linus Torvalds

On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > 
> > The new implementation of scale_stime() does the additional div64_u64_rem()
> > in a loop but see the comment, as long it is used by cputime_adjust() this
> > can happen only once.
> 
> That only shows something after long long staring :/ There's no words on
> what the output actually means or what would've been expected.
> 
> Also, your example is incomplete; the below is a test for scale_stime();
> from this we can see that the division results in too large a number,
> but, important for our use-case in cputime_adjust(), it is a step
> function (due to loss in precision) and for every plateau we shift
> runtime into the wrong bucket.
> 
> Your proposed function works; but is atrocious, esp. on 32bit. That
> said, before we 'fixed' it, it had similar horrible divisions in, see
> commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").
> 
> Included below is also an x86_64 implementation in 2 instructions.
> 
> I'm still trying see if there's anything saner we can do...

I was always proponent of removing scaling and export raw values
and sum_exec_runtime. But that has obvious drawback, reintroduce
'top hiding' issue.

But maybe we can export raw values in separate file i.e.
/proc/[pid]/raw_cpu_times ? So applications that require more precise
cputime values for very long-living processes can use this file.

Stanislaw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-19 14:03   ` Oleg Nesterov
@ 2019-07-22 19:45     ` Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-22 19:45 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On Fri, Jul 19, 2019 at 04:03:25PM +0200, Oleg Nesterov wrote:
> On 07/19, Peter Zijlstra wrote:
> > Included below is also an x86_64 implementation in 2 instructions.
> 
> But we need the arch-neutral implementation anyway, the code above
> is the best I could invent.

Agreed; we do. Depending on the cost of division and if the arch has a
64x64->128 mult, it might be better to compute a reciprocal and multiply
that, but yes, long staring didn't get me much better ideas either.

> But see below!
> 
> > I'm still trying see if there's anything saner we can do...
> 
> Oh, please, it is not that I like my solution very much, I would like
> to see something more clever.
> 
> > static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> > {
> > 	u64 q;
> > 	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
> > 	return q;
> > }
> 
> Heh. I have to admit that I didn't know that divq divides 128bit by
> 64bit. gcc calls the __udivti3 intrinsic in this case so I wrongly
> came to conclusion this is not simple even on x86_64. Plus the fact
> that linux/math64.h only has mul_u64_u64_shr()...

C wants to promote the dividend and divisor to the same type (int128)
and then it runs into trouble.

But yeah, I don't know how many other 64bit archs can pull off that
trick. I asked, and ARGH64 cannot do that 128/64 (although it can do a
64x64->128 in two instructions).


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-19 14:37     ` Oleg Nesterov
@ 2019-07-22 19:56       ` Peter Zijlstra
  2019-07-23 14:00         ` Oleg Nesterov
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-22 19:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On Fri, Jul 19, 2019 at 04:37:42PM +0200, Oleg Nesterov wrote:
> On 07/19, Peter Zijlstra wrote:

> > But I'm still confused, since in the long run, it should still end up
> > with a proportionally divided user/system, irrespective of some short
> > term wobblies.
> 
> Why?
> 
> Yes, statistically the numbers are proportionally divided.

This; due to the loss in precision the distribution is like a step
function around the actual s:u ratio line, but on average it still is
s:u.

Even if it were a perfect function, we'd still see increments in stime even
if the current program state never does syscalls, simply because it
needs to stay on that s:u line.

> but you will (probably) never see the real stime == 1000 && utime == 10000
> numbers if you watch incrementally.

See, there are no 'real' stime and utime numbers. What we have are user
and system samples -- tick based.

If the tick lands in the kernel, we get a system sample, if the tick
lands in userspace we get a user sample.

What we do have is an accurate (ns) based runtime accounting, and we
(re)construct stime and utime from this; we divide the total known
runtime in stime and utime pro-rata.

Sure, we take a shortcut, it wobbles a bit, but seriously, the samples
are inaccurate anyway, so who bloody cares :-)

You can construct a program that runs 99% in userspace but has all
system samples. All you need to do is make sure you're in a system call
when the tick lands.

> Just in case... yes I know that these numbers can only "converge" to the
> reality, only their sum is correct. But people complain.

People always complain, just tell em to go pound sand :-)


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-22 10:52   ` Stanislaw Gruszka
@ 2019-07-22 20:00     ` Peter Zijlstra
  2019-07-23  9:37       ` Stanislaw Gruszka
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-22 20:00 UTC (permalink / raw)
  To: Stanislaw Gruszka
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, linux-kernel, Linus Torvalds

On Mon, Jul 22, 2019 at 12:52:41PM +0200, Stanislaw Gruszka wrote:
> On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> > > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > > 
> > > The new implementation of scale_stime() does the additional div64_u64_rem()
> > > in a loop but see the comment, as long it is used by cputime_adjust() this
> > > can happen only once.
> > 
> > That only shows something after long long staring :/ There's no words on
> > what the output actually means or what would've been expected.
> > 
> > Also, your example is incomplete; the below is a test for scale_stime();
> > from this we can see that the division results in too large a number,
> > but, important for our use-case in cputime_adjust(), it is a step
> > function (due to loss in precision) and for every plateau we shift
> > runtime into the wrong bucket.
> > 
> > Your proposed function works; but is atrocious, esp. on 32bit. That
> > said, before we 'fixed' it, it had similar horrible divisions in, see
> > commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").
> > 
> > Included below is also an x86_64 implementation in 2 instructions.
> > 
> > I'm still trying see if there's anything saner we can do...
> 
> I was always proponent of removing scaling and export raw values
> and sum_exec_runtime. But that has obvious drawback, reintroduce
> 'top hiding' issue.

I think (but didn't grep) that we actually export sum_exec_runtime in
/proc/ *somewhere*.

> But maybe we can export raw values in separate file i.e.
> /proc/[pid]/raw_cpu_times ? So applications that require more precise
> cputime values for very long-living processes can use this file.

There are no raw cpu_times, there are system and user samples, and
samples are, by their very nature, an approximation. We just happen to
track the samples in TICK_NSEC resolution these days, but they're still
ticks (except on s390 and maybe other archs, which do time accounting in
the syscall path).

But I think you'll find x86 people are quite opposed to doing TSC reads
in syscall entry and exit :-)


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-22 20:00     ` Peter Zijlstra
@ 2019-07-23  9:37       ` Stanislaw Gruszka
  0 siblings, 0 replies; 29+ messages in thread
From: Stanislaw Gruszka @ 2019-07-23  9:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, linux-kernel, Linus Torvalds

On Mon, Jul 22, 2019 at 10:00:34PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 22, 2019 at 12:52:41PM +0200, Stanislaw Gruszka wrote:
> > On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> > > > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > > > 
> > > > The new implementation of scale_stime() does the additional div64_u64_rem()
> > > > in a loop but see the comment, as long it is used by cputime_adjust() this
> > > > can happen only once.
> > > 
> > > That only shows something after long long staring :/ There's no words on
> > > what the output actually means or what would've been expected.
> > > 
> > > Also, your example is incomplete; the below is a test for scale_stime();
> > > from this we can see that the division results in too large a number,
> > > but, important for our use-case in cputime_adjust(), it is a step
> > > function (due to loss in precision) and for every plateau we shift
> > > runtime into the wrong bucket.
> > > 
> > > Your proposed function works; but is atrocious, esp. on 32bit. That
> > > said, before we 'fixed' it, it had similar horrible divisions in, see
> > > commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").
> > > 
> > > Included below is also an x86_64 implementation in 2 instructions.
> > > 
> > > I'm still trying see if there's anything saner we can do...
> > 
> > I was always proponent of removing scaling and export raw values
> > and sum_exec_runtime. But that has obvious drawback, reintroduce
> > 'top hiding' issue.
> 
> I think (but didn't grep) that we actually export sum_exec_runtime in
> /proc/ *somewhere*.

Yes, it is, in /proc/[PID]/schedstat for CONFIG_SCHEDSTAT=y and in
/proc/[PID]/sched for CONFIG_SCHED_DEBUG=y

> > But maybe we can export raw values in separate file i.e.
> > /proc/[pid]/raw_cpu_times ? So applications that require more precise
> > cputime values for very long-living processes can use this file.
> 
> There are no raw cpu_times, there are system and user samples, and
> samples are, by their very nature, an approximation. We just happen to
> track the samples in TICK_NSEC resolution these days, but they're still
> ticks (except on s390 and maybe other archs, which do time accounting in
> the syscall path).
> 
> But I think you'll find x86 people are quite opposed to doing TSC reads
> in syscall entry and exit :-)

By 'raw' I meant values that are stored in task_struct without
processing by cputime_adjust() -> scale_stime(). Idea is that
user space can make scaling using floating point, so do not have
to drop precision if numbers are big. 

That was discussed on my RFC and PATCH series posts:
https://lore.kernel.org/lkml/1364489605-5443-1-git-send-email-sgruszka@redhat.com/
https://lore.kernel.org/lkml/1365066635-2959-1-git-send-email-sgruszka@redhat.com/

There was objection that even if kernel does not tell what utime/stime
values mean exactly, users already got used to scaled behaviour and
changing this is 'semantic' ABI breakage. And obviously this would make
'top hiding' worm working again for unpatched top.

So perhaps we can add new exports of not-scaled utime/stime and achieve
the same goal without changing the meaning of existing fields. From
other hand, maybe nowadays better tools exist to provide exact cputimes
to user space i.e. 'perf stat' or bpf, so proposed addition is unneeded.

Stanislaw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-22 19:56       ` Peter Zijlstra
@ 2019-07-23 14:00         ` Oleg Nesterov
  2019-07-23 14:29           ` Oleg Nesterov
  0 siblings, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-23 14:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On 07/22, Peter Zijlstra wrote:
>
> On Fri, Jul 19, 2019 at 04:37:42PM +0200, Oleg Nesterov wrote:
> > On 07/19, Peter Zijlstra wrote:
>
> > > But I'm still confused, since in the long run, it should still end up
> > > with a proportionally divided user/system, irrespective of some short
> > > term wobblies.
> >
> > Why?
> >
> > Yes, statistically the numbers are proportionally divided.
>
> This; due to the loss in precision the distribution is like a step
> function around the actual s:u ratio line, but on average it still is
> s:u.

You know, I am no longer sure... perhaps it is even worse, I need to recheck.

> Even if it were a perfect function, we'd still see increments in stime even
> if the current program state never does syscalls, simply because it
> needs to stay on that s:u line.
>
> > but you will (probably) never see the real stime == 1000 && utime == 10000
> > numbers if you watch incrementally.
>
> See, there are no 'real' stime and utime numbers. What we have are user
> and system samples -- tick based.

Yes, yes, I know.

> Sure, we take a shortcut, it wobbles a bit, but seriously, the samples
> are inaccurate anyway, so who bloody cares :-)
...
> People always complain, just tell em to go pound sand :-)

I tried ;) this was my initial reaction to this bug report.

However,

> You can construct a program that runs 99% in userspace but has all
> system samples.

Yes, but with the current implementation you do not need to construct
such a program, this is what you can easily get "in practice". And this
confuses people.

They can watch /proc/pid/stat incrementally and (when the numbers are big)
find that a program that runs 100% in userspace somehow spends 10 minutes
almost entirely in kernel. Or at least more in kernel than in userspace.
Even if task->stime doesn't grow at all.

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-23 14:00         ` Oleg Nesterov
@ 2019-07-23 14:29           ` Oleg Nesterov
  0 siblings, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2019-07-23 14:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Linus Torvalds, Stanislaw Gruszka

On 07/23, Oleg Nesterov wrote:
>
> > > Yes, statistically the numbers are proportionally divided.
> >
> > This; due to the loss in precision the distribution is like a step
> > function around the actual s:u ratio line, but on average it still is
> > s:u.
>
> You know, I am no longer sure... perhaps it is even worse, I need to recheck.

To clarify, this is probably true even if prev_cputime adds more confusion.
But how many minutes (hours? days?) you need to look at /proc/pid/stat
to get the average which more or less matches the reality?

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
                   ` (2 preceding siblings ...)
  2019-07-19 11:03 ` Peter Zijlstra
@ 2020-01-22 16:46 ` Oleg Nesterov
  2020-01-23 13:05   ` Oleg Nesterov
  2020-01-24 15:42   ` Oleg Nesterov
  2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
  4 siblings, 2 replies; 29+ messages in thread
From: Oleg Nesterov @ 2020-01-22 16:46 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel, Stanislaw Gruszka

Hi Peter, et al.

On 07/18, Oleg Nesterov wrote:
>
> People report that utime and stime from /proc/<pid>/stat become very wrong
> when the numbers are big enough. In particular, the monitored application
> can run all the time in user-space but only stime grows.

Let me reanimate this discussion and try again to convince you that
scale_stime() needs changes and this patch makes sense.

To remind, scale_stime(stime, rtime, total) is not precise, to say at
least. For example:

	stime = -1ul/33333; total = stime*3; rtime = total*5555555555;

scale_stime() returns 9067034312525142184 while the correct result is
6148914688753325707.

OK, these random numbers are not realistic, usually the relative error
is small enough.

However, even if the relative error is small, the absolute error can be
huge. And this means that if you watch /proc/$pid/status incrementally
to see how stime/utime grow, you can get the completely wrong numbers.

Say, utime (or stime) can be frozen for unpredictably long time, as if
the monitored application "hangs" in kernel mode, while the real split
is 50/50. The test-case from the changelog tries to demonstrate this,
see https://lore.kernel.org/lkml/20190718131834.GA22211@redhat.com/

Yes, yes, there are no 'real' stime and utime numbers. But still, why we
can't improve scale_stime()?

-------------------------------------------------------------------------------
Lets look at simplified (but less accurate) version I mentioned in
https://lore.kernel.org/lkml/20190718145549.GA22902@redhat.com

	u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res = div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shift;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

It is much more accurate than the current version, in fact I think it
should be 100% accurate "in practice".

But there is another reason why I think it makes more sense. It is also
faster on x86-64, much faster when the numbers are big. See the naive
test code below. For example,

	$ ./test 553407856289849 18446744066259977121 1660223568869547
	553407856289849 * 18446744066259977121 / 1660223568869547 =
		(128) 6148914688753325707
		(asm) 6148914688753325707
		(new) 6148914691236512239
		(old) 9067034312525142184

	ticks:
		asm: 7183908591
		new: 4891383871
		old: 23585547775

(I am surprised it works faster than asm mul_u64_u64_div_u64() version
 on my machine, I don't understand why divq is so slow when rdx != 0).

I am going to send V2, I need to rewrite the changelog and comments.
But if you still think this doesn't worth fixing, please tell me right
now.

What do you think?

Oleg.

-------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned __int128 u128;
typedef unsigned long long u64;
typedef unsigned int u32;

u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
	u64 q;
	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
	return q;
}

static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return scaled;
}

u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	if (ilog2(stime) + ilog2(rtime) > 62) {
		div = div64_u64_rem(rtime, total, &rem);
		res = div * stime;
		rtime = rem;

		int shift = ilog2(stime) + ilog2(rtime) - 62;
		if (shift > 0) {
			rtime >>= shift;
			total >>= shift;
			if (!total)
				return res;
		}
	}

	return res + div64_u64(stime * rtime, total);
}

static inline u64 rdtsc(void)
{
	unsigned low, high;
	asm volatile("rdtsc" : "=a" (low), "=d" (high));
	return ((low) | ((u64)(high) << 32));
}

u64 S, R, T;

u64 noinline profile(u64 (*f)(u64,u64,u64))
{
//	u64 s = S, r = R, t = T;
	u64 tsc1, tsc2;
	int i;

	tsc1 = rdtsc();

	for (i = 0; i < 100*1000*1000; ++i)
//		f(s++, r++, t++);
		f(S,R,T);

	tsc2 = rdtsc();

	return tsc2 - tsc1;
}


int main(int argc, char **argv)
{
	if (argc != 4) {
		printf("usage: %s stime rtime total\n", argv[0]);
		return 1;
	}

	S = strtoull(argv[1], NULL, 0);
	R = strtoull(argv[2], NULL, 0);
	T = strtoull(argv[3], NULL, 0);
	assert(S < T);
	assert(T < R);

	if (1) {
		printf("%llu * %llu / %llu =\n", S,R,T);
		printf("\t(128) %lld\n", (u64)( ((u128)S)*((u128)R)/((u128)T) ));
		printf("\t(asm) %lld\n", mul_u64_u64_div_u64(S,R,T));
		printf("\t(new) %lld\n", new_scale_stime(S,R,T));
		printf("\t(old) %lld\n", scale_stime(S,R,T));
		printf("\n");
	}

	printf("ticks:\n");
	printf("\tasm: %lld\n", profile(mul_u64_u64_div_u64));
	printf("\tnew: %lld\n", profile(new_scale_stime));
	printf("\told: %lld\n", profile(scale_stime));

	return 0;
}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2020-01-22 16:46 ` Oleg Nesterov
@ 2020-01-23 13:05   ` Oleg Nesterov
  2020-01-24 15:42   ` Oleg Nesterov
  1 sibling, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2020-01-23 13:05 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel, Stanislaw Gruszka

On 01/22, Oleg Nesterov wrote:
>
> But there is another reason why I think it makes more sense. It is also
> faster on x86-64, much faster when the numbers are big. See the naive
> test code below. For example,
>
> 	$ ./test 553407856289849 18446744066259977121 1660223568869547
> 	553407856289849 * 18446744066259977121 / 1660223568869547 =
> 		(128) 6148914688753325707
> 		(asm) 6148914688753325707
> 		(new) 6148914691236512239
> 		(old) 9067034312525142184
>
> 	ticks:
> 		asm: 7183908591
> 		new: 4891383871
> 		old: 23585547775

Just for completeness, see the updated code which can be compiled with -m32.
As expected, my version is slower on 32-bit when the numbers are small,

	$ ./test 1 3 2
	1 * 3 / 2 =
		(new) 1
		(old) 1

	ticks:
		new: 3624344961
		old: 2514403456

But still faster when rtime is big enough:

	$ ./test 1 68719476736 2
	1 * 68719476736 / 2 =
		(new) 34359738368
		(old) 34359738368

	ticks:
		new: 5044284834
		old: 5347969883

	$ ./test 553407856289849 18446744066259977121 1660223568869547
	553407856289849 * 18446744066259977121 / 1660223568869547 =
		(new) 6148914691236512239
		(old) 9067034312525142184

	ticks:
		new: 11496181242
		old: 33622910386

Oleg.

------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;

#ifdef __x86_64__
typedef unsigned __int128 u128;

u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
	u64 q;
	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
	return q;
}

static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}
#else // 32-bit
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	union {
		u64 v64;
		u32 v32[2];
	} d = { dividend };
	u32 upper;

	upper = d.v32[1];
	d.v32[1] = 0;
	if (upper >= divisor) {
		d.v32[1] = upper / divisor;
		upper %= divisor;
	}
	asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
		"rm" (divisor), "0" (d.v32[0]), "1" (upper));
	return d.v64;
}

static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls(unsigned int x)
{
	int r;

	asm("bsrl %1,%0\n\t"
	    "cmovzl %2,%0"
	    : "=&r" (r) : "rm" (x), "rm" (-1));

	return r + 1;
}
static inline int fls64(u64 x)
{
	u32 h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}

u64 noinline div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		u32 rem32;
		quot = div_u64_rem(dividend, divisor, &rem32);
		*remainder = rem32;
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;

		*remainder = dividend - quot * divisor;
		if (*remainder >= divisor) {
			quot++;
			*remainder -= divisor;
		}
	}

	return quot;
}
u64 noinline div64_u64(u64 dividend, u64 divisor)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		quot = div_u64(dividend, divisor);
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;
		if ((dividend - quot * divisor) >= divisor)
			quot++;
	}

	return quot;
}
#endif

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return scaled;
}

u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	if (ilog2(stime) + ilog2(rtime) > 62) {
		div = div64_u64_rem(rtime, total, &rem);
		res = div * stime;
		rtime = rem;

		int shift = ilog2(stime) + ilog2(rtime) - 62;
		if (shift > 0) {
			rtime >>= shift;
			total >>= shift;
			if (!total)
				return res;
		}
	}

	return res + div64_u64(stime * rtime, total);
}

static inline u64 rdtsc(void)
{
	unsigned low, high;
	asm volatile("rdtsc" : "=a" (low), "=d" (high));
	return ((low) | ((u64)(high) << 32));
}

u64 S, R, T;

u64 noinline profile(u64 (*f)(u64,u64,u64))
{
//	u64 s = S, r = R, t = T;
	u64 tsc1, tsc2;
	int i;

	tsc1 = rdtsc();

	for (i = 0; i < 100*1000*1000; ++i)
//		f(s++, r++, t++);
		f(S,R,T);

	tsc2 = rdtsc();

	return tsc2 - tsc1;
}


int main(int argc, char **argv)
{
	if (argc != 4) {
		printf("usage: %s stime rtime total\n", argv[0]);
		return 1;
	}

	S = strtoull(argv[1], NULL, 0);
	R = strtoull(argv[2], NULL, 0);
	T = strtoull(argv[3], NULL, 0);
	assert(S < T);
	assert(T < R);

	if (1) {
		printf("%llu * %llu / %llu =\n", S,R,T);
#ifdef __x86_64__
		printf("\t(128) %lld\n", (u64)( ((u128)S)*((u128)R)/((u128)T) ));
		printf("\t(asm) %lld\n", mul_u64_u64_div_u64(S,R,T));
#endif
		printf("\t(new) %lld\n", new_scale_stime(S,R,T));
		printf("\t(old) %lld\n", scale_stime(S,R,T));
		printf("\n");
	}

	printf("ticks:\n");
#ifdef __x86_64__
	printf("\tasm: %lld\n", profile(mul_u64_u64_div_u64));
#endif
	printf("\tnew: %lld\n", profile(new_scale_stime));
	printf("\told: %lld\n", profile(scale_stime));

	return 0;
}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] sched/cputime: make scale_stime() more precise
  2020-01-22 16:46 ` Oleg Nesterov
  2020-01-23 13:05   ` Oleg Nesterov
@ 2020-01-24 15:42   ` Oleg Nesterov
  1 sibling, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2020-01-24 15:42 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel, Stanislaw Gruszka

On 01/22, Oleg Nesterov wrote:
>
> To remind, scale_stime(stime, rtime, total) is not precise, to say at
> least. For example:
>
> 	stime = -1ul/33333; total = stime*3; rtime = total*5555555555;
>
> scale_stime() returns 9067034312525142184 while the correct result is
> 6148914688753325707.
>
> OK, these random numbers are not realistic, usually the relative error
> is small enough.
>
> However, even if the relative error is small, the absolute error can be
> huge. And this means that if you watch /proc/$pid/status incrementally
> to see how stime/utime grow, you can get the completely wrong numbers.
>
> Say, utime (or stime) can be frozen for unpredictably long time, as if
> the monitored application "hangs" in kernel mode, while the real split
> is 50/50.

See another test-case below. Arguments:

	start_time start_utime_percent inc_time inc_utime_percent

For example,

	$ ./test 8640000 50 600 50 | head

simulates process which runs 100 days 50/50 in user/kernel mode, then it
starts to check utime/stime every 600 seconds and print the difference.

The output:

               old               	               new
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
    600000000000:0               	    300000000000:300000000000
    499469920248:100530079752    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
               0:600000000000    	    300000000000:300000000000
    600000000000:0               	    300000000000:300000000000
    499490181588:100509818412    	    300000000000:300000000000


it looks as if this process can spend 20 minutes entirely in kernel mode.

Oleg.

-------------------------------------------------------------------------------
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;
typedef unsigned __int128 u128;

static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return scaled;
}

u64 new_scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	if (ilog2(stime) + ilog2(rtime) > 62) {
		div = div64_u64_rem(rtime, total, &rem);
		res = div * stime;
		rtime = rem;

		int shift = ilog2(stime) + ilog2(rtime) - 62;
		if (shift > 0) {
			rtime >>= shift;
			total >>= shift;
			if (!total)
				return res;
		}
	}

	return res + div64_u64(stime * rtime, total);
}

struct task_cputime {
	u64				stime;
	u64				utime;
	unsigned long long		sum_exec_runtime;
};
struct prev_cputime {
	u64				utime;
	u64				stime;
};

void cputime_adjust(int new, struct task_cputime *curr, struct prev_cputime *prev,
		    u64 *ut, u64 *st)
{
	u64 rtime, stime, utime;

	rtime = curr->sum_exec_runtime;

	if (prev->stime + prev->utime >= rtime)
		goto out;

	stime = curr->stime;
	utime = curr->utime;

	if (stime == 0) {
		utime = rtime;
		goto update;
	}

	if (utime == 0) {
		stime = rtime;
		goto update;
	}

	stime = (new ? new_scale_stime : scale_stime)(stime, rtime, stime + utime);

update:
	if (stime < prev->stime)
		stime = prev->stime;
	utime = rtime - stime;

	if (utime < prev->utime) {
		utime = prev->utime;
		stime = rtime - utime;
	}

	prev->stime = stime;
	prev->utime = utime;
out:
	*ut = prev->utime;
	*st = prev->stime;
}

void prdiff(int new, struct task_cputime *curr, struct prev_cputime *prev)
{
	struct prev_cputime __prev = *prev;
	u64 ut, st, ud, sd;

	cputime_adjust(new, curr, prev, &ut, &st);
	ud = ut - __prev.utime;
	sd = st - __prev.stime;

	printf("%16llu:%-16llu", ud, sd);
}

#define SEC	1000000000ULL

void parse_cputime(struct task_cputime *t, char **argv)
{
	double total = strtod(argv[0], NULL) * SEC;
	double utime = strtod(argv[1], NULL) / 100;

	utime *= total;
	t->utime = utime;
	t->stime = total - utime;
}

int main(int argc, char **argv)
{
	struct prev_cputime old_prev = {};
	struct prev_cputime new_prev = {};
	struct task_cputime curr, diff;
	u64 tmp;

	if (argc != 5) {
		printf("usage: %s start_time utime_percent inc_time utime_percent\n", argv[0]);
		return 0;
	}

	parse_cputime(&curr, argv+1);
	parse_cputime(&diff, argv+3);

	curr.sum_exec_runtime = curr.utime + curr.stime;
	cputime_adjust(0, &curr, &old_prev, &tmp, &tmp);
	cputime_adjust(1, &curr, &new_prev, &tmp, &tmp);

	printf("%18s%15s\t%18s\n", "old", "", "new");
	for (;;) {
		curr.utime += diff.utime;
		curr.stime += diff.stime;
		curr.sum_exec_runtime = curr.utime + curr.stime;

		prdiff(0, &curr, &old_prev);
		printf("\t");
		prdiff(1, &curr, &new_prev);
		printf("\n");
	}

	return 0;
}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v2] sched/cputime: make scale_stime() more precise
  2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
                   ` (3 preceding siblings ...)
  2020-01-22 16:46 ` Oleg Nesterov
@ 2020-01-27 12:28 ` Oleg Nesterov
  2020-05-15 17:24   ` Oleg Nesterov
  2020-05-19 17:25   ` Peter Zijlstra
  4 siblings, 2 replies; 29+ messages in thread
From: Oleg Nesterov @ 2020-01-27 12:28 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel, Stanislaw Gruszka,
	Linus Torvalds

People report that utime and stime from /proc/<pid>/stat become very
wrong when the numbers are big enough, especially if you watch these
counters incrementally.

Say, if the monitored process runs 100 days 50/50 in user/kernel mode
it looks as if it runs 20 minutes entirely in kernel mode, then 20
minutes in user mode. See the test-case which tries to demonstrate this
behaviour:

	https://lore.kernel.org/lkml/20200124154215.GA14714@redhat.com/

The new implementation does the additional div64_u64_rem() but according
to my naive measurements it is faster on x86_64, much faster if rtime/etc
are big enough. See

	https://lore.kernel.org/lkml/20200123130541.GA30620@redhat.com/

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/sched/cputime.c | 65 +++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d43318a..ae1ea09 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -528,42 +528,41 @@ void account_idle_ticks(unsigned long ticks)
  */
 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
-	u64 scaled;
+	u64 res = 0, div, rem;
+	int shift;
 
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime)
-			swap(rtime, stime);
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
+	/* can stime * rtime overflow ? */
+	if (ilog2(stime) + ilog2(rtime) > 62) {
+		/*
+		 * (rtime * stime) / total is equal to
+		 *
+		 *	(rtime / total) * stime +
+		 *	(rtime % total) * stime / total
+		 *
+		 * if nothing overflows. Can the 1st multiplication
+		 * overflow? Yes, but we do not care: this can only
+		 * happen if the end result can't fit in u64 anyway.
+		 *
+		 * So the code below does
+		 *
+		 *	res = (rtime / total) * stime;
+		 *	rtime = rtime % total;
+		 */
+		div = div64_u64_rem(rtime, total, &rem);
+		res = div * stime;
+		rtime = rem;
+
+		shift = ilog2(stime) + ilog2(rtime) - 62;
+		if (shift > 0) {
+			/* drop precision */
+			rtime >>= shift;
+			total >>= shift;
+			if (!total)
+				return res;
+		}
 	}
 
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
+	return res + div64_u64(stime * rtime, total);
 }
 
 /*
-- 
2.5.0



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
@ 2020-05-15 17:24   ` Oleg Nesterov
  2020-05-19 17:25   ` Peter Zijlstra
  1 sibling, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2020-05-15 17:24 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner
  Cc: Andrew Fox, Stephen Johnston, linux-kernel, Stanislaw Gruszka,
	Linus Torvalds

ping...

Peter, could you comment?

On 01/27, Oleg Nesterov wrote:
>
> People report that utime and stime from /proc/<pid>/stat become very
> wrong when the numbers are big enough, especially if you watch these
> counters incrementally.
> 
> Say, if the monitored process runs 100 days 50/50 in user/kernel mode
> it looks as if it runs 20 minutes entirely in kernel mode, then 20
> minutes in user mode. See the test-case which tries to demonstrate this
> behaviour:
> 
> 	https://lore.kernel.org/lkml/20200124154215.GA14714@redhat.com/
> 
> The new implementation does the additional div64_u64_rem() but according
> to my naive measurements it is faster on x86_64, much faster if rtime/etc
> are big enough. See
> 
> 	https://lore.kernel.org/lkml/20200123130541.GA30620@redhat.com/
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  kernel/sched/cputime.c | 65 +++++++++++++++++++++++++-------------------------
>  1 file changed, 32 insertions(+), 33 deletions(-)
> 
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index d43318a..ae1ea09 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -528,42 +528,41 @@ void account_idle_ticks(unsigned long ticks)
>   */
>  static u64 scale_stime(u64 stime, u64 rtime, u64 total)
>  {
> -	u64 scaled;
> +	u64 res = 0, div, rem;
> +	int shift;
>  
> -	for (;;) {
> -		/* Make sure "rtime" is the bigger of stime/rtime */
> -		if (stime > rtime)
> -			swap(rtime, stime);
> -
> -		/* Make sure 'total' fits in 32 bits */
> -		if (total >> 32)
> -			goto drop_precision;
> -
> -		/* Does rtime (and thus stime) fit in 32 bits? */
> -		if (!(rtime >> 32))
> -			break;
> -
> -		/* Can we just balance rtime/stime rather than dropping bits? */
> -		if (stime >> 31)
> -			goto drop_precision;
> -
> -		/* We can grow stime and shrink rtime and try to make them both fit */
> -		stime <<= 1;
> -		rtime >>= 1;
> -		continue;
> -
> -drop_precision:
> -		/* We drop from rtime, it has more bits than stime */
> -		rtime >>= 1;
> -		total >>= 1;
> +	/* can stime * rtime overflow ? */
> +	if (ilog2(stime) + ilog2(rtime) > 62) {
> +		/*
> +		 * (rtime * stime) / total is equal to
> +		 *
> +		 *	(rtime / total) * stime +
> +		 *	(rtime % total) * stime / total
> +		 *
> +		 * if nothing overflows. Can the 1st multiplication
> +		 * overflow? Yes, but we do not care: this can only
> +		 * happen if the end result can't fit in u64 anyway.
> +		 *
> +		 * So the code below does
> +		 *
> +		 *	res = (rtime / total) * stime;
> +		 *	rtime = rtime % total;
> +		 */
> +		div = div64_u64_rem(rtime, total, &rem);
> +		res = div * stime;
> +		rtime = rem;
> +
> +		shift = ilog2(stime) + ilog2(rtime) - 62;
> +		if (shift > 0) {
> +			/* drop precision */
> +			rtime >>= shift;
> +			total >>= shift;
> +			if (!total)
> +				return res;
> +		}
>  	}
>  
> -	/*
> -	 * Make sure gcc understands that this is a 32x32->64 multiply,
> -	 * followed by a 64/32->64 divide.
> -	 */
> -	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
> -	return scaled;
> +	return res + div64_u64(stime * rtime, total);
>  }
>  
>  /*
> -- 
> 2.5.0
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
  2020-05-15 17:24   ` Oleg Nesterov
@ 2020-05-19 17:25   ` Peter Zijlstra
  2020-05-19 18:33     ` Linus Torvalds
                       ` (2 more replies)
  1 sibling, 3 replies; 29+ messages in thread
From: Peter Zijlstra @ 2020-05-19 17:25 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Stanislaw Gruszka, Linus Torvalds

On Mon, Jan 27, 2020 at 01:28:17PM +0100, Oleg Nesterov wrote:
> People report that utime and stime from /proc/<pid>/stat become very
> wrong when the numbers are big enough, especially if you watch these
> counters incrementally.
> 
> Say, if the monitored process runs 100 days 50/50 in user/kernel mode
> it looks as if it runs 20 minutes entirely in kernel mode, then 20
> minutes in user mode. See the test-case which tries to demonstrate this
> behaviour:
> 
> 	https://lore.kernel.org/lkml/20200124154215.GA14714@redhat.com/
> 
> The new implementation does the additional div64_u64_rem() but according
> to my naive measurements it is faster on x86_64, much faster if rtime/etc
> are big enough. See
> 
> 	https://lore.kernel.org/lkml/20200123130541.GA30620@redhat.com/

Right, so -m32 when ran on x86_64 CPUs isn't really fair, because then
it still has hardware fls() for ilog2() and a massively fast mult and
division instruction. Try and run this on a puny 32bit ARM that maybe
has a hardware multiplier on.

Anyway, how about we write it like the below and then when some puny
architecture comes complaining we can use Linus' original algorithm for
their arch implementation.

Hmm?

---
 arch/x86/include/asm/div64.h | 14 ++++++++++++--
 include/linux/math64.h       | 41 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/cputime.c       | 46 +-------------------------------------------
 3 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9b8cb50768c2..b8f1dc0761e4 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
 #else
 # include <asm-generic/div64.h>
 
-static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
 {
 	u64 q;
 
 	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
+				: "a" (a), "rm" (mul), "rm" (div)
 				: "rdx");
 
 	return q;
 }
+#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+	return mul_u64_u64_div_u64(a, mul, div);
+}
 #define mul_u64_u32_div	mul_u64_u32_div
 
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 11a267413e8e..22b6f173dccb 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,47 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */
 
+#ifndef mul_u64_u64_div_u64
+static inline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+{
+	u64 res = 0, div, rem;
+	int shift;
+
+	/* can a * b overflow ? */
+	if (ilog2(a) + ilog2(b) > 62) {
+		/*
+		 * (b * a) / c is equal to
+		 *
+		 *	(b / c) * a +
+		 *	(b % c) * a / c
+		 *
+		 * if nothing overflows. Can the 1st multiplication
+		 * overflow? Yes, but we do not care: this can only
+		 * happen if the end result can't fit in u64 anyway.
+		 *
+		 * So the code below does
+		 *
+		 *	res = (b / c) * a;
+		 *	b = b % c;
+		 */
+		div = div64_u64_rem(b, c, &rem);
+		res = div * a;
+		b = rem;
+
+		shift = ilog2(a) + ilog2(b) - 62;
+		if (shift > 0) {
+			/* drop precision */
+			b >>= shift;
+			c >>= shift;
+			if (!c)
+				return res;
+		}
+	}
+
+	return res + div64_u64(a * b, c);
+}
+#endif /* mul_u64_u64_div_u64 */
+
 #define DIV64_U64_ROUND_UP(ll, d)	\
 	({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..5a55d2300452 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -519,50 +519,6 @@ void account_idle_ticks(unsigned long ticks)
 	account_idle_time(cputime);
 }
 
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime)
-			swap(rtime, stime);
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
-}
-
 /*
  * Adjust tick based cputime random precision against scheduler runtime
  * accounting.
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 		goto update;
 	}
 
-	stime = scale_stime(stime, rtime, stime + utime);
+	stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
 
 update:
 	/*

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-19 17:25   ` Peter Zijlstra
@ 2020-05-19 18:33     ` Linus Torvalds
  2020-05-19 18:42       ` Peter Zijlstra
  2020-05-19 19:11       ` Peter Zijlstra
  2020-05-20 15:24     ` Oleg Nesterov
  2020-06-16 12:21     ` [tip: sched/core] sched/cputime: Improve cputime_adjust() tip-bot2 for Oleg Nesterov
  2 siblings, 2 replies; 29+ messages in thread
From: Linus Torvalds @ 2020-05-19 18:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, Linux Kernel Mailing List, Stanislaw Gruszka

On Tue, May 19, 2020 at 10:25 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> --- a/include/linux/math64.h
> +++ b/include/linux/math64.h
> @@ -263,6 +263,47 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
>  }
>  #endif /* mul_u64_u32_div */
>
> +#ifndef mul_u64_u64_div_u64
> +static inline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)

Do we really want to inline this? Particularly if we expect this to be
the "architecture doesn't have a better version" case?

It's not like we'd expect people to call it with constant values that
could be optimized by inlining, do we? If any of the values are
actually constants and it's performance-critical, the code is likely
better off using some special helper rather than this anyway.

So I'd much rather see it as a weak function defined in
lib/math/div64.c, and then architectures don't even need to override
it at all. Just let them define their own (inline or not) function,
and we have this as a weak fallback.

No?

               Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-19 18:33     ` Linus Torvalds
@ 2020-05-19 18:42       ` Peter Zijlstra
  2020-05-19 19:11       ` Peter Zijlstra
  1 sibling, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2020-05-19 18:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, Linux Kernel Mailing List, Stanislaw Gruszka

On Tue, May 19, 2020 at 11:33:34AM -0700, Linus Torvalds wrote:
> On Tue, May 19, 2020 at 10:25 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > --- a/include/linux/math64.h
> > +++ b/include/linux/math64.h
> > @@ -263,6 +263,47 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
> >  }
> >  #endif /* mul_u64_u32_div */
> >
> > +#ifndef mul_u64_u64_div_u64
> > +static inline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> 
> Do we really want to inline this? Particularly if we expect this to be
> the "architecture doesn't have a better version" case?
> 
> It's not like we'd expect people to call it with constant values that
> could be optimized by inlining, do we? If any of the values are
> actually constants and it's performance-critical, the code is likely
> better off using some special helper rather than this anyway.
> 
> So I'd much rather see it as a weak function defined in
> lib/math/div64.c, and then architectures don't even need to override
> it at all. Just let them define their own (inline or not) function,
> and we have this as a weak fallback.

I completely forgot we had a .c file to go with all this. Yes, I'll put
it in there.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-19 18:33     ` Linus Torvalds
  2020-05-19 18:42       ` Peter Zijlstra
@ 2020-05-19 19:11       ` Peter Zijlstra
  2020-05-19 19:51         ` Linus Torvalds
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-05-19 19:11 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, Linux Kernel Mailing List, Stanislaw Gruszka

On Tue, May 19, 2020 at 11:33:34AM -0700, Linus Torvalds wrote:
> So I'd much rather see it as a weak function defined in
> lib/math/div64.c, and then architectures don't even need to override
> it at all. Just let them define their own (inline or not) function,
> and we have this as a weak fallback.

My compiler doesn't like overriding a __weak with a static inline. It
complains about redefinitions.

It works with extern inline though; but that is fairly rare in the
kernel. Still it compiles and generates the right code.

---
 arch/x86/include/asm/div64.h | 13 +++++++++++--
 include/linux/math64.h       |  2 ++
 kernel/sched/cputime.c       | 46 +-------------------------------------------
 lib/math/div64.c             | 39 +++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9b8cb50768c2..320508d797de 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -74,16 +74,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
 #else
 # include <asm-generic/div64.h>
 
-static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+extern __always_inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
 {
 	u64 q;
 
 	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
+				: "a" (a), "rm" (mul), "rm" (div)
 				: "rdx");
 
 	return q;
 }
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+	return mul_u64_u64_div_u64(a, mul, div);
+}
 #define mul_u64_u32_div	mul_u64_u32_div
 
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 11a267413e8e..d097119419e6 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */
 
+u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+
 #define DIV64_U64_ROUND_UP(ll, d)	\
 	({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..5a55d2300452 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -519,50 +519,6 @@ void account_idle_ticks(unsigned long ticks)
 	account_idle_time(cputime);
 }
 
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime)
-			swap(rtime, stime);
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
-}
-
 /*
  * Adjust tick based cputime random precision against scheduler runtime
  * accounting.
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 		goto update;
 	}
 
-	stime = scale_stime(stime, rtime, stime + utime);
+	stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
 
 update:
 	/*
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 368ca7fd0d82..200a151e1be9 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -190,3 +190,42 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return __iter_div_u64_rem(dividend, divisor, remainder);
 }
 EXPORT_SYMBOL(iter_div_u64_rem);
+
+__weak u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+{
+	u64 res = 0, div, rem;
+	int shift;
+
+	/* can a * b overflow ? */
+	if (ilog2(a) + ilog2(b) > 62) {
+		/*
+		 * (b * a) / c is equal to
+		 *
+		 *	(b / c) * a +
+		 *	(b % c) * a / c
+		 *
+		 * if nothing overflows. Can the 1st multiplication
+		 * overflow? Yes, but we do not care: this can only
+		 * happen if the end result can't fit in u64 anyway.
+		 *
+		 * So the code below does
+		 *
+		 *	res = (b / c) * a;
+		 *	b = b % c;
+		 */
+		div = div64_u64_rem(b, c, &rem);
+		res = div * a;
+		b = rem;
+
+		shift = ilog2(a) + ilog2(b) - 62;
+		if (shift > 0) {
+			/* drop precision */
+			b >>= shift;
+			c >>= shift;
+			if (!c)
+				return res;
+		}
+	}
+
+	return res + div64_u64(a * b, c);
+}

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-19 19:11       ` Peter Zijlstra
@ 2020-05-19 19:51         ` Linus Torvalds
  0 siblings, 0 replies; 29+ messages in thread
From: Linus Torvalds @ 2020-05-19 19:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Ingo Molnar, Thomas Gleixner, Andrew Fox,
	Stephen Johnston, Linux Kernel Mailing List, Stanislaw Gruszka

On Tue, May 19, 2020 at 12:12 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> My compiler doesn't like overriding a __weak with a static inline. It
> complains about redefinitions.
>
> It works with extern inline though; but that is fairly rare in the
> kernel. Still it compiles and generates the right code.

That's a tiny bit worrisome, because the compiler might just decide
not to inline at all and then you end up with the (relatively bad)
version that doesn't take advantage of the hardware capabilities. But
things will work, and not be absolutely horrid, I guess.

And you do use "always_inline" so I guess it's all fine.

For added protection, you might also mark the asm itself as
"asm_inline", which makes sure gcc thinks it's a small asm too.

Regardless, ack from me, with just a note on that small worry.

                 Linus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-19 17:25   ` Peter Zijlstra
  2020-05-19 18:33     ` Linus Torvalds
@ 2020-05-20 15:24     ` Oleg Nesterov
  2020-05-20 15:36       ` Peter Zijlstra
  2020-06-16 12:21     ` [tip: sched/core] sched/cputime: Improve cputime_adjust() tip-bot2 for Oleg Nesterov
  2 siblings, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2020-05-20 15:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Stanislaw Gruszka, Linus Torvalds

On 05/19, Peter Zijlstra wrote:
>
> > The new implementation does the additional div64_u64_rem() but according
> > to my naive measurements it is faster on x86_64, much faster if rtime/etc
> > are big enough. See
> >
> > 	https://lore.kernel.org/lkml/20200123130541.GA30620@redhat.com/
>
> Right, so -m32 when ran on x86_64 CPUs isn't really fair, because then
> it still has hardware fls() for ilog2() and a massively fast mult and
> division instruction. Try and run this on a puny 32bit ARM that maybe
> has a hardware multiplier on.

OK,

> Anyway, how about we write it like the below and then when some puny
> architecture comes complaining we can use Linus' original algorithm for
> their arch implementation.

Sure, I am fine either way, but...

> +static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
>  {
>  	u64 q;
>  
>  	asm ("mulq %2; divq %3" : "=a" (q)
> -				: "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
> +				: "a" (a), "rm" (mul), "rm" (div)
>  				: "rdx");

...

> +#ifndef mul_u64_u64_div_u64
> +static inline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> +{
> +	u64 res = 0, div, rem;
> +	int shift;
> +
> +	/* can a * b overflow ? */
> +	if (ilog2(a) + ilog2(b) > 62) {
> +		/*
> +		 * (b * a) / c is equal to
> +		 *
> +		 *	(b / c) * a +
> +		 *	(b % c) * a / c
> +		 *
> +		 * if nothing overflows. Can the 1st multiplication
> +		 * overflow? Yes, but we do not care: this can only
> +		 * happen if the end result can't fit in u64 anyway.
> +		 *
> +		 * So the code below does
> +		 *
> +		 *	res = (b / c) * a;
> +		 *	b = b % c;
> +		 */
> +		div = div64_u64_rem(b, c, &rem);
> +		res = div * a;
> +		b = rem;
> +
> +		shift = ilog2(a) + ilog2(b) - 62;
> +		if (shift > 0) {
> +			/* drop precision */
> +			b >>= shift;
> +			c >>= shift;
> +			if (!c)
> +				return res;
> +		}
> +	}
> +
> +	return res + div64_u64(a * b, c);
> +}

Note that according to my measurements the "asm" version is slower than
the generic code above when "a * b" doesn't fit u64.

Nevermind, I agree with your version. Will you send this patch or do you
want me to make V3 ?

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-20 15:24     ` Oleg Nesterov
@ 2020-05-20 15:36       ` Peter Zijlstra
  2020-05-20 20:10         ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-05-20 15:36 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Stanislaw Gruszka, Linus Torvalds

On Wed, May 20, 2020 at 05:24:40PM +0200, Oleg Nesterov wrote:
> Nevermind, I agree with your version. Will you send this patch or do you
> want me to make V3 ?

I got stuck at writing a Changelog, am more scatter brained than usual,
due to lack of sleep. If you have Changelog that includes information
rather than links to it, feel free to send it along and I'll frob the
thing together, or just send a v3 with everything on.

Otherwise, I'll try again tomorrow :-)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-20 15:36       ` Peter Zijlstra
@ 2020-05-20 20:10         ` Peter Zijlstra
  2020-05-21 13:26           ` Oleg Nesterov
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2020-05-20 20:10 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Stanislaw Gruszka, Linus Torvalds

On Wed, May 20, 2020 at 05:36:03PM +0200, Peter Zijlstra wrote:
> On Wed, May 20, 2020 at 05:24:40PM +0200, Oleg Nesterov wrote:
> > Nevermind, I agree with your version. Will you send this patch or do you
> > want me to make V3 ?
> 
> I got stuck at writing a Changelog, am more scatter brained than usual,
> due to lack of sleep. If you have Changelog that includes information
> rather than links to it, feel free to send it along and I'll frob the
> thing together, or just send a v3 with everything on.
> 
> Otherwise, I'll try again tomorrow :-)

I had a another go; I ended up with this...

---
Subject: sched/cputime: Improve cputime_adjust()
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 19 May 2020 19:25:06 +0200

From: Oleg Nesterov <oleg@redhat.com>

People report that utime and stime from /proc/<pid>/stat become very
wrong when the numbers are big enough, especially if you watch these
counters incrementally.

Specifically, the current implementation of: stime*rtime/total,
results in a saw-tooth function on top of the desired line, where the
teeth grow in size the larger the values become. IOW, it has a
relative error.

The result is that, when watching incrementally as time progresses
(for large values), we'll see periods of pure stime or utime increase,
irrespective of the actual ratio we're striving for.

Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64()
that is far more accurate. This also allows architectures to override
the implementation -- for instance they can opt for the old algorithm
if this new one turns out to be too expensive for them.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/div64.h |   13 ++++++++++--
 include/linux/math64.h       |    2 +
 kernel/sched/cputime.c       |   46 -------------------------------------------
 lib/math/div64.c             |   39 ++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 47 deletions(-)

--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -74,16 +74,25 @@ static inline u64 mul_u32_u32(u32 a, u32
 #else
 # include <asm-generic/div64.h>
 
-static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+extern __always_inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
 {
 	u64 q;
 
 	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
+				: "a" (a), "rm" (mul), "rm" (div)
 				: "rdx");
 
 	return q;
 }
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+	return mul_u64_u64_div_u64(a, mul, div);
+}
 #define mul_u64_u32_div	mul_u64_u32_div
 
 #endif /* CONFIG_X86_32 */
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a,
 }
 #endif /* mul_u64_u32_div */
 
+u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+
 #define DIV64_U64_ROUND_UP(ll, d)	\
 	({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
 
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ti
 }
 
 /*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime)
-			swap(rtime, stime);
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
-}
-
-/*
  * Adjust tick based cputime random precision against scheduler runtime
  * accounting.
  *
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime
 		goto update;
 	}
 
-	stime = scale_stime(stime, rtime, stime + utime);
+	stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
 
 update:
 	/*
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -190,3 +190,42 @@ u32 iter_div_u64_rem(u64 dividend, u32 d
 	return __iter_div_u64_rem(dividend, divisor, remainder);
 }
 EXPORT_SYMBOL(iter_div_u64_rem);
+
+__weak u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+{
+	u64 res = 0, div, rem;
+	int shift;
+
+	/* can a * b overflow ? */
+	if (ilog2(a) + ilog2(b) > 62) {
+		/*
+		 * (b * a) / c is equal to
+		 *
+		 *      (b / c) * a +
+		 *      (b % c) * a / c
+		 *
+		 * if nothing overflows. Can the 1st multiplication
+		 * overflow? Yes, but we do not care: this can only
+		 * happen if the end result can't fit in u64 anyway.
+		 *
+		 * So the code below does
+		 *
+		 *      res = (b / c) * a;
+		 *      b = b % c;
+		 */
+		div = div64_u64_rem(b, c, &rem);
+		res = div * a;
+		b = rem;
+
+		shift = ilog2(a) + ilog2(b) - 62;
+		if (shift > 0) {
+			/* drop precision */
+			b >>= shift;
+			c >>= shift;
+			if (!c)
+				return res;
+		}
+	}
+
+	return res + div64_u64(a * b, c);
+}

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] sched/cputime: make scale_stime() more precise
  2020-05-20 20:10         ` Peter Zijlstra
@ 2020-05-21 13:26           ` Oleg Nesterov
  0 siblings, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2020-05-21 13:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Andrew Fox, Stephen Johnston,
	linux-kernel, Stanislaw Gruszka, Linus Torvalds

On 05/20, Peter Zijlstra wrote:
>
> I had a another go; I ended up with this...

Oh, thanks Peter!

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip: sched/core] sched/cputime: Improve cputime_adjust()
  2020-05-19 17:25   ` Peter Zijlstra
  2020-05-19 18:33     ` Linus Torvalds
  2020-05-20 15:24     ` Oleg Nesterov
@ 2020-06-16 12:21     ` tip-bot2 for Oleg Nesterov
  2 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Oleg Nesterov @ 2020-06-16 12:21 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Oleg Nesterov, Peter Zijlstra (Intel), x86, LKML

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     3dc167ba5729ddd2d8e3fa1841653792c295d3f1
Gitweb:        https://git.kernel.org/tip/3dc167ba5729ddd2d8e3fa1841653792c295d3f1
Author:        Oleg Nesterov <oleg@redhat.com>
AuthorDate:    Tue, 19 May 2020 19:25:06 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 15 Jun 2020 14:10:00 +02:00

sched/cputime: Improve cputime_adjust()

People report that utime and stime from /proc/<pid>/stat become very
wrong when the numbers are big enough, especially if you watch these
counters incrementally.

Specifically, the current implementation of: stime*rtime/total,
results in a saw-tooth function on top of the desired line, where the
teeth grow in size the larger the values become. IOW, it has a
relative error.

The result is that, when watching incrementally as time progresses
(for large values), we'll see periods of pure stime or utime increase,
irrespective of the actual ratio we're striving for.

Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64()
that is far more accurate. This also allows architectures to override
the implementation -- for instance they can opt for the old algorithm
if this new one turns out to be too expensive for them.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200519172506.GA317395@hirez.programming.kicks-ass.net
---
 arch/x86/include/asm/div64.h | 14 +++++++++--
 include/linux/math64.h       |  2 ++-
 kernel/sched/cputime.c       | 46 +-----------------------------------
 lib/math/div64.c             | 41 +++++++++++++++++++++++++++++++-
 4 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9b8cb50..b8f1dc0 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
 #else
 # include <asm-generic/div64.h>
 
-static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
 {
 	u64 q;
 
 	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
+				: "a" (a), "rm" (mul), "rm" (div)
 				: "rdx");
 
 	return q;
 }
+#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+	return mul_u64_u64_div_u64(a, mul, div);
+}
 #define mul_u64_u32_div	mul_u64_u32_div
 
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 11a2674..d097119 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */
 
+u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+
 #define DIV64_U64_ROUND_UP(ll, d)	\
 	({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435d..5a55d23 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks)
 }
 
 /*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime)
-			swap(rtime, stime);
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
-}
-
-/*
  * Adjust tick based cputime random precision against scheduler runtime
  * accounting.
  *
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 		goto update;
 	}
 
-	stime = scale_stime(stime, rtime, stime + utime);
+	stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
 
 update:
 	/*
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 368ca7f..3952a07 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -190,3 +190,44 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return __iter_div_u64_rem(dividend, divisor, remainder);
 }
 EXPORT_SYMBOL(iter_div_u64_rem);
+
+#ifndef mul_u64_u64_div_u64
+u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+{
+	u64 res = 0, div, rem;
+	int shift;
+
+	/* can a * b overflow ? */
+	if (ilog2(a) + ilog2(b) > 62) {
+		/*
+		 * (b * a) / c is equal to
+		 *
+		 *      (b / c) * a +
+		 *      (b % c) * a / c
+		 *
+		 * if nothing overflows. Can the 1st multiplication
+		 * overflow? Yes, but we do not care: this can only
+		 * happen if the end result can't fit in u64 anyway.
+		 *
+		 * So the code below does
+		 *
+		 *      res = (b / c) * a;
+		 *      b = b % c;
+		 */
+		div = div64_u64_rem(b, c, &rem);
+		res = div * a;
+		b = rem;
+
+		shift = ilog2(a) + ilog2(b) - 62;
+		if (shift > 0) {
+			/* drop precision */
+			b >>= shift;
+			c >>= shift;
+			if (!c)
+				return res;
+		}
+	}
+
+	return res + div64_u64(a * b, c);
+}
+#endif

^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2020-06-16 12:22 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-18 13:18 [PATCH] sched/cputime: make scale_stime() more precise Oleg Nesterov
2019-07-18 13:21 ` Oleg Nesterov
2019-07-18 14:55 ` Oleg Nesterov
2019-07-19 11:03 ` Peter Zijlstra
2019-07-19 13:47   ` Peter Zijlstra
2019-07-19 14:37     ` Oleg Nesterov
2019-07-22 19:56       ` Peter Zijlstra
2019-07-23 14:00         ` Oleg Nesterov
2019-07-23 14:29           ` Oleg Nesterov
2019-07-19 14:03   ` Oleg Nesterov
2019-07-22 19:45     ` Peter Zijlstra
2019-07-22 10:52   ` Stanislaw Gruszka
2019-07-22 20:00     ` Peter Zijlstra
2019-07-23  9:37       ` Stanislaw Gruszka
2020-01-22 16:46 ` Oleg Nesterov
2020-01-23 13:05   ` Oleg Nesterov
2020-01-24 15:42   ` Oleg Nesterov
2020-01-27 12:28 ` [PATCH v2] " Oleg Nesterov
2020-05-15 17:24   ` Oleg Nesterov
2020-05-19 17:25   ` Peter Zijlstra
2020-05-19 18:33     ` Linus Torvalds
2020-05-19 18:42       ` Peter Zijlstra
2020-05-19 19:11       ` Peter Zijlstra
2020-05-19 19:51         ` Linus Torvalds
2020-05-20 15:24     ` Oleg Nesterov
2020-05-20 15:36       ` Peter Zijlstra
2020-05-20 20:10         ` Peter Zijlstra
2020-05-21 13:26           ` Oleg Nesterov
2020-06-16 12:21     ` [tip: sched/core] sched/cputime: Improve cputime_adjust() tip-bot2 for Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).