sched/cputime: make scale_stime() more precise
diff mbox series

Message ID 20190718131834.GA22211@redhat.com
State New
Headers show
Series
  • sched/cputime: make scale_stime() more precise
Related show

Commit Message

Oleg Nesterov July 18, 2019, 1:18 p.m. UTC
People report that utime and stime from /proc/<pid>/stat become very wrong
when the numbers are big enough. In particular, the monitored application
can run all the time in user-space but only stime grows.

This is because scale_stime() is very inaccurate. It tries to minimize the
relative error, but the absolute error can be huge.

Andrew wrote the test-case:

	int main(int argc, char **argv)
	{
	    struct task_cputime c;
	    struct prev_cputime p;
	    u64 st, pst, cst;
	    u64 ut, put, cut;
	    u64 x;
	    int i = -1; // one step not printed

	    if (argc != 2)
	    {
		printf("usage: %s <start_in_seconds>\n", argv[0]);
		return 1;
	    }
	    x = strtoull(argv[1], NULL, 0) * SEC;
	    printf("start=%lld\n", x);

	    p.stime = 0;
	    p.utime = 0;

	    while (i++ < NSTEPS)
	    {
		x += STEP;
		c.stime = x;
		c.utime = x;
		c.sum_exec_runtime = x + x;
		pst = cputime_to_clock_t(p.stime);
		put = cputime_to_clock_t(p.utime);
		cputime_adjust(&c, &p, &ut, &st);
		cst = cputime_to_clock_t(st);
		cut = cputime_to_clock_t(ut);
		if (i)
		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
			cut, cut - put, cst, cst - pst);
	    }
	}

For example,

	$ ./stime 300000
	start=300000000000000
	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)

shows the problem even when sum_exec_runtime is not that big: 300000 secs.

The new implementation of scale_stime() does the additional div64_u64_rem()
in a loop but see the comment, as long it is used by cputime_adjust() this
can happen only once.

Reported-by: Andrew Fox <afox@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/sched/cputime.c | 66 ++++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 29 deletions(-)

Comments

Oleg Nesterov July 18, 2019, 1:21 p.m. UTC | #1
To simplify the review, see the code with this patch applied:

/*
 * Perform (stime * rtime) / total, but avoid multiplication overflow
 * by losing precision when the numbers are big.
 *
 * NOTE! currently the only user is cputime_adjust() and thus
 *
 *	stime < total && rtime > total
 *
 * this means that the end result is always precise and the additional
 * div64_u64_rem() inside the main loop is called at most once.
 */
static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;

	/* can stime * rtime overflow ? */
	while (ilog2(stime) + ilog2(rtime) > 62) {
		if (stime > rtime)
			swap(rtime, stime);

		if (rtime >= total) {
			/*
			 * (rtime * stime) / total is equal to
			 *
			 *	(rtime / total) * stime +
			 *	(rtime % total) * stime / total
			 *
			 * if nothing overflows. Can the 1st multiplication
			 * overflow? Yes, but we do not care: this can only
			 * happen if the end result can't fit in u64 anyway.
			 *
			 * So the code below does
			 *
			 *	res += (rtime / total) * stime;
			 *	rtime = rtime % total;
			 */
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;
			continue;
		}

		/* drop precision */
		rtime >>= 1;
		total >>= 1;
		if (!total)
			return res;
	}

	return res + div64_u64(stime * rtime, total);
}
Oleg Nesterov July 18, 2019, 2:55 p.m. UTC | #2
On 07/18, Oleg Nesterov wrote:
>
> + * NOTE! currently the only user is cputime_adjust() and thus
> + *
> + *	stime < total && rtime > total
> + *
> + * this means that the end result is always precise and the additional
> + * div64_u64_rem() inside the main loop is called at most once.

Ah, I just noticed that the comment is not 100% correct... in theory we
can drop the precision and even do div64_u64_rem() more than once, but this
can only happen if stime or total = stime + utime is "really" huge, I don't
think this can happen in practice...

We can probably just do

	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shitt;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

but this way the code looks less generic.

Oleg.
Peter Zijlstra July 19, 2019, 11:03 a.m. UTC | #3
On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> People report that utime and stime from /proc/<pid>/stat become very wrong
> when the numbers are big enough. In particular, the monitored application
> can run all the time in user-space but only stime grows.
> 
> This is because scale_stime() is very inaccurate. It tries to minimize the
> relative error, but the absolute error can be huge.
> 
> Andrew wrote the test-case:
> 
> 	int main(int argc, char **argv)
> 	{
> 	    struct task_cputime c;
> 	    struct prev_cputime p;
> 	    u64 st, pst, cst;
> 	    u64 ut, put, cut;
> 	    u64 x;
> 	    int i = -1; // one step not printed
> 
> 	    if (argc != 2)
> 	    {
> 		printf("usage: %s <start_in_seconds>\n", argv[0]);
> 		return 1;
> 	    }
> 	    x = strtoull(argv[1], NULL, 0) * SEC;
> 	    printf("start=%lld\n", x);
> 
> 	    p.stime = 0;
> 	    p.utime = 0;
> 
> 	    while (i++ < NSTEPS)
> 	    {
> 		x += STEP;
> 		c.stime = x;
> 		c.utime = x;
> 		c.sum_exec_runtime = x + x;
> 		pst = cputime_to_clock_t(p.stime);
> 		put = cputime_to_clock_t(p.utime);
> 		cputime_adjust(&c, &p, &ut, &st);
> 		cst = cputime_to_clock_t(st);
> 		cut = cputime_to_clock_t(ut);
> 		if (i)
> 		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
> 			cut, cut - put, cst, cst - pst);
> 	    }
> 	}
> 
> For example,
> 
> 	$ ./stime 300000
> 	start=300000000000000
> 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> 
> shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> 
> The new implementation of scale_stime() does the additional div64_u64_rem()
> in a loop but see the comment, as long it is used by cputime_adjust() this
> can happen only once.

That only shows something after long long staring :/ There's no words on
what the output actually means or what would've been expected.

Also, your example is incomplete; the below is a test for scale_stime();
from this we can see that the division results in too large a number,
but, important for our use-case in cputime_adjust(), it is a step
function (due to loss in precision) and for every plateau we shift
runtime into the wrong bucket.

Your proposed function works; but is atrocious, esp. on 32bit. That
said, before we 'fixed' it, it had similar horrible divisions in, see
commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").

Included below is also an x86_64 implementation in 2 instructions.

I'm still trying see if there's anything saner we can do...

---
#include <stdio.h>
#include <stdlib.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;

static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
	u64 q;
	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
	return q;
}

static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder);

static inline u64 div_u64(u64 dividend, u32 divisor)
{
	u32 remainder;
	return div_u64_rem(dividend, divisor, &remainder);
}

static __always_inline int fls(unsigned int x)
{
	return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}

#if 0
static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	union {
		u64 v64;
		u32 v32[2];
	} d = { dividend };
	u32 upper;

	upper = d.v32[1];
	d.v32[1] = 0;
	if (upper >= divisor) {
		d.v32[1] = upper / divisor;
		upper %= divisor;
	}
	asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
		"rm" (divisor), "0" (d.v32[0]), "1" (upper));
	return d.v64;
}
static u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		u32 rem32;
		quot = div_u64_rem(dividend, divisor, &rem32);
		*remainder = rem32;
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;

		*remainder = dividend - quot * divisor;
		if (*remainder >= divisor) {
			quot++;
			*remainder -= divisor;
		}
	}

	return quot;
}
static u64 div64_u64(u64 dividend, u64 divisor)
{
	u32 high = divisor >> 32;
	u64 quot;

	if (high == 0) {
		quot = div_u64(dividend, divisor);
	} else {
		int n = fls(high);
		quot = div_u64(dividend >> n, divisor >> n);

		if (quot != 0)
			quot--;
		if ((dividend - quot * divisor) >= divisor)
			quot++;
	}

	return quot;
}
#else
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
	*remainder = dividend % divisor;
	return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
	return dividend / divisor;
}
#endif

static __always_inline int fls64(u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}

static inline int ilog2(u64 n)
{
	return fls64(n) - 1;
}

#define swap(a, b) \
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

static noinline u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((stime * rtime), total);
	return scaled;
}

static noinline u64 oleg(u64 stime, u64 rtime, u64 total)
{
	u64 res = 0, div, rem;
	/* can stime * rtime overflow ? */
	while (ilog2(stime) + ilog2(rtime) > 62) {
		if (stime > rtime)
			swap(rtime, stime);
		if (rtime >= total) {
			/*
			 * (rtime * stime) / total is equal to
			 *
			 *      (rtime / total) * stime +
			 *      (rtime % total) * stime / total
			 *
			 * if nothing overflows. Can the 1st multiplication
			 * overflow? Yes, but we do not care: this can only
			 * happen if the end result can't fit in u64 anyway.
			 *
			 * So the code below does
			 *
			 *      res += (rtime / total) * stime;
			 *      rtime = rtime % total;
			 */
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;
			continue;
		}
		/* drop precision */
		rtime >>= 1;
		total >>= 1;
		if (!total)
			return res;
	}
	return res + div64_u64(stime * rtime, total);
}

#define SEC	1000000000ULL

int main(int argc, char **argv)
{
	u64 u, s;
	u64 x;
	int i = -1; // one step not printed
	if (argc != 2)
	{
		printf("usage: %s <start_in_seconds>\n", argv[0]);
		return 1;
	}
	x = strtoull(argv[1], NULL, 0) * SEC;
	printf("start=%lld\n", x);

	for (i=0; i<50; i++, x += 2000) {
		printf("%lld = %lld * %lld / %lld\n", mul_u64_u64_div_u64(x, x+x, x+x), x, x+x, x+x);
		printf("%lld = %lld * %lld / %lld\n", scale_stime(x, x+x, x+x), x, x+x, x+x);
		printf("%lld = %lld * %lld / %lld\n", oleg(x, x+x, x+x), x, x+x, x+x);
		printf("---\n");
	}
}
Peter Zijlstra July 19, 2019, 1:47 p.m. UTC | #4
On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> > People report that utime and stime from /proc/<pid>/stat become very wrong
> > when the numbers are big enough. In particular, the monitored application
> > can run all the time in user-space but only stime grows.
> > 
> > This is because scale_stime() is very inaccurate. It tries to minimize the
> > relative error, but the absolute error can be huge.
> > 
> > Andrew wrote the test-case:
> > 
> > 	int main(int argc, char **argv)
> > 	{
> > 	    struct task_cputime c;
> > 	    struct prev_cputime p;
> > 	    u64 st, pst, cst;
> > 	    u64 ut, put, cut;
> > 	    u64 x;
> > 	    int i = -1; // one step not printed
> > 
> > 	    if (argc != 2)
> > 	    {
> > 		printf("usage: %s <start_in_seconds>\n", argv[0]);
> > 		return 1;
> > 	    }
> > 	    x = strtoull(argv[1], NULL, 0) * SEC;
> > 	    printf("start=%lld\n", x);
> > 
> > 	    p.stime = 0;
> > 	    p.utime = 0;
> > 
> > 	    while (i++ < NSTEPS)
> > 	    {
> > 		x += STEP;
> > 		c.stime = x;
> > 		c.utime = x;
> > 		c.sum_exec_runtime = x + x;
> > 		pst = cputime_to_clock_t(p.stime);
> > 		put = cputime_to_clock_t(p.utime);
> > 		cputime_adjust(&c, &p, &ut, &st);
> > 		cst = cputime_to_clock_t(st);
> > 		cut = cputime_to_clock_t(ut);
> > 		if (i)
> > 		    printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld (%4lld)\n",
> > 			cut, cut - put, cst, cst - pst);
> > 	    }
> > 	}
> > 
> > For example,
> > 
> > 	$ ./stime 300000
> > 	start=300000000000000
> > 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> > 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> > 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> > 
> > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > 
> > The new implementation of scale_stime() does the additional div64_u64_rem()
> > in a loop but see the comment, as long it is used by cputime_adjust() this
> > can happen only once.
> 
> That only shows something after long long staring :/ There's no words on
> what the output actually means or what would've been expected.
> 
> Also, your example is incomplete; the below is a test for scale_stime();
> from this we can see that the division results in too large a number,
> but, important for our use-case in cputime_adjust(), it is a step
> function (due to loss in precision) and for every plateau we shift
> runtime into the wrong bucket.

But I'm still confused, since in the long run, it should still end up
with a proportionally divided user/system, irrespective of some short
term wobblies.

So please, better articulate the problem.
Oleg Nesterov July 19, 2019, 2:03 p.m. UTC | #5
On 07/19, Peter Zijlstra wrote:
>
> On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> >
> > 	$ ./stime 300000
> > 	start=300000000000000
> > 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> > 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> > 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> > 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> > 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> > 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> >
> > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> >
> > The new implementation of scale_stime() does the additional div64_u64_rem()
> > in a loop but see the comment, as long it is used by cputime_adjust() this
> > can happen only once.
>
> That only shows something after long long staring :/ There's no words on
> what the output actually means or what would've been expected.

Sorry, I should have explained it in more details,

> Also, your example is incomplete; the below is a test for scale_stime();
> from this we can see that the division results in too large a number,
> but, important for our use-case in cputime_adjust(), it is a step
> function (due to loss in precision) and for every plateau we shift
> runtime into the wrong bucket.

Yes.

> Your proposed function works; but is atrocious,

Agreed,

> esp. on 32bit.

Yes... but lets compare it with the current implementation. To simplify,
lets look at the "less generic" version I showed in reply to this patch:

	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shift;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

So, if stime * rtime overflows it does div64_u64() twice while the
current version does a single div_u64() == do_div() (on 32bit).

Even a single div64_u64() is more expensive than do_div() but afaics
a) not too much and b) only if divisor == total doesn't fit in 32bit
and I think this is unlikely.

So I'd say it makes scale_stime() approximately twice more expensive
on 32bit. But hopefully fixe the problem.

> Included below is also an x86_64 implementation in 2 instructions.

But we need the arch-neutral implementation anyway, the code above
is the best I could invent.

But see below!

> I'm still trying see if there's anything saner we can do...

Oh, please, it is not that I like my solution very much, I would like
to see something more clever.

> static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> {
> 	u64 q;
> 	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
> 	return q;
> }

Heh. I have to admit that I didn't know that divq divides 128bit by
64bit. gcc calls the __udivti3 intrinsic in this case so I wrongly
came to conclusion this is not simple even on x86_64. Plus the fact
that linux/math64.h only has mul_u64_u64_shr()...

IIUC, the new helper above is not "safe", it generates an exception
if the result doesn't fit in 64bit. But scale_stime() can safely use
it because stime < total.

So may be we can do

	static u64 scale_stime(u64 stime, u64 rtime, u64 total)
	{
		u64 res = 0, div, rem;

		#ifdef mul_u64_u64_div_u64
		return mul_u64_u64_div_u64(stime, rtime, total);
		#endif

		if (ilog2(stime) + ilog2(rtime) > 62) {
			div = div64_u64_rem(rtime, total, &rem);
			res += div * stime;
			rtime = rem;

			int shift = ilog2(stime) + ilog2(rtime) - 62;
			if (shift > 0) {
				rtime >>= shift;
				total >>= shift;
				if (!total)
					return res;
			}
		}

		return res + div64_u64(stime * rtime, total);
	}

?

Oleg.
Oleg Nesterov July 19, 2019, 2:37 p.m. UTC | #6
On 07/19, Peter Zijlstra wrote:
>
> > > 	$ ./stime 300000
> > > 	start=300000000000000
> > > 	ut(diff)/st(diff):            299994875 (   0)             300009124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300011124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300013124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300015124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300017124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300019124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300021124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300023124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300025124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300027124 (2000)
> > > 	ut(diff)/st(diff):            299994875 (   0)             300029124 (2000)
> > > 	ut(diff)/st(diff):            299996875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            299998875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300000875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300002875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300004875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300006875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300008875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300010875 (2000)             300029124 (   0)
> > > 	ut(diff)/st(diff):            300012055 (1180)             300029944 ( 820)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300031944 (2000)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300033944 (2000)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300035944 (2000)
> > > 	ut(diff)/st(diff):            300012055 (   0)             300037944 (2000)
> > >
> > > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > >
> > > The new implementation of scale_stime() does the additional div64_u64_rem()
> > > in a loop but see the comment, as long it is used by cputime_adjust() this
> > > can happen only once.
> >
> > That only shows something after long long staring :/ There's no words on
> > what the output actually means or what would've been expected.
> >
> > Also, your example is incomplete; the below is a test for scale_stime();
> > from this we can see that the division results in too large a number,
> > but, important for our use-case in cputime_adjust(), it is a step
> > function (due to loss in precision) and for every plateau we shift
> > runtime into the wrong bucket.
>
> But I'm still confused, since in the long run, it should still end up
> with a proportionally divided user/system, irrespective of some short
> term wobblies.

Why?

Yes, statistically the numbers are proportionally divided.

but you will (probably) never see the real stime == 1000 && utime == 10000
numbers if you watch incrementally.

Just in case... yes I know that these numbers can only "converge" to the
reality, only their sum is correct. But people complain.

Oleg.
Stanislaw Gruszka July 22, 2019, 10:52 a.m. UTC | #7
On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > 
> > The new implementation of scale_stime() does the additional div64_u64_rem()
> > in a loop but see the comment, as long it is used by cputime_adjust() this
> > can happen only once.
> 
> That only shows something after long long staring :/ There's no words on
> what the output actually means or what would've been expected.
> 
> Also, your example is incomplete; the below is a test for scale_stime();
> from this we can see that the division results in too large a number,
> but, important for our use-case in cputime_adjust(), it is a step
> function (due to loss in precision) and for every plateau we shift
> runtime into the wrong bucket.
> 
> Your proposed function works; but is atrocious, esp. on 32bit. That
> said, before we 'fixed' it, it had similar horrible divisions in, see
> commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").
> 
> Included below is also an x86_64 implementation in 2 instructions.
> 
> I'm still trying see if there's anything saner we can do...

I was always proponent of removing scaling and export raw values
and sum_exec_runtime. But that has obvious drawback, reintroduce
'top hiding' issue.

But maybe we can export raw values in separate file i.e.
/proc/[pid]/raw_cpu_times ? So applications that require more precise
cputime values for very long-living processes can use this file.

Stanislaw
Peter Zijlstra July 22, 2019, 7:45 p.m. UTC | #8
On Fri, Jul 19, 2019 at 04:03:25PM +0200, Oleg Nesterov wrote:
> On 07/19, Peter Zijlstra wrote:
> > Included below is also an x86_64 implementation in 2 instructions.
> 
> But we need the arch-neutral implementation anyway, the code above
> is the best I could invent.

Agreed; we do. Depending on the cost of division and if the arch has a
64x64->128 mult, it might be better to compute a reciprocal and multiply
that, but yes, long staring didn't get me much better ideas either.

> But see below!
> 
> > I'm still trying see if there's anything saner we can do...
> 
> Oh, please, it is not that I like my solution very much, I would like
> to see something more clever.
> 
> > static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
> > {
> > 	u64 q;
> > 	asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : "rdx");
> > 	return q;
> > }
> 
> Heh. I have to admit that I didn't know that divq divides 128bit by
> 64bit. gcc calls the __udivti3 intrinsic in this case so I wrongly
> came to conclusion this is not simple even on x86_64. Plus the fact
> that linux/math64.h only has mul_u64_u64_shr()...

C wants to promote the dividend and divisor to the same type (int128)
and then it runs into trouble.

But yeah, I don't know how many other 64bit archs can pull off that
trick. I asked, and ARGH64 cannot do that 128/64 (although it can do a
64x64->128 in two instructions).
Peter Zijlstra July 22, 2019, 7:56 p.m. UTC | #9
On Fri, Jul 19, 2019 at 04:37:42PM +0200, Oleg Nesterov wrote:
> On 07/19, Peter Zijlstra wrote:

> > But I'm still confused, since in the long run, it should still end up
> > with a proportionally divided user/system, irrespective of some short
> > term wobblies.
> 
> Why?
> 
> Yes, statistically the numbers are proportionally divided.

This; due to the loss in precision the distribution is like a step
function around the actual s:u ratio line, but on average it still is
s:u.

Even if it were a perfect function, we'd still see increments in stime even
if the current program state never does syscalls, simply because it
needs to stay on that s:u line.

> but you will (probably) never see the real stime == 1000 && utime == 10000
> numbers if you watch incrementally.

See, there are no 'real' stime and utime numbers. What we have are user
and system samples -- tick based.

If the tick lands in the kernel, we get a system sample, if the tick
lands in userspace we get a user sample.

What we do have is an accurate (ns) based runtime accounting, and we
(re)construct stime and utime from this; we divide the total known
runtime in stime and utime pro-rata.

Sure, we take a shortcut, it wobbles a bit, but seriously, the samples
are inaccurate anyway, so who bloody cares :-)

You can construct a program that runs 99% in userspace but has all
system samples. All you need to do is make sure you're in a system call
when the tick lands.

> Just in case... yes I know that these numbers can only "converge" to the
> reality, only their sum is correct. But people complain.

People always complain, just tell em to go pound sand :-)
Peter Zijlstra July 22, 2019, 8 p.m. UTC | #10
On Mon, Jul 22, 2019 at 12:52:41PM +0200, Stanislaw Gruszka wrote:
> On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> > > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > > 
> > > The new implementation of scale_stime() does the additional div64_u64_rem()
> > > in a loop but see the comment, as long it is used by cputime_adjust() this
> > > can happen only once.
> > 
> > That only shows something after long long staring :/ There's no words on
> > what the output actually means or what would've been expected.
> > 
> > Also, your example is incomplete; the below is a test for scale_stime();
> > from this we can see that the division results in too large a number,
> > but, important for our use-case in cputime_adjust(), it is a step
> > function (due to loss in precision) and for every plateau we shift
> > runtime into the wrong bucket.
> > 
> > Your proposed function works; but is atrocious, esp. on 32bit. That
> > said, before we 'fixed' it, it had similar horrible divisions in, see
> > commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").
> > 
> > Included below is also an x86_64 implementation in 2 instructions.
> > 
> > I'm still trying see if there's anything saner we can do...
> 
> I was always proponent of removing scaling and export raw values
> and sum_exec_runtime. But that has obvious drawback, reintroduce
> 'top hiding' issue.

I think (but didn't grep) that we actually export sum_exec_runtime in
/proc/ *somewhere*.

> But maybe we can export raw values in separate file i.e.
> /proc/[pid]/raw_cpu_times ? So applications that require more precise
> cputime values for very long-living processes can use this file.

There are no raw cpu_times, there are system and user samples, and
samples are, by their very nature, an approximation. We just happen to
track the samples in TICK_NSEC resolution these days, but they're still
ticks (except on s390 and maybe other archs, which do time accounting in
the syscall path).

But I think you'll find x86 people are quite opposed to doing TSC reads
in syscall entry and exit :-)
Stanislaw Gruszka July 23, 2019, 9:37 a.m. UTC | #11
On Mon, Jul 22, 2019 at 10:00:34PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 22, 2019 at 12:52:41PM +0200, Stanislaw Gruszka wrote:
> > On Fri, Jul 19, 2019 at 01:03:49PM +0200, Peter Zijlstra wrote:
> > > > shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> > > > 
> > > > The new implementation of scale_stime() does the additional div64_u64_rem()
> > > > in a loop but see the comment, as long it is used by cputime_adjust() this
> > > > can happen only once.
> > > 
> > > That only shows something after long long staring :/ There's no words on
> > > what the output actually means or what would've been expected.
> > > 
> > > Also, your example is incomplete; the below is a test for scale_stime();
> > > from this we can see that the division results in too large a number,
> > > but, important for our use-case in cputime_adjust(), it is a step
> > > function (due to loss in precision) and for every plateau we shift
> > > runtime into the wrong bucket.
> > > 
> > > Your proposed function works; but is atrocious, esp. on 32bit. That
> > > said, before we 'fixed' it, it had similar horrible divisions in, see
> > > commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").
> > > 
> > > Included below is also an x86_64 implementation in 2 instructions.
> > > 
> > > I'm still trying see if there's anything saner we can do...
> > 
> > I was always proponent of removing scaling and export raw values
> > and sum_exec_runtime. But that has obvious drawback, reintroduce
> > 'top hiding' issue.
> 
> I think (but didn't grep) that we actually export sum_exec_runtime in
> /proc/ *somewhere*.

Yes, it is, in /proc/[PID]/schedstat for CONFIG_SCHEDSTAT=y and in
/proc/[PID]/sched for CONFIG_SCHED_DEBUG=y

> > But maybe we can export raw values in separate file i.e.
> > /proc/[pid]/raw_cpu_times ? So applications that require more precise
> > cputime values for very long-living processes can use this file.
> 
> There are no raw cpu_times, there are system and user samples, and
> samples are, by their very nature, an approximation. We just happen to
> track the samples in TICK_NSEC resolution these days, but they're still
> ticks (except on s390 and maybe other archs, which do time accounting in
> the syscall path).
> 
> But I think you'll find x86 people are quite opposed to doing TSC reads
> in syscall entry and exit :-)

By 'raw' I meant values that are stored in task_struct without
processing by cputime_adjust() -> scale_stime(). Idea is that
user space can make scaling using floating point, so do not have
to drop precision if numbers are big. 

That was discussed on my RFC and PATCH series posts:
https://lore.kernel.org/lkml/1364489605-5443-1-git-send-email-sgruszka@redhat.com/
https://lore.kernel.org/lkml/1365066635-2959-1-git-send-email-sgruszka@redhat.com/

There was objection that even if kernel does not tell what utime/stime
values mean exactly, users already got used to scaled behaviour and
changing this is 'semantic' ABI breakage. And obviously this would make
'top hiding' worm working again for unpatched top.

So perhaps we can add new exports of not-scaled utime/stime and achieve
the same goal without changing the meaning of existing fields. From
other hand, maybe nowadays better tools exist to provide exact cputimes
to user space i.e. 'perf stat' or bpf, so proposed addition is unneeded.

Stanislaw
Oleg Nesterov July 23, 2019, 2 p.m. UTC | #12
On 07/22, Peter Zijlstra wrote:
>
> On Fri, Jul 19, 2019 at 04:37:42PM +0200, Oleg Nesterov wrote:
> > On 07/19, Peter Zijlstra wrote:
>
> > > But I'm still confused, since in the long run, it should still end up
> > > with a proportionally divided user/system, irrespective of some short
> > > term wobblies.
> >
> > Why?
> >
> > Yes, statistically the numbers are proportionally divided.
>
> This; due to the loss in precision the distribution is like a step
> function around the actual s:u ratio line, but on average it still is
> s:u.

You know, I am no longer sure... perhaps it is even worse, I need to recheck.

> Even if it were a perfect function, we'd still see increments in stime even
> if the current program state never does syscalls, simply because it
> needs to stay on that s:u line.
>
> > but you will (probably) never see the real stime == 1000 && utime == 10000
> > numbers if you watch incrementally.
>
> See, there are no 'real' stime and utime numbers. What we have are user
> and system samples -- tick based.

Yes, yes, I know.

> Sure, we take a shortcut, it wobbles a bit, but seriously, the samples
> are inaccurate anyway, so who bloody cares :-)
...
> People always complain, just tell em to go pound sand :-)

I tried ;) this was my initial reaction to this bug report.

However,

> You can construct a program that runs 99% in userspace but has all
> system samples.

Yes, but with the current implementation you do not need to construct
such a program, this is what you can easily get "in practice". And this
confuses people.

They can watch /proc/pid/stat incrementally and (when the numbers are big)
find that a program that runs 100% in userspace somehow spends 10 minutes
almost entirely in kernel. Or at least more in kernel than in userspace.
Even if task->stime doesn't grow at all.

Oleg.
Oleg Nesterov July 23, 2019, 2:29 p.m. UTC | #13
On 07/23, Oleg Nesterov wrote:
>
> > > Yes, statistically the numbers are proportionally divided.
> >
> > This; due to the loss in precision the distribution is like a step
> > function around the actual s:u ratio line, but on average it still is
> > s:u.
>
> You know, I am no longer sure... perhaps it is even worse, I need to recheck.

To clarify, this is probably true even if prev_cputime adds more confusion.
But how many minutes (hours? days?) you need to look at /proc/pid/stat
to get the average which more or less matches the reality?

Oleg.

Patch
diff mbox series

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2305ce8..ad055a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -525,47 +525,55 @@  void account_idle_ticks(unsigned long ticks)
 }
 
 /*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
+ * Perform (stime * rtime) / total, but avoid multiplication overflow
+ * by losing precision when the numbers are big.
+ *
+ * NOTE! currently the only user is cputime_adjust() and thus
+ *
+ *	stime < total && rtime > total
+ *
+ * this means that the end result is always precise and the additional
+ * div64_u64_rem() inside the main loop is called at most once.
  */
 static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
-	u64 scaled;
+	u64 res = 0, div, rem;
 
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
+	/* can stime * rtime overflow ? */
+	while (ilog2(stime) + ilog2(rtime) > 62) {
 		if (stime > rtime)
 			swap(rtime, stime);
 
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
+		if (rtime >= total) {
+			/*
+			 * (rtime * stime) / total is equal to
+			 *
+			 *	(rtime / total) * stime +
+			 *	(rtime % total) * stime / total
+			 *
+			 * if nothing overflows. Can the 1st multiplication
+			 * overflow? Yes, but we do not care: this can only
+			 * happen if the end result can't fit in u64 anyway.
+			 *
+			 * So the code below does
+			 *
+			 *	res += (rtime / total) * stime;
+			 *	rtime = rtime % total;
+			 */
+			div = div64_u64_rem(rtime, total, &rem);
+			res += div * stime;
+			rtime = rem;
+			continue;
+		}
 
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
+		/* drop precision */
 		rtime >>= 1;
 		total >>= 1;
+		if (!total)
+			return res;
 	}
 
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
+	return res + div64_u64(stime * rtime, total);
 }
 
 /*