All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christophe Leroy <christophe.leroy@c-s.fr>
To: Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Paul Mackerras <paulus@samba.org>,
	Michael Ellerman <mpe@ellerman.id.au>,
	arnd@arndb.de, tglx@linutronix.de, vincenzo.frascino@arm.com,
	luto@kernel.org
Cc: linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org, linux-mips@vger.kernel.org,
	x86@kernel.org
Subject: [RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time
Date: Mon, 13 Jan 2020 17:08:44 +0000 (UTC)	[thread overview]
Message-ID: <5b38617a2ca4f719760aafbdb6115eaad28c0640.1578934751.git.christophe.leroy@c-s.fr> (raw)
In-Reply-To: <cover.1578934751.git.christophe.leroy@c-s.fr>

Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.

Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday:    vdso: 1078 nsec/call
clock-gettime-monotonic-raw:    vdso: 1317 nsec/call
clock-gettime-monotonic:    vdso: 1255 nsec/call

After:
gettimeofday:    vdso: 1032 nsec/call
clock-gettime-monotonic-raw:    vdso: 1312 nsec/call
clock-gettime-monotonic:    vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 }
 #endif
 
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder)
+{
+	u32 ret = 0;
+
+	while (dividend >= divisor) {
+		/* The following asm() prevents the compiler from
+		   optimising this loop into a modulo operation.  */
+		asm("" : "+rm"(dividend));
+
+		dividend -= divisor;
+		ret++;
+	}
+
+	*remainder = dividend;
+
+	return ret;
+}
+
 static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 				   struct __kernel_timespec *ts)
 {
 	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
 	u64 cycles, last, sec, ns;
 	u32 seq;
+	unsigned long nsec;
 
 	do {
 		seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 			return -1;
 
 		ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-		ns >>= vd->shift;
+		nsec = ns >> vd->shift;
 		sec = vdso_ts->sec;
 	} while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 	 * Do this outside the loop: a race inside the loop could result
 	 * in __iter_div_u64_rem() being extremely slow.
 	 */
-	ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
+	ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+	ts->tv_nsec = nsec;
 
 	return 0;
 }
-- 
2.13.3


WARNING: multiple messages have this Message-ID (diff)
From: Christophe Leroy <christophe.leroy@c-s.fr>
To: Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Paul Mackerras <paulus@samba.org>,
	Michael Ellerman <mpe@ellerman.id.au>,
	 arnd@arndb.de, tglx@linutronix.de, vincenzo.frascino@arm.com,
	luto@kernel.org
Cc: x86@kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, linux-mips@vger.kernel.org
Subject: [RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time
Date: Mon, 13 Jan 2020 17:08:44 +0000 (UTC)	[thread overview]
Message-ID: <5b38617a2ca4f719760aafbdb6115eaad28c0640.1578934751.git.christophe.leroy@c-s.fr> (raw)
In-Reply-To: <cover.1578934751.git.christophe.leroy@c-s.fr>

Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.

Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday:    vdso: 1078 nsec/call
clock-gettime-monotonic-raw:    vdso: 1317 nsec/call
clock-gettime-monotonic:    vdso: 1255 nsec/call

After:
gettimeofday:    vdso: 1032 nsec/call
clock-gettime-monotonic-raw:    vdso: 1312 nsec/call
clock-gettime-monotonic:    vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 }
 #endif
 
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder)
+{
+	u32 ret = 0;
+
+	while (dividend >= divisor) {
+		/* The following asm() prevents the compiler from
+		   optimising this loop into a modulo operation.  */
+		asm("" : "+rm"(dividend));
+
+		dividend -= divisor;
+		ret++;
+	}
+
+	*remainder = dividend;
+
+	return ret;
+}
+
 static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 				   struct __kernel_timespec *ts)
 {
 	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
 	u64 cycles, last, sec, ns;
 	u32 seq;
+	unsigned long nsec;
 
 	do {
 		seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 			return -1;
 
 		ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-		ns >>= vd->shift;
+		nsec = ns >> vd->shift;
 		sec = vdso_ts->sec;
 	} while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 	 * Do this outside the loop: a race inside the loop could result
 	 * in __iter_div_u64_rem() being extremely slow.
 	 */
-	ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
+	ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+	ts->tv_nsec = nsec;
 
 	return 0;
 }
-- 
2.13.3


WARNING: multiple messages have this Message-ID (diff)
From: Christophe Leroy <christophe.leroy@c-s.fr>
To: Benjamin Herrenschmidt <benh@kernel.crashing.org>,
	Paul Mackerras <paulus@samba.org>,
	Michael Ellerman <mpe@ellerman.id.au>,
	 arnd@arndb.de, tglx@linutronix.de, vincenzo.frascino@arm.com,
	luto@kernel.org
Cc: x86@kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, linux-mips@vger.kernel.org
Subject: [RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time
Date: Mon, 13 Jan 2020 17:08:44 +0000 (UTC)	[thread overview]
Message-ID: <5b38617a2ca4f719760aafbdb6115eaad28c0640.1578934751.git.christophe.leroy@c-s.fr> (raw)
In-Reply-To: <cover.1578934751.git.christophe.leroy@c-s.fr>

Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.

Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday:    vdso: 1078 nsec/call
clock-gettime-monotonic-raw:    vdso: 1317 nsec/call
clock-gettime-monotonic:    vdso: 1255 nsec/call

After:
gettimeofday:    vdso: 1032 nsec/call
clock-gettime-monotonic-raw:    vdso: 1312 nsec/call
clock-gettime-monotonic:    vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 }
 #endif
 
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder)
+{
+	u32 ret = 0;
+
+	while (dividend >= divisor) {
+		/* The following asm() prevents the compiler from
+		   optimising this loop into a modulo operation.  */
+		asm("" : "+rm"(dividend));
+
+		dividend -= divisor;
+		ret++;
+	}
+
+	*remainder = dividend;
+
+	return ret;
+}
+
 static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 				   struct __kernel_timespec *ts)
 {
 	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
 	u64 cycles, last, sec, ns;
 	u32 seq;
+	unsigned long nsec;
 
 	do {
 		seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 			return -1;
 
 		ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-		ns >>= vd->shift;
+		nsec = ns >> vd->shift;
 		sec = vdso_ts->sec;
 	} while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 	 * Do this outside the loop: a race inside the loop could result
 	 * in __iter_div_u64_rem() being extremely slow.
 	 */
-	ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
+	ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+	ts->tv_nsec = nsec;
 
 	return 0;
 }
-- 
2.13.3


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  parent reply	other threads:[~2020-01-13 17:09 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-13 17:08 [RFC PATCH v3 00/12] powerpc: switch VDSO to C implementation Christophe Leroy
2020-01-13 17:08 ` Christophe Leroy
2020-01-13 17:08 ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 01/12] powerpc/64: Don't provide time functions in compat VDSO32 Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 02/12] powerpc/vdso: Switch VDSO to generic C implementation Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 03/12] lib: vdso: mark __cvdso_clock_getres() as static Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 04/12] lib: vdso: inline do_hres() and do_coarse() Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 05/12] lib: vdso: Avoid duplication in __cvdso_clock_getres() Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` Christophe Leroy [this message]
2020-01-13 17:08   ` [RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-14 11:31   ` Thomas Gleixner
2020-01-14 11:31     ` Thomas Gleixner
2020-01-14 11:31     ` Thomas Gleixner
2020-01-13 17:08 ` [RFC PATCH v3 07/12] powerpc/vdso: simplify __get_datapage() Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 08/12] lib: vdso: allow arches to provide vdso data pointer Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-14 23:06   ` Thomas Gleixner
2020-01-14 23:06     ` Thomas Gleixner
2020-01-14 23:06     ` Thomas Gleixner
2020-01-15  6:15     ` Christophe Leroy
2020-01-15  6:15       ` Christophe Leroy
2020-01-15  6:15       ` Christophe Leroy
2020-01-16  9:16       ` Christophe Leroy
2020-01-16  9:16         ` Christophe Leroy
2020-01-16  9:16         ` Christophe Leroy
2020-01-16 10:35         ` Thomas Gleixner
2020-01-16 10:35           ` Thomas Gleixner
2020-01-16 10:35           ` Thomas Gleixner
2020-01-16 20:22           ` Andy Lutomirski
2020-01-16 20:22             ` Andy Lutomirski
2020-01-16 20:22             ` Andy Lutomirski
2020-01-13 17:08 ` [RFC PATCH v3 09/12] powerpc/vdso: provide inline alternative to __get_datapage() Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 10/12] powerpc/vdso: provide vdso data pointer from the ASM caller Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 11/12] lib: vdso: split clock verification out of __arch_get_hw_counter() Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08 ` [RFC PATCH v3 12/12] powerpc/vdso: provide __arch_is_hw_counter_valid() Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy
2020-01-13 17:08   ` Christophe Leroy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5b38617a2ca4f719760aafbdb6115eaad28c0640.1578934751.git.christophe.leroy@c-s.fr \
    --to=christophe.leroy@c-s.fr \
    --cc=arnd@arndb.de \
    --cc=benh@kernel.crashing.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mips@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=luto@kernel.org \
    --cc=mpe@ellerman.id.au \
    --cc=paulus@samba.org \
    --cc=tglx@linutronix.de \
    --cc=vincenzo.frascino@arm.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.