linux-toolchains.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] LKMM: Add volatile_if()
@ 2021-06-04 10:12 Peter Zijlstra
  2021-06-04 10:44 ` Will Deacon
                   ` (6 more replies)
  0 siblings, 7 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 10:12 UTC (permalink / raw)
  To: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks
  Cc: linux-kernel, linux-toolchains, linux-arch

Hi!

With optimizing compilers becoming more and more agressive and C so far
refusing to acknowledge the concept of control-dependencies even while
we keep growing the amount of reliance on them, things will eventually
come apart.

There have been talks with toolchain people on how to resolve this; one
suggestion was allowing the volatile qualifier on branch statements like
'if', but so far no actual compiler has made any progress on this.

Rather than waiting any longer, provide our own construct based on that
suggestion. The idea is by Alan Stern and refined by Paul and myself.

Code generation is sub-optimal (for the weak architectures) since we're
forced to convert the condition into another and use a fixed conditional
branch instruction, but shouldn't be too bad.

Usage of volatile_if requires the @cond to be headed by a volatile load
(READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
emit the load and the branch emitted will have the required
data-dependency. Furthermore, volatile_if() is a compiler barrier, which
should prohibit the compiler from lifting anything out of the selection
statement.

This construct should place control dependencies on a stronger footing
until such time that the compiler folks get around to accepting them :-)

I've converted most architectures we care about, and the rest will get
an extra smp_mb() by means of the 'generic' fallback implementation (for
now).

I've converted the control dependencies I remembered and those found
with a search for smp_acquire__after_ctrl_dep(), there might be more.

Compile tested only (alpha, arm, arm64, x86_64, powerpc, powerpc64, s390
and sparc64).

Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm/include/asm/barrier.h      | 11 +++++++++++
 arch/arm64/include/asm/barrier.h    | 11 +++++++++++
 arch/powerpc/include/asm/barrier.h  | 13 +++++++++++++
 arch/s390/include/asm/barrier.h     |  3 +++
 arch/sparc/include/asm/barrier_64.h |  3 +++
 arch/x86/include/asm/barrier.h      | 16 ++++++++++++++++
 include/asm-generic/barrier.h       | 38 ++++++++++++++++++++++++++++++++++++-
 include/linux/refcount.h            |  2 +-
 ipc/mqueue.c                        |  2 +-
 ipc/msg.c                           |  2 +-
 kernel/events/ring_buffer.c         |  8 ++++----
 kernel/locking/rwsem.c              |  4 ++--
 kernel/sched/core.c                 |  2 +-
 kernel/smp.c                        |  2 +-
 14 files changed, 105 insertions(+), 12 deletions(-)

diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index 83ae97c049d9..de8a61479268 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -97,6 +97,17 @@ static inline unsigned long array_index_mask_nospec(unsigned long idx,
 #define array_index_mask_nospec array_index_mask_nospec
 #endif
 
+/* Guarantee a conditional branch that depends on @cond. */
+static __always_inline _Bool volatile_cond(_Bool cond)
+{
+	asm_volatile_goto("teq %0, #0; bne %l[l_yes]"
+			  : : "r" (cond) : "cc", "memory" : l_yes);
+	return 0;
+l_yes:
+	return 1;
+}
+#define volatile_cond volatile_cond
+
 #include <asm-generic/barrier.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 451e11e5fd23..2782a7013615 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -156,6 +156,17 @@ do {									\
 	(typeof(*p))__u.__val;						\
 })
 
+/* Guarantee a conditional branch that depends on @cond. */
+static __always_inline _Bool volatile_cond(_Bool cond)
+{
+	asm_volatile_goto("cbnz %0, %l[l_yes]"
+			  : : "r" (cond) : "cc", "memory" : l_yes);
+	return 0;
+l_yes:
+	return 1;
+}
+#define volatile_cond volatile_cond
+
 #define smp_cond_load_relaxed(ptr, cond_expr)				\
 ({									\
 	typeof(ptr) __PTR = (ptr);					\
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 7ae29cfb06c0..9fdf587f059e 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -80,6 +80,19 @@ do {									\
 	___p1;								\
 })
 
+#ifndef __ASSEMBLY__
+/* Guarantee a conditional branch that depends on @cond. */
+static __always_inline bool volatile_cond(bool cond)
+{
+	asm_volatile_goto("and. %0,%0,%0; bne %l[l_yes]"
+			  : : "r" (cond) : "cc", "memory" : l_yes);
+	return false;
+l_yes:
+	return true;
+}
+#define volatile_cond volatile_cond
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
 #define NOSPEC_BARRIER_SLOT   nop
 #elif defined(CONFIG_PPC_FSL_BOOK3E)
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index f9eddbca79d2..fa78fc1f141b 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -49,6 +49,9 @@ do {									\
 #define __smp_mb__before_atomic()	barrier()
 #define __smp_mb__after_atomic()	barrier()
 
+/* TSO prohibits the LOAD->STORE reorder. */
+#define volatile_cond(cond)	({ bool __t = (cond); barrier(); __t; })
+
 /**
  * array_index_mask_nospec - generate a mask for array_idx() that is
  * ~0UL when the bounds check succeeds and 0 otherwise
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 9fb148bd3c97..dd8e40ad0787 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -56,6 +56,9 @@ do {									\
 #define __smp_mb__before_atomic()	barrier()
 #define __smp_mb__after_atomic()	barrier()
 
+/* TSO prohibits the LOAD->STORE reorder. */
+#define volatile_cond(cond)	({ bool __t = (cond); barrier(); __t; })
+
 #include <asm-generic/barrier.h>
 
 #endif /* !(__SPARC64_BARRIER_H) */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 3ba772a69cc8..2ebdf4e349ff 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -79,6 +79,22 @@ do {									\
 #define __smp_mb__before_atomic()	do { } while (0)
 #define __smp_mb__after_atomic()	do { } while (0)
 
+/* TSO prohibits the LOAD->STORE reorder. */
+#define volatile_cond(cond)	({ bool __t = (cond); barrier(); __t; })
+
+#if 0
+/* For testing the more complicated construct...  */
+static __always_inline bool volatile_cond(bool cond)
+{
+	asm_volatile_goto("test %0,%0; jnz %l[l_yes]"
+			  : : "r" (cond) : "cc", "memory" : l_yes);
+	return false;
+l_yes:
+	return true;
+}
+#define volatile_cond volatile_cond
+#endif
+
 #include <asm-generic/barrier.h>
 
 /*
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 640f09479bdf..a84833f1397b 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -187,6 +187,42 @@ do {									\
 #define virt_store_release(p, v) __smp_store_release(p, v)
 #define virt_load_acquire(p) __smp_load_acquire(p)
 
+/*
+ * 'Generic' wrapper to make volatile_if() below 'work'. Architectures are
+ * encouraged to provide their own implementation. See x86 for TSO and arm64
+ * for a weak example.
+ */
+#ifndef volatile_cond
+#define volatile_cond(cond)	({ bool __t = (cond); smp_mb(); __t; })
+#endif
+
+/**
+ * volatile_if() - Provide a control-dependency
+ *
+ * volatile_if(READ_ONCE(A))
+ *	WRITE_ONCE(B, 1);
+ *
+ * will ensure that the STORE to B happens after the LOAD of A. Normally a
+ * control dependency relies on a conditional branch having a data dependency
+ * on the LOAD and an architecture's inability to speculate STOREs. IOW, this
+ * provides a LOAD->STORE order.
+ *
+ * Due to optimizing compilers extra care is needed; as per the example above
+ * the LOAD must be 'volatile' qualified in order to ensure the compiler
+ * actually emits the load, such that the data-dependency to the conditional
+ * branch can be formed.
+ *
+ * Secondly, the compiler must be prohibited from lifting anything out of the
+ * selection statement, as this would obviously also break the ordering.
+ *
+ * Thirdly, and this is the tricky bit, architectures that allow the
+ * LOAD->STORE reorder must ensure the compiler actually emits the conditional
+ * branch instruction, this isn't possible in generic.
+ *
+ * See the volatile_cond() wrapper.
+ */
+#define volatile_if(cond) if (volatile_cond(cond))
+
 /**
  * smp_acquire__after_ctrl_dep() - Provide ACQUIRE ordering after a control dependency
  *
@@ -216,7 +252,7 @@ do {									\
 	__unqual_scalar_typeof(*ptr) VAL;			\
 	for (;;) {						\
 		VAL = READ_ONCE(*__PTR);			\
-		if (cond_expr)					\
+		volatile_if (cond_expr)				\
 			break;					\
 		cpu_relax();					\
 	}							\
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index b8a6e387f8f9..c0165b4b9f1d 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -274,7 +274,7 @@ static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, in
 	if (oldp)
 		*oldp = old;
 
-	if (old == i) {
+	volatile_if (old == i) {
 		smp_acquire__after_ctrl_dep();
 		return true;
 	}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4e4e61111500..1c023829697c 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -713,7 +713,7 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
 		time = schedule_hrtimeout_range_clock(timeout, 0,
 			HRTIMER_MODE_ABS, CLOCK_REALTIME);
 
-		if (READ_ONCE(ewp->state) == STATE_READY) {
+		volatile_if (READ_ONCE(ewp->state) == STATE_READY) {
 			/* see MQ_BARRIER for purpose/pairing */
 			smp_acquire__after_ctrl_dep();
 			retval = 0;
diff --git a/ipc/msg.c b/ipc/msg.c
index 6e6c8e0c9380..0b0e71fa3fbc 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1213,7 +1213,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
 		 * signal) it will either see the message and continue ...
 		 */
 		msg = READ_ONCE(msr_d.r_msg);
-		if (msg != ERR_PTR(-EAGAIN)) {
+		volatile_if (msg != ERR_PTR(-EAGAIN)) {
 			/* see MSG_BARRIER for purpose/pairing */
 			smp_acquire__after_ctrl_dep();
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 52868716ec35..7767aabfde9f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -192,10 +192,10 @@ __perf_output_begin(struct perf_output_handle *handle,
 	do {
 		tail = READ_ONCE(rb->user_page->data_tail);
 		offset = head = local_read(&rb->head);
-		if (!rb->overwrite) {
-			if (unlikely(!ring_buffer_has_space(head, tail,
-							    perf_data_size(rb),
-							    size, backward)))
+		if (likely(!rb->overwrite)) {
+			volatile_if (unlikely(!ring_buffer_has_space(head, tail,
+								     perf_data_size(rb),
+								     size, backward)))
 				goto fail;
 		}
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 809b0016d344..c76ba4e034ae 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -941,8 +941,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state)
 		 * exit the slowpath and return immediately as its
 		 * RWSEM_READER_BIAS has already been set in the count.
 		 */
-		if (!(atomic_long_read(&sem->count) &
-		     (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+		volatile_if (!(atomic_long_read(&sem->count) &
+			       (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
 			/* Provide lock ACQUIRE */
 			smp_acquire__after_ctrl_dep();
 			raw_spin_unlock_irq(&sem->wait_lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d559db02e3cb..8038d76cfd56 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3760,7 +3760,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
 	 */
 	smp_rmb();
-	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+	volatile_if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
 		goto unlock;
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 52bf159ec400..3d87af0519c5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -394,7 +394,7 @@ static void __csd_lock_wait(struct __call_single_data *csd)
 
 	ts1 = ts0 = sched_clock();
 	for (;;) {
-		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
+		volatile_if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
 			break;
 		cpu_relax();
 	}

^ permalink raw reply related	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
@ 2021-06-04 10:44 ` Will Deacon
  2021-06-04 11:13   ` Will Deacon
  2021-06-04 11:31   ` Peter Zijlstra
  2021-06-04 11:44 ` Peter Zijlstra
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 127+ messages in thread
From: Will Deacon @ 2021-06-04 10:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> With optimizing compilers becoming more and more agressive and C so far
> refusing to acknowledge the concept of control-dependencies even while
> we keep growing the amount of reliance on them, things will eventually
> come apart.
> 
> There have been talks with toolchain people on how to resolve this; one
> suggestion was allowing the volatile qualifier on branch statements like
> 'if', but so far no actual compiler has made any progress on this.
> 
> Rather than waiting any longer, provide our own construct based on that
> suggestion. The idea is by Alan Stern and refined by Paul and myself.
> 
> Code generation is sub-optimal (for the weak architectures) since we're
> forced to convert the condition into another and use a fixed conditional
> branch instruction, but shouldn't be too bad.
> 
> Usage of volatile_if requires the @cond to be headed by a volatile load
> (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> emit the load and the branch emitted will have the required
> data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> should prohibit the compiler from lifting anything out of the selection
> statement.

When building with LTO on arm64, we already upgrade READ_ONCE() to an RCpc
acquire. In this case, it would be really good to avoid having the dummy
conditional branch somehow, but I can't see a good way to achieve that.

> This construct should place control dependencies on a stronger footing
> until such time that the compiler folks get around to accepting them :-)
> 
> I've converted most architectures we care about, and the rest will get
> an extra smp_mb() by means of the 'generic' fallback implementation (for
> now).
> 
> I've converted the control dependencies I remembered and those found
> with a search for smp_acquire__after_ctrl_dep(), there might be more.
> 
> Compile tested only (alpha, arm, arm64, x86_64, powerpc, powerpc64, s390
> and sparc64).
> 
> Suggested-by: Alan Stern <stern@rowland.harvard.edu>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/arm/include/asm/barrier.h      | 11 +++++++++++
>  arch/arm64/include/asm/barrier.h    | 11 +++++++++++
>  arch/powerpc/include/asm/barrier.h  | 13 +++++++++++++
>  arch/s390/include/asm/barrier.h     |  3 +++
>  arch/sparc/include/asm/barrier_64.h |  3 +++
>  arch/x86/include/asm/barrier.h      | 16 ++++++++++++++++
>  include/asm-generic/barrier.h       | 38 ++++++++++++++++++++++++++++++++++++-
>  include/linux/refcount.h            |  2 +-
>  ipc/mqueue.c                        |  2 +-
>  ipc/msg.c                           |  2 +-
>  kernel/events/ring_buffer.c         |  8 ++++----
>  kernel/locking/rwsem.c              |  4 ++--
>  kernel/sched/core.c                 |  2 +-
>  kernel/smp.c                        |  2 +-
>  14 files changed, 105 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
> index 83ae97c049d9..de8a61479268 100644
> --- a/arch/arm/include/asm/barrier.h
> +++ b/arch/arm/include/asm/barrier.h
> @@ -97,6 +97,17 @@ static inline unsigned long array_index_mask_nospec(unsigned long idx,
>  #define array_index_mask_nospec array_index_mask_nospec
>  #endif
>  
> +/* Guarantee a conditional branch that depends on @cond. */
> +static __always_inline _Bool volatile_cond(_Bool cond)
> +{
> +	asm_volatile_goto("teq %0, #0; bne %l[l_yes]"
> +			  : : "r" (cond) : "cc", "memory" : l_yes);
> +	return 0;
> +l_yes:
> +	return 1;
> +}
> +#define volatile_cond volatile_cond
> +
>  #include <asm-generic/barrier.h>
>  
>  #endif /* !__ASSEMBLY__ */
> diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
> index 451e11e5fd23..2782a7013615 100644
> --- a/arch/arm64/include/asm/barrier.h
> +++ b/arch/arm64/include/asm/barrier.h
> @@ -156,6 +156,17 @@ do {									\
>  	(typeof(*p))__u.__val;						\
>  })
>  
> +/* Guarantee a conditional branch that depends on @cond. */
> +static __always_inline _Bool volatile_cond(_Bool cond)

Is _Bool to fix some awful header mess?

> +{
> +	asm_volatile_goto("cbnz %0, %l[l_yes]"
> +			  : : "r" (cond) : "cc", "memory" : l_yes);
> +	return 0;
> +l_yes:
> +	return 1;
> +}

nit: you don't need the "cc" clobber here.

> diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
> index 640f09479bdf..a84833f1397b 100644
> --- a/include/asm-generic/barrier.h
> +++ b/include/asm-generic/barrier.h
> @@ -187,6 +187,42 @@ do {									\
>  #define virt_store_release(p, v) __smp_store_release(p, v)
>  #define virt_load_acquire(p) __smp_load_acquire(p)
>  
> +/*
> + * 'Generic' wrapper to make volatile_if() below 'work'. Architectures are
> + * encouraged to provide their own implementation. See x86 for TSO and arm64
> + * for a weak example.
> + */
> +#ifndef volatile_cond
> +#define volatile_cond(cond)	({ bool __t = (cond); smp_mb(); __t; })
> +#endif
> +
> +/**
> + * volatile_if() - Provide a control-dependency
> + *
> + * volatile_if(READ_ONCE(A))
> + *	WRITE_ONCE(B, 1);
> + *
> + * will ensure that the STORE to B happens after the LOAD of A. Normally a
> + * control dependency relies on a conditional branch having a data dependency
> + * on the LOAD and an architecture's inability to speculate STOREs. IOW, this
> + * provides a LOAD->STORE order.
> + *
> + * Due to optimizing compilers extra care is needed; as per the example above
> + * the LOAD must be 'volatile' qualified in order to ensure the compiler
> + * actually emits the load, such that the data-dependency to the conditional
> + * branch can be formed.
> + *
> + * Secondly, the compiler must be prohibited from lifting anything out of the
> + * selection statement, as this would obviously also break the ordering.
> + *
> + * Thirdly, and this is the tricky bit, architectures that allow the
> + * LOAD->STORE reorder must ensure the compiler actually emits the conditional
> + * branch instruction, this isn't possible in generic.
> + *
> + * See the volatile_cond() wrapper.
> + */
> +#define volatile_if(cond) if (volatile_cond(cond))

The thing I really dislike about this is that, if the compiler _does_
emit a conditional branch for the C 'if', then we get a pair of branch
instructions in close proximity to each other which the predictor is likely
to hate. I wouldn't be surprised if an RCpc acquire heading the dependency
actually performs better on modern arm64 cores in the general case.

So I think that's an argument for doing this in the compiler...

Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:44 ` Will Deacon
@ 2021-06-04 11:13   ` Will Deacon
  2021-06-04 11:31   ` Peter Zijlstra
  1 sibling, 0 replies; 127+ messages in thread
From: Will Deacon @ 2021-06-04 11:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 11:43:59AM +0100, Will Deacon wrote:
> On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> > With optimizing compilers becoming more and more agressive and C so far
> > refusing to acknowledge the concept of control-dependencies even while
> > we keep growing the amount of reliance on them, things will eventually
> > come apart.
> > 
> > There have been talks with toolchain people on how to resolve this; one
> > suggestion was allowing the volatile qualifier on branch statements like
> > 'if', but so far no actual compiler has made any progress on this.
> > 
> > Rather than waiting any longer, provide our own construct based on that
> > suggestion. The idea is by Alan Stern and refined by Paul and myself.
> > 
> > Code generation is sub-optimal (for the weak architectures) since we're
> > forced to convert the condition into another and use a fixed conditional
> > branch instruction, but shouldn't be too bad.
> > 
> > Usage of volatile_if requires the @cond to be headed by a volatile load
> > (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> > emit the load and the branch emitted will have the required
> > data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> > should prohibit the compiler from lifting anything out of the selection
> > statement.
> 
> When building with LTO on arm64, we already upgrade READ_ONCE() to an RCpc
> acquire. In this case, it would be really good to avoid having the dummy
> conditional branch somehow, but I can't see a good way to achieve that.

Thinking more on this, an alternative angle would be having READ_ONCE_CTRL()
instead of volatile_if. That would then expand (on arm64) to either
something like:

	LDR	X0, [X1]
	CBNZ	X0, 1f		// Dummy ctrl
1:

or, with LTO:

	LDAPR	X0, [X1]	// RCpc

and we'd avoid the redundancy.

Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:44 ` Will Deacon
  2021-06-04 11:13   ` Will Deacon
@ 2021-06-04 11:31   ` Peter Zijlstra
  2021-06-04 13:44     ` Will Deacon
  2021-06-04 15:47     ` Segher Boessenkool
  1 sibling, 2 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 11:31 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 11:44:00AM +0100, Will Deacon wrote:
> On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:

> > Usage of volatile_if requires the @cond to be headed by a volatile load
> > (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> > emit the load and the branch emitted will have the required
> > data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> > should prohibit the compiler from lifting anything out of the selection
> > statement.
> 
> When building with LTO on arm64, we already upgrade READ_ONCE() to an RCpc
> acquire. In this case, it would be really good to avoid having the dummy
> conditional branch somehow, but I can't see a good way to achieve that.

#ifdef CONFIG_LTO
/* Because __READ_ONCE() is load-acquire */
#define volatile_cond(cond)	(cond)
#else
....
#endif

Doesn't work? Bit naf, but I'm thinking it ought to do.

> > This construct should place control dependencies on a stronger footing
> > until such time that the compiler folks get around to accepting them :-)
> > 
> > I've converted most architectures we care about, and the rest will get
> > an extra smp_mb() by means of the 'generic' fallback implementation (for
> > now).
> > 
> > I've converted the control dependencies I remembered and those found
> > with a search for smp_acquire__after_ctrl_dep(), there might be more.
> > 
> > Compile tested only (alpha, arm, arm64, x86_64, powerpc, powerpc64, s390
> > and sparc64).
> > 
> > Suggested-by: Alan Stern <stern@rowland.harvard.edu>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

> > diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
> > index 451e11e5fd23..2782a7013615 100644
> > --- a/arch/arm64/include/asm/barrier.h
> > +++ b/arch/arm64/include/asm/barrier.h
> > @@ -156,6 +156,17 @@ do {									\
> >  	(typeof(*p))__u.__val;						\
> >  })
> >  
> > +/* Guarantee a conditional branch that depends on @cond. */
> > +static __always_inline _Bool volatile_cond(_Bool cond)
> 
> Is _Bool to fix some awful header mess?

Yes, header soup :/ Idem for the lack of true and false.

> > +{
> > +	asm_volatile_goto("cbnz %0, %l[l_yes]"
> > +			  : : "r" (cond) : "cc", "memory" : l_yes);
> > +	return 0;
> > +l_yes:
> > +	return 1;
> > +}
> 
> nit: you don't need the "cc" clobber here.

Yeah I know, "cc" is implied.

> > diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
> > index 640f09479bdf..a84833f1397b 100644
> > --- a/include/asm-generic/barrier.h
> > +++ b/include/asm-generic/barrier.h
> > @@ -187,6 +187,42 @@ do {									\
> >  #define virt_store_release(p, v) __smp_store_release(p, v)
> >  #define virt_load_acquire(p) __smp_load_acquire(p)
> >  
> > +/*
> > + * 'Generic' wrapper to make volatile_if() below 'work'. Architectures are
> > + * encouraged to provide their own implementation. See x86 for TSO and arm64
> > + * for a weak example.
> > + */
> > +#ifndef volatile_cond
> > +#define volatile_cond(cond)	({ bool __t = (cond); smp_mb(); __t; })
> > +#endif
> > +
> > +/**
> > + * volatile_if() - Provide a control-dependency
> > + *
> > + * volatile_if(READ_ONCE(A))
> > + *	WRITE_ONCE(B, 1);
> > + *
> > + * will ensure that the STORE to B happens after the LOAD of A. Normally a
> > + * control dependency relies on a conditional branch having a data dependency
> > + * on the LOAD and an architecture's inability to speculate STOREs. IOW, this
> > + * provides a LOAD->STORE order.
> > + *
> > + * Due to optimizing compilers extra care is needed; as per the example above
> > + * the LOAD must be 'volatile' qualified in order to ensure the compiler
> > + * actually emits the load, such that the data-dependency to the conditional
> > + * branch can be formed.
> > + *
> > + * Secondly, the compiler must be prohibited from lifting anything out of the
> > + * selection statement, as this would obviously also break the ordering.
> > + *
> > + * Thirdly, and this is the tricky bit, architectures that allow the
> > + * LOAD->STORE reorder must ensure the compiler actually emits the conditional
> > + * branch instruction, this isn't possible in generic.
> > + *
> > + * See the volatile_cond() wrapper.
> > + */
> > +#define volatile_if(cond) if (volatile_cond(cond))
> 
> The thing I really dislike about this is that, if the compiler _does_
> emit a conditional branch for the C 'if', then we get a pair of branch
> instructions in close proximity to each other which the predictor is likely
> to hate. I wouldn't be surprised if an RCpc acquire heading the dependency
> actually performs better on modern arm64 cores in the general case.

jump_label / static_branch relies on asm goto inside if to get optimized
away, so I'm fairly confident this will not result in a double branch,
because yes, that would blow.

> So I think that's an argument for doing this in the compiler...

Don't get me wrong, I would _LOVE_ for the compilers to do this. This
really is just a stop-gap solution to ensure we don't get to debug 'FUN'
stuff.


^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
  2021-06-04 10:44 ` Will Deacon
@ 2021-06-04 11:44 ` Peter Zijlstra
  2021-06-04 14:13   ` Paul E. McKenney
  2021-06-04 15:35   ` Segher Boessenkool
  2021-06-04 14:25 ` Alan Stern
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 11:44 UTC (permalink / raw)
  To: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks
  Cc: linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> +/**
> + * volatile_if() - Provide a control-dependency
> + *
> + * volatile_if(READ_ONCE(A))
> + *	WRITE_ONCE(B, 1);
> + *
> + * will ensure that the STORE to B happens after the LOAD of A. Normally a
> + * control dependency relies on a conditional branch having a data dependency
> + * on the LOAD and an architecture's inability to speculate STOREs. IOW, this
> + * provides a LOAD->STORE order.
> + *
> + * Due to optimizing compilers extra care is needed; as per the example above
> + * the LOAD must be 'volatile' qualified in order to ensure the compiler
> + * actually emits the load, such that the data-dependency to the conditional
> + * branch can be formed.
> + *
> + * Secondly, the compiler must be prohibited from lifting anything out of the
> + * selection statement, as this would obviously also break the ordering.
> + *
> + * Thirdly, and this is the tricky bit, architectures that allow the
> + * LOAD->STORE reorder must ensure the compiler actually emits the conditional
> + * branch instruction, this isn't possible in generic.
> + *
> + * See the volatile_cond() wrapper.
> + */
> +#define volatile_if(cond) if (volatile_cond(cond))

On naming (sorry Paul for forgetting that in the initial mail); while I
think using the volatile qualifier for the language feature (can we haz
plz, kthxbai) makes perfect sense, Paul felt that we might use a
'better' name for the kernel use, ctrl_dep_if() was proposed.

Let us pain bike sheds :-)

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 11:31   ` Peter Zijlstra
@ 2021-06-04 13:44     ` Will Deacon
  2021-06-04 13:56       ` Peter Zijlstra
  2021-06-04 15:47     ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Will Deacon @ 2021-06-04 13:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 01:31:48PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 11:44:00AM +0100, Will Deacon wrote:
> > On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> 
> > > Usage of volatile_if requires the @cond to be headed by a volatile load
> > > (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> > > emit the load and the branch emitted will have the required
> > > data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> > > should prohibit the compiler from lifting anything out of the selection
> > > statement.
> > 
> > When building with LTO on arm64, we already upgrade READ_ONCE() to an RCpc
> > acquire. In this case, it would be really good to avoid having the dummy
> > conditional branch somehow, but I can't see a good way to achieve that.
> 
> #ifdef CONFIG_LTO
> /* Because __READ_ONCE() is load-acquire */
> #define volatile_cond(cond)	(cond)
> #else
> ....
> #endif
> 
> Doesn't work? Bit naf, but I'm thinking it ought to do.

The problem is with relaxed atomic RMWs; we don't upgrade those to acquire
atm as they're written in asm, but we'd need volatile_cond() to work with
them. It's a shame, because we only have RCsc RMWs on arm64, so it would
be a bit more expensive.

> > > +/**
> > > + * volatile_if() - Provide a control-dependency
> > > + *
> > > + * volatile_if(READ_ONCE(A))
> > > + *	WRITE_ONCE(B, 1);
> > > + *
> > > + * will ensure that the STORE to B happens after the LOAD of A. Normally a
> > > + * control dependency relies on a conditional branch having a data dependency
> > > + * on the LOAD and an architecture's inability to speculate STOREs. IOW, this
> > > + * provides a LOAD->STORE order.
> > > + *
> > > + * Due to optimizing compilers extra care is needed; as per the example above
> > > + * the LOAD must be 'volatile' qualified in order to ensure the compiler
> > > + * actually emits the load, such that the data-dependency to the conditional
> > > + * branch can be formed.
> > > + *
> > > + * Secondly, the compiler must be prohibited from lifting anything out of the
> > > + * selection statement, as this would obviously also break the ordering.
> > > + *
> > > + * Thirdly, and this is the tricky bit, architectures that allow the
> > > + * LOAD->STORE reorder must ensure the compiler actually emits the conditional
> > > + * branch instruction, this isn't possible in generic.
> > > + *
> > > + * See the volatile_cond() wrapper.
> > > + */
> > > +#define volatile_if(cond) if (volatile_cond(cond))
> > 
> > The thing I really dislike about this is that, if the compiler _does_
> > emit a conditional branch for the C 'if', then we get a pair of branch
> > instructions in close proximity to each other which the predictor is likely
> > to hate. I wouldn't be surprised if an RCpc acquire heading the dependency
> > actually performs better on modern arm64 cores in the general case.
> 
> jump_label / static_branch relies on asm goto inside if to get optimized
> away, so I'm fairly confident this will not result in a double branch,
> because yes, that would blow.

I gave it a spin and you're right. Neat!

Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 13:44     ` Will Deacon
@ 2021-06-04 13:56       ` Peter Zijlstra
  2021-06-04 15:13         ` Will Deacon
  2021-06-04 15:50         ` Segher Boessenkool
  0 siblings, 2 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 13:56 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 02:44:22PM +0100, Will Deacon wrote:
> On Fri, Jun 04, 2021 at 01:31:48PM +0200, Peter Zijlstra wrote:
> > On Fri, Jun 04, 2021 at 11:44:00AM +0100, Will Deacon wrote:
> > > On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> > 
> > > > Usage of volatile_if requires the @cond to be headed by a volatile load
> > > > (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> > > > emit the load and the branch emitted will have the required
> > > > data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> > > > should prohibit the compiler from lifting anything out of the selection
> > > > statement.
> > > 
> > > When building with LTO on arm64, we already upgrade READ_ONCE() to an RCpc
> > > acquire. In this case, it would be really good to avoid having the dummy
> > > conditional branch somehow, but I can't see a good way to achieve that.
> > 
> > #ifdef CONFIG_LTO
> > /* Because __READ_ONCE() is load-acquire */
> > #define volatile_cond(cond)	(cond)
> > #else
> > ....
> > #endif
> > 
> > Doesn't work? Bit naf, but I'm thinking it ought to do.
> 
> The problem is with relaxed atomic RMWs; we don't upgrade those to acquire
> atm as they're written in asm, but we'd need volatile_cond() to work with
> them. It's a shame, because we only have RCsc RMWs on arm64, so it would
> be a bit more expensive.

Urgh, I see. Compiler can't really help in that case either I'm afraid.
They'll never want to modify loads that originate in an asm(). They'll
say to use the C11 _Atomic crud.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 11:44 ` Peter Zijlstra
@ 2021-06-04 14:13   ` Paul E. McKenney
  2021-06-04 15:35   ` Segher Boessenkool
  1 sibling, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-04 14:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, will, stern, parri.andrea, boqun.feng, npiggin,
	dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 01:44:37PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> > +/**
> > + * volatile_if() - Provide a control-dependency
> > + *
> > + * volatile_if(READ_ONCE(A))
> > + *	WRITE_ONCE(B, 1);
> > + *
> > + * will ensure that the STORE to B happens after the LOAD of A. Normally a
> > + * control dependency relies on a conditional branch having a data dependency
> > + * on the LOAD and an architecture's inability to speculate STOREs. IOW, this
> > + * provides a LOAD->STORE order.
> > + *
> > + * Due to optimizing compilers extra care is needed; as per the example above
> > + * the LOAD must be 'volatile' qualified in order to ensure the compiler
> > + * actually emits the load, such that the data-dependency to the conditional
> > + * branch can be formed.
> > + *
> > + * Secondly, the compiler must be prohibited from lifting anything out of the
> > + * selection statement, as this would obviously also break the ordering.
> > + *
> > + * Thirdly, and this is the tricky bit, architectures that allow the
> > + * LOAD->STORE reorder must ensure the compiler actually emits the conditional
> > + * branch instruction, this isn't possible in generic.
> > + *
> > + * See the volatile_cond() wrapper.
> > + */
> > +#define volatile_if(cond) if (volatile_cond(cond))
> 
> On naming (sorry Paul for forgetting that in the initial mail); while I
> think using the volatile qualifier for the language feature (can we haz
> plz, kthxbai) makes perfect sense, Paul felt that we might use a
> 'better' name for the kernel use, ctrl_dep_if() was proposed.
> 
> Let us pain bike sheds :-)

I have felt that pain many times...  ;-)

Here is what I see thus far from these two threads:

1.	volatile_if() as above.  Nice ease of use, but might be suboptimal
	on architectures where a branch is slower than an acquire load.

2.	#1, but with my preferred name of ctrl_dep_if() instead of
	volatile_if().

3.	READ_ONCE_CTRL() like back in the old days.  This has the
	advantage of giving the compiler more information, but has
	problems with relaxed atomic RMW operations.

4.	A full (fool?) solution based on #3 would also include _ctrl
	suffixed atomic RMW operations.

5.	Your bikeshed color here!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
  2021-06-04 10:44 ` Will Deacon
  2021-06-04 11:44 ` Peter Zijlstra
@ 2021-06-04 14:25 ` Alan Stern
  2021-06-04 16:09 ` Segher Boessenkool
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-06-04 14:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, will, paulmck, parri.andrea, boqun.feng, npiggin,
	dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> Hi!
> 
> With optimizing compilers becoming more and more agressive and C so far
> refusing to acknowledge the concept of control-dependencies even while
> we keep growing the amount of reliance on them, things will eventually
> come apart.
> 
> There have been talks with toolchain people on how to resolve this; one
> suggestion was allowing the volatile qualifier on branch statements like
> 'if', but so far no actual compiler has made any progress on this.
> 
> Rather than waiting any longer, provide our own construct based on that
> suggestion. The idea is by Alan Stern and refined by Paul and myself.
> 
> Code generation is sub-optimal (for the weak architectures) since we're
> forced to convert the condition into another and use a fixed conditional
> branch instruction, but shouldn't be too bad.
> 
> Usage of volatile_if requires the @cond to be headed by a volatile load
> (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> emit the load and the branch emitted will have the required
> data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> should prohibit the compiler from lifting anything out of the selection
> statement.
> 
> This construct should place control dependencies on a stronger footing
> until such time that the compiler folks get around to accepting them :-)
> 
> I've converted most architectures we care about, and the rest will get
> an extra smp_mb() by means of the 'generic' fallback implementation (for
> now).
> 
> I've converted the control dependencies I remembered and those found
> with a search for smp_acquire__after_ctrl_dep(), there might be more.
> 
> Compile tested only (alpha, arm, arm64, x86_64, powerpc, powerpc64, s390
> and sparc64).
> 
> Suggested-by: Alan Stern <stern@rowland.harvard.edu>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Is there any interest in doing the same sort of thing for switch
statements?  A similar approach would probably work, but maybe people
don't care about it.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 13:56       ` Peter Zijlstra
@ 2021-06-04 15:13         ` Will Deacon
  2021-06-04 15:22           ` Peter Zijlstra
  2021-06-04 15:50         ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Will Deacon @ 2021-06-04 15:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 03:56:16PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 02:44:22PM +0100, Will Deacon wrote:
> > On Fri, Jun 04, 2021 at 01:31:48PM +0200, Peter Zijlstra wrote:
> > > On Fri, Jun 04, 2021 at 11:44:00AM +0100, Will Deacon wrote:
> > > > On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> > > 
> > > > > Usage of volatile_if requires the @cond to be headed by a volatile load
> > > > > (READ_ONCE() / atomic_read() etc..) such that the compiler is forced to
> > > > > emit the load and the branch emitted will have the required
> > > > > data-dependency. Furthermore, volatile_if() is a compiler barrier, which
> > > > > should prohibit the compiler from lifting anything out of the selection
> > > > > statement.
> > > > 
> > > > When building with LTO on arm64, we already upgrade READ_ONCE() to an RCpc
> > > > acquire. In this case, it would be really good to avoid having the dummy
> > > > conditional branch somehow, but I can't see a good way to achieve that.
> > > 
> > > #ifdef CONFIG_LTO
> > > /* Because __READ_ONCE() is load-acquire */
> > > #define volatile_cond(cond)	(cond)
> > > #else
> > > ....
> > > #endif
> > > 
> > > Doesn't work? Bit naf, but I'm thinking it ought to do.
> > 
> > The problem is with relaxed atomic RMWs; we don't upgrade those to acquire
> > atm as they're written in asm, but we'd need volatile_cond() to work with
> > them. It's a shame, because we only have RCsc RMWs on arm64, so it would
> > be a bit more expensive.
> 
> Urgh, I see. Compiler can't really help in that case either I'm afraid.
> They'll never want to modify loads that originate in an asm(). They'll
> say to use the C11 _Atomic crud.

Indeed. That's partly what led me down the route of thinking about "control
ordering" to sit between relaxed and acquire. So you have READ_ONCE_CTRL()
instead of this, but then we can't play your asm goto trick.

If we could push the memory access _and_ the branch down into the new
volatile_if helper, a bit like we do for smp_cond_load_*(), that would
help but it makes the thing a lot harder to use.

In fact, maybe it's actually necessary to bundle the load and branch
together. I looked at some of the examples of compilers breaking control
dependencies from memory-barriers.txt and the "boolean short-circuit"
example seems to defeat volatile_if:

void foo(int *x, int *y)
{
        volatile_if (READ_ONCE(*x) || 1 > 0)
                WRITE_ONCE(*y, 42);
}  

Although we get a conditional branch emitted, it's headed by an immediate
move instruction and the result of the load is discarded:

  38:   d503233f        paciasp
  3c:   b940001f        ldr     wzr, [x0]
  40:   52800028        mov     w8, #0x1                        // #1
  44:   b5000068        cbnz    x8, 50 <foo+0x18>
  48:   d50323bf        autiasp
  4c:   d65f03c0        ret
  50:   d503249f        bti     j
  54:   52800548        mov     w8, #0x2a                       // #42
  58:   b9000028        str     w8, [x1]
  5c:   d50323bf        autiasp
  60:   d65f03c0        ret

Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 15:13         ` Will Deacon
@ 2021-06-04 15:22           ` Peter Zijlstra
  2021-06-04 15:36             ` Alan Stern
  2021-06-04 15:42             ` Peter Zijlstra
  0 siblings, 2 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 15:22 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 04:13:57PM +0100, Will Deacon wrote:

> In fact, maybe it's actually necessary to bundle the load and branch
> together. I looked at some of the examples of compilers breaking control
> dependencies from memory-barriers.txt and the "boolean short-circuit"
> example seems to defeat volatile_if:
> 
> void foo(int *x, int *y)
> {
>         volatile_if (READ_ONCE(*x) || 1 > 0)
>                 WRITE_ONCE(*y, 42);
> }  

Yeah, I'm not too bothered about this. Broken is broken.

If this were a compiler feature, the above would be a compile error. But
alas, we're not there yet :/ and the best we get to say at this point
is: don't do that then.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 11:44 ` Peter Zijlstra
  2021-06-04 14:13   ` Paul E. McKenney
@ 2021-06-04 15:35   ` Segher Boessenkool
  2021-06-04 16:10     ` Peter Zijlstra
  1 sibling, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 15:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 01:44:37PM +0200, Peter Zijlstra wrote:
> On naming (sorry Paul for forgetting that in the initial mail); while I
> think using the volatile qualifier for the language feature (can we haz
> plz, kthxbai) makes perfect sense, Paul felt that we might use a
> 'better' name for the kernel use, ctrl_dep_if() was proposed.

In standard C statements do not have qualifiers.  Unless you can
convince the ISO C committee to have them on "if", you will have a very
hard time convincing any serious compiler to do this.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 15:22           ` Peter Zijlstra
@ 2021-06-04 15:36             ` Alan Stern
  2021-06-04 15:42             ` Peter Zijlstra
  1 sibling, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-06-04 15:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, paulmck, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 05:22:04PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 04:13:57PM +0100, Will Deacon wrote:
> 
> > In fact, maybe it's actually necessary to bundle the load and branch
> > together. I looked at some of the examples of compilers breaking control
> > dependencies from memory-barriers.txt and the "boolean short-circuit"
> > example seems to defeat volatile_if:
> > 
> > void foo(int *x, int *y)
> > {
> >         volatile_if (READ_ONCE(*x) || 1 > 0)
> >                 WRITE_ONCE(*y, 42);
> > }  
> 
> Yeah, I'm not too bothered about this. Broken is broken.
> 
> If this were a compiler feature, the above would be a compile error. But
> alas, we're not there yet :/ and the best we get to say at this point
> is: don't do that then.

This is an example of a "syntactic" dependency versus a "semantic" 
dependency.  We shouldn't expect syntactic control dependencies to be 
preserved.

As a rule, people don't write non-semantic dependencies on purpose.  But 
they can occur in some situations, thanks to definitions the programmer 
isn't aware of.  One example would be:

(In some obscure header file): #define NUM_FOO 1

(Then in real code): if (READ_ONCE(*x) % NUM_FOO) ...

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 15:22           ` Peter Zijlstra
  2021-06-04 15:36             ` Alan Stern
@ 2021-06-04 15:42             ` Peter Zijlstra
  2021-06-04 15:51               ` Alan Stern
  1 sibling, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 15:42 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 05:22:04PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 04:13:57PM +0100, Will Deacon wrote:
> 
> > In fact, maybe it's actually necessary to bundle the load and branch
> > together. I looked at some of the examples of compilers breaking control
> > dependencies from memory-barriers.txt and the "boolean short-circuit"
> > example seems to defeat volatile_if:
> > 
> > void foo(int *x, int *y)
> > {
> >         volatile_if (READ_ONCE(*x) || 1 > 0)
> >                 WRITE_ONCE(*y, 42);
> > }  
> 
> Yeah, I'm not too bothered about this. Broken is broken.
> 
> If this were a compiler feature, the above would be a compile error. But
> alas, we're not there yet :/ and the best we get to say at this point
> is: don't do that then.

Ha! Fixed it for you:

#define volatile_if(cond) if (({ bool __t = (cond); BUILD_BUG_ON(__builtin_constant_p(__t)); volatile_cond(__t); }))


^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 11:31   ` Peter Zijlstra
  2021-06-04 13:44     ` Will Deacon
@ 2021-06-04 15:47     ` Segher Boessenkool
  1 sibling, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 15:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, paulmck, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 01:31:48PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 11:44:00AM +0100, Will Deacon wrote:
> > > +{
> > > +	asm_volatile_goto("cbnz %0, %l[l_yes]"
> > > +			  : : "r" (cond) : "cc", "memory" : l_yes);
> > > +	return 0;
> > > +l_yes:
> > > +	return 1;
> > > +}
> > 
> > nit: you don't need the "cc" clobber here.
> 
> Yeah I know, "cc" is implied.

It isn't needed at all here.  cbnz does not write to the condition
register.  Neither does it change or access memory, but the "memory"
clobber is to force a false dependency.  Writing "cc" as well looks a
bit confusing, given that.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 13:56       ` Peter Zijlstra
  2021-06-04 15:13         ` Will Deacon
@ 2021-06-04 15:50         ` Segher Boessenkool
  1 sibling, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 15:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, paulmck, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 03:56:16PM +0200, Peter Zijlstra wrote:
> Urgh, I see. Compiler can't really help in that case either I'm afraid.
> They'll never want to modify loads that originate in an asm().

We never *can* change an asm template.  That is part of the fundamental
properties of inline asm.  We cannot even parse it!


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 15:42             ` Peter Zijlstra
@ 2021-06-04 15:51               ` Alan Stern
  2021-06-04 16:17                 ` Peter Zijlstra
  0 siblings, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-04 15:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, paulmck, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 05:42:28PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 05:22:04PM +0200, Peter Zijlstra wrote:
> > On Fri, Jun 04, 2021 at 04:13:57PM +0100, Will Deacon wrote:
> > 
> > > In fact, maybe it's actually necessary to bundle the load and branch
> > > together. I looked at some of the examples of compilers breaking control
> > > dependencies from memory-barriers.txt and the "boolean short-circuit"
> > > example seems to defeat volatile_if:
> > > 
> > > void foo(int *x, int *y)
> > > {
> > >         volatile_if (READ_ONCE(*x) || 1 > 0)
> > >                 WRITE_ONCE(*y, 42);
> > > }  
> > 
> > Yeah, I'm not too bothered about this. Broken is broken.
> > 
> > If this were a compiler feature, the above would be a compile error. But
> > alas, we're not there yet :/ and the best we get to say at this point
> > is: don't do that then.
> 
> Ha! Fixed it for you:
> 
> #define volatile_if(cond) if (({ bool __t = (cond); BUILD_BUG_ON(__builtin_constant_p(__t)); volatile_cond(__t); }))

That won't help with more complicated examples, such as:

	volatile_if (READ_ONCE(*x) * 0 + READ_ONCE(*y))

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
                   ` (2 preceding siblings ...)
  2021-06-04 14:25 ` Alan Stern
@ 2021-06-04 16:09 ` Segher Boessenkool
  2021-06-04 16:33   ` Peter Zijlstra
  2021-06-04 16:30 ` Linus Torvalds
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 16:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

Hi!

On Fri, Jun 04, 2021 at 12:12:07PM +0200, Peter Zijlstra wrote:
> With optimizing compilers becoming more and more agressive and C so far
> refusing to acknowledge the concept of control-dependencies even while
> we keep growing the amount of reliance on them, things will eventually
> come apart.

Yes, C is still not a portable assembler.

> There have been talks with toolchain people on how to resolve this; one
> suggestion was allowing the volatile qualifier on branch statements like
> 'if', but so far no actual compiler has made any progress on this.

"if" is not a "branch statement".

> --- a/arch/powerpc/include/asm/barrier.h
> +++ b/arch/powerpc/include/asm/barrier.h
> @@ -80,6 +80,19 @@ do {									\
>  	___p1;								\
>  })
>  
> +#ifndef __ASSEMBLY__
> +/* Guarantee a conditional branch that depends on @cond. */
> +static __always_inline bool volatile_cond(bool cond)
> +{
> +	asm_volatile_goto("and. %0,%0,%0; bne %l[l_yes]"
> +			  : : "r" (cond) : "cc", "memory" : l_yes);
> +	return false;
> +l_yes:
> +	return true;
> +}
> +#define volatile_cond volatile_cond
> +#endif

"cmpwi" is ever so slightly better than "and.".  And you can write "cr0"
instead of "cc" more explicitely (it means the same thing though).


I didn't find a description of the expected precise semantics anywhere
in this patch.  This however is the most important thing required here!


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 15:35   ` Segher Boessenkool
@ 2021-06-04 16:10     ` Peter Zijlstra
  2021-06-04 16:40       ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 16:10 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 10:35:18AM -0500, Segher Boessenkool wrote:
> On Fri, Jun 04, 2021 at 01:44:37PM +0200, Peter Zijlstra wrote:
> > On naming (sorry Paul for forgetting that in the initial mail); while I
> > think using the volatile qualifier for the language feature (can we haz
> > plz, kthxbai) makes perfect sense, Paul felt that we might use a
> > 'better' name for the kernel use, ctrl_dep_if() was proposed.
> 
> In standard C statements do not have qualifiers.  Unless you can
> convince the ISO C committee to have them on "if", you will have a very
> hard time convincing any serious compiler to do this.

While some people like talking to the Committee, I would much rather
explore language extensions with the compiler communities. Such
extensions can then make their way into the Committee once they show
their usefulness.

The whole statement qualifier thing was something that was proposed by a
tools person, although I can't really remember who. I'm not much married
to it, but since it's been the only actual suggestion from a tools
person, it's stuck.

If you have another proposal on how to express this; one you'd rather
see implemented, I'm all ears.

But the fact is that we really do depend on this. And we seem to be
growing more of them, not less.

Data dependencies, control dependencies and address dependencies, C
doesn't really like them, we rely on them. It would be awesome if we can
fix this.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 15:51               ` Alan Stern
@ 2021-06-04 16:17                 ` Peter Zijlstra
  2021-06-04 18:27                   ` Alan Stern
  0 siblings, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 16:17 UTC (permalink / raw)
  To: Alan Stern
  Cc: Will Deacon, Linus Torvalds, paulmck, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 11:51:54AM -0400, Alan Stern wrote:
> On Fri, Jun 04, 2021 at 05:42:28PM +0200, Peter Zijlstra wrote:

> > #define volatile_if(cond) if (({ bool __t = (cond); BUILD_BUG_ON(__builtin_constant_p(__t)); volatile_cond(__t); }))
> 
> That won't help with more complicated examples, such as:
> 
> 	volatile_if (READ_ONCE(*x) * 0 + READ_ONCE(*y))

That's effectively:

	volatile_if (READ_ONCE(*y))
		WRITE_ONCE(*y, 42);

which is a valid, but daft, LOAD->STORE order, no? A compiler might
maybe be able to WARN on that, but that's definitely beyond what we can
do with macros.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
                   ` (3 preceding siblings ...)
  2021-06-04 16:09 ` Segher Boessenkool
@ 2021-06-04 16:30 ` Linus Torvalds
  2021-06-04 16:37   ` Peter Zijlstra
  2021-06-08 12:48 ` David Laight
  2021-09-24 18:38 ` Mathieu Desnoyers
  6 siblings, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 16:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Paul E. McKenney, Alan Stern, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 3:12 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> I've converted most architectures we care about, and the rest will get
> an extra smp_mb() by means of the 'generic' fallback implementation (for
> now).

Why is "volatile_if()" not just

       #define barier_true() ({ barrier(); 1; })

       #define volatile_if(x) if ((x) && barrier_true())

because that should essentially cause the same thing - the compiler
should be *forced* to create one conditional branch (because "barrier"
is an asm that can't be done on the false side, so it can't do it with
arithmetic or other games), and after that we're done.

No need for per-architecture "asm goto" games. No new memory barriers.
No actual new code generation (except for the empty asm volatile that
is a barrier).

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:09 ` Segher Boessenkool
@ 2021-06-04 16:33   ` Peter Zijlstra
  0 siblings, 0 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 16:33 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 11:09:55AM -0500, Segher Boessenkool wrote:
> I didn't find a description of the expected precise semantics anywhere
> in this patch.  This however is the most important thing required here!

Fair enough; so a control-dependency is a LOAD->STORE memory ordering
provided by conditional branches.

The conditional branch instruction itself must have a data dependency on
a previous LOAD, while the branch itself guards a STORE. Then because
speculative STOREs result in out-of-thin-air values, the STORE must not
become visible until the branch is decided, which can only be done if
the LOAD is complete.

We make use of this, and would like the compiler to not ruin this code
pattern for us.

So we need the STORE to say inside the selection statement, we need the
LOAD not be optimized away, and we need the conditional branch to be
emitted.

Alternatively, we need the LOAD to be upgraded to a LOAD-ACQUIRE (an
option on platforms where this is sufficiently cheap). Which will also
ensure the STORE happens after.

So we can force the LOAD using READ_ONCE() (a volatile cast).

We can prohibit hoisting by adding a compiler barrier to the expression.
And then we use asm goto() to force emit a conditional branch. Combined
this leaves the compiler very little room to mess things up, but it does
produce sub-optimal code, and doesn't allow the LOAD-ACQUIRE upgrade
Will would like (but he can't always have that anyway due to our other
use of asm()).


We also have a 'CONTROL DEPENDENCIES' section in
Documentation/memory-barriers.txt for further reading.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:30 ` Linus Torvalds
@ 2021-06-04 16:37   ` Peter Zijlstra
  2021-06-04 16:52     ` Segher Boessenkool
  2021-06-04 17:10     ` Linus Torvalds
  0 siblings, 2 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 16:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Will Deacon, Paul E. McKenney, Alan Stern, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 09:30:01AM -0700, Linus Torvalds wrote:
> On Fri, Jun 4, 2021 at 3:12 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > I've converted most architectures we care about, and the rest will get
> > an extra smp_mb() by means of the 'generic' fallback implementation (for
> > now).
> 
> Why is "volatile_if()" not just
> 
>        #define barier_true() ({ barrier(); 1; })
> 
>        #define volatile_if(x) if ((x) && barrier_true())
> 
> because that should essentially cause the same thing - the compiler
> should be *forced* to create one conditional branch (because "barrier"
> is an asm that can't be done on the false side, so it can't do it with
> arithmetic or other games), and after that we're done.
> 
> No need for per-architecture "asm goto" games. No new memory barriers.
> No actual new code generation (except for the empty asm volatile that
> is a barrier).

Because we weren't sure compilers weren't still allowed to optimize the
branch away. If compiler folks can guarantee us your thing (along with
maybe the BUILD_BUG_ON(__builtin_constant_p(cond)) thing) always shall
generate a conditional branch instruction, then Yay!

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:10     ` Peter Zijlstra
@ 2021-06-04 16:40       ` Segher Boessenkool
  2021-06-04 18:55         ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 16:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 06:10:55PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 10:35:18AM -0500, Segher Boessenkool wrote:
> > On Fri, Jun 04, 2021 at 01:44:37PM +0200, Peter Zijlstra wrote:
> > > On naming (sorry Paul for forgetting that in the initial mail); while I
> > > think using the volatile qualifier for the language feature (can we haz
> > > plz, kthxbai) makes perfect sense, Paul felt that we might use a
> > > 'better' name for the kernel use, ctrl_dep_if() was proposed.
> > 
> > In standard C statements do not have qualifiers.  Unless you can
> > convince the ISO C committee to have them on "if", you will have a very
> > hard time convincing any serious compiler to do this.
> 
> While some people like talking to the Committee, I would much rather
> explore language extensions with the compiler communities. Such
> extensions can then make their way into the Committee once they show
> their usefulness.

My point is that you ask compiler developers to paint themselves into a
corner if you ask them to change such fundamental C syntax.

> If you have another proposal on how to express this; one you'd rather
> see implemented, I'm all ears.

I would love to see something that meshes well with the rest of C.  But
there is no 1-1 translation from C code to machine code (not in either
direction), so anything that more or less depends on that will always
be awkward.  If you can actually express the dependency in your source
code that will get us 95% to where we want to be.

> Data dependencies, control dependencies and address dependencies, C
> doesn't really like them, we rely on them. It would be awesome if we can
> fix this.

Yes.  The problem is that C is a high-level language.  All C semantics
are expressed on a an "as-if" level, never as "do this, then that" --
well, of course that *is* what it says, it's an imperative language just
like most, but that is just how you *think* about things on a conceptual
level, there is nothing that says the machine code has to do the same
thing in the same order as you wrote!


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:37   ` Peter Zijlstra
@ 2021-06-04 16:52     ` Segher Boessenkool
  2021-06-04 17:10     ` Linus Torvalds
  1 sibling, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 16:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Will Deacon, Paul E. McKenney, Alan Stern,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 06:37:22PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 09:30:01AM -0700, Linus Torvalds wrote:
> > Why is "volatile_if()" not just
> > 
> >        #define barier_true() ({ barrier(); 1; })
> > 
> >        #define volatile_if(x) if ((x) && barrier_true())
> > 
> > because that should essentially cause the same thing - the compiler
> > should be *forced* to create one conditional branch (because "barrier"
> > is an asm that can't be done on the false side, so it can't do it with
> > arithmetic or other games), and after that we're done.
> > 
> > No need for per-architecture "asm goto" games. No new memory barriers.
> > No actual new code generation (except for the empty asm volatile that
> > is a barrier).
> 
> Because we weren't sure compilers weren't still allowed to optimize the
> branch away.

barrier_true is a volatile asm, so it should be executed on the real
machine exactly as often as on the abstract machine (and in order with
other side effects).  And the && short-circuits, so you will always have
the same effect as a branch.  But there of course is nothing that forces
there to be a branch (as a silly example, the compiler could convert
some control flow to go via computed return addresses).


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:37   ` Peter Zijlstra
  2021-06-04 16:52     ` Segher Boessenkool
@ 2021-06-04 17:10     ` Linus Torvalds
  2021-06-04 17:24       ` Segher Boessenkool
  2021-06-04 18:23       ` Alan Stern
  1 sibling, 2 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 17:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Paul E. McKenney, Alan Stern, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 9:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> >
> > Why is "volatile_if()" not just
> >
> >        #define barier_true() ({ barrier(); 1; })
> >
> >        #define volatile_if(x) if ((x) && barrier_true())
>
> Because we weren't sure compilers weren't still allowed to optimize the
> branch away.

This isn't about some "compiler folks think".

The above CANNOT be compiled any other way than with a branch.

A compiler that optimizes a branch away is simply broken.

Of course, the actual condition (ie "x" above) has to be something
that the compiler cannot statically determine is a constant, but since
the whole - and only - point is that there will be a READ_ONCE() or
similar there, that's not an issue.

The compiler *cannot* just say "oh, I'll do that 'volatile asm
barrier' whether the condition is true or not". That would be a
fundamental compiler bug.

It's as if we wrote

    if (x) y++;

and the compiler went "Oh, I'll just increment 'y' unconditionally by
one, I'm sure the programmer doesn't mind, the conditional on 'x' is
immaterial".

No. That's not a C compiler. That's a stinking piece of buggy shit.
The compiler has to honor the conditional.

In that "y++" case, a compiler can decide to do it without a branch,
and basically rewrite the above as

   y += !!x;

but with a "volatile asm", that would be a bug.

Of course, we might want to make sure that the compiler doesn't go
"oh, empty asm, I can ignore it", but if that's the case then it's not
about "volatile_if()" any more, at that point it's "oh, the compiler
broke our 'barrier()' implementation", and we have bigger issues.

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 17:10     ` Linus Torvalds
@ 2021-06-04 17:24       ` Segher Boessenkool
  2021-06-04 17:38         ` Linus Torvalds
  2021-06-04 19:17         ` Peter Zijlstra
  2021-06-04 18:23       ` Alan Stern
  1 sibling, 2 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 17:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Alan Stern,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 10:10:29AM -0700, Linus Torvalds wrote:
> The compiler *cannot* just say "oh, I'll do that 'volatile asm
> barrier' whether the condition is true or not". That would be a
> fundamental compiler bug.

Yes.

> Of course, we might want to make sure that the compiler doesn't go
> "oh, empty asm, I can ignore it",

It isn't allowed to do that.  GCC has this arguable misfeature where it
doesn't show empty asm in the assembler output, but that has no bearing
on anything but how human-readable the output is.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 17:24       ` Segher Boessenkool
@ 2021-06-04 17:38         ` Linus Torvalds
  2021-06-04 18:25           ` Segher Boessenkool
  2021-06-04 19:17         ` Peter Zijlstra
  1 sibling, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 17:38 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Alan Stern,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 10:27 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> > Of course, we might want to make sure that the compiler doesn't go
> > "oh, empty asm, I can ignore it",
>
> It isn't allowed to do that.  GCC has this arguable misfeature where it
> doesn't show empty asm in the assembler output, but that has no bearing
> on anything but how human-readable the output is.

That sounds about right, but we have had people talking about the
compiler looking inside the asm string before.

So it worries me that some compiler person might at some point go all
breathy-voice on us and say "I am altering the deal. Pray I don't
alter it any further".

Side note: when grepping for what "barrier()" does on different
architectures and different compilers, I note that yes, it really is
just an empty asm volatile with a "memory" barrier. That should in all
way sbe sufficient.

BUT.

There's this really odd comment in <linux/compiler-intel.h> that talks
about some "ECC" compiler:

  /* Intel ECC compiler doesn't support gcc specific asm stmts.
   * It uses intrinsics to do the equivalent things.
   */

and it defines it as "__memory_barrier()". This seems to be an ia64 thing, but:

 - I cannot get google to find me any documentation on such an intrinsic

 - it seems to be bogus anyway, since we have "asm volatile" usage in
at least arch/ia64/mm/tlb.c

So I do note that "barrier()" has an odd definition in one odd ia64
case, and I can't find the semantics for it.

Admittedly I also cannot find it in myself to care. I don't think that
"Intel ECC" compiler case actually exists, and even if it does I don't
think itanium is relevant any more. But it was an odd detail on what
"barrier()" actually might mean to the compiler.

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 17:10     ` Linus Torvalds
  2021-06-04 17:24       ` Segher Boessenkool
@ 2021-06-04 18:23       ` Alan Stern
  1 sibling, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-06-04 18:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 10:10:29AM -0700, Linus Torvalds wrote:
> On Fri, Jun 4, 2021 at 9:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > >
> > > Why is "volatile_if()" not just
> > >
> > >        #define barier_true() ({ barrier(); 1; })
> > >
> > >        #define volatile_if(x) if ((x) && barrier_true())
> >
> > Because we weren't sure compilers weren't still allowed to optimize the
> > branch away.
> 
> This isn't about some "compiler folks think".
> 
> The above CANNOT be compiled any other way than with a branch.
> 
> A compiler that optimizes a branch away is simply broken.
> 
> Of course, the actual condition (ie "x" above) has to be something
> that the compiler cannot statically determine is a constant, but since
> the whole - and only - point is that there will be a READ_ONCE() or
> similar there, that's not an issue.

In fact there is one weird case where it is an issue (mentioned in 
memory-barriers.txt):

If some obscure arch-specific header file does:

	#define FOO	1

and an unwitting programmer writes:

	volatile_if (READ_ONCE(*y) % FOO == 0)
		WRITE_ONCE(*z, 5);

then the compiler _can_ statically determine that the condition is a 
constant, in spite of the READ_ONCE, but this fact isn't apparent to the 
programmer.  The generated object code will include both the read and 
the write, but there won't necessarily be any ordering between them.

I don't know if cases like this exist in the kernel.  It wouldn't be 
surprising if they did though, particularly in situations where a 
feature (like multi-level page tables) may be compiled away.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 17:38         ` Linus Torvalds
@ 2021-06-04 18:25           ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 18:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Alan Stern,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

Hi!

On Fri, Jun 04, 2021 at 10:38:43AM -0700, Linus Torvalds wrote:
> On Fri, Jun 4, 2021 at 10:27 AM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> > > Of course, we might want to make sure that the compiler doesn't go
> > > "oh, empty asm, I can ignore it",
> >
> > It isn't allowed to do that.  GCC has this arguable misfeature where it
> > doesn't show empty asm in the assembler output, but that has no bearing
> > on anything but how human-readable the output is.
> 
> That sounds about right, but we have had people talking about the
> compiler looking inside the asm string before.
> 
> So it worries me that some compiler person might at some point go all
> breathy-voice on us and say "I am altering the deal. Pray I don't
> alter it any further".

GCC will never do that.  And neither will any other compiler that claims
to implement the GCC asm extensions, if they are true to their word.

GCC *does* look inside the assembler template to estimate what code size
this asm will generate, and it tries to be pessimistic about its
estimate so that this will always work, but it always is possible to
mislead the compiler here, precisely because it does not actually
pretend it understands assembler code (think .irp or anything with
assembler macros for example).  In very rare cases this leads to
(assembler) errors ("jump target out of range", that kind of thing).
The most effective workaround is to write less silly code ;-)  And of
course this is documented, see
<https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html>

> Side note: when grepping for what "barrier()" does on different
> architectures and different compilers, I note that yes, it really is
> just an empty asm volatile with a "memory" barrier. That should in all
> way sbe sufficient.
> 
> BUT.
> 
> There's this really odd comment in <linux/compiler-intel.h> that talks
> about some "ECC" compiler:
> 
>   /* Intel ECC compiler doesn't support gcc specific asm stmts.
>    * It uses intrinsics to do the equivalent things.
>    */
> 
> and it defines it as "__memory_barrier()". This seems to be an ia64 thing, but:

"ecc" apparently was "icc" but for Itanium.  It ceased to exist some
time in the 2.4 era apparently.  It was still used in 2003.  Searching
for "ecpc" (the C++ compiler driver) will find a bit more.

>  - I cannot get google to find me any documentation on such an intrinsic
> 
>  - it seems to be bogus anyway, since we have "asm volatile" usage in
> at least arch/ia64/mm/tlb.c
> 
> So I do note that "barrier()" has an odd definition in one odd ia64
> case, and I can't find the semantics for it.
> 
> Admittedly I also cannot find it in myself to care.

Yeah, I love code archaeology, but I have work to do as well :-)

> I don't think that
> "Intel ECC" compiler case actually exists, and even if it does I don't
> think itanium is relevant any more. But it was an odd detail on what
> "barrier()" actually might mean to the compiler.

:-)


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:17                 ` Peter Zijlstra
@ 2021-06-04 18:27                   ` Alan Stern
  2021-06-04 19:09                     ` Linus Torvalds
  0 siblings, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-04 18:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, paulmck, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 06:17:20PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 11:51:54AM -0400, Alan Stern wrote:
> > On Fri, Jun 04, 2021 at 05:42:28PM +0200, Peter Zijlstra wrote:
> 
> > > #define volatile_if(cond) if (({ bool __t = (cond); BUILD_BUG_ON(__builtin_constant_p(__t)); volatile_cond(__t); }))
> > 
> > That won't help with more complicated examples, such as:
> > 
> > 	volatile_if (READ_ONCE(*x) * 0 + READ_ONCE(*y))
> 
> That's effectively:
> 
> 	volatile_if (READ_ONCE(*y))
> 		WRITE_ONCE(*y, 42);

Sorry, what I meant to write was:

	volatile_if (READ_ONCE(*x) * 0 + READ_ONCE(*y))
		WRITE_ONCE(*z, 42);

where there is no ordering between *x and *z.  It's not daft, and yes, a 
macro won't be able to warn about it.

Alan

> which is a valid, but daft, LOAD->STORE order, no? A compiler might
> maybe be able to WARN on that, but that's definitely beyond what we can
> do with macros.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 16:40       ` Segher Boessenkool
@ 2021-06-04 18:55         ` Paul E. McKenney
  2021-06-04 19:53           ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-04 18:55 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Linus Torvalds, will, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 11:40:47AM -0500, Segher Boessenkool wrote:
> On Fri, Jun 04, 2021 at 06:10:55PM +0200, Peter Zijlstra wrote:
> > On Fri, Jun 04, 2021 at 10:35:18AM -0500, Segher Boessenkool wrote:
> > > On Fri, Jun 04, 2021 at 01:44:37PM +0200, Peter Zijlstra wrote:
> > > > On naming (sorry Paul for forgetting that in the initial mail); while I
> > > > think using the volatile qualifier for the language feature (can we haz
> > > > plz, kthxbai) makes perfect sense, Paul felt that we might use a
> > > > 'better' name for the kernel use, ctrl_dep_if() was proposed.
> > > 
> > > In standard C statements do not have qualifiers.  Unless you can
> > > convince the ISO C committee to have them on "if", you will have a very
> > > hard time convincing any serious compiler to do this.
> > 
> > While some people like talking to the Committee, I would much rather
> > explore language extensions with the compiler communities. Such
> > extensions can then make their way into the Committee once they show
> > their usefulness.
> 
> My point is that you ask compiler developers to paint themselves into a
> corner if you ask them to change such fundamental C syntax.

Once we have some experience with a language extension, the official
syntax for a standardized version of that extension can be bikeshedded.
Committees being what they are, what we use in the meantime will
definitely not be what is chosen, so there is not a whole lot of point
in worrying about the exact syntax in the meantime.  ;-)

> > If you have another proposal on how to express this; one you'd rather
> > see implemented, I'm all ears.
> 
> I would love to see something that meshes well with the rest of C.  But
> there is no 1-1 translation from C code to machine code (not in either
> direction), so anything that more or less depends on that will always
> be awkward.  If you can actually express the dependency in your source
> code that will get us 95% to where we want to be.
> 
> > Data dependencies, control dependencies and address dependencies, C
> > doesn't really like them, we rely on them. It would be awesome if we can
> > fix this.
> 
> Yes.  The problem is that C is a high-level language.  All C semantics
> are expressed on a an "as-if" level, never as "do this, then that" --
> well, of course that *is* what it says, it's an imperative language just
> like most, but that is just how you *think* about things on a conceptual
> level, there is nothing that says the machine code has to do the same
> thing in the same order as you wrote!

Which is exactly why these conversations are often difficult.  There is
a tension between pushing the as-if rule as far as possible within the
compiler on the one hand and allowing developers to write code that does
what is needed on the other.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 18:27                   ` Alan Stern
@ 2021-06-04 19:09                     ` Linus Torvalds
  2021-06-04 19:18                       ` Linus Torvalds
  2021-06-05  3:14                       ` Alan Stern
  0 siblings, 2 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 19:09 UTC (permalink / raw)
  To: Alan Stern
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 11:27 AM Alan Stern <stern@rowland.harvard.edu> wrote:
>
>         volatile_if (READ_ONCE(*x) * 0 + READ_ONCE(*y))
>                 WRITE_ONCE(*z, 42);
>
> where there is no ordering between *x and *z.

I wouldn't worry about it.

I think a compiler is allowed to optimize away stupid code.

I get upset when a compiler says "oh, that's undefined, so I will
ignore the obvious meaning of it", but that's a different thing
entirely.

I really wish that the C standards group showed some spine, and said
"there is no undefined, there is only implementation-defined". That
would solve a *lot* of problems.

But I also realize that will never happen. Because "spine" and "good
taste" is not something that I've ever heard of happening in an
industry standards committee.

Side note: it is worth noting that my version of "volatile_if()" has
an added little quirk: it _ONLY_ orders the stuff inside the
if-statement.

I do think it's worth not adding new special cases (especially that
"asm goto" hack that will generate worse code than the compiler could
do), but it means that

    x = READ_ONCE(ptr);
    volatile_if (x > 0)
        WRITE_ONCE(*z, 42);

has an ordering, but if you write it as

    x = READ_ONCE(ptr);
    volatile_if (x <= 0)
        return;
    WRITE_ONCE(*z, 42);

then I could in theory see teh compiler doing that WRITE_ONCE() as
some kind of non-control dependency.

That said, I don't actually see how the compiler could do anything
that actually broke the _semantics_ of the code. Yes, it could do the
write using a magical data dependency on the conditional and turning
it into a store on a conditional address instead (before doing the
branch), but honestly, I don't see how that would actually break
anything.

So this is more of a "in theory, the two sides are not symmetric". The
"asm volatile" in a barrier() will force the compiler to generate the
branch, and the memory clobber in barrier() will most certainly force
any stores inside the "volatile_if()" to be after the branch.

But because the memory clobber is only inside the if-statement true
case, the false case could have the compiler migrate any code in that
false thing to before the if.

Again, semantics do matter, and I don't see how the compiler could
actually break the fundamental issue of "load->conditional->store is a
fundamental ordering even without memory barriers because of basic
causality", because you can't just arbitrarily generate speculative
stores that would be visible to others.

But at the same time, that's *such* a fundamental rule that I really
am intrigued why people think "volatile_if()" is needed in reality (as
opposed to some "in theory, the compiler can know things that are
unknowable thanks to a magical oracle" BS argument)

             Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 17:24       ` Segher Boessenkool
  2021-06-04 17:38         ` Linus Torvalds
@ 2021-06-04 19:17         ` Peter Zijlstra
  2021-06-04 20:43           ` Paul E. McKenney
  1 sibling, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 19:17 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Will Deacon, Paul E. McKenney, Alan Stern,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 12:24:07PM -0500, Segher Boessenkool wrote:
> On Fri, Jun 04, 2021 at 10:10:29AM -0700, Linus Torvalds wrote:
> > The compiler *cannot* just say "oh, I'll do that 'volatile asm
> > barrier' whether the condition is true or not". That would be a
> > fundamental compiler bug.
> 
> Yes.

So we can all agree on something like this?

#define volatile_if(x) \
	if (({ _Bool __x = (x); BUILD_BUG_ON(__builtin_constant_p(__x)); __x; }) && \
	    ({ barrier(); 1; }))

Do we keep volatile_if() or do we like ctrl_dep_if() better?

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 19:09                     ` Linus Torvalds
@ 2021-06-04 19:18                       ` Linus Torvalds
  2021-06-04 20:56                         ` Paul E. McKenney
  2021-06-05  3:14                       ` Alan Stern
  1 sibling, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 19:18 UTC (permalink / raw)
  To: Alan Stern
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 12:09 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Again, semantics do matter, and I don't see how the compiler could
> actually break the fundamental issue of "load->conditional->store is a
> fundamental ordering even without memory barriers because of basic
> causality", because you can't just arbitrarily generate speculative
> stores that would be visible to others.

This, after all, is why we trust that the *hardware* can't do it.

Even if the hardware mis-speculates and goes down the wrong branch,
and speculatively does the store when it shouldn't have, we don't
care: we know that such a speculative store can not possibly become
semantically visible (*) to other threads.

For all the same reasons, I don't see how a compiler can violate
causal ordering of the code (assuming, again, that the test is
_meaningful_ - if we write nonsensical code, that's a different
issue).

If we have compilers that create speculative stores that are visible
to other threads, we need to fix them.

               Linus

(*) By "semantically visible" I intend to avoid the whole timing/cache
pattern kind of non-semantic visibility that is all about the spectre
leakage kind of things.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 18:55         ` Paul E. McKenney
@ 2021-06-04 19:53           ` Segher Boessenkool
  2021-06-04 20:40             ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-04 19:53 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Linus Torvalds, will, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 11:55:26AM -0700, Paul E. McKenney wrote:
> On Fri, Jun 04, 2021 at 11:40:47AM -0500, Segher Boessenkool wrote:
> > My point is that you ask compiler developers to paint themselves into a
> > corner if you ask them to change such fundamental C syntax.
> 
> Once we have some experience with a language extension, the official
> syntax for a standardized version of that extension can be bikeshedded.
> Committees being what they are, what we use in the meantime will
> definitely not be what is chosen, so there is not a whole lot of point
> in worrying about the exact syntax in the meantime.  ;-)

I am only saying that it is unlikely any compiler that is used in
production will want to experiment with "volatile if".

> > I would love to see something that meshes well with the rest of C.  But
> > there is no 1-1 translation from C code to machine code (not in either
> > direction), so anything that more or less depends on that will always
> > be awkward.  If you can actually express the dependency in your source
> > code that will get us 95% to where we want to be.

^^^

> > > Data dependencies, control dependencies and address dependencies, C
> > > doesn't really like them, we rely on them. It would be awesome if we can
> > > fix this.
> > 
> > Yes.  The problem is that C is a high-level language.  All C semantics
> > are expressed on a an "as-if" level, never as "do this, then that" --
> > well, of course that *is* what it says, it's an imperative language just
> > like most, but that is just how you *think* about things on a conceptual
> > level, there is nothing that says the machine code has to do the same
> > thing in the same order as you wrote!
> 
> Which is exactly why these conversations are often difficult.  There is
> a tension between pushing the as-if rule as far as possible within the
> compiler on the one hand and allowing developers to write code that does
> what is needed on the other.  ;-)

There is a tension between what users expect from the compiler and what
actually is promised.  The compiler is not pushing the as-if rule any
further than it always has: it just becomes better at optimising over
time.  The as-if rule is and always has been absolute.

What is needed to get any progress is for user expectations to be
feasible and not contradict existing requirements.  See "^^^" above.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 19:53           ` Segher Boessenkool
@ 2021-06-04 20:40             ` Paul E. McKenney
  2021-06-06 11:36               ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-04 20:40 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Linus Torvalds, will, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 02:53:01PM -0500, Segher Boessenkool wrote:
> On Fri, Jun 04, 2021 at 11:55:26AM -0700, Paul E. McKenney wrote:
> > On Fri, Jun 04, 2021 at 11:40:47AM -0500, Segher Boessenkool wrote:
> > > My point is that you ask compiler developers to paint themselves into a
> > > corner if you ask them to change such fundamental C syntax.
> > 
> > Once we have some experience with a language extension, the official
> > syntax for a standardized version of that extension can be bikeshedded.
> > Committees being what they are, what we use in the meantime will
> > definitely not be what is chosen, so there is not a whole lot of point
> > in worrying about the exact syntax in the meantime.  ;-)
> 
> I am only saying that it is unlikely any compiler that is used in
> production will want to experiment with "volatile if".

That unfortunately matches my experience over quite a few years.  But if
something can be implemented using existing extensions, the conversations
often get easier.  Especially given many more people are now familiar
with concurrency.

> > > I would love to see something that meshes well with the rest of C.  But
> > > there is no 1-1 translation from C code to machine code (not in either
> > > direction), so anything that more or less depends on that will always
> > > be awkward.  If you can actually express the dependency in your source
> > > code that will get us 95% to where we want to be.
> 
> ^^^
> 
> > > > Data dependencies, control dependencies and address dependencies, C
> > > > doesn't really like them, we rely on them. It would be awesome if we can
> > > > fix this.
> > > 
> > > Yes.  The problem is that C is a high-level language.  All C semantics
> > > are expressed on a an "as-if" level, never as "do this, then that" --
> > > well, of course that *is* what it says, it's an imperative language just
> > > like most, but that is just how you *think* about things on a conceptual
> > > level, there is nothing that says the machine code has to do the same
> > > thing in the same order as you wrote!
> > 
> > Which is exactly why these conversations are often difficult.  There is
> > a tension between pushing the as-if rule as far as possible within the
> > compiler on the one hand and allowing developers to write code that does
> > what is needed on the other.  ;-)
> 
> There is a tension between what users expect from the compiler and what
> actually is promised.  The compiler is not pushing the as-if rule any
> further than it always has: it just becomes better at optimising over
> time.  The as-if rule is and always has been absolute.

Heh!  The fact that the compiler has become better at optimizing
over time is exactly what has been pushing the as-if rule further.

The underlying problem is that it is often impossible to write large
applications (such as the Linux kernel) completely within the confines of
the standard.  Thus, most large applications, and especially concurrent
applications, are vulnerable to either the compiler becoming better
at optimizing or compilers pushing the as-if rule, however you want to
say it.

> What is needed to get any progress is for user expectations to be
> feasible and not contradict existing requirements.  See "^^^" above.

Or additional requirements need to be accepted by the various compilation
powers that be.  Failing to acknowledge valid new user expectations is
after all an excellent path to obsolescence.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 19:17         ` Peter Zijlstra
@ 2021-06-04 20:43           ` Paul E. McKenney
  0 siblings, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-04 20:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Segher Boessenkool, Linus Torvalds, Will Deacon, Alan Stern,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 09:17:56PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 04, 2021 at 12:24:07PM -0500, Segher Boessenkool wrote:
> > On Fri, Jun 04, 2021 at 10:10:29AM -0700, Linus Torvalds wrote:
> > > The compiler *cannot* just say "oh, I'll do that 'volatile asm
> > > barrier' whether the condition is true or not". That would be a
> > > fundamental compiler bug.
> > 
> > Yes.
> 
> So we can all agree on something like this?
> 
> #define volatile_if(x) \
> 	if (({ _Bool __x = (x); BUILD_BUG_ON(__builtin_constant_p(__x)); __x; }) && \
> 	    ({ barrier(); 1; }))

As long as this prevents compilers from causing trouble with things like
conditional-move instructions, I am good.  I don't know that this trouble
actually exists, but I never have been able to get official confirmation
one way or the other.  :-/

> Do we keep volatile_if() or do we like ctrl_dep_if() better?

I like ctrl_dep_if() because that is what it does, but I don't feel all
that strongly about it.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 19:18                       ` Linus Torvalds
@ 2021-06-04 20:56                         ` Paul E. McKenney
  2021-06-04 21:27                           ` Linus Torvalds
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-04 20:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 12:18:43PM -0700, Linus Torvalds wrote:
> On Fri, Jun 4, 2021 at 12:09 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > Again, semantics do matter, and I don't see how the compiler could
> > actually break the fundamental issue of "load->conditional->store is a
> > fundamental ordering even without memory barriers because of basic
> > causality", because you can't just arbitrarily generate speculative
> > stores that would be visible to others.
> 
> This, after all, is why we trust that the *hardware* can't do it.
> 
> Even if the hardware mis-speculates and goes down the wrong branch,
> and speculatively does the store when it shouldn't have, we don't
> care: we know that such a speculative store can not possibly become
> semantically visible (*) to other threads.
> 
> For all the same reasons, I don't see how a compiler can violate
> causal ordering of the code (assuming, again, that the test is
> _meaningful_ - if we write nonsensical code, that's a different
> issue).

I am probably missing your point, but something like this:

	if (READ_ONCE(x))
		y = 42;
	else
		y = 1729;

Can in theory be transformed into something like this:

	y = 1729;
	if (READ_ONCE(x))
		y = 42;

The usual way to prevent it is to use WRITE_ONCE().

Fortunately, register sets are large, and gcc manages to do a single
store and use only %eax.

							Thanx, Paul

> If we have compilers that create speculative stores that are visible
> to other threads, we need to fix them.
> 
>                Linus
> 
> (*) By "semantically visible" I intend to avoid the whole timing/cache
> pattern kind of non-semantic visibility that is all about the spectre
> leakage kind of things.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 20:56                         ` Paul E. McKenney
@ 2021-06-04 21:27                           ` Linus Torvalds
  2021-06-04 21:40                             ` Paul E. McKenney
  2021-06-04 22:05                             ` Peter Zijlstra
  0 siblings, 2 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 21:27 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Alan Stern, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 1:56 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> The usual way to prevent it is to use WRITE_ONCE().

The very *documentation example* for "volatile_if()" uses that WRITE_ONCE().

IOW, the patch that started this discussion has this comment in it:

+/**
+ * volatile_if() - Provide a control-dependency
+ *
+ * volatile_if(READ_ONCE(A))
+ *     WRITE_ONCE(B, 1);
+ *
+ * will ensure that the STORE to B happens after the LOAD of A.

and my point is that I don't see *ANY THEORETICALLY POSSIBLE* way that
that "volatile_if()" could not be just a perfectly regular "if ()".

Can you?

Because we *literally* depend on the fundamental concept of causality
to make the hardware not re-order those operations.

That is the WHOLE AND ONLY point of this whole construct: we're
avoiding a possibly expensive hardware barrier operation, because we
know we have a more fundamental barrier that is INHERENT TO THE
OPERATION.

And I cannot for the life of me see how a compiler can break that
fundamental concept of causality either.

Seriously. Tell me how a compiler could _possibly_ turn that into
something that breaks the fundamental causal relationship. The same
fundamental causal relationship that is the whole and only reason we
don't need a memory barrier for the hardware.

And no, there is not a way in hell that the above can be written with
some kind of semantically visible speculative store without the
compiler being a total pile of garbage that wouldn't be usable for
compiling a kernel with.

If your argument is that the compiler can magically insert speculative
stores that can then be overwritten later, then MY argument is that
such a compiler could do that for *ANYTHING*. "volatile_if()" wouldn't
save us.

If that's valid compiler behavior in your opinion, then we have
exactly two options:

 (a) give up

 (b) not use that broken garbage of a compiler.

So I can certainly accept the patch with the simpler implementation of
"volatile_if()", but dammit, I want to see an actual real example
arguing for why it would be relevant and why the compiler would need
our help.

Because the EXACT VERY EXAMPLE that was in the patch as-is sure as
hell is no such thing.

If the intent is to *document* that "this conditional is part of a
load-conditional-store memory ordering pattern, then that is one
thing. But if that's the intent, then we might as well just write it
as

    #define volatile_if(x) if (x)

and add a *comment* about why this kind of sequence doesn't need a
memory barrier.

I'd much rather have that kind of documentation, than have barriers
that are magical for theoretical compiler issues that aren't real, and
don't have any grounding in reality.

Without a real and valid example of how this could matter, this is
just voodoo programming.

We don't actually need to walk three times widdershins around the
computer before compiling the kernel.That's not how kernel development
works.

And we don't need to add a "volatile_if()" with magical barriers that
have no possibility of having real semantic meaning.

So I want to know what the semantic meaning of volatile_if() would be,
and why it fixes anything that a plain "if()" wouldn't. I want to see
the sequence where that "volatile_if()" actually *fixes* something.

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 21:27                           ` Linus Torvalds
@ 2021-06-04 21:40                             ` Paul E. McKenney
  2021-06-04 22:19                               ` Linus Torvalds
  2021-06-04 22:05                             ` Peter Zijlstra
  1 sibling, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-04 21:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 02:27:49PM -0700, Linus Torvalds wrote:
> On Fri, Jun 4, 2021 at 1:56 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > The usual way to prevent it is to use WRITE_ONCE().
> 
> The very *documentation example* for "volatile_if()" uses that WRITE_ONCE().

Whew!  ;-)

> IOW, the patch that started this discussion has this comment in it:
> 
> +/**
> + * volatile_if() - Provide a control-dependency
> + *
> + * volatile_if(READ_ONCE(A))
> + *     WRITE_ONCE(B, 1);
> + *
> + * will ensure that the STORE to B happens after the LOAD of A.
> 
> and my point is that I don't see *ANY THEORETICALLY POSSIBLE* way that
> that "volatile_if()" could not be just a perfectly regular "if ()".
> 
> Can you?

I cannot, maybe due to failure of imagination.  But please see below.

> Because we *literally* depend on the fundamental concept of causality
> to make the hardware not re-order those operations.
> 
> That is the WHOLE AND ONLY point of this whole construct: we're
> avoiding a possibly expensive hardware barrier operation, because we
> know we have a more fundamental barrier that is INHERENT TO THE
> OPERATION.
> 
> And I cannot for the life of me see how a compiler can break that
> fundamental concept of causality either.
> 
> Seriously. Tell me how a compiler could _possibly_ turn that into
> something that breaks the fundamental causal relationship. The same
> fundamental causal relationship that is the whole and only reason we
> don't need a memory barrier for the hardware.
> 
> And no, there is not a way in hell that the above can be written with
> some kind of semantically visible speculative store without the
> compiler being a total pile of garbage that wouldn't be usable for
> compiling a kernel with.
> 
> If your argument is that the compiler can magically insert speculative
> stores that can then be overwritten later, then MY argument is that
> such a compiler could do that for *ANYTHING*. "volatile_if()" wouldn't
> save us.
> 
> If that's valid compiler behavior in your opinion, then we have
> exactly two options:
> 
>  (a) give up
> 
>  (b) not use that broken garbage of a compiler.
> 
> So I can certainly accept the patch with the simpler implementation of
> "volatile_if()", but dammit, I want to see an actual real example
> arguing for why it would be relevant and why the compiler would need
> our help.
> 
> Because the EXACT VERY EXAMPLE that was in the patch as-is sure as
> hell is no such thing.
> 
> If the intent is to *document* that "this conditional is part of a
> load-conditional-store memory ordering pattern, then that is one
> thing. But if that's the intent, then we might as well just write it
> as
> 
>     #define volatile_if(x) if (x)
> 
> and add a *comment* about why this kind of sequence doesn't need a
> memory barrier.
> 
> I'd much rather have that kind of documentation, than have barriers
> that are magical for theoretical compiler issues that aren't real, and
> don't have any grounding in reality.
> 
> Without a real and valid example of how this could matter, this is
> just voodoo programming.
> 
> We don't actually need to walk three times widdershins around the
> computer before compiling the kernel.That's not how kernel development
> works.
> 
> And we don't need to add a "volatile_if()" with magical barriers that
> have no possibility of having real semantic meaning.
> 
> So I want to know what the semantic meaning of volatile_if() would be,
> and why it fixes anything that a plain "if()" wouldn't. I want to see
> the sequence where that "volatile_if()" actually *fixes* something.

Here is one use case:

	volatile_if(READ_ONCE(A)) {
		WRITE_ONCE(B, 1);
		do_something();
	} else {
		WRITE_ONCE(B, 1);
		do_something_else();
	}

With plain "if", the compiler is within its rights to do this:

	tmp = READ_ONCE(A);
	WRITE_ONCE(B, 1);
	if (tmp)
		do_something();
	else
		do_something_else();

On x86, still no problem.  But weaker hardware could now reorder the
store to B before the load from A.  With volatile_if(), this reordering
would be prevented.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 21:27                           ` Linus Torvalds
  2021-06-04 21:40                             ` Paul E. McKenney
@ 2021-06-04 22:05                             ` Peter Zijlstra
  1 sibling, 0 replies; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-04 22:05 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul E. McKenney, Alan Stern, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 02:27:49PM -0700, Linus Torvalds wrote:
> On Fri, Jun 4, 2021 at 1:56 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > The usual way to prevent it is to use WRITE_ONCE().
> 
> The very *documentation example* for "volatile_if()" uses that WRITE_ONCE().
> 
> IOW, the patch that started this discussion has this comment in it:
> 
> +/**
> + * volatile_if() - Provide a control-dependency
> + *
> + * volatile_if(READ_ONCE(A))
> + *     WRITE_ONCE(B, 1);
> + *
> + * will ensure that the STORE to B happens after the LOAD of A.

We do actually have uses what use a 'regular' store, and not a
WRITE_ONCE(). And I think for those the added barrier() might make a
difference.

At the very least the perf ring-buffer case uses memcpy().

On my part I'm deeply distrusting some of the C language committee
proposals I've seen regarding this stuff, and I'm maybe worrying too
much, but I'd rather not have to debug anything like this when they do
manage to make it go bad.

On top of that, I think having the construct is good for documenting
intent and possibly some of the concurrency analyzers can make use of
it.

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 21:40                             ` Paul E. McKenney
@ 2021-06-04 22:19                               ` Linus Torvalds
  2021-06-05 14:57                                 ` Alan Stern
  0 siblings, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-04 22:19 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Alan Stern, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 2:40 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> Here is one use case:
>
>         volatile_if(READ_ONCE(A)) {
>                 WRITE_ONCE(B, 1);
>                 do_something();
>         } else {
>                 WRITE_ONCE(B, 1);
>                 do_something_else();
>         }
>
> With plain "if", the compiler is within its rights to do this:
>
>         tmp = READ_ONCE(A);
>         WRITE_ONCE(B, 1);
>         if (tmp)
>                 do_something();
>         else
>                 do_something_else();
>
> On x86, still no problem.  But weaker hardware could now reorder the
> store to B before the load from A.  With volatile_if(), this reordering
> would be prevented.

But *should* it be prevented? For code like the above?

I'm not really seeing that the above is a valid code sequence.

Sure, that "WRITE_ONCE(B, 1)" could be seen as a lock release, and
then it would be wrong to have the read of 'A' happen after the lock
has actually been released. But if that's the case, then it should
have used a smp_store_release() in the first place, not a
WRITE_ONCE().

So I don't see the above as much of a valid example of actual
READ/WRITE_ONCE() use.

If people use READ/WRITE_ONCE() like the above, and they actually
depend on that kind of ordering, I think that code is likely wrong to
begin with. Using "volatile_if()" doesn't make it more valid.

Now, part of this is that I do think that in *general* we should never
use this very suble load-cond-store pattern to begin with. We should
strive to use more smp_load_acquire() and smp_store_release() if we
care about ordering of accesses. They are typically cheap enough, and
if there's much of an ordering issue, they are the right things to do.

I think the whole "load-to-store ordering" subtle non-ordered case is
for very very special cases, when you literally don't have a general
memory ordering, you just have an ordering for *one* very particular
access. Like some of the very magical code in the rw-semaphore case,
or that smp_cond_load_acquire().

IOW, I would expect that we have a handful of uses of this thing. And
none of them have that "the conditional store is the same on both
sides" pattern, afaik.

And immediately when the conditional store is different, you end up
having a dependency on it that orders it.

But I guess I can accept the above made-up example as an "argument",
even though I feel it is entirely irrelevant to the actual issues and
uses we have.

               Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 19:09                     ` Linus Torvalds
  2021-06-04 19:18                       ` Linus Torvalds
@ 2021-06-05  3:14                       ` Alan Stern
  2021-06-05 16:24                         ` Linus Torvalds
  1 sibling, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-05  3:14 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 12:09:26PM -0700, Linus Torvalds wrote:
> Side note: it is worth noting that my version of "volatile_if()" has
> an added little quirk: it _ONLY_ orders the stuff inside the
> if-statement.
> 
> I do think it's worth not adding new special cases (especially that
> "asm goto" hack that will generate worse code than the compiler could
> do), but it means that
> 
>     x = READ_ONCE(ptr);
>     volatile_if (x > 0)
>         WRITE_ONCE(*z, 42);
> 
> has an ordering, but if you write it as
> 
>     x = READ_ONCE(ptr);
>     volatile_if (x <= 0)
>         return;
>     WRITE_ONCE(*z, 42);
> 
> then I could in theory see teh compiler doing that WRITE_ONCE() as
> some kind of non-control dependency.

This may be a minor point, but can that loophole be closed as follows?

define volatile_if(x) \
	if ((({ _Bool __x = (x); BUILD_BUG_ON(__builtin_constant_p(__x)); __x; }) && \
		({ barrier(); 1; })) || ({ barrier(); 0; }))

(It's now a little later at night than when I usually think about this 
sort of thing, so my brain isn't firing on all its cylinders.  Forgive 
me if this is a dumb question.)

Alan


^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 22:19                               ` Linus Torvalds
@ 2021-06-05 14:57                                 ` Alan Stern
  2021-06-06  0:14                                   ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-05 14:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul E. McKenney, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 03:19:11PM -0700, Linus Torvalds wrote:
> Now, part of this is that I do think that in *general* we should never
> use this very suble load-cond-store pattern to begin with. We should
> strive to use more smp_load_acquire() and smp_store_release() if we
> care about ordering of accesses. They are typically cheap enough, and
> if there's much of an ordering issue, they are the right things to do.
> 
> I think the whole "load-to-store ordering" subtle non-ordered case is
> for very very special cases, when you literally don't have a general
> memory ordering, you just have an ordering for *one* very particular
> access. Like some of the very magical code in the rw-semaphore case,
> or that smp_cond_load_acquire().
> 
> IOW, I would expect that we have a handful of uses of this thing. And
> none of them have that "the conditional store is the same on both
> sides" pattern, afaik.
> 
> And immediately when the conditional store is different, you end up
> having a dependency on it that orders it.
> 
> But I guess I can accept the above made-up example as an "argument",
> even though I feel it is entirely irrelevant to the actual issues and
> uses we have.

Indeed, the expansion of the currently proposed version of

	volatile_if (A) {
		B;
	} else {
		C;
	}

is basically the same as

	if (A) {
		barrier();
		B;
	} else {
		barrier();
		C;
	}

which is just about as easy to write by hand.  (For some reason my 
fingers don't like typing "volatile_"; the letters tend to get 
scrambled.)

So given that:

	1. Reliance on control dependencies is uncommon in the kernel,
	   and

	2. The loads in A could just be replaced with load_acquires
	   at a low penalty (or store-releases could go into B and C),

it seems that we may not need volatile_if at all!  The only real reason 
for having it in the first place was to avoid the penalty of 
load-acquire on architectures where it has a significant cost, when the 
control dependency would provide the necessary ordering for free.  Such 
architectures are getting less and less common.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-05  3:14                       ` Alan Stern
@ 2021-06-05 16:24                         ` Linus Torvalds
  0 siblings, 0 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-05 16:24 UTC (permalink / raw)
  To: Alan Stern
  Cc: Peter Zijlstra, Will Deacon, Paul E. McKenney, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Fri, Jun 4, 2021 at 8:14 PM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> >
> > then I could in theory see teh compiler doing that WRITE_ONCE() as
> > some kind of non-control dependency.
>
> This may be a minor point, but can that loophole be closed as follows?

Note that it's actually entirely sufficient to have the barrier just
on one side.

I brought it up mainly as an oddity, and that it can result in the
compiler generating different code for the two different directions.

The reason that it is sufficient is that with the barrier in place (on
either side), the compiler really can't do much. It can't join either
of the sides, because it has to do that barrier on one side before any
common code.

In fact, even if the compiler decides to first do a conditional call
just around the barrier, and then do any common code (and then do
_another_ conditional branch), it still did that conditional branch
first, and the problem is solved. The CPU doesn't care, it will have
to resolve the branch before any subsequent stores are finalized.

Of course, if the compiler creates a conditional call just around the
barrier, and the barrier is empty (like we do now), and the compiler
leaves no mark of it in the result (like it does seem to do for empty
asm stataments), I could imagine some optimizing assembler (or linker)
screwing things up for us, and saying "a conditional branch to the
next instruction can just be removed).

At that point, we've lost again, and it's a toolchain issue. I don't
think that issue can currently happen, but it's an example of yet
another really subtle problem that *could* happen even if *we* do
everything right.

I also do not believe that any of our code that has this pattern would
have that situation where the compiler would generate a branch over
just the barrier. It's kind of similar to Paul's example in that
sense. When we use volatile_if(), the two sides are very very
different entirely regardless of the barrier, so in practice I think
this is all entirely moot.

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-05 14:57                                 ` Alan Stern
@ 2021-06-06  0:14                                   ` Paul E. McKenney
  2021-06-06  1:29                                     ` Alan Stern
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-06  0:14 UTC (permalink / raw)
  To: Alan Stern
  Cc: Linus Torvalds, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 10:57:39AM -0400, Alan Stern wrote:
> On Fri, Jun 04, 2021 at 03:19:11PM -0700, Linus Torvalds wrote:
> > Now, part of this is that I do think that in *general* we should never
> > use this very suble load-cond-store pattern to begin with. We should
> > strive to use more smp_load_acquire() and smp_store_release() if we
> > care about ordering of accesses. They are typically cheap enough, and
> > if there's much of an ordering issue, they are the right things to do.
> > 
> > I think the whole "load-to-store ordering" subtle non-ordered case is
> > for very very special cases, when you literally don't have a general
> > memory ordering, you just have an ordering for *one* very particular
> > access. Like some of the very magical code in the rw-semaphore case,
> > or that smp_cond_load_acquire().
> > 
> > IOW, I would expect that we have a handful of uses of this thing. And
> > none of them have that "the conditional store is the same on both
> > sides" pattern, afaik.
> > 
> > And immediately when the conditional store is different, you end up
> > having a dependency on it that orders it.
> > 
> > But I guess I can accept the above made-up example as an "argument",
> > even though I feel it is entirely irrelevant to the actual issues and
> > uses we have.
> 
> Indeed, the expansion of the currently proposed version of
> 
> 	volatile_if (A) {
> 		B;
> 	} else {
> 		C;
> 	}
> 
> is basically the same as
> 
> 	if (A) {
> 		barrier();
> 		B;
> 	} else {
> 		barrier();
> 		C;
> 	}
> 
> which is just about as easy to write by hand.  (For some reason my 
> fingers don't like typing "volatile_"; the letters tend to get 
> scrambled.)
> 
> So given that:
> 
> 	1. Reliance on control dependencies is uncommon in the kernel,
> 	   and
> 
> 	2. The loads in A could just be replaced with load_acquires
> 	   at a low penalty (or store-releases could go into B and C),
> 
> it seems that we may not need volatile_if at all!  The only real reason 
> for having it in the first place was to avoid the penalty of 
> load-acquire on architectures where it has a significant cost, when the 
> control dependency would provide the necessary ordering for free.  Such 
> architectures are getting less and less common.

That does sound good, but...

Current compilers beg to differ at -O2: https://godbolt.org/z/5K55Gardn

------------------------------------------------------------------------
#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
#define WRITE_ONCE(x, val) (READ_ONCE(x) = (val))
#define barrier() __asm__ __volatile__("": : :"memory")

int x, y;

int main(int argc, char *argv[])
{
    if (READ_ONCE(x)) {
        barrier();
        WRITE_ONCE(y, 1);
    } else {
        barrier();
        WRITE_ONCE(y, 1);
    }
    return 0;
}
------------------------------------------------------------------------

Both gcc and clang generate a load followed by a store, with no branch.
ARM gets the same results from both compilers.

As Linus suggested, removing one (but not both!) invocations of barrier()
does cause a branch to be emitted, so maybe that is a way forward.
Assuming it is more than just dumb luck, anyway.  :-/

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  0:14                                   ` Paul E. McKenney
@ 2021-06-06  1:29                                     ` Alan Stern
  2021-06-06  3:41                                       ` Linus Torvalds
  2021-06-06 11:53                                       ` Segher Boessenkool
  0 siblings, 2 replies; 127+ messages in thread
From: Alan Stern @ 2021-06-06  1:29 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 05:14:18PM -0700, Paul E. McKenney wrote:
> On Sat, Jun 05, 2021 at 10:57:39AM -0400, Alan Stern wrote:
> > Indeed, the expansion of the currently proposed version of
> > 
> > 	volatile_if (A) {
> > 		B;
> > 	} else {
> > 		C;
> > 	}
> > 
> > is basically the same as
> > 
> > 	if (A) {
> > 		barrier();
> > 		B;
> > 	} else {
> > 		barrier();
> > 		C;
> > 	}

> That does sound good, but...
> 
> Current compilers beg to differ at -O2: https://godbolt.org/z/5K55Gardn
> 
> ------------------------------------------------------------------------
> #define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
> #define WRITE_ONCE(x, val) (READ_ONCE(x) = (val))
> #define barrier() __asm__ __volatile__("": : :"memory")
> 
> int x, y;
> 
> int main(int argc, char *argv[])
> {
>     if (READ_ONCE(x)) {
>         barrier();
>         WRITE_ONCE(y, 1);
>     } else {
>         barrier();
>         WRITE_ONCE(y, 1);
>     }
>     return 0;
> }
> ------------------------------------------------------------------------
> 
> Both gcc and clang generate a load followed by a store, with no branch.
> ARM gets the same results from both compilers.
> 
> As Linus suggested, removing one (but not both!) invocations of barrier()
> does cause a branch to be emitted, so maybe that is a way forward.
> Assuming it is more than just dumb luck, anyway.  :-/

Interesting.  And changing one of the branches from barrier() to __asm__ 
__volatile__("nop": : :"memory") also causes a branch to be emitted.  So 
even though the compiler doesn't "look inside" assembly code, it does 
compare two pieces at least textually and apparently assumes if they are 
identical then they do the same thing.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  1:29                                     ` Alan Stern
@ 2021-06-06  3:41                                       ` Linus Torvalds
  2021-06-06  4:43                                         ` Paul E. McKenney
                                                           ` (3 more replies)
  2021-06-06 11:53                                       ` Segher Boessenkool
  1 sibling, 4 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06  3:41 UTC (permalink / raw)
  To: Alan Stern, Segher Boessenkool
  Cc: Paul E. McKenney, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Sat, Jun 5, 2021 at 6:29 PM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> Interesting.  And changing one of the branches from barrier() to __asm__
> __volatile__("nop": : :"memory") also causes a branch to be emitted.  So
> even though the compiler doesn't "look inside" assembly code, it does
> compare two pieces at least textually and apparently assumes if they are
> identical then they do the same thing.

That's actually a feature in some cases, ie the ability to do CSE on
asm statements (ie the "always has the same output" optimization that
the docs talk about).

So gcc has always looked at the asm string for that reason, afaik.

I think it's something of a bug when it comes to "asm volatile", but
the documentation isn't exactly super-specific.

There is a statement of "Under certain circumstances, GCC may
duplicate (or remove duplicates of) your assembly code when
optimizing" and a suggestion of using "%=" to generate a unique
instance of an asm.

Which might actually be a good idea for "barrier()", just in case.
However, the problem with that is that I don't think we are guaranteed
to have a universal comment character for asm statements.

IOW, it might be a good idea to do something like

   #define barrier() \
        __asm__ __volatile__("# barrier %=": : :"memory")

but I'm  not 100% convinced that '#' is always a comment in asm code,
so the above might not actually build everywhere.

However, *testing* the above (in my config, where '#' does work as a
comment character) shows that gcc doesn't actually consider them to be
distinct EVEN THEN, and will still merge two barrier statements.

That's distressing.

So the gcc docs are actively wrong, and %= does nothing - it will
still compare as the exact same inline asm, because the string
equality testing is apparently done before any expansion.

Something like this *does* seem to work:

   #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
   #define __barrier(id) ____barrier(id)
   #define barrier() __barrier(__COUNTER__)

which is "interesting" or "disgusting" depending on how you happen to feel.

And again - the above works only as long as "#" is a valid comment
character in the assembler. And I have this very dim memory of us
having comments in inline asm, and it breaking certain configurations
(for when the assembler that the compiler uses is a special
human-unfriendly one that only accepts compiler output).

You could make even more disgusting hacks, and have it generate something like

    .pushsection .discard.barrier
    .long #id
    .popsection

instead of a comment. We already expect that to work and have generic
inline asm cases that generate code like that.

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  3:41                                       ` Linus Torvalds
@ 2021-06-06  4:43                                         ` Paul E. McKenney
  2021-06-06 13:17                                           ` Segher Boessenkool
  2021-06-06 12:59                                         ` Segher Boessenkool
                                                           ` (2 subsequent siblings)
  3 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-06  4:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Segher Boessenkool, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> On Sat, Jun 5, 2021 at 6:29 PM Alan Stern <stern@rowland.harvard.edu> wrote:
> >
> > Interesting.  And changing one of the branches from barrier() to __asm__
> > __volatile__("nop": : :"memory") also causes a branch to be emitted.  So
> > even though the compiler doesn't "look inside" assembly code, it does
> > compare two pieces at least textually and apparently assumes if they are
> > identical then they do the same thing.
> 
> That's actually a feature in some cases, ie the ability to do CSE on
> asm statements (ie the "always has the same output" optimization that
> the docs talk about).

Agreed, albeit reluctantly.  ;-)

> So gcc has always looked at the asm string for that reason, afaik.
> 
> I think it's something of a bug when it comes to "asm volatile", but
> the documentation isn't exactly super-specific.
> 
> There is a statement of "Under certain circumstances, GCC may
> duplicate (or remove duplicates of) your assembly code when
> optimizing" and a suggestion of using "%=" to generate a unique
> instance of an asm.

So gcc might some day note a do-nothing asm and duplicate it for
the sole purpose of collapsing the "then" and "else" clauses.  I
guess I need to keep my paranoia for the time being, then.  :-/

> Which might actually be a good idea for "barrier()", just in case.
> However, the problem with that is that I don't think we are guaranteed
> to have a universal comment character for asm statements.
> 
> IOW, it might be a good idea to do something like
> 
>    #define barrier() \
>         __asm__ __volatile__("# barrier %=": : :"memory")
> 
> but I'm  not 100% convinced that '#' is always a comment in asm code,
> so the above might not actually build everywhere.
> 
> However, *testing* the above (in my config, where '#' does work as a
> comment character) shows that gcc doesn't actually consider them to be
> distinct EVEN THEN, and will still merge two barrier statements.
> 
> That's distressing.

If I keep the old definition of barrier() and make a barrier1() as
you defined above:

#define barrier1() __asm__ __volatile__("# barrier %=": : :"memory")

Then putting barrier() in the "then" clause and barrier1() in the
"else" clause works, though clang 12 for whatever reason generates
an extra jump in that case.  https://godbolt.org/z/YhbcsxsxG

Increasing the optimization level gets rid of the extra jump.

Of course, there is no guarantee that gcc won't learn about
assembler constants.  :-/

> So the gcc docs are actively wrong, and %= does nothing - it will
> still compare as the exact same inline asm, because the string
> equality testing is apparently done before any expansion.
> 
> Something like this *does* seem to work:
> 
>    #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
>    #define __barrier(id) ____barrier(id)
>    #define barrier() __barrier(__COUNTER__)
> 
> which is "interesting" or "disgusting" depending on how you happen to feel.
> 
> And again - the above works only as long as "#" is a valid comment
> character in the assembler. And I have this very dim memory of us
> having comments in inline asm, and it breaking certain configurations
> (for when the assembler that the compiler uses is a special
> human-unfriendly one that only accepts compiler output).
> 
> You could make even more disgusting hacks, and have it generate something like
> 
>     .pushsection .discard.barrier
>     .long #id
>     .popsection
> 
> instead of a comment. We already expect that to work and have generic
> inline asm cases that generate code like that.

And that does the trick as well, at least with recent gcc and clang.
https://godbolt.org/z/P8zPv9f9o

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 20:40             ` Paul E. McKenney
@ 2021-06-06 11:36               ` Segher Boessenkool
  2021-06-06 19:01                 ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 11:36 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Linus Torvalds, will, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Jun 04, 2021 at 01:40:42PM -0700, Paul E. McKenney wrote:
> On Fri, Jun 04, 2021 at 02:53:01PM -0500, Segher Boessenkool wrote:
> > On Fri, Jun 04, 2021 at 11:55:26AM -0700, Paul E. McKenney wrote:
> > > On Fri, Jun 04, 2021 at 11:40:47AM -0500, Segher Boessenkool wrote:
> > > > My point is that you ask compiler developers to paint themselves into a
> > > > corner if you ask them to change such fundamental C syntax.
> > > 
> > > Once we have some experience with a language extension, the official
> > > syntax for a standardized version of that extension can be bikeshedded.
> > > Committees being what they are, what we use in the meantime will
> > > definitely not be what is chosen, so there is not a whole lot of point
> > > in worrying about the exact syntax in the meantime.  ;-)
> > 
> > I am only saying that it is unlikely any compiler that is used in
> > production will want to experiment with "volatile if".
> 
> That unfortunately matches my experience over quite a few years.  But if
> something can be implemented using existing extensions, the conversations
> often get easier.  Especially given many more people are now familiar
> with concurrency.

This was about the syntax "volatile if", not about the concept, let's
call that "volatile_if".  And no, it was not me who brought this up :-)

> > > Which is exactly why these conversations are often difficult.  There is
> > > a tension between pushing the as-if rule as far as possible within the
> > > compiler on the one hand and allowing developers to write code that does
> > > what is needed on the other.  ;-)
> > 
> > There is a tension between what users expect from the compiler and what
> > actually is promised.  The compiler is not pushing the as-if rule any
> > further than it always has: it just becomes better at optimising over
> > time.  The as-if rule is and always has been absolute.
> 
> Heh!  The fact that the compiler has become better at optimizing
> over time is exactly what has been pushing the as-if rule further.
> 
> The underlying problem is that it is often impossible to write large
> applications (such as the Linux kernel) completely within the confines of
> the standard.  Thus, most large applications, and especially concurrent
> applications, are vulnerable to either the compiler becoming better
> at optimizing or compilers pushing the as-if rule, however you want to
> say it.

Oh definitely.  But there is nothing the compiler can do about most
cases of undefined behaviour: it cannot detect it, and there is no way
it *can* be handled sanely.  Take for example dereferencing a pointer
that does not point to an object.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  1:29                                     ` Alan Stern
  2021-06-06  3:41                                       ` Linus Torvalds
@ 2021-06-06 11:53                                       ` Segher Boessenkool
  2021-06-06 13:45                                         ` Alan Stern
  2021-06-06 18:04                                         ` Linus Torvalds
  1 sibling, 2 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 11:53 UTC (permalink / raw)
  To: Alan Stern
  Cc: Paul E. McKenney, Linus Torvalds, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 09:29:03PM -0400, Alan Stern wrote:
> Interesting.  And changing one of the branches from barrier() to __asm__ 
> __volatile__("nop": : :"memory") also causes a branch to be emitted.  So 
> even though the compiler doesn't "look inside" assembly code, it does 
> compare two pieces at least textually and apparently assumes if they are 
> identical then they do the same thing.

And that is a simple fact, since the same assembler code (at the same
spot in the program) will do the same thing no matter how that ended up
there.

And the compiler always is allowed to duplicate, join, delete, you name
it, inline assembler code.  The only thing that it cares about is
semantics of the code, just like for any other code.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  3:41                                       ` Linus Torvalds
  2021-06-06  4:43                                         ` Paul E. McKenney
@ 2021-06-06 12:59                                         ` Segher Boessenkool
  2021-06-06 13:47                                           ` Alan Stern
  2021-06-06 18:25                                           ` Linus Torvalds
  2021-06-06 18:41                                         ` Alan Stern
  2021-06-06 18:59                                         ` Jakub Jelinek
  3 siblings, 2 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 12:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> On Sat, Jun 5, 2021 at 6:29 PM Alan Stern <stern@rowland.harvard.edu> wrote:
> > Interesting.  And changing one of the branches from barrier() to __asm__
> > __volatile__("nop": : :"memory") also causes a branch to be emitted.  So
> > even though the compiler doesn't "look inside" assembly code, it does
> > compare two pieces at least textually and apparently assumes if they are
> > identical then they do the same thing.
> 
> That's actually a feature in some cases, ie the ability to do CSE on
> asm statements (ie the "always has the same output" optimization that
> the docs talk about).
> 
> So gcc has always looked at the asm string for that reason, afaik.

GCC does not pretend it can understand the asm.  But it can see when
two asm statements are identical.

> I think it's something of a bug when it comes to "asm volatile", but
> the documentation isn't exactly super-specific.

Why would that be?  "asm volatile" does not prevent optimisation.  It
says this code has some unspecified side effect, and that is all!  All
the usual C rules cover everything needed: the same side effects have to
be executed in the same order on the real machine as they would on the
abstract machine.

> There is a statement of "Under certain circumstances, GCC may
> duplicate (or remove duplicates of) your assembly code when
> optimizing" and a suggestion of using "%=" to generate a unique
> instance of an asm.

"%=" outputs a number unique for every output instruction (the whole asm
is one instruction; these are GCC internal instructions, not the same
thing as machine instructions).  This will not help here.  The actual
thing the manual says is
  Under certain circumstances, GCC may duplicate (or remove duplicates
  of) your assembly code when optimizing.  This can lead to unexpected
  duplicate symbol errors during compilation if your 'asm' code defines
  symbols or labels.  Using '%=' may help resolve this problem.
It helps prevent duplicated symbols and labels.  It does not do much
else.

> Which might actually be a good idea for "barrier()", just in case.
> However, the problem with that is that I don't think we are guaranteed
> to have a universal comment character for asm statements.

That's right.  But ";#" works on most systems, you may be able to use
that?

> IOW, it might be a good idea to do something like
> 
>    #define barrier() \
>         __asm__ __volatile__("# barrier %=": : :"memory")
> 
> but I'm  not 100% convinced that '#' is always a comment in asm code,
> so the above might not actually build everywhere.

Some assemblers use ";", some use "!", and there are more variations.

But this will not do what you want.  "%=" is output as a unique number
*after* everything GCC has done with the asm.

> However, *testing* the above (in my config, where '#' does work as a
> comment character) shows that gcc doesn't actually consider them to be
> distinct EVEN THEN, and will still merge two barrier statements.

Yes, the insns have the same templates, will output the exact same to
the generated assembler code, so are CSEd.

> So the gcc docs are actively wrong, and %= does nothing - it will
> still compare as the exact same inline asm, because the string
> equality testing is apparently done before any expansion.

They are not wrong.  Maybe the doc could be clearer though?  Patches
welcome.

> Something like this *does* seem to work:
> 
>    #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
>    #define __barrier(id) ____barrier(id)
>    #define barrier() __barrier(__COUNTER__)
> 
> which is "interesting" or "disgusting" depending on how you happen to feel.

__COUNTER__ is a preprocessor thing, much more like what you want here:
this does its work *before* everything the compiler does, while %= does
its thing *after* :-)

(Not that I actually understand what you are trying to do with this).


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  4:43                                         ` Paul E. McKenney
@ 2021-06-06 13:17                                           ` Segher Boessenkool
  2021-06-06 19:07                                             ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 13:17 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, Alan Stern, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 09:43:33PM -0700, Paul E. McKenney wrote:
> So gcc might some day note a do-nothing asm and duplicate it for
> the sole purpose of collapsing the "then" and "else" clauses.  I
> guess I need to keep my paranoia for the time being, then.  :-/

Or a "do-something" asm, even.  What it does is make sure it is executed
on the real machine exactly like on the abstract machine.  That is how C
is defined, what a compiler *does*.

The programmer does not have any direct control over the generated code.

> Of course, there is no guarantee that gcc won't learn about
> assembler constants.  :-/

I am not sure what you call an "assembler constant" here.  But you can
be sure that GCC will not start doing anything here.  GCC does not try
to understand what you wrote in an inline asm, it just fills in the
operands and that is all.  It can do all the same things to it that it
can do to any other code of course: duplicate it, deduplicate it,
frobnicate it, etc.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 11:53                                       ` Segher Boessenkool
@ 2021-06-06 13:45                                         ` Alan Stern
  2021-06-06 18:04                                         ` Linus Torvalds
  1 sibling, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-06-06 13:45 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Paul E. McKenney, Linus Torvalds, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 06:53:36AM -0500, Segher Boessenkool wrote:
> On Sat, Jun 05, 2021 at 09:29:03PM -0400, Alan Stern wrote:
> > Interesting.  And changing one of the branches from barrier() to __asm__ 
> > __volatile__("nop": : :"memory") also causes a branch to be emitted.  So 
> > even though the compiler doesn't "look inside" assembly code, it does 
> > compare two pieces at least textually and apparently assumes if they are 
> > identical then they do the same thing.
> 
> And that is a simple fact, since the same assembler code (at the same
> spot in the program) will do the same thing no matter how that ended up
> there.

Sure.  But the same assembler code at two different spots in the program 
might not do the same thing.  (Think of code that stores the current EIP 
register's value into a variable.)

So while de-duplicating such code may be allowed, it will give rise to 
observable results at execution time.

Alan

> And the compiler always is allowed to duplicate, join, delete, you name
> it, inline assembler code.  The only thing that it cares about is
> semantics of the code, just like for any other code.
> 
> 
> Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 12:59                                         ` Segher Boessenkool
@ 2021-06-06 13:47                                           ` Alan Stern
  2021-06-06 17:13                                             ` Segher Boessenkool
  2021-06-06 18:25                                           ` Linus Torvalds
  1 sibling, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-06 13:47 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 07:59:55AM -0500, Segher Boessenkool wrote:
> On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> > On Sat, Jun 5, 2021 at 6:29 PM Alan Stern <stern@rowland.harvard.edu> wrote:
> > > Interesting.  And changing one of the branches from barrier() to __asm__
> > > __volatile__("nop": : :"memory") also causes a branch to be emitted.  So
> > > even though the compiler doesn't "look inside" assembly code, it does
> > > compare two pieces at least textually and apparently assumes if they are
> > > identical then they do the same thing.
> > 
> > That's actually a feature in some cases, ie the ability to do CSE on
> > asm statements (ie the "always has the same output" optimization that
> > the docs talk about).
> > 
> > So gcc has always looked at the asm string for that reason, afaik.
> 
> GCC does not pretend it can understand the asm.  But it can see when
> two asm statements are identical.

How similar do two asm strings have to be before they are considered 
identical?  For instance, do changes to the amount of leading or 
trailing whitespace matter?

Or what about including an empty assembly statement in one but not the 
other?

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 13:47                                           ` Alan Stern
@ 2021-06-06 17:13                                             ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 17:13 UTC (permalink / raw)
  To: Alan Stern
  Cc: Linus Torvalds, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 09:47:49AM -0400, Alan Stern wrote:
> > GCC does not pretend it can understand the asm.  But it can see when
> > two asm statements are identical.
> 
> How similar do two asm strings have to be before they are considered 
> identical?  For instance, do changes to the amount of leading or 
> trailing whitespace matter?

They have to be identical to be considered identical.

> Or what about including an empty assembly statement in one but not the 
> other?

GCC does not parse the assembler template.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 11:53                                       ` Segher Boessenkool
  2021-06-06 13:45                                         ` Alan Stern
@ 2021-06-06 18:04                                         ` Linus Torvalds
  2021-06-06 18:22                                           ` Alan Stern
  2021-06-06 18:40                                           ` Segher Boessenkool
  1 sibling, 2 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 18:04 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 4:56 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> And that is a simple fact, since the same assembler code (at the same
> spot in the program) will do the same thing no matter how that ended up
> there.

The thing is, that's exactl;y what gcc violates.

The example - you may not have been cc'd personally on that one - was
something like

    if (READ_ONCE(a)) {
        barrier();
        WRITE_ONCE(b,1);
   } else {
        barrier();
        WRITE_ONCE(b, 1);
    }

and currently because gcc thinks "same exact code", it will actually
optimize this to (pseudo-asm):

    LD A
    "empty asm"
    ST $1,B

which is very much NOT equivalent to

    LD A
    BEQ over
    "empty asm"
    ST $1,B
    JMP join

over:
    "empty asm"
    ST $1,B

join:

and that's the whole point of the barriers.

It's not equivalent exactly because of memory ordering. In the first
case, there is no ordering on weak architectures. In the second case,
there is always an ordering, because of CPU consistency guarantees.

And no, gcc doesn't understand about memory ordering. But that's
exactly why we use inline asms.

> And the compiler always is allowed to duplicate, join, delete, you name
> it, inline assembler code.  The only thing that it cares about is
> semantics of the code, just like for any other code.

See, but it VIOLATES the semantics of the code.

You can't join those two empty asm's (and then remove the branch),
because the semantics of the code really aren't the same any more if
you do. Truly.

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:04                                         ` Linus Torvalds
@ 2021-06-06 18:22                                           ` Alan Stern
  2021-06-06 18:43                                             ` Linus Torvalds
  2021-06-06 18:40                                           ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-06 18:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Segher Boessenkool, Paul E. McKenney, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 11:04:49AM -0700, Linus Torvalds wrote:
> On Sun, Jun 6, 2021 at 4:56 AM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> >
> > And that is a simple fact, since the same assembler code (at the same
> > spot in the program) will do the same thing no matter how that ended up
> > there.
> 
> The thing is, that's exactl;y what gcc violates.
> 
> The example - you may not have been cc'd personally on that one - was
> something like
> 
>     if (READ_ONCE(a)) {
>         barrier();
>         WRITE_ONCE(b,1);
>    } else {
>         barrier();
>         WRITE_ONCE(b, 1);
>     }
> 
> and currently because gcc thinks "same exact code", it will actually
> optimize this to (pseudo-asm):
> 
>     LD A
>     "empty asm"
>     ST $1,B
> 
> which is very much NOT equivalent to
> 
>     LD A
>     BEQ over
>     "empty asm"
>     ST $1,B
>     JMP join
> 
> over:
>     "empty asm"
>     ST $1,B
> 
> join:
> 
> and that's the whole point of the barriers.
> 
> It's not equivalent exactly because of memory ordering. In the first
> case, there is no ordering on weak architectures. In the second case,
> there is always an ordering, because of CPU consistency guarantees.
> 
> And no, gcc doesn't understand about memory ordering. But that's
> exactly why we use inline asms.
> 
> > And the compiler always is allowed to duplicate, join, delete, you name
> > it, inline assembler code.  The only thing that it cares about is
> > semantics of the code, just like for any other code.
> 
> See, but it VIOLATES the semantics of the code.
> 
> You can't join those two empty asm's (and then remove the branch),
> because the semantics of the code really aren't the same any more if
> you do. Truly.

To be fair, the same argument applies even without the asm code.  The 
compiler will translate

     if (READ_ONCE(a))
         WRITE_ONCE(b, 1);
     else
         WRITE_ONCE(b, 1);

to

     LD A
     ST $1,B

intstead of

     LD A
     BEQ over
     ST $1,B
     JMP join
 
 over:
     ST $1,B
 
 join:

And these two are different for the same memory ordering reasons as 
above.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 12:59                                         ` Segher Boessenkool
  2021-06-06 13:47                                           ` Alan Stern
@ 2021-06-06 18:25                                           ` Linus Torvalds
  2021-06-06 19:19                                             ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 18:25 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 6:03 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> >
> > I think it's something of a bug when it comes to "asm volatile", but
> > the documentation isn't exactly super-specific.
>
> Why would that be?  "asm volatile" does not prevent optimisation.

Sure it does.

That's the whole and only *POINT* of the "volatile".

It's the same as a vol;atile memory access. That very much prevents
certain optimizations. You can't just join two volatile reads or
writes, because they have side effects.

And the exact same thing is true of inline asm. Even when they are
*identical*, inline asms have side effects that gcc simply doesn't
understand.

And yes, those side effects can - and do - include "you can't just merge these".

> It says this code has some unspecified side effect, and that is all!

And that should be sufficient. But gcc then violates it, because gcc
doesn't understand the side effects.

Now, the side effects may be *subtle*, but they are very very real.
Just placement of code wrt a branch will actually affect memory
ordering, as that one example was.

> > Something like this *does* seem to work:
> >
> >    #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
> >    #define __barrier(id) ____barrier(id)
> >    #define barrier() __barrier(__COUNTER__)
> >
> > which is "interesting" or "disgusting" depending on how you happen to feel.
>
> __COUNTER__ is a preprocessor thing, much more like what you want here:
> this does its work *before* everything the compiler does, while %= does
> its thing *after* :-)
>
> (Not that I actually understand what you are trying to do with this).

See my previous email for why two barriers in two different code
sequences cannot just be joined into one and moved into the common
parent. It actually is semantically meaningful *where* they are, and
they are distinct barriers.

The case we happen to care about is memory ordering issues. The
example quoted may sound pointless and insane, and I actually don't
believe we have real code that triggers the issue, because whenever we
have a conditional barrier, the two sides of the conditional are
generally so different that gcc would never merge any of it anyway.

So the issue is mostly theoretical, but we do have code that is fairly
critical, and that depends on memory ordering, and on some weakly
ordered machines (which is where all these problems would happen),
actual explicit memory barriers are also <i>much</i> too expensive.

End result: we have code that depends on the fact that a read-to-write
ordering exists if there is a data dependency or a control dependency
between the two. No actual expensive CPU instruction to specify the
ordering, because the ordering is implicit in the code flow itself.

But that's what we need a compiler barrier for in the first place -
the compiler certainly doesn't understand about this very subtle
memory ordering issue, and we want to make sure that the code sequence
*remains* that "if A then write B".

             Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:04                                         ` Linus Torvalds
  2021-06-06 18:22                                           ` Alan Stern
@ 2021-06-06 18:40                                           ` Segher Boessenkool
  2021-06-06 18:48                                             ` Linus Torvalds
  1 sibling, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 18:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 11:04:49AM -0700, Linus Torvalds wrote:
>     if (READ_ONCE(a)) {
>         barrier();
>         WRITE_ONCE(b,1);
>    } else {
>         barrier();
>         WRITE_ONCE(b, 1);
>     }
> 
> and currently because gcc thinks "same exact code", it will actually
> optimize this to (pseudo-asm):
> 
>     LD A
>     "empty asm"
>     ST $1,B
> 
> which is very much NOT equivalent to
> 
>     LD A
>     BEQ over
>     "empty asm"
>     ST $1,B
>     JMP join
> 
> over:
>     "empty asm"
>     ST $1,B
> 
> join:
> 
> and that's the whole point of the barriers.

You didn't use a barrier with these semantics though.  There is nothing
in that code that guarantees a branch.

> See, but it VIOLATES the semantics of the code.

The code violates your expectations of the code.

> You can't join those two empty asm's (and then remove the branch),
> because the semantics of the code really aren't the same any more if
> you do. Truly.

You truly should have written a branch in tthe asm if you truly wanted
a branch instruction.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  3:41                                       ` Linus Torvalds
  2021-06-06  4:43                                         ` Paul E. McKenney
  2021-06-06 12:59                                         ` Segher Boessenkool
@ 2021-06-06 18:41                                         ` Alan Stern
  2021-06-06 18:59                                         ` Jakub Jelinek
  3 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-06-06 18:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Segher Boessenkool, Paul E. McKenney, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> On Sat, Jun 5, 2021 at 6:29 PM Alan Stern <stern@rowland.harvard.edu> wrote:
> >
> > Interesting.  And changing one of the branches from barrier() to __asm__
> > __volatile__("nop": : :"memory") also causes a branch to be emitted.  So
> > even though the compiler doesn't "look inside" assembly code, it does
> > compare two pieces at least textually and apparently assumes if they are
> > identical then they do the same thing.
> 
> That's actually a feature in some cases, ie the ability to do CSE on
> asm statements (ie the "always has the same output" optimization that
> the docs talk about).
> 
> So gcc has always looked at the asm string for that reason, afaik.
> 
> I think it's something of a bug when it comes to "asm volatile", but
> the documentation isn't exactly super-specific.
> 
> There is a statement of "Under certain circumstances, GCC may
> duplicate (or remove duplicates of) your assembly code when
> optimizing" and a suggestion of using "%=" to generate a unique
> instance of an asm.
> 
> Which might actually be a good idea for "barrier()", just in case.
> However, the problem with that is that I don't think we are guaranteed
> to have a universal comment character for asm statements.
> 
> IOW, it might be a good idea to do something like
> 
>    #define barrier() \
>         __asm__ __volatile__("# barrier %=": : :"memory")
> 
> but I'm  not 100% convinced that '#' is always a comment in asm code,
> so the above might not actually build everywhere.
> 
> However, *testing* the above (in my config, where '#' does work as a
> comment character) shows that gcc doesn't actually consider them to be
> distinct EVEN THEN, and will still merge two barrier statements.
> 
> That's distressing.
> 
> So the gcc docs are actively wrong, and %= does nothing - it will
> still compare as the exact same inline asm, because the string
> equality testing is apparently done before any expansion.
> 
> Something like this *does* seem to work:
> 
>    #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
>    #define __barrier(id) ____barrier(id)
>    #define barrier() __barrier(__COUNTER__)
> 
> which is "interesting" or "disgusting" depending on how you happen to feel.
> 
> And again - the above works only as long as "#" is a valid comment
> character in the assembler. And I have this very dim memory of us
> having comments in inline asm, and it breaking certain configurations
> (for when the assembler that the compiler uses is a special
> human-unfriendly one that only accepts compiler output).
> 
> You could make even more disgusting hacks, and have it generate something like
> 
>     .pushsection .discard.barrier
>     .long #id
>     .popsection
> 
> instead of a comment. We already expect that to work and have generic
> inline asm cases that generate code like that.

I tried the experiment with this code:

#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
#define WRITE_ONCE(x, val) (READ_ONCE(x) = (val))
#define barrier() __asm__ __volatile__("": : :"memory")

int x, y;

int main(int argc, char *argv[])
{
    if (READ_ONCE(x)) {
        barrier();
        y = 1;
    } else {
        y = 1;
    }
    return 0;
}

The output from gcc -O2 is:

main:
        mov     eax, DWORD PTR x[rip]
        test    eax, eax
        je      .L2
.L2:
        mov     DWORD PTR y[rip], 1

The output from clang is essentially the same (the mov and test are 
replaced by a cmp).

This does what we want, but I wouldn't bet against a future 
optimization pass getting rid of the "useless" test and branch.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:22                                           ` Alan Stern
@ 2021-06-06 18:43                                             ` Linus Torvalds
  2021-06-07 10:43                                               ` Peter Zijlstra
  0 siblings, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 18:43 UTC (permalink / raw)
  To: Alan Stern
  Cc: Segher Boessenkool, Paul E. McKenney, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 11:22 AM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> To be fair, the same argument applies even without the asm code.  The
> compiler will translate

Yes, yes.

But that is literally why the asm exists in the first place.

It's supposed to be the barrier that makes sure that doesn't happen.

So your point that "but this would happen without the asm" is missing
the whole point. This is exactly the thing that the asm is supposed to
avoid.

And it actually works fine when just one side has the barrier, because
then no merging can take place, because there is nothing to merge.

That's why my suggested fix for "volatile_if()" was this #define

    #define barrier_true() ({ barrier(); 1; })
    #define volatile_if(x) if ((x) && barrier_true())

because now code like

    volatile_if (READ_ONCE(a))
        WRITE_ONCE(b, 1);
    else
        WRITE_ONCE(b, 1);

would force that branch. And it's actually fine to merge the
"WRITE(b,1)", as loing as the branch exists, so the above can (and
does) compile to

    LD A
    BEQ over
    "empty asm"
over:
    ST $1,B

and the above is actually perfectly valid code and actually solves the
problem, even if it admittedly looks entirely insane.

With that crazy "conditional jump over nothing" the store to B is
ordered wrt the load from A on real machines.

And again: I do not believe we actually have this kind of code in the
kernel. I could imagine some CPU turning "conditional branch over
nothing" into a nop-op internally, and losing the ordering. And that's
ok, exactly because the above kind of code that *only* does the
WRITE_ONCE() and nothing else is crazy and stupid.

So don't get hung up on the "branch over nothing", that's just for
this insane unreal example.

But I *could* see us having something where both branches do end up
writing to "B", and it might even be the first thing both branches end
up doing. Not the *only* thing they do, but "B" might be a flag for "I
am actively working on this issue", and I could see a situation where
we care that the read of "A" (which might be what specifies *what* the
issue is) would need to be ordered with regards to that "I'm working
on it" flag.

IOW, another CPU might want to know *what* somebody is working on, and do

    /* Is somebody working on this */
    if (READ_ONCE(B)) {
        smp_rmb();
        READ_ONCE(A); <- this is what they are working on

and the ordering requirement in this all is that B has to be written
after A has been read.

So while the example code is insane and pointless (and you shouldn't
read *too* much into it), conceptually the notion of that pattern of

    if (READ_ONCE(a)) {
        WRITE_ONCE(b,1);
        .. do something ..
    } else {
        WRITE_ONCE(b,1);
        .. do something else ..
    }

is not insane or entirely unrealistic - the WRITE_ONCE() might
basically be an ACK for "I have read the value of A and will act on
it".

Odd? Yes. Unusual? Yes. Do we do this now? No. But it does worry me
that we don't seem to have a good way to add that required barrier.

Adding it on one side is good, and works, but what if somebody then does

    volatile_if (READ_ONCE(a))
        WRITE_ONCE(b, 1);
    else {
        barrier();
        WRITE_ONCE(b, 1);
    }

and now we end up with it on both sides again, and then the second
barrier basically undoes the first one..

              Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:40                                           ` Segher Boessenkool
@ 2021-06-06 18:48                                             ` Linus Torvalds
  2021-06-06 18:53                                               ` Linus Torvalds
  2021-06-06 19:52                                               ` Segher Boessenkool
  0 siblings, 2 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 18:48 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 11:43 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> You truly should have written a branch in tthe asm if you truly wanted
> a branch instruction.

That's exactly what I don't want to do, and what the original patch by
PeterZ did.

Why?

Because then we need to write that stupid pointless branch for every
single architecture.

And to work well, it needs "asm goto", which is so recent that a lot
of compilers don't support it (thank God for clang dragging gcc
kicking and screaming to implement it at all - I'd asked for it over a
decade ago).

So you get bad code generation in a lot of cases, which entirely
obviates the _point_ of this all - which is that we can avoid an
expensive operation (a memory barrier) by just doing clever code
generation.

So if we can't get the clever code generation, it's all pretty much
moot, imnsho.

A working barrier "just fixes it".

I suspect the best we can do is to just work around the gcc badness
with that __COUNTER__ trick of mine. The lack of a reliable comment
character is the biggest issue with that trick.

                 Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:48                                             ` Linus Torvalds
@ 2021-06-06 18:53                                               ` Linus Torvalds
  2021-06-06 19:52                                               ` Segher Boessenkool
  1 sibling, 0 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 18:53 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 11:48 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> And to work well, it needs "asm goto", which is so recent that a lot
> of compilers don't support it (thank God for clang dragging gcc
> kicking and screaming to implement it at all - I'd asked for it over a
> decade ago).

Oh, actually, I'm wrong on this.

We don't need an output from the asm (the output ends up being in the
targets), so we can use the old-style asm goto that we've been relying
on for a long time.

So the main code generation problem is just (a) all the architectures
and (b) we'd have to use a fixed conditional against zero.

                Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06  3:41                                       ` Linus Torvalds
                                                           ` (2 preceding siblings ...)
  2021-06-06 18:41                                         ` Alan Stern
@ 2021-06-06 18:59                                         ` Jakub Jelinek
  2021-06-06 19:15                                           ` Paul E. McKenney
  2021-06-06 19:22                                           ` Linus Torvalds
  3 siblings, 2 replies; 127+ messages in thread
From: Jakub Jelinek @ 2021-06-06 18:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Segher Boessenkool, Paul E. McKenney, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> Something like this *does* seem to work:
> 
>    #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
>    #define __barrier(id) ____barrier(id)
>    #define barrier() __barrier(__COUNTER__)
> 
> which is "interesting" or "disgusting" depending on how you happen to feel.

I think just
#define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
should be enough (or "X" instead of "i" if some arch uses -fpic and will not
accept small constants in PIC code), for CSE gcc compares that the asm template
string and all arguments are the same.

As for volatile, that is implicit on asm without any output operands and
it is about whether the inline asm can be DCEd, not whether it can be CSEd.

	Jakub


^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 11:36               ` Segher Boessenkool
@ 2021-06-06 19:01                 ` Paul E. McKenney
  0 siblings, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-06 19:01 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Linus Torvalds, will, stern, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 06:36:51AM -0500, Segher Boessenkool wrote:
> On Fri, Jun 04, 2021 at 01:40:42PM -0700, Paul E. McKenney wrote:
> > On Fri, Jun 04, 2021 at 02:53:01PM -0500, Segher Boessenkool wrote:
> > > On Fri, Jun 04, 2021 at 11:55:26AM -0700, Paul E. McKenney wrote:
> > > > On Fri, Jun 04, 2021 at 11:40:47AM -0500, Segher Boessenkool wrote:
> > > > > My point is that you ask compiler developers to paint themselves into a
> > > > > corner if you ask them to change such fundamental C syntax.
> > > > 
> > > > Once we have some experience with a language extension, the official
> > > > syntax for a standardized version of that extension can be bikeshedded.
> > > > Committees being what they are, what we use in the meantime will
> > > > definitely not be what is chosen, so there is not a whole lot of point
> > > > in worrying about the exact syntax in the meantime.  ;-)
> > > 
> > > I am only saying that it is unlikely any compiler that is used in
> > > production will want to experiment with "volatile if".
> > 
> > That unfortunately matches my experience over quite a few years.  But if
> > something can be implemented using existing extensions, the conversations
> > often get easier.  Especially given many more people are now familiar
> > with concurrency.
> 
> This was about the syntax "volatile if", not about the concept, let's
> call that "volatile_if".  And no, it was not me who brought this up :-)

I agree that it is likely that the syntax "volatile if" would be at best
a very reluctantly acquired taste among most of the committee.  But some
might point to the evolving semantics of "auto" as a counter-example,
to say nothing of the celebrated spaceship operator.  Me, I am not
all that worried about the exact syntax.

> > > > Which is exactly why these conversations are often difficult.  There is
> > > > a tension between pushing the as-if rule as far as possible within the
> > > > compiler on the one hand and allowing developers to write code that does
> > > > what is needed on the other.  ;-)
> > > 
> > > There is a tension between what users expect from the compiler and what
> > > actually is promised.  The compiler is not pushing the as-if rule any
> > > further than it always has: it just becomes better at optimising over
> > > time.  The as-if rule is and always has been absolute.
> > 
> > Heh!  The fact that the compiler has become better at optimizing
> > over time is exactly what has been pushing the as-if rule further.
> > 
> > The underlying problem is that it is often impossible to write large
> > applications (such as the Linux kernel) completely within the confines of
> > the standard.  Thus, most large applications, and especially concurrent
> > applications, are vulnerable to either the compiler becoming better
> > at optimizing or compilers pushing the as-if rule, however you want to
> > say it.
> 
> Oh definitely.  But there is nothing the compiler can do about most
> cases of undefined behaviour: it cannot detect it, and there is no way
> it *can* be handled sanely.  Take for example dereferencing a pointer
> that does not point to an object.

Almost.

The compiler's use of provenance allows detection in some cases.
For a stupid example, please see https://godbolt.org/z/z9cWvqdhE.

Less stupidly, this sort of thing can be quite annoying to people trying
to use ABA-tolerant concurrent algorithms.  See for example P1726R4
[1] (update in progress) and for an even more controversial proposal,
P2188R1 [2].  The Lifo Singly Linked Push algorithm described beginning
on page 14 of [1] is a simple example of an ABA-tolerant algorithm that
was already in use when I first programmed a computer.  ;-)

							Thanx, Paul

[1]	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1726r4.pdf
[2]	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2188r1.html

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 13:17                                           ` Segher Boessenkool
@ 2021-06-06 19:07                                             ` Paul E. McKenney
  0 siblings, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-06 19:07 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Alan Stern, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 08:17:40AM -0500, Segher Boessenkool wrote:
> On Sat, Jun 05, 2021 at 09:43:33PM -0700, Paul E. McKenney wrote:
> > So gcc might some day note a do-nothing asm and duplicate it for
> > the sole purpose of collapsing the "then" and "else" clauses.  I
> > guess I need to keep my paranoia for the time being, then.  :-/
> 
> Or a "do-something" asm, even.  What it does is make sure it is executed
> on the real machine exactly like on the abstract machine.  That is how C
> is defined, what a compiler *does*.
> 
> The programmer does not have any direct control over the generated code.

I am not looking for direct control, simply sufficient influence.  ;-)

> > Of course, there is no guarantee that gcc won't learn about
> > assembler constants.  :-/
> 
> I am not sure what you call an "assembler constant" here.  But you can
> be sure that GCC will not start doing anything here.  GCC does not try
> to understand what you wrote in an inline asm, it just fills in the
> operands and that is all.  It can do all the same things to it that it
> can do to any other code of course: duplicate it, deduplicate it,
> frobnicate it, etc.

Apologies, that "assembler constants" should have been "assembler
comments".

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:59                                         ` Jakub Jelinek
@ 2021-06-06 19:15                                           ` Paul E. McKenney
  2021-06-06 19:22                                           ` Linus Torvalds
  1 sibling, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-06 19:15 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Linus Torvalds, Alan Stern, Segher Boessenkool, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 08:59:22PM +0200, Jakub Jelinek wrote:
> On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> > Something like this *does* seem to work:
> > 
> >    #define ____barrier(id) __asm__ __volatile__("#" #id: : :"memory")
> >    #define __barrier(id) ____barrier(id)
> >    #define barrier() __barrier(__COUNTER__)
> > 
> > which is "interesting" or "disgusting" depending on how you happen to feel.
> 
> I think just
> #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> should be enough (or "X" instead of "i" if some arch uses -fpic and will not
> accept small constants in PIC code), for CSE gcc compares that the asm template
> string and all arguments are the same.

This does seem to do the trick: https://godbolt.org/z/K5j3bYqGT

So thank you for that!

							Thanx, Paul

> As for volatile, that is implicit on asm without any output operands and
> it is about whether the inline asm can be DCEd, not whether it can be CSEd.
> 
> 	Jakub
> 

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:25                                           ` Linus Torvalds
@ 2021-06-06 19:19                                             ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 19:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 11:25:46AM -0700, Linus Torvalds wrote:
> On Sun, Jun 6, 2021 at 6:03 AM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> >
> > On Sat, Jun 05, 2021 at 08:41:00PM -0700, Linus Torvalds wrote:
> > >
> > > I think it's something of a bug when it comes to "asm volatile", but
> > > the documentation isn't exactly super-specific.
> >
> > Why would that be?  "asm volatile" does not prevent optimisation.
> 
> Sure it does.
> 
> That's the whole and only *POINT* of the "volatile".
> 
> It's the same as a vol;atile memory access. That very much prevents
> certain optimizations. You can't just join two volatile reads or
> writes, because they have side effects.

You can though.  In exactly this same way:

volatile int x;
void g(int);
void f(int n) { if (n) g(x); else g(x); }

==>

f:
        movl    x(%rip), %edi
        jmp     g

You can do whatever you want with code with side effects.  The only
thing required is that the side effects are executed as often as before
and in the same order.  Merging identical sides of a diamond is just
fine.

> And the exact same thing is true of inline asm. Even when they are
> *identical*, inline asms have side effects that gcc simply doesn't
> understand.

Only volatile asm does (including all asm without outputs).  But that
still does not mean GCC cannot manipulate the asm!

> And yes, those side effects can - and do - include "you can't just merge these".

They do not.  That is not what a side effect is.

> > It says this code has some unspecified side effect, and that is all!
> 
> And that should be sufficient. But gcc then violates it, because gcc
> doesn't understand the side effects.
> 
> Now, the side effects may be *subtle*, but they are very very real.
> Just placement of code wrt a branch will actually affect memory
> ordering, as that one example was.

You have a different definition of "side effect" than C does apparently.

5.1.2.3/2:
  Accessing a volatile object, modifying an object, modifying a file, or
  calling a function that does any of those operations are all side
  effects, which are changes in the state of the execution environment.
  Evaluation of an expression in general includes both value
  computations and initiation of side effects.  Value computation for an
  lvalue expression includes determining the identity of the designated
  object.

> But that's what we need a compiler barrier for in the first place -
> the compiler certainly doesn't understand about this very subtle
> memory ordering issue, and we want to make sure that the code sequence
> *remains* that "if A then write B".

The compiler doesn't magically understand your intention, no.  Some real
work will need to be done to make this work.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:59                                         ` Jakub Jelinek
  2021-06-06 19:15                                           ` Paul E. McKenney
@ 2021-06-06 19:22                                           ` Linus Torvalds
  2021-06-06 20:11                                             ` Segher Boessenkool
  2021-06-06 21:19                                             ` Alexander Monakov
  1 sibling, 2 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 19:22 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Alan Stern, Segher Boessenkool, Paul E. McKenney, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 11:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> I think just
> #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> should be enough

Oh, I like that. Much better.

It avoids all the issues with comments etc, and because it's not using
__COUNTER__ as a string, it doesn't need the preprocessor games with
double expansion either.

So yeah, that seems like a nice solution to the issue, and should make
the barriers all unique to the compiler.

             Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:48                                             ` Linus Torvalds
  2021-06-06 18:53                                               ` Linus Torvalds
@ 2021-06-06 19:52                                               ` Segher Boessenkool
  2021-06-06 20:11                                                 ` Linus Torvalds
  1 sibling, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 19:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 11:48:32AM -0700, Linus Torvalds wrote:
> On Sun, Jun 6, 2021 at 11:43 AM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> >
> > You truly should have written a branch in tthe asm if you truly wanted
> > a branch instruction.
> 
> That's exactly what I don't want to do, and what the original patch by
> PeterZ did.

Yes, I know.  But it is literally the *only* way to *always* get a
conditional branch: by writing one.

> And to work well, it needs "asm goto", which is so recent that a lot
> of compilers don't support it (thank God for clang dragging gcc
> kicking and screaming to implement it at all - I'd asked for it over a
> decade ago).

GCC has had it since 2009.

> So you get bad code generation in a lot of cases, which entirely
> obviates the _point_ of this all - which is that we can avoid an
> expensive operation (a memory barrier) by just doing clever code
> generation.
> 
> So if we can't get the clever code generation, it's all pretty much
> moot, imnsho.

Yes.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 19:52                                               ` Segher Boessenkool
@ 2021-06-06 20:11                                                 ` Linus Torvalds
  2021-06-06 20:26                                                   ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 20:11 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 6, 2021 at 12:56 PM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> Yes, I know.  But it is literally the *only* way to *always* get a
> conditional branch: by writing one.

The thing is, I don't actually believe you.

The barrier() thing can work - all we need to do is to simply make it
impossible for gcc to validly create anything but a conditional
branch.

If either side of the thing have an asm that cannot be combined, gcc
simply doesn't have any choice in the matter. There's no other valid
model than a conditional branch around it (of some sort - doing an
indirect branch that has a data dependency isn't wrong either, it just
wouldn't be something that a sane compiler would generate because it's
obviously much slower and more complicated).

We are very used to just making the compiler generate the code we
need. That is, fundamentally, what any use of inline asm is all about.
We want the compiler to generate all the common cases and all the
regular instructions.

The conditional branch itself - and the instructions leading up to it
- are exactly those "common regular instructions" that we'd want the
compiler to generate. That is in fact more true here than for most
inline asm, exactly because there are so many different possible
combinations of conditional branches (equal, not equal, less than,..)
and so many ways to generate the code that generates the condition.

So we are much better off letting the compiler do all that for us -
it's very much what the compiler is good at.

               Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 19:22                                           ` Linus Torvalds
@ 2021-06-06 20:11                                             ` Segher Boessenkool
  2021-06-06 21:19                                             ` Alexander Monakov
  1 sibling, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 20:11 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jakub Jelinek, Alan Stern, Paul E. McKenney, Peter Zijlstra,
	Will Deacon, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 12:22:44PM -0700, Linus Torvalds wrote:
> On Sun, Jun 6, 2021 at 11:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > I think just
> > #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> > should be enough
> 
> Oh, I like that. Much better.
> 
> It avoids all the issues with comments etc, and because it's not using
> __COUNTER__ as a string, it doesn't need the preprocessor games with
> double expansion either.
> 
> So yeah, that seems like a nice solution to the issue, and should make
> the barriers all unique to the compiler.

__COUNTER__ is a preprocessor thing as well, and it may not do all that
you expect.  Ex.:

===
#define fm() __COUNTER__
int gm(void) { return fm(); }
int hm(void) { return fm(); }

int fi(void) { return __COUNTER__; }
int gi(void) { return fi(); }
int hi(void) { return fi(); }
===

The macro version here works as you would hope, but the inlined one has
the same number everywhere.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 20:11                                                 ` Linus Torvalds
@ 2021-06-06 20:26                                                   ` Segher Boessenkool
  2021-06-06 23:37                                                     ` Paul E. McKenney
  2021-06-07 10:52                                                     ` Peter Zijlstra
  0 siblings, 2 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-06 20:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Paul E. McKenney, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 01:11:53PM -0700, Linus Torvalds wrote:
> On Sun, Jun 6, 2021 at 12:56 PM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> >
> > Yes, I know.  But it is literally the *only* way to *always* get a
> > conditional branch: by writing one.
> 
> The thing is, I don't actually believe you.

Fortune favours the bold!

> The barrier() thing can work - all we need to do is to simply make it
> impossible for gcc to validly create anything but a conditional
> branch.

And the only foolproof way of doing that is by writing a branch.

> If either side of the thing have an asm that cannot be combined, gcc
> simply doesn't have any choice in the matter. There's no other valid
> model than a conditional branch around it (of some sort - doing an
> indirect branch that has a data dependency isn't wrong either, it just
> wouldn't be something that a sane compiler would generate because it's
> obviously much slower and more complicated).

Or push something to the stack and return.  Or rewrite the whole thing
as an FSM.  Or or or.

(And yes, there are existing compilers that can do both of these things
on some code).

> We are very used to just making the compiler generate the code we
> need. That is, fundamentally, what any use of inline asm is all about.
> We want the compiler to generate all the common cases and all the
> regular instructions.
> 
> The conditional branch itself - and the instructions leading up to it
> - are exactly those "common regular instructions" that we'd want the
> compiler to generate. That is in fact more true here than for most
> inline asm, exactly because there are so many different possible
> combinations of conditional branches (equal, not equal, less than,..)
> and so many ways to generate the code that generates the condition.
> 
> So we are much better off letting the compiler do all that for us -
> it's very much what the compiler is good at.

Yes, exactly.

I am saying that if you depend on that some C code you write will result
in some particular machine code, without actually *forcing* the compiler
to output that exact machine code, then you will be disappointed.  Maybe
not today, and maybe it will take years, if you are lucky.

(s/forcing/instructing/ of course, compilers have feelings too!)


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 19:22                                           ` Linus Torvalds
  2021-06-06 20:11                                             ` Segher Boessenkool
@ 2021-06-06 21:19                                             ` Alexander Monakov
  2021-06-06 22:38                                               ` Linus Torvalds
  1 sibling, 1 reply; 127+ messages in thread
From: Alexander Monakov @ 2021-06-06 21:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jakub Jelinek, Alan Stern, Segher Boessenkool, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch



On Sun, 6 Jun 2021, Linus Torvalds wrote:

> On Sun, Jun 6, 2021 at 11:59 AM Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > I think just
> > #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> > should be enough
> 
> Oh, I like that. Much better.
> 
> It avoids all the issues with comments etc, and because it's not using
> __COUNTER__ as a string, it doesn't need the preprocessor games with
> double expansion either.
> 
> So yeah, that seems like a nice solution to the issue, and should make
> the barriers all unique to the compiler.

It also plants a nice LTO time-bomb (__COUNTER__ values will be unique
only within each LTO input unit, not across all of them).

Alexander

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 21:19                                             ` Alexander Monakov
@ 2021-06-06 22:38                                               ` Linus Torvalds
  2021-06-06 23:39                                                 ` Rasmus Villemoes
                                                                   ` (2 more replies)
  0 siblings, 3 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-06 22:38 UTC (permalink / raw)
  To: Alexander Monakov
  Cc: Jakub Jelinek, Alan Stern, Segher Boessenkool, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Sun, Jun 6, 2021 at 2:19 PM Alexander Monakov <amonakov@ispras.ru> wrote:
>
> > So yeah, that seems like a nice solution to the issue, and should make
> > the barriers all unique to the compiler.
>
> It also plants a nice LTO time-bomb (__COUNTER__ values will be unique
> only within each LTO input unit, not across all of them).

That could be an issue in other circumstances, but for at least
volatile_if() that doesn't much matter. The decision there is purely
local, and it's literally about the two sides of the conditional not
being merged.

Now, an optimizing linker or assembler can of course do anything at
all in theory: and if that ends up being an issue we'd have to have
some way to actually propagate the barrier from being just a compiler
thing. Right now gcc doesn't even output the barrier in the assembly
code, so it's invisible to any optimizing assembler/linker thing.

But I don't think that's an issue with what _currently_ goes on in an
assembler or linker - not even a smart one like LTO.

And such things really are independent of "volatile_if()". We use
barriers for other things where we need to force some kind of
operation ordering, and right now the only thing that re-orders
accesses etc is the compiler.

Btw, since we have compiler people on line, the suggested 'barrier()'
isn't actually perfect for this particular use:

   #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")

in the general barrier case, we very much want to have that "memory"
clobber, because the whole point of the general barrier case is that
we want to make sure that the compiler doesn't cache memory state
across it (ie the traditional use was basically what we now use
"cpu_relax()" for, and you would use it for busy-looping on some
condition).

In the case of "volatile_if()", we actually would like to have not a
memory clobber, but a "memory read". IOW, it would be a barrier for
any writes taking place, but reads can move around it.

I don't know of any way to express that to the compiler. We've used
hacks for it before (in gcc, BLKmode reads turn into that kind of
barrier in practice, so you can do something like make the memory
input to the asm be a big array). But that turned out to be fairly
unreliable, so now we use memory clobbers even if we just mean "reads
random memory".

Example: variable_test_bit(), which generates a "bt" instruction, does

                     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");

and the memory clobber is obviously wrong: 'bt' only *reads* memory,
but since the whole reason we use it is that it's not just that word
at address 'addr', in order to make sure that any previous writes are
actually stable in memory, we use that "memory" clobber.

It would be much nicer to have a "memory read" marker instead, to let
the compiler know "I need to have done all pending writes to memory,
but I can still cache read values over this op because it doesn't
_change_ memory".

Anybody have ideas or suggestions for something like that?

                 Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 20:26                                                   ` Segher Boessenkool
@ 2021-06-06 23:37                                                     ` Paul E. McKenney
  2021-06-07 14:12                                                       ` Segher Boessenkool
  2021-06-07 10:52                                                     ` Peter Zijlstra
  1 sibling, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-06 23:37 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Alan Stern, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 03:26:16PM -0500, Segher Boessenkool wrote:
> On Sun, Jun 06, 2021 at 01:11:53PM -0700, Linus Torvalds wrote:
> > On Sun, Jun 6, 2021 at 12:56 PM Segher Boessenkool
> > <segher@kernel.crashing.org> wrote:
> > >
> > > Yes, I know.  But it is literally the *only* way to *always* get a
> > > conditional branch: by writing one.
> > 
> > The thing is, I don't actually believe you.
> 
> Fortune favours the bold!
> 
> > The barrier() thing can work - all we need to do is to simply make it
> > impossible for gcc to validly create anything but a conditional
> > branch.
> 
> And the only foolproof way of doing that is by writing a branch.
> 
> > If either side of the thing have an asm that cannot be combined, gcc
> > simply doesn't have any choice in the matter. There's no other valid
> > model than a conditional branch around it (of some sort - doing an
> > indirect branch that has a data dependency isn't wrong either, it just
> > wouldn't be something that a sane compiler would generate because it's
> > obviously much slower and more complicated).
> 
> Or push something to the stack and return.  Or rewrite the whole thing
> as an FSM.  Or or or.
> 
> (And yes, there are existing compilers that can do both of these things
> on some code).
> 
> > We are very used to just making the compiler generate the code we
> > need. That is, fundamentally, what any use of inline asm is all about.
> > We want the compiler to generate all the common cases and all the
> > regular instructions.
> > 
> > The conditional branch itself - and the instructions leading up to it
> > - are exactly those "common regular instructions" that we'd want the
> > compiler to generate. That is in fact more true here than for most
> > inline asm, exactly because there are so many different possible
> > combinations of conditional branches (equal, not equal, less than,..)
> > and so many ways to generate the code that generates the condition.
> > 
> > So we are much better off letting the compiler do all that for us -
> > it's very much what the compiler is good at.
> 
> Yes, exactly.
> 
> I am saying that if you depend on that some C code you write will result
> in some particular machine code, without actually *forcing* the compiler
> to output that exact machine code, then you will be disappointed.  Maybe
> not today, and maybe it will take years, if you are lucky.
> 
> (s/forcing/instructing/ of course, compilers have feelings too!)

OK, I will bite...

What would you suggest as a way of instructing the compiler to emit the
conditional branch that we are looking for?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 22:38                                               ` Linus Torvalds
@ 2021-06-06 23:39                                                 ` Rasmus Villemoes
  2021-06-06 23:44                                                   ` Rasmus Villemoes
  2021-06-07  8:01                                                 ` Alexander Monakov
  2021-06-07 17:42                                                 ` Segher Boessenkool
  2 siblings, 1 reply; 127+ messages in thread
From: Rasmus Villemoes @ 2021-06-06 23:39 UTC (permalink / raw)
  To: Linus Torvalds, Alexander Monakov
  Cc: Jakub Jelinek, Alan Stern, Segher Boessenkool, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On 07/06/2021 00.38, Linus Torvalds wrote:

> Example: variable_test_bit(), which generates a "bt" instruction, does
> 
>                      : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> 
> and the memory clobber is obviously wrong: 'bt' only *reads* memory,
> but since the whole reason we use it is that it's not just that word
> at address 'addr', in order to make sure that any previous writes are
> actually stable in memory, we use that "memory" clobber.
> 
> It would be much nicer to have a "memory read" marker instead, to let
> the compiler know "I need to have done all pending writes to memory,
> but I can still cache read values over this op because it doesn't
> _change_ memory".
> 
> Anybody have ideas or suggestions for something like that?

The obvious thing is to try and mark the function as pure. But when
applied to a static inline, gcc seems to read the contents and say "nah,
you have something here that declares itself to possibly write to
memory". Replacing with a call to an extern function marked pure does
indeed cause gcc to cache the value of y*z, so in theory this should be
possible, if one could convince gcc to "trust me, this really is a pure
function".

https://godbolt.org/z/s4546K6Pj

Rasmus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 23:39                                                 ` Rasmus Villemoes
@ 2021-06-06 23:44                                                   ` Rasmus Villemoes
  0 siblings, 0 replies; 127+ messages in thread
From: Rasmus Villemoes @ 2021-06-06 23:44 UTC (permalink / raw)
  To: Linus Torvalds, Alexander Monakov
  Cc: Jakub Jelinek, Alan Stern, Segher Boessenkool, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On 07/06/2021 01.39, Rasmus Villemoes wrote:

> memory". Replacing with a call to an extern function marked pure does
> indeed cause gcc to cache the value of y*z, so in theory this should be
> possible, if one could convince gcc to "trust me, this really is a pure
> function".

Don't know why I didn't think to check before sending, but FWIW clang
doesn't need convincing, it already takes the __pure at face value and
caches y*z.

Rasmus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 22:38                                               ` Linus Torvalds
  2021-06-06 23:39                                                 ` Rasmus Villemoes
@ 2021-06-07  8:01                                                 ` Alexander Monakov
  2021-06-07  8:27                                                   ` Marco Elver
  2021-06-07 17:52                                                   ` Segher Boessenkool
  2021-06-07 17:42                                                 ` Segher Boessenkool
  2 siblings, 2 replies; 127+ messages in thread
From: Alexander Monakov @ 2021-06-07  8:01 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jakub Jelinek, Alan Stern, Segher Boessenkool, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Sun, 6 Jun 2021, Linus Torvalds wrote:

> On Sun, Jun 6, 2021 at 2:19 PM Alexander Monakov <amonakov@ispras.ru> wrote:
> >
> > > So yeah, that seems like a nice solution to the issue, and should make
> > > the barriers all unique to the compiler.
> >
> > It also plants a nice LTO time-bomb (__COUNTER__ values will be unique
> > only within each LTO input unit, not across all of them).
> 
> That could be an issue in other circumstances, but for at least
> volatile_if() that doesn't much matter. The decision there is purely
> local, and it's literally about the two sides of the conditional not
> being merged.
> 
> Now, an optimizing linker or assembler can of course do anything at
> all in theory: and if that ends up being an issue we'd have to have
> some way to actually propagate the barrier from being just a compiler
> thing. Right now gcc doesn't even output the barrier in the assembly
> code, so it's invisible to any optimizing assembler/linker thing.
> 
> But I don't think that's an issue with what _currently_ goes on in an
> assembler or linker - not even a smart one like LTO.
> 
> And such things really are independent of "volatile_if()". We use
> barriers for other things where we need to force some kind of
> operation ordering, and right now the only thing that re-orders
> accesses etc is the compiler.

Uhh... I was not talking about some (non-existent) "optimizing linker".
LTO works by relaunching the compiler from the linker and letting it
consume multiple translation units (which are fully preprocessed by that
point). So the very thing you wanted to avoid -- such barriers appearing
in close proximity where they can be deduplicated -- may arise after a
little bit of cross-unit inlining.

My main point here is that using __COUNTER__ that way (making things
"unique" for the compiler) does not work in general when LTO enters the
picture. As long as that is remembered, I'm happy.

> Btw, since we have compiler people on line, the suggested 'barrier()'
> isn't actually perfect for this particular use:
> 
>    #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> 
> in the general barrier case, we very much want to have that "memory"
> clobber, because the whole point of the general barrier case is that
> we want to make sure that the compiler doesn't cache memory state
> across it (ie the traditional use was basically what we now use
> "cpu_relax()" for, and you would use it for busy-looping on some
> condition).
> 
> In the case of "volatile_if()", we actually would like to have not a
> memory clobber, but a "memory read". IOW, it would be a barrier for
> any writes taking place, but reads can move around it.
> 
> I don't know of any way to express that to the compiler. We've used
> hacks for it before (in gcc, BLKmode reads turn into that kind of
> barrier in practice, so you can do something like make the memory
> input to the asm be a big array). But that turned out to be fairly
> unreliable, so now we use memory clobbers even if we just mean "reads
> random memory".

So the barrier which is a compiler barrier but not a machine barrier is
__atomic_signal_fence(model), but internally GCC will not treat it smarter
than an asm-with-memory-clobber today.

> Example: variable_test_bit(), which generates a "bt" instruction, does
> 
>                      : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> 
> and the memory clobber is obviously wrong: 'bt' only *reads* memory,
> but since the whole reason we use it is that it's not just that word
> at address 'addr', in order to make sure that any previous writes are
> actually stable in memory, we use that "memory" clobber.
> 
> It would be much nicer to have a "memory read" marker instead, to let
> the compiler know "I need to have done all pending writes to memory,
> but I can still cache read values over this op because it doesn't
> _change_ memory".
> 
> Anybody have ideas or suggestions for something like that?

In the specific case of 'bt', the offset cannot be negative, so I think you
can simply spell out the extent of the array being accessed:

    : "m" *(unsigned long (*)[-1UL / 8 / sizeof(long) + 1])addr

In the general case (possibility of negative offsets, or no obvious base to
supply), have you considered adding a "wild read" through a char pointer
that is initialized in a non-transparent way? Like this:

  char *wild_pointer;

  asm(""
      : "=X"(wild_pointer)
      : "X"(base1)
      , "X"(base2)); // unknown value related to given base pointers

  asm("pattern"
      : // normal outputs
      : // normal inputs
      , "m"(*wild_pointer));

The "X" constraint in theory should not tie up neither a register nor a stack
slot.

Alexander

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07  8:01                                                 ` Alexander Monakov
@ 2021-06-07  8:27                                                   ` Marco Elver
  2021-06-07 15:28                                                     ` Paul E. McKenney
  2021-06-07 17:52                                                   ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Marco Elver @ 2021-06-07  8:27 UTC (permalink / raw)
  To: Alexander Monakov
  Cc: Linus Torvalds, Jakub Jelinek, Alan Stern, Segher Boessenkool,
	Paul E. McKenney, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Mon, 7 Jun 2021 at 10:02, Alexander Monakov <amonakov@ispras.ru> wrote:
> On Sun, 6 Jun 2021, Linus Torvalds wrote:
[...]
> > On Sun, Jun 6, 2021 at 2:19 PM Alexander Monakov <amonakov@ispras.ru> wrote:
[...]
> > Btw, since we have compiler people on line, the suggested 'barrier()'
> > isn't actually perfect for this particular use:
> >
> >    #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> >
> > in the general barrier case, we very much want to have that "memory"
> > clobber, because the whole point of the general barrier case is that
> > we want to make sure that the compiler doesn't cache memory state
> > across it (ie the traditional use was basically what we now use
> > "cpu_relax()" for, and you would use it for busy-looping on some
> > condition).
> >
> > In the case of "volatile_if()", we actually would like to have not a
> > memory clobber, but a "memory read". IOW, it would be a barrier for
> > any writes taking place, but reads can move around it.
> >
> > I don't know of any way to express that to the compiler. We've used
> > hacks for it before (in gcc, BLKmode reads turn into that kind of
> > barrier in practice, so you can do something like make the memory
> > input to the asm be a big array). But that turned out to be fairly
> > unreliable, so now we use memory clobbers even if we just mean "reads
> > random memory".
>
> So the barrier which is a compiler barrier but not a machine barrier is
> __atomic_signal_fence(model), but internally GCC will not treat it smarter
> than an asm-with-memory-clobber today.

FWIW, Clang seems to be cleverer about it, and seems to do the optimal
thing if I use a __atomic_signal_fence(__ATOMIC_RELEASE):
https://godbolt.org/z/4v5xojqaY

Thanks,
-- Marco

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 18:43                                             ` Linus Torvalds
@ 2021-06-07 10:43                                               ` Peter Zijlstra
  2021-06-07 11:52                                                 ` Will Deacon
  0 siblings, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-07 10:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Stern, Segher Boessenkool, Paul E. McKenney, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 11:43:42AM -0700, Linus Torvalds wrote:
> So while the example code is insane and pointless (and you shouldn't
> read *too* much into it), conceptually the notion of that pattern of
> 
>     if (READ_ONCE(a)) {
>         WRITE_ONCE(b,1);
>         .. do something ..
>     } else {
>         WRITE_ONCE(b,1);
>         .. do something else ..
>     }

This is actually more tricky than it would appear (isn't it always).

The thing is, that normally we must avoid speculative stores, because
they'll result in out-of-thin-air values.

*Except* in this case, where both branches emit the same store, then
it's a given that the store will happen and it will not be OOTA.
Someone's actually done the proof for that apparently (Will, you have a
reference to Jade's paper?)

There's apparently also a competition going on who can build the
weakestest ARM64 implementation ever.

Combine the two, and you'll get a CPU that *will* emit the store early
:/

So it might be prudent to make this pattern as difficult as possible (a
compiler implementation of volatile_if might be able to observe and WARN
about this).

How's something like (leaving the improved barrier() aside for now):

#define volatile_if(x) \
	if (!(({ _Bool __x = (x); BUILD_BUG_ON(__builtin_constant_p(__x)); __x; }) && \
	     ({ barrier(); 1; }))) { } else

That makes writing:

	volatile_if(READ_ONCE(a)) {
		WRITE_ONCE(b, 1);
		// something
	} else {
		WRITE_ONCE(b, 1);
		// something else
	}

A syntax error, due to volatile_if() already being an else. And yes,
there's plenty other ways to write the same :/

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 20:26                                                   ` Segher Boessenkool
  2021-06-06 23:37                                                     ` Paul E. McKenney
@ 2021-06-07 10:52                                                     ` Peter Zijlstra
  2021-06-07 14:16                                                       ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-07 10:52 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Alan Stern, Paul E. McKenney, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 03:26:16PM -0500, Segher Boessenkool wrote:
> On Sun, Jun 06, 2021 at 01:11:53PM -0700, Linus Torvalds wrote:

> > We are very used to just making the compiler generate the code we
> > need. That is, fundamentally, what any use of inline asm is all about.
> > We want the compiler to generate all the common cases and all the
> > regular instructions.
> > 
> > The conditional branch itself - and the instructions leading up to it
> > - are exactly those "common regular instructions" that we'd want the
> > compiler to generate. That is in fact more true here than for most
> > inline asm, exactly because there are so many different possible
> > combinations of conditional branches (equal, not equal, less than,..)
> > and so many ways to generate the code that generates the condition.
> > 
> > So we are much better off letting the compiler do all that for us -
> > it's very much what the compiler is good at.
> 
> Yes, exactly.
> 
> I am saying that if you depend on that some C code you write will result
> in some particular machine code, without actually *forcing* the compiler
> to output that exact machine code, then you will be disappointed.  Maybe
> not today, and maybe it will take years, if you are lucky.
> 
> (s/forcing/instructing/ of course, compilers have feelings too!)

And hence the request for a language extension. Both compilers have a
vast array of language extensions that are outside of the C spec (thank
you!), so can we please get one more?



^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 10:43                                               ` Peter Zijlstra
@ 2021-06-07 11:52                                                 ` Will Deacon
  2021-06-07 15:25                                                   ` Paul E. McKenney
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
  0 siblings, 2 replies; 127+ messages in thread
From: Will Deacon @ 2021-06-07 11:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Alan Stern, Segher Boessenkool, Paul E. McKenney,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 12:43:01PM +0200, Peter Zijlstra wrote:
> On Sun, Jun 06, 2021 at 11:43:42AM -0700, Linus Torvalds wrote:
> > So while the example code is insane and pointless (and you shouldn't
> > read *too* much into it), conceptually the notion of that pattern of
> > 
> >     if (READ_ONCE(a)) {
> >         WRITE_ONCE(b,1);
> >         .. do something ..
> >     } else {
> >         WRITE_ONCE(b,1);
> >         .. do something else ..
> >     }
> 
> This is actually more tricky than it would appear (isn't it always).
> 
> The thing is, that normally we must avoid speculative stores, because
> they'll result in out-of-thin-air values.
> 
> *Except* in this case, where both branches emit the same store, then
> it's a given that the store will happen and it will not be OOTA.
> Someone's actually done the proof for that apparently (Will, you have a
> reference to Jade's paper?)

I don't think there's a paper on this, but Jade and I are hoping to talk
about aspects of it at LPC (assuming the toolchain MC gets accepted).

> There's apparently also a competition going on who can build the
> weakestest ARM64 implementation ever.
> 
> Combine the two, and you'll get a CPU that *will* emit the store early
> :/

So there are a lot of important details missing here and, as above, I think
this is something worth discussing at LPC with Jade. The rough summary is
that the arm64 memory model recently (so recently that it's not yet landed
in the public docs) introduced something called "pick dependencies", which
are a bit like control dependencies only they don't create order to all
subsequent stores. These are useful for some conditional data-processing
instructions such as CSEL and CAS, but it's important to note here that
*conditional branch instructions behave exactly as you would expect*.

<disclaimer; I don't work for Arm so any mistakes here are mine>

To reiterate, in the code sequence at the top of this mail, if the compiler
emits something along the lines of:

	LDR
	<conditional branch instruction>
	STR

then the load *will* be ordered before the store, even if the same store
instruction is executed regardless of the branch direction. Yes, one can
fantasize about a CPU that executes both taken and non-taken paths and
figures out that the STR can be hoisted before the load, but that is not
allowed by the architecture today.

It's the conditional instructions that are more fun. For example, the CSEL
instruction:

	CSEL	X0, X1, X2, <cond>

basically says:

	if (cond)
		X0 = X1;
	else
		X0 = X2;

these are just register-register operations, but the idea is that the CPU
can predict that "branching event" inside the CSEL instruction and
speculatively rename X0 while waiting for the condition to resolve.

So then you can add loads and stores to the mix along the lines of:

	LDR	X0, [X1]		// X0 = *X1
	CMP	X0, X2
	CSEL	X3, X4, X5, EQ		// X3 = (X0 == X2) ? X4 : X5
	STR	X3, [X6]		// MUST BE ORDERED AFTER THE LOAD
	STR	X7, [X8]		// Can be reordered

(assuming X1, X6, X8 all point to different locations in memory)

So now we have a dependency from the load to the first store, but the
interesting part is that the last store is _not_ ordered wrt either of the
other two memory accesses, whereas it would be if we used a conditional
branch instead of the CSEL. Make sense?

Now, obviously the compiler is blissfully unaware that conditional
data processing instructions can give rise to dependencies than
conditional branches, so the question really is how much do we need to
care in the kernel?

My preference is to use load-acquire instead of control dependencies so
that we don't have to worry about this, or any future relaxations to the
CPU architecture, at all.

Jade -- please can you correct me if I got any of this wrong?

Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 23:37                                                     ` Paul E. McKenney
@ 2021-06-07 14:12                                                       ` Segher Boessenkool
  2021-06-07 15:27                                                         ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 14:12 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, Alan Stern, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Sun, Jun 06, 2021 at 04:37:29PM -0700, Paul E. McKenney wrote:
> > > The barrier() thing can work - all we need to do is to simply make it
> > > impossible for gcc to validly create anything but a conditional
> > > branch.
> > 
> > And the only foolproof way of doing that is by writing a branch.

[ ... ]

> > I am saying that if you depend on that some C code you write will result
> > in some particular machine code, without actually *forcing* the compiler
> > to output that exact machine code, then you will be disappointed.  Maybe
> > not today, and maybe it will take years, if you are lucky.
> > 
> > (s/forcing/instructing/ of course, compilers have feelings too!)
> 
> OK, I will bite...
> 
> What would you suggest as a way of instructing the compiler to emit the
> conditional branch that we are looking for?

You write it in the assembler code.

Yes, it sucks.  But it is the only way to get a branch if you really
want one.  Now, you do not really need one here anyway, so there may be
some other way to satisfy the actual requirements.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 10:52                                                     ` Peter Zijlstra
@ 2021-06-07 14:16                                                       ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 14:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Alan Stern, Paul E. McKenney, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 12:52:15PM +0200, Peter Zijlstra wrote:
> On Sun, Jun 06, 2021 at 03:26:16PM -0500, Segher Boessenkool wrote:
> > I am saying that if you depend on that some C code you write will result
> > in some particular machine code, without actually *forcing* the compiler
> > to output that exact machine code, then you will be disappointed.  Maybe
> > not today, and maybe it will take years, if you are lucky.
> > 
> > (s/forcing/instructing/ of course, compilers have feelings too!)
> 
> And hence the request for a language extension. Both compilers have a
> vast array of language extensions that are outside of the C spec (thank
> you!), so can we please get one more?

I don't see why not?  It will need to be well-defined, so that it *can*
be implemented.  And ideally it will be useful for other applications as
well.  Finally, it should "play nice" with other extensions and language
features.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 11:52                                                 ` Will Deacon
@ 2021-06-07 15:25                                                   ` Paul E. McKenney
  2021-06-07 16:02                                                     ` Will Deacon
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
  1 sibling, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-07 15:25 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Linus Torvalds, Alan Stern, Segher Boessenkool,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 12:52:35PM +0100, Will Deacon wrote:
> On Mon, Jun 07, 2021 at 12:43:01PM +0200, Peter Zijlstra wrote:
> > On Sun, Jun 06, 2021 at 11:43:42AM -0700, Linus Torvalds wrote:
> > > So while the example code is insane and pointless (and you shouldn't
> > > read *too* much into it), conceptually the notion of that pattern of
> > > 
> > >     if (READ_ONCE(a)) {
> > >         WRITE_ONCE(b,1);
> > >         .. do something ..
> > >     } else {
> > >         WRITE_ONCE(b,1);
> > >         .. do something else ..
> > >     }
> > 
> > This is actually more tricky than it would appear (isn't it always).
> > 
> > The thing is, that normally we must avoid speculative stores, because
> > they'll result in out-of-thin-air values.
> > 
> > *Except* in this case, where both branches emit the same store, then
> > it's a given that the store will happen and it will not be OOTA.
> > Someone's actually done the proof for that apparently (Will, you have a
> > reference to Jade's paper?)
> 
> I don't think there's a paper on this, but Jade and I are hoping to talk
> about aspects of it at LPC (assuming the toolchain MC gets accepted).
> 
> > There's apparently also a competition going on who can build the
> > weakestest ARM64 implementation ever.
> > 
> > Combine the two, and you'll get a CPU that *will* emit the store early
> > :/
> 
> So there are a lot of important details missing here and, as above, I think
> this is something worth discussing at LPC with Jade. The rough summary is
> that the arm64 memory model recently (so recently that it's not yet landed
> in the public docs) introduced something called "pick dependencies", which
> are a bit like control dependencies only they don't create order to all
> subsequent stores. These are useful for some conditional data-processing
> instructions such as CSEL and CAS, but it's important to note here that
> *conditional branch instructions behave exactly as you would expect*.
> 
> <disclaimer; I don't work for Arm so any mistakes here are mine>
> 
> To reiterate, in the code sequence at the top of this mail, if the compiler
> emits something along the lines of:
> 
> 	LDR
> 	<conditional branch instruction>
> 	STR
> 
> then the load *will* be ordered before the store, even if the same store
> instruction is executed regardless of the branch direction. Yes, one can
> fantasize about a CPU that executes both taken and non-taken paths and
> figures out that the STR can be hoisted before the load, but that is not
> allowed by the architecture today.
> 
> It's the conditional instructions that are more fun. For example, the CSEL
> instruction:
> 
> 	CSEL	X0, X1, X2, <cond>
> 
> basically says:
> 
> 	if (cond)
> 		X0 = X1;
> 	else
> 		X0 = X2;
> 
> these are just register-register operations, but the idea is that the CPU
> can predict that "branching event" inside the CSEL instruction and
> speculatively rename X0 while waiting for the condition to resolve.
> 
> So then you can add loads and stores to the mix along the lines of:
> 
> 	LDR	X0, [X1]		// X0 = *X1
> 	CMP	X0, X2
> 	CSEL	X3, X4, X5, EQ		// X3 = (X0 == X2) ? X4 : X5
> 	STR	X3, [X6]		// MUST BE ORDERED AFTER THE LOAD
> 	STR	X7, [X8]		// Can be reordered
> 
> (assuming X1, X6, X8 all point to different locations in memory)
> 
> So now we have a dependency from the load to the first store, but the
> interesting part is that the last store is _not_ ordered wrt either of the
> other two memory accesses, whereas it would be if we used a conditional
> branch instead of the CSEL. Make sense?

And if I remember correctly, this is why LKMM orders loads in the
"if" condition only with stores in the "then" and "else" clauses,
not with stores after the end of the "if" statement.  Or is there
some case that I am missing?

> Now, obviously the compiler is blissfully unaware that conditional
> data processing instructions can give rise to dependencies than
> conditional branches, so the question really is how much do we need to
> care in the kernel?
> 
> My preference is to use load-acquire instead of control dependencies so
> that we don't have to worry about this, or any future relaxations to the
> CPU architecture, at all.

From what I can see, ARMv8 has DMB(LD) and DMB(ST).  Does it have
something like a DMB(LD,ST) that would act something like powerpc lwsync?

Or are you proposing rewriting the "if" conditions to upgrade
READ_ONCE() to smp_load_acquire()?  Or something else?

Just trying to find out exactly what you are proposing.  ;-)

						Thanx, Paul

> Jade -- please can you correct me if I got any of this wrong?
> 
> Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 14:12                                                       ` Segher Boessenkool
@ 2021-06-07 15:27                                                         ` Paul E. McKenney
  2021-06-07 18:23                                                           ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-07 15:27 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Alan Stern, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 09:12:42AM -0500, Segher Boessenkool wrote:
> On Sun, Jun 06, 2021 at 04:37:29PM -0700, Paul E. McKenney wrote:
> > > > The barrier() thing can work - all we need to do is to simply make it
> > > > impossible for gcc to validly create anything but a conditional
> > > > branch.
> > > 
> > > And the only foolproof way of doing that is by writing a branch.
> 
> [ ... ]
> 
> > > I am saying that if you depend on that some C code you write will result
> > > in some particular machine code, without actually *forcing* the compiler
> > > to output that exact machine code, then you will be disappointed.  Maybe
> > > not today, and maybe it will take years, if you are lucky.
> > > 
> > > (s/forcing/instructing/ of course, compilers have feelings too!)
> > 
> > OK, I will bite...
> > 
> > What would you suggest as a way of instructing the compiler to emit the
> > conditional branch that we are looking for?
> 
> You write it in the assembler code.
> 
> Yes, it sucks.  But it is the only way to get a branch if you really
> want one.  Now, you do not really need one here anyway, so there may be
> some other way to satisfy the actual requirements.

Hmmm...  What do you see Peter asking for that is different than what
I am asking for?  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07  8:27                                                   ` Marco Elver
@ 2021-06-07 15:28                                                     ` Paul E. McKenney
  2021-06-07 17:04                                                       ` Marco Elver
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-07 15:28 UTC (permalink / raw)
  To: Marco Elver
  Cc: Alexander Monakov, Linus Torvalds, Jakub Jelinek, Alan Stern,
	Segher Boessenkool, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 10:27:10AM +0200, Marco Elver wrote:
> On Mon, 7 Jun 2021 at 10:02, Alexander Monakov <amonakov@ispras.ru> wrote:
> > On Sun, 6 Jun 2021, Linus Torvalds wrote:
> [...]
> > > On Sun, Jun 6, 2021 at 2:19 PM Alexander Monakov <amonakov@ispras.ru> wrote:
> [...]
> > > Btw, since we have compiler people on line, the suggested 'barrier()'
> > > isn't actually perfect for this particular use:
> > >
> > >    #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> > >
> > > in the general barrier case, we very much want to have that "memory"
> > > clobber, because the whole point of the general barrier case is that
> > > we want to make sure that the compiler doesn't cache memory state
> > > across it (ie the traditional use was basically what we now use
> > > "cpu_relax()" for, and you would use it for busy-looping on some
> > > condition).
> > >
> > > In the case of "volatile_if()", we actually would like to have not a
> > > memory clobber, but a "memory read". IOW, it would be a barrier for
> > > any writes taking place, but reads can move around it.
> > >
> > > I don't know of any way to express that to the compiler. We've used
> > > hacks for it before (in gcc, BLKmode reads turn into that kind of
> > > barrier in practice, so you can do something like make the memory
> > > input to the asm be a big array). But that turned out to be fairly
> > > unreliable, so now we use memory clobbers even if we just mean "reads
> > > random memory".
> >
> > So the barrier which is a compiler barrier but not a machine barrier is
> > __atomic_signal_fence(model), but internally GCC will not treat it smarter
> > than an asm-with-memory-clobber today.
> 
> FWIW, Clang seems to be cleverer about it, and seems to do the optimal
> thing if I use a __atomic_signal_fence(__ATOMIC_RELEASE):
> https://godbolt.org/z/4v5xojqaY

Indeed it does!  But I don't know of a guarantee for that helpful
behavior.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 15:25                                                   ` Paul E. McKenney
@ 2021-06-07 16:02                                                     ` Will Deacon
  2021-06-07 18:08                                                       ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Will Deacon @ 2021-06-07 16:02 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Linus Torvalds, Alan Stern, Segher Boessenkool,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

Hi Paul,

On Mon, Jun 07, 2021 at 08:25:33AM -0700, Paul E. McKenney wrote:
> On Mon, Jun 07, 2021 at 12:52:35PM +0100, Will Deacon wrote:
> > It's the conditional instructions that are more fun. For example, the CSEL
> > instruction:
> > 
> > 	CSEL	X0, X1, X2, <cond>
> > 
> > basically says:
> > 
> > 	if (cond)
> > 		X0 = X1;
> > 	else
> > 		X0 = X2;
> > 
> > these are just register-register operations, but the idea is that the CPU
> > can predict that "branching event" inside the CSEL instruction and
> > speculatively rename X0 while waiting for the condition to resolve.
> > 
> > So then you can add loads and stores to the mix along the lines of:
> > 
> > 	LDR	X0, [X1]		// X0 = *X1
> > 	CMP	X0, X2
> > 	CSEL	X3, X4, X5, EQ		// X3 = (X0 == X2) ? X4 : X5
> > 	STR	X3, [X6]		// MUST BE ORDERED AFTER THE LOAD
> > 	STR	X7, [X8]		// Can be reordered
> > 
> > (assuming X1, X6, X8 all point to different locations in memory)
> > 
> > So now we have a dependency from the load to the first store, but the
> > interesting part is that the last store is _not_ ordered wrt either of the
> > other two memory accesses, whereas it would be if we used a conditional
> > branch instead of the CSEL. Make sense?
> 
> And if I remember correctly, this is why LKMM orders loads in the
> "if" condition only with stores in the "then" and "else" clauses,
> not with stores after the end of the "if" statement.  Or is there
> some case that I am missing?

It's not clear to me that such a restriction prevents the compiler from
using any of the arm64 conditional instructions in place of the conditional
branch in such a way that you end up with an "independent" store in the
assembly output constructed from two stores on the "then" and "else" paths
which the compiler determined where the same.

> > Now, obviously the compiler is blissfully unaware that conditional
> > data processing instructions can give rise to dependencies than
> > conditional branches, so the question really is how much do we need to
> > care in the kernel?
> > 
> > My preference is to use load-acquire instead of control dependencies so
> > that we don't have to worry about this, or any future relaxations to the
> > CPU architecture, at all.
> 
> From what I can see, ARMv8 has DMB(LD) and DMB(ST).  Does it have
> something like a DMB(LD,ST) that would act something like powerpc lwsync?
> 
> Or are you proposing rewriting the "if" conditions to upgrade
> READ_ONCE() to smp_load_acquire()?  Or something else?
> 
> Just trying to find out exactly what you are proposing.  ;-)

Some options are:

 (1) Do nothing until something actually goes wrong (and hope we spot/debug it)

 (2) Have volatile_if force a conditional branch, assuming that it solves
     the problem and doesn't hurt codegen (I still haven't convinced myself
     for either case)

 (3) Upgrade READ_ONCE() to RCpc acquire, relaxed atomic RMWs to RCsc
     acquire on arm64

 (4) Introduce e.g. READ_ONCE_CTRL(), atomic_add_return_ctrl() etc
     specifically for control dependencies and upgrade only those for
     arm64

 (5) Work to get toolchain support for dependency ordering and use that

I'm suggesting (3) or (4) because, honestly, it feels like we're being
squeezed from both sides with both the compiler and the hardware prepared
to break control dependencies.

Will

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 15:28                                                     ` Paul E. McKenney
@ 2021-06-07 17:04                                                       ` Marco Elver
  2021-06-08  9:30                                                         ` Marco Elver
  0 siblings, 1 reply; 127+ messages in thread
From: Marco Elver @ 2021-06-07 17:04 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Alexander Monakov, Linus Torvalds, Jakub Jelinek, Alan Stern,
	Segher Boessenkool, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 08:28AM -0700, Paul E. McKenney wrote:
> On Mon, Jun 07, 2021 at 10:27:10AM +0200, Marco Elver wrote:
> > On Mon, 7 Jun 2021 at 10:02, Alexander Monakov <amonakov@ispras.ru> wrote:
> > > On Sun, 6 Jun 2021, Linus Torvalds wrote:
> > [...]
> > > > On Sun, Jun 6, 2021 at 2:19 PM Alexander Monakov <amonakov@ispras.ru> wrote:
> > [...]
> > > > Btw, since we have compiler people on line, the suggested 'barrier()'
> > > > isn't actually perfect for this particular use:
> > > >
> > > >    #define barrier() __asm__ __volatile__("" : : "i" (__COUNTER__) : "memory")
> > > >
> > > > in the general barrier case, we very much want to have that "memory"
> > > > clobber, because the whole point of the general barrier case is that
> > > > we want to make sure that the compiler doesn't cache memory state
> > > > across it (ie the traditional use was basically what we now use
> > > > "cpu_relax()" for, and you would use it for busy-looping on some
> > > > condition).
> > > >
> > > > In the case of "volatile_if()", we actually would like to have not a
> > > > memory clobber, but a "memory read". IOW, it would be a barrier for
> > > > any writes taking place, but reads can move around it.
> > > >
> > > > I don't know of any way to express that to the compiler. We've used
> > > > hacks for it before (in gcc, BLKmode reads turn into that kind of
> > > > barrier in practice, so you can do something like make the memory
> > > > input to the asm be a big array). But that turned out to be fairly
> > > > unreliable, so now we use memory clobbers even if we just mean "reads
> > > > random memory".
> > >
> > > So the barrier which is a compiler barrier but not a machine barrier is
> > > __atomic_signal_fence(model), but internally GCC will not treat it smarter
> > > than an asm-with-memory-clobber today.
> > 
> > FWIW, Clang seems to be cleverer about it, and seems to do the optimal
> > thing if I use a __atomic_signal_fence(__ATOMIC_RELEASE):
> > https://godbolt.org/z/4v5xojqaY
> 
> Indeed it does!  But I don't know of a guarantee for that helpful
> behavior.

Is there a way we can interpret the standard in such a way that it
should be guaranteed?

If yes, it should be easy to add tests to the compiler repos for
snippets that the Linux kernel relies on (if we decide to use
__atomic_signal_fence() for this).

If no, we can still try to add tests to the compiler repos, but may
receive some push-back at the very latest when some optimization pass
decides to break it. Because the argument then is that it's well within
the language standard.

Adding language extensions will likely be met with resistance, because
some compiler folks are afraid of creating language forks (the reason
why we have '-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang').
That could be solved if we declare Linux-C a "standard", and finally get
-std=linux or such, at which point asking for "volatile if" directly
would probably be easier without jumping through hoops.

The jumping-through-hoops variant would probably be asking for a
__builtin primitive that allows constructing volatile_if() (if we can't
bend existing primitives to do what we want).

Thanks,
-- Marco

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-06 22:38                                               ` Linus Torvalds
  2021-06-06 23:39                                                 ` Rasmus Villemoes
  2021-06-07  8:01                                                 ` Alexander Monakov
@ 2021-06-07 17:42                                                 ` Segher Boessenkool
  2021-06-07 20:31                                                   ` Linus Torvalds
  2 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 17:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexander Monakov, Jakub Jelinek, Alan Stern, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Sun, Jun 06, 2021 at 03:38:06PM -0700, Linus Torvalds wrote:
> In the case of "volatile_if()", we actually would like to have not a
> memory clobber, but a "memory read". IOW, it would be a barrier for
> any writes taking place, but reads can move around it.
> 
> I don't know of any way to express that to the compiler. We've used
> hacks for it before (in gcc, BLKmode reads turn into that kind of
> barrier in practice, so you can do something like make the memory
> input to the asm be a big array). But that turned out to be fairly
> unreliable, so now we use memory clobbers even if we just mean "reads
> random memory".
> 
> Example: variable_test_bit(), which generates a "bt" instruction, does
> 
>                      : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> 
> and the memory clobber is obviously wrong: 'bt' only *reads* memory,
> but since the whole reason we use it is that it's not just that word
> at address 'addr', in order to make sure that any previous writes are
> actually stable in memory, we use that "memory" clobber.

You can split the "I" version from the "r" version, it does not need
the memory clobber.  If you know the actual maximum bit offset used you
don't need the clobber for "r" either.  Or you could even write
  "m"(((unsigned long *)addr)[nr/32])
That should work for all cases.

> Anybody have ideas or suggestions for something like that?

Is it useful in general for the kernel to have separate "read" and
"write" clobbers in asm expressions?  And for other applications?


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07  8:01                                                 ` Alexander Monakov
  2021-06-07  8:27                                                   ` Marco Elver
@ 2021-06-07 17:52                                                   ` Segher Boessenkool
  2021-06-07 18:07                                                     ` Alexander Monakov
  1 sibling, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 17:52 UTC (permalink / raw)
  To: Alexander Monakov
  Cc: Linus Torvalds, Jakub Jelinek, Alan Stern, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Mon, Jun 07, 2021 at 11:01:39AM +0300, Alexander Monakov wrote:
> Uhh... I was not talking about some (non-existent) "optimizing linker".
> LTO works by relaunching the compiler from the linker and letting it
> consume multiple translation units (which are fully preprocessed by that
> point). So the very thing you wanted to avoid -- such barriers appearing
> in close proximity where they can be deduplicated -- may arise after a
> little bit of cross-unit inlining.
> 
> My main point here is that using __COUNTER__ that way (making things
> "unique" for the compiler) does not work in general when LTO enters the
> picture. As long as that is remembered, I'm happy.

Yup.  Exactly the same issue as using this in any function that may end
up inlined.

> > In the case of "volatile_if()", we actually would like to have not a
> > memory clobber, but a "memory read". IOW, it would be a barrier for
> > any writes taking place, but reads can move around it.
> > 
> > I don't know of any way to express that to the compiler. We've used
> > hacks for it before (in gcc, BLKmode reads turn into that kind of
> > barrier in practice, so you can do something like make the memory
> > input to the asm be a big array). But that turned out to be fairly
> > unreliable, so now we use memory clobbers even if we just mean "reads
> > random memory".
> 
> So the barrier which is a compiler barrier but not a machine barrier is
> __atomic_signal_fence(model), but internally GCC will not treat it smarter
> than an asm-with-memory-clobber today.

It will do nothing for relaxed ordering, and do blockage for everything
else.  Can it do anything weaker than that?


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 17:52                                                   ` Segher Boessenkool
@ 2021-06-07 18:07                                                     ` Alexander Monakov
  2021-06-07 18:18                                                       ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Alexander Monakov @ 2021-06-07 18:07 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Linus Torvalds, Jakub Jelinek, Alan Stern, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Mon, 7 Jun 2021, Segher Boessenkool wrote:

> > So the barrier which is a compiler barrier but not a machine barrier is
> > __atomic_signal_fence(model), but internally GCC will not treat it smarter
> > than an asm-with-memory-clobber today.
> 
> It will do nothing for relaxed ordering, and do blockage for everything
> else.  Can it do anything weaker than that?

It's a "blockage instruction" after transitioning to RTL, but before that,
on GIMPLE, the compiler sees it properly as a corresponding built-in, and
may optimize according to given memory model. And on RTL, well, if anyone
cares they'll need to invent RTL representation for it, I guess.

Alexander

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 16:02                                                     ` Will Deacon
@ 2021-06-07 18:08                                                       ` Paul E. McKenney
  0 siblings, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-07 18:08 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Linus Torvalds, Alan Stern, Segher Boessenkool,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 05:02:53PM +0100, Will Deacon wrote:
> Hi Paul,
> 
> On Mon, Jun 07, 2021 at 08:25:33AM -0700, Paul E. McKenney wrote:
> > On Mon, Jun 07, 2021 at 12:52:35PM +0100, Will Deacon wrote:
> > > It's the conditional instructions that are more fun. For example, the CSEL
> > > instruction:
> > > 
> > > 	CSEL	X0, X1, X2, <cond>
> > > 
> > > basically says:
> > > 
> > > 	if (cond)
> > > 		X0 = X1;
> > > 	else
> > > 		X0 = X2;
> > > 
> > > these are just register-register operations, but the idea is that the CPU
> > > can predict that "branching event" inside the CSEL instruction and
> > > speculatively rename X0 while waiting for the condition to resolve.
> > > 
> > > So then you can add loads and stores to the mix along the lines of:
> > > 
> > > 	LDR	X0, [X1]		// X0 = *X1
> > > 	CMP	X0, X2
> > > 	CSEL	X3, X4, X5, EQ		// X3 = (X0 == X2) ? X4 : X5
> > > 	STR	X3, [X6]		// MUST BE ORDERED AFTER THE LOAD
> > > 	STR	X7, [X8]		// Can be reordered
> > > 
> > > (assuming X1, X6, X8 all point to different locations in memory)
> > > 
> > > So now we have a dependency from the load to the first store, but the
> > > interesting part is that the last store is _not_ ordered wrt either of the
> > > other two memory accesses, whereas it would be if we used a conditional
> > > branch instead of the CSEL. Make sense?
> > 
> > And if I remember correctly, this is why LKMM orders loads in the
> > "if" condition only with stores in the "then" and "else" clauses,
> > not with stores after the end of the "if" statement.  Or is there
> > some case that I am missing?
> 
> It's not clear to me that such a restriction prevents the compiler from
> using any of the arm64 conditional instructions in place of the conditional
> branch in such a way that you end up with an "independent" store in the
> assembly output constructed from two stores on the "then" and "else" paths
> which the compiler determined where the same.
> 
> > > Now, obviously the compiler is blissfully unaware that conditional
> > > data processing instructions can give rise to dependencies than
> > > conditional branches, so the question really is how much do we need to
> > > care in the kernel?
> > > 
> > > My preference is to use load-acquire instead of control dependencies so
> > > that we don't have to worry about this, or any future relaxations to the
> > > CPU architecture, at all.
> > 
> > From what I can see, ARMv8 has DMB(LD) and DMB(ST).  Does it have
> > something like a DMB(LD,ST) that would act something like powerpc lwsync?
> > 
> > Or are you proposing rewriting the "if" conditions to upgrade
> > READ_ONCE() to smp_load_acquire()?  Or something else?
> > 
> > Just trying to find out exactly what you are proposing.  ;-)
> 
> Some options are:
> 
>  (1) Do nothing until something actually goes wrong (and hope we spot/debug it)
> 
>  (2) Have volatile_if force a conditional branch, assuming that it solves
>      the problem and doesn't hurt codegen (I still haven't convinced myself
>      for either case)
> 
>  (3) Upgrade READ_ONCE() to RCpc acquire, relaxed atomic RMWs to RCsc
>      acquire on arm64
> 
>  (4) Introduce e.g. READ_ONCE_CTRL(), atomic_add_return_ctrl() etc
>      specifically for control dependencies and upgrade only those for
>      arm64
> 
>  (5) Work to get toolchain support for dependency ordering and use that
> 
> I'm suggesting (3) or (4) because, honestly, it feels like we're being
> squeezed from both sides with both the compiler and the hardware prepared
> to break control dependencies.

I will toss out this as well:

  (6) Create a volatile_if() that does not support an "else" clause,
      thus covering all current use cases and avoiding some of the
      same-store issues.  Which in the end might or might not help,
      but perhaps worth looking into.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 18:07                                                     ` Alexander Monakov
@ 2021-06-07 18:18                                                       ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 18:18 UTC (permalink / raw)
  To: Alexander Monakov
  Cc: Linus Torvalds, Jakub Jelinek, Alan Stern, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Mon, Jun 07, 2021 at 09:07:58PM +0300, Alexander Monakov wrote:
> On Mon, 7 Jun 2021, Segher Boessenkool wrote:
> 
> > > So the barrier which is a compiler barrier but not a machine barrier is
> > > __atomic_signal_fence(model), but internally GCC will not treat it smarter
> > > than an asm-with-memory-clobber today.
> > 
> > It will do nothing for relaxed ordering, and do blockage for everything
> > else.  Can it do anything weaker than that?
> 
> It's a "blockage instruction" after transitioning to RTL, but before that,
> on GIMPLE, the compiler sees it properly as a corresponding built-in, and
> may optimize according to given memory model. And on RTL, well, if anyone
> cares they'll need to invent RTL representation for it, I guess.

My question was if anything weaker is *valid* :-)  (And if so, why!)


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 15:27                                                         ` Paul E. McKenney
@ 2021-06-07 18:23                                                           ` Segher Boessenkool
  2021-06-07 19:51                                                             ` Alan Stern
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 18:23 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, Alan Stern, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 08:27:12AM -0700, Paul E. McKenney wrote:
> > > > > The barrier() thing can work - all we need to do is to simply make it
> > > > > impossible for gcc to validly create anything but a conditional
> > > > > branch.

> > > What would you suggest as a way of instructing the compiler to emit the
> > > conditional branch that we are looking for?
> > 
> > You write it in the assembler code.
> > 
> > Yes, it sucks.  But it is the only way to get a branch if you really
> > want one.  Now, you do not really need one here anyway, so there may be
> > some other way to satisfy the actual requirements.
> 
> Hmmm...  What do you see Peter asking for that is different than what
> I am asking for?  ;-)

I don't know what you are referring to, sorry?

I know what you asked for: literally some way to tell the compiler to
emit a conditional branch.  If that is what you want, the only way to
make sure that is what you get is by writing exactly that in assembler.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 18:23                                                           ` Segher Boessenkool
@ 2021-06-07 19:51                                                             ` Alan Stern
  2021-06-07 20:16                                                               ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-06-07 19:51 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Paul E. McKenney, Linus Torvalds, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 01:23:35PM -0500, Segher Boessenkool wrote:
> On Mon, Jun 07, 2021 at 08:27:12AM -0700, Paul E. McKenney wrote:
> > > > > > The barrier() thing can work - all we need to do is to simply make it
> > > > > > impossible for gcc to validly create anything but a conditional
> > > > > > branch.
> 
> > > > What would you suggest as a way of instructing the compiler to emit the
> > > > conditional branch that we are looking for?
> > > 
> > > You write it in the assembler code.
> > > 
> > > Yes, it sucks.  But it is the only way to get a branch if you really
> > > want one.  Now, you do not really need one here anyway, so there may be
> > > some other way to satisfy the actual requirements.
> > 
> > Hmmm...  What do you see Peter asking for that is different than what
> > I am asking for?  ;-)
> 
> I don't know what you are referring to, sorry?
> 
> I know what you asked for: literally some way to tell the compiler to
> emit a conditional branch.  If that is what you want, the only way to
> make sure that is what you get is by writing exactly that in assembler.

That's not necessarily it.

People would be happy to have an easy way of telling the compiler that 
all writes in the "if" branch of an if statement must be ordered after 
any reads that the condition depends on.  Or maybe all writes in either 
the "if" branch or the "else" branch.  And maybe not all reads that the 
condition depends on, but just the reads appearing syntactically in the 
condition.  Or maybe even just the volatile reads appearing in the 
condition.  Nobody has said exactly.

The exact method used for doing this doesn't matter.  It could be 
accomplished by treating those reads as load-acquires.  Or it could be 
done by ensuring that the object code contains a dependency (control or 
data) from the reads to the writes.  Or it could be done by treating 
the writes as store-releases.  But we do want the execution-time 
penalty to be small.

In short, we want to guarantee somehow that the conditional writes are 
not re-ordered before the reads in the condition.  (But note that 
"conditional writes" includes identical writes occurring in both 
branches.)

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 19:51                                                             ` Alan Stern
@ 2021-06-07 20:16                                                               ` Paul E. McKenney
  2021-06-07 22:40                                                                 ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-07 20:16 UTC (permalink / raw)
  To: Alan Stern
  Cc: Segher Boessenkool, Linus Torvalds, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 03:51:44PM -0400, Alan Stern wrote:
> On Mon, Jun 07, 2021 at 01:23:35PM -0500, Segher Boessenkool wrote:
> > On Mon, Jun 07, 2021 at 08:27:12AM -0700, Paul E. McKenney wrote:
> > > > > > > The barrier() thing can work - all we need to do is to simply make it
> > > > > > > impossible for gcc to validly create anything but a conditional
> > > > > > > branch.
> > 
> > > > > What would you suggest as a way of instructing the compiler to emit the
> > > > > conditional branch that we are looking for?
> > > > 
> > > > You write it in the assembler code.
> > > > 
> > > > Yes, it sucks.  But it is the only way to get a branch if you really
> > > > want one.  Now, you do not really need one here anyway, so there may be
> > > > some other way to satisfy the actual requirements.
> > > 
> > > Hmmm...  What do you see Peter asking for that is different than what
> > > I am asking for?  ;-)
> > 
> > I don't know what you are referring to, sorry?
> > 
> > I know what you asked for: literally some way to tell the compiler to
> > emit a conditional branch.  If that is what you want, the only way to
> > make sure that is what you get is by writing exactly that in assembler.
> 
> That's not necessarily it.
> 
> People would be happy to have an easy way of telling the compiler that 
> all writes in the "if" branch of an if statement must be ordered after 
> any reads that the condition depends on.  Or maybe all writes in either 
> the "if" branch or the "else" branch.  And maybe not all reads that the 
> condition depends on, but just the reads appearing syntactically in the 
> condition.  Or maybe even just the volatile reads appearing in the 
> condition.  Nobody has said exactly.
> 
> The exact method used for doing this doesn't matter.  It could be 
> accomplished by treating those reads as load-acquires.  Or it could be 
> done by ensuring that the object code contains a dependency (control or 
> data) from the reads to the writes.  Or it could be done by treating 
> the writes as store-releases.  But we do want the execution-time 
> penalty to be small.
> 
> In short, we want to guarantee somehow that the conditional writes are 
> not re-ordered before the reads in the condition.  (But note that 
> "conditional writes" includes identical writes occurring in both 
> branches.)

What Alan said!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 17:42                                                 ` Segher Boessenkool
@ 2021-06-07 20:31                                                   ` Linus Torvalds
  2021-06-07 22:54                                                     ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Linus Torvalds @ 2021-06-07 20:31 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alexander Monakov, Jakub Jelinek, Alan Stern, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Mon, Jun 7, 2021 at 10:45 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Sun, Jun 06, 2021 at 03:38:06PM -0700, Linus Torvalds wrote:
> >
> > Example: variable_test_bit(), which generates a "bt" instruction, does
> >
> >                      : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> >
> > and the memory clobber is obviously wrong: 'bt' only *reads* memory,
> > but since the whole reason we use it is that it's not just that word
> > at address 'addr', in order to make sure that any previous writes are
> > actually stable in memory, we use that "memory" clobber.
>
> You can split the "I" version from the "r" version, it does not need
> the memory clobber.  If you know the actual maximum bit offset used you
> don't need the clobber for "r" either.  Or you could even write
>   "m"(((unsigned long *)addr)[nr/32])
> That should work for all cases.

Note that the bit test thing really was just an example.

And some other cases don't actually have an address range at all,
because they affect arbitrary ranges, not - like that bit test - just
one particular range.

To pick a couple of examples of that, think of

 (a) write memory barrier. On some architectures it's an explicit
instruction, on x86 it's just a compiler barrier, since writes are
ordered on the CPU anyway, and we only need to make sure that the
compiler doesn't re-order writes around the barrier

Again, we currently use that same "barrier()" macro for that:

    #define __smp_wmb()     barrier()

but as mentioned, the barrier() thing has a "memory" clobber, and that
means that this write barrier - which is really really cheap on x86 -
also unnecessarily ends up causing pointless reloads from globals. It
obviously doesn't actually *change* memory, but it very much requires
that writes are not moved around it.

 (b) things like cache flush and/or invalidate instructions, eg

        asm volatile("wbinvd": : :"memory");

Again, this one doesn't actually *modify* memory, and honestly, this
one is not performance critical so the memory clobber is not actually
a problem, but I'm pointing it out as an example of the exact same
issue: the notion of an instruction that we don't want _writes_ to
move around, but reads can happily be moved and/or cached around it.

 (c) this whole "volatile_if()" situation: we want to make sure writes
can't move around it, but there's no reason to re-load memory values,
because it doesn't modify memory, and we only need to make sure that
any writes are delayed to after the conditional.

We long long ago (over 20 years by now) used to do things like this:

  struct __dummy { unsigned long a[100]; };
  #define ADDR (*(volatile struct __dummy *) addr)

      __asm__ __volatile__(
              "btl %2,%1\n\tsbbl %0,%0"
              :"=r" (oldbit)
              :"m" (ADDR),"ir" (nr));

for that test-bit thing. Note how the above doesn't need the memory
clobber, because for gcc that ADDR thing (access to a big struct) ends
up being a "BLKmode" read, and then gcc at least used to treat it as
an arbitrary read.

I forget just why we had to stop using that trick, I think it caused
some reload confusion for some gcc version at some point. Probably
exactly because inline asms had issues with some BLKmode thing. That
change happened before 2001, we didn't have nice changelogs with
detailed commit messages back then, so

> > Anybody have ideas or suggestions for something like that?
>
> Is it useful in general for the kernel to have separate "read" and
> "write" clobbers in asm expressions?  And for other applications?

See above. It's actually not all that uncommon that you have a "this
doesn't modify memory, but you can't move writes around it". It's
usually very much about cache handling or memory ordering operations,
and that bit test example was probably a bad example exactly because
it made it look like it's about some controlled range.

The "write memory barroer" is likely the best and simplest example,
but it's in not the only one.

            Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 20:16                                                               ` Paul E. McKenney
@ 2021-06-07 22:40                                                                 ` Segher Boessenkool
  2021-06-07 23:26                                                                   ` Paul E. McKenney
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 22:40 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Alan Stern, Linus Torvalds, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 01:16:33PM -0700, Paul E. McKenney wrote:
> On Mon, Jun 07, 2021 at 03:51:44PM -0400, Alan Stern wrote:
> > On Mon, Jun 07, 2021 at 01:23:35PM -0500, Segher Boessenkool wrote:
> > > On Mon, Jun 07, 2021 at 08:27:12AM -0700, Paul E. McKenney wrote:
> > > > > > > > The barrier() thing can work - all we need to do is to simply make it
> > > > > > > > impossible for gcc to validly create anything but a conditional
> > > > > > > > branch.
> > > 
> > > > > > What would you suggest as a way of instructing the compiler to emit the
> > > > > > conditional branch that we are looking for?
> > > > > 
> > > > > You write it in the assembler code.
> > > > > 
> > > > > Yes, it sucks.  But it is the only way to get a branch if you really
> > > > > want one.  Now, you do not really need one here anyway, so there may be
> > > > > some other way to satisfy the actual requirements.
> > > > 
> > > > Hmmm...  What do you see Peter asking for that is different than what
> > > > I am asking for?  ;-)
> > > 
> > > I don't know what you are referring to, sorry?
> > > 
> > > I know what you asked for: literally some way to tell the compiler to
> > > emit a conditional branch.  If that is what you want, the only way to
> > > make sure that is what you get is by writing exactly that in assembler.
> > 
> > That's not necessarily it.
> > 
> > People would be happy to have an easy way of telling the compiler that 
> > all writes in the "if" branch of an if statement must be ordered after 
> > any reads that the condition depends on.  Or maybe all writes in either 
> > the "if" branch or the "else" branch.  And maybe not all reads that the 
> > condition depends on, but just the reads appearing syntactically in the 
> > condition.  Or maybe even just the volatile reads appearing in the 
> > condition.  Nobody has said exactly.
> > 
> > The exact method used for doing this doesn't matter.  It could be 
> > accomplished by treating those reads as load-acquires.  Or it could be 
> > done by ensuring that the object code contains a dependency (control or 
> > data) from the reads to the writes.  Or it could be done by treating 
> > the writes as store-releases.  But we do want the execution-time 
> > penalty to be small.
> > 
> > In short, we want to guarantee somehow that the conditional writes are 
> > not re-ordered before the reads in the condition.  (But note that 
> > "conditional writes" includes identical writes occurring in both 
> > branches.)
> 
> What Alan said!  ;-)

Okay, I'll think about that.

But you wrote:

> > > > > > What would you suggest as a way of instructing the compiler to emit the
> > > > > > conditional branch that we are looking for?

... and that is what I answered.  I am sorry if you do not like being
taken literally, but that is how I read technical remarks: as literally
what they say.  If you say you want a branch, I take it you want a
branch!  :-)


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 20:31                                                   ` Linus Torvalds
@ 2021-06-07 22:54                                                     ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-07 22:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alexander Monakov, Jakub Jelinek, Alan Stern, Paul E. McKenney,
	Peter Zijlstra, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Mon, Jun 07, 2021 at 01:31:24PM -0700, Linus Torvalds wrote:
> > Is it useful in general for the kernel to have separate "read" and
> > "write" clobbers in asm expressions?  And for other applications?
> 
> See above. It's actually not all that uncommon that you have a "this
> doesn't modify memory, but you can't move writes around it". It's
> usually very much about cache handling or memory ordering operations,
> and that bit test example was probably a bad example exactly because
> it made it look like it's about some controlled range.
> 
> The "write memory barroer" is likely the best and simplest example,
> but it's in not the only one.

Thanks for the examples!  I opened <https://gcc.gnu.org/PR100953> so
that we can easily track it.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 22:40                                                                 ` Segher Boessenkool
@ 2021-06-07 23:26                                                                   ` Paul E. McKenney
  0 siblings, 0 replies; 127+ messages in thread
From: Paul E. McKenney @ 2021-06-07 23:26 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alan Stern, Linus Torvalds, Peter Zijlstra, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Jun 07, 2021 at 05:40:37PM -0500, Segher Boessenkool wrote:
> On Mon, Jun 07, 2021 at 01:16:33PM -0700, Paul E. McKenney wrote:
> > On Mon, Jun 07, 2021 at 03:51:44PM -0400, Alan Stern wrote:
> > > On Mon, Jun 07, 2021 at 01:23:35PM -0500, Segher Boessenkool wrote:
> > > > On Mon, Jun 07, 2021 at 08:27:12AM -0700, Paul E. McKenney wrote:
> > > > > > > > > The barrier() thing can work - all we need to do is to simply make it
> > > > > > > > > impossible for gcc to validly create anything but a conditional
> > > > > > > > > branch.
> > > > 
> > > > > > > What would you suggest as a way of instructing the compiler to emit the
> > > > > > > conditional branch that we are looking for?
> > > > > > 
> > > > > > You write it in the assembler code.
> > > > > > 
> > > > > > Yes, it sucks.  But it is the only way to get a branch if you really
> > > > > > want one.  Now, you do not really need one here anyway, so there may be
> > > > > > some other way to satisfy the actual requirements.
> > > > > 
> > > > > Hmmm...  What do you see Peter asking for that is different than what
> > > > > I am asking for?  ;-)
> > > > 
> > > > I don't know what you are referring to, sorry?
> > > > 
> > > > I know what you asked for: literally some way to tell the compiler to
> > > > emit a conditional branch.  If that is what you want, the only way to
> > > > make sure that is what you get is by writing exactly that in assembler.
> > > 
> > > That's not necessarily it.
> > > 
> > > People would be happy to have an easy way of telling the compiler that 
> > > all writes in the "if" branch of an if statement must be ordered after 
> > > any reads that the condition depends on.  Or maybe all writes in either 
> > > the "if" branch or the "else" branch.  And maybe not all reads that the 
> > > condition depends on, but just the reads appearing syntactically in the 
> > > condition.  Or maybe even just the volatile reads appearing in the 
> > > condition.  Nobody has said exactly.
> > > 
> > > The exact method used for doing this doesn't matter.  It could be 
> > > accomplished by treating those reads as load-acquires.  Or it could be 
> > > done by ensuring that the object code contains a dependency (control or 
> > > data) from the reads to the writes.  Or it could be done by treating 
> > > the writes as store-releases.  But we do want the execution-time 
> > > penalty to be small.
> > > 
> > > In short, we want to guarantee somehow that the conditional writes are 
> > > not re-ordered before the reads in the condition.  (But note that 
> > > "conditional writes" includes identical writes occurring in both 
> > > branches.)
> > 
> > What Alan said!  ;-)
> 
> Okay, I'll think about that.
> 
> But you wrote:
> 
> > > > > > > What would you suggest as a way of instructing the compiler to emit the
> > > > > > > conditional branch that we are looking for?
> 
> ... and that is what I answered.  I am sorry if you do not like being
> taken literally, but that is how I read technical remarks: as literally
> what they say.  If you say you want a branch, I take it you want a
> branch!  :-)

When it is the cheapest means of providing the needed ordering, I really
do want a branch.  ;-)

And a branch would implement Alan's "control dependency" above.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-07 17:04                                                       ` Marco Elver
@ 2021-06-08  9:30                                                         ` Marco Elver
  2021-06-08 11:22                                                           ` Peter Zijlstra
  0 siblings, 1 reply; 127+ messages in thread
From: Marco Elver @ 2021-06-08  9:30 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Alexander Monakov, Linus Torvalds, Jakub Jelinek, Alan Stern,
	Segher Boessenkool, Peter Zijlstra, Will Deacon, Andrea Parri,
	Boqun Feng, Nick Piggin, David Howells, Jade Alglave,
	Luc Maranget, Akira Yokosawa, Linux Kernel Mailing List,
	linux-toolchains, linux-arch

On Mon, 7 Jun 2021 at 19:04, Marco Elver <elver@google.com> wrote:
[...]
> > > > So the barrier which is a compiler barrier but not a machine barrier is
> > > > __atomic_signal_fence(model), but internally GCC will not treat it smarter
> > > > than an asm-with-memory-clobber today.
> > >
> > > FWIW, Clang seems to be cleverer about it, and seems to do the optimal
> > > thing if I use a __atomic_signal_fence(__ATOMIC_RELEASE):
> > > https://godbolt.org/z/4v5xojqaY
> >
> > Indeed it does!  But I don't know of a guarantee for that helpful
> > behavior.
>
> Is there a way we can interpret the standard in such a way that it
> should be guaranteed?

I figured out why it works, and unfortunately it's suboptimal codegen.
In LLVM __atomic_signal_fence() turns into a real IR instruction,
which when lowered to asm just doesn't emit anything. But most
optimizations happen before in IR, and a "fence" cannot be removed.
Essentially imagine there's an invisible instruction, which explains
why it does what it does. Sadly we can't rely on that.

> The jumping-through-hoops variant would probably be asking for a
> __builtin primitive that allows constructing volatile_if() (if we can't
> bend existing primitives to do what we want).

I had a think about this. I think if we ask for some primitive
compiler support, "volatile if" as the target is suboptimal design,
because it somewhat limits composability (and of course make it hard
to get as an extension). That primitive should probably also support
for/while/switch. But "volatile if" would also preclude us from
limiting the scope of the source of forced dependency, e.g. say we
have "if (A && B)", but we only care about A.

The cleaner approach would be an expression wrapper, e.g. "if
(ctrl_depends(A) && B) { ... }".

I imagine syntactically it'd be similar to __builtin_expect(..). I
think that's also easier to request an extension for, say
__builtin_ctrl_depends(expr). (If that is appealing, we can try and
propose it as std::ctrl_depends() along with std::dependent_ptr<>.)

Thoughts?

Thanks,
-- Marco

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-08  9:30                                                         ` Marco Elver
@ 2021-06-08 11:22                                                           ` Peter Zijlstra
  2021-06-08 15:28                                                             ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Peter Zijlstra @ 2021-06-08 11:22 UTC (permalink / raw)
  To: Marco Elver
  Cc: Paul E. McKenney, Alexander Monakov, Linus Torvalds,
	Jakub Jelinek, Alan Stern, Segher Boessenkool, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Tue, Jun 08, 2021 at 11:30:36AM +0200, Marco Elver wrote:

> The cleaner approach would be an expression wrapper, e.g. "if
> (ctrl_depends(A) && B) { ... }".
> 
> I imagine syntactically it'd be similar to __builtin_expect(..). I
> think that's also easier to request an extension for, say
> __builtin_ctrl_depends(expr). (If that is appealing, we can try and
> propose it as std::ctrl_depends() along with std::dependent_ptr<>.)
> 
> Thoughts?

Works for me; and note how it mirrors how we implemented volatile_if()
in the first place, by doing an expression wrapper.

__builtin_ctrl_depends(expr) would have to:

 - ensure !__builtin_const_p(expr)	(A)
 - imply an acquire compiler fence	(B)
 - ensure cond-branch is emitted	(C)

*OR*

 - ensure !__builtin_const_p(expr);		(A)
 - upgrade the load in @expr to load-acquire	(D)


A)

This all hinges on there actually being a LOAD, if expr is constant, we
have a malformed program and can emit a compiler error.

B)

We want to capture any store, not just volatile stores that come after.

The example here is a ring-buffer that loads the (head and) tail pointer
to check for space and then writes data elements. It would be
'cumbersome' to have all the data writes as volatile.

C)

We depend on the load-to-branch data dependency to guard the store to
provide the LOAD->STORE memory order.

D)

Upgrading LOAD to LOAD-ACQUIRE also provides LOAD->STORE ordering, but
it does require that the compiler has access to the LOAD in the first
place, which isn't a given seeing how much asm() we have around. Also
the achitecture should have a sheep LOAD-ACQUIRE in the first place,
otherwise there's no point.

If this is done, the branch is allowed to be optimized away if the
compiler so wants.

Now, Will will also want to allow the load-acquire to be run-time
patched between the RCsc and RCpc variant depending on what ARMv8
extentions are available, which will be 'interesting' (although I can
think of ways to actually do that, one would be to keep a special
section that tracks the location of these __builtin_ctrl_depends()
generated load-acquire instruction).



^ permalink raw reply	[flat|nested] 127+ messages in thread

* RE: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
                   ` (4 preceding siblings ...)
  2021-06-04 16:30 ` Linus Torvalds
@ 2021-06-08 12:48 ` David Laight
  2021-09-24 18:38 ` Mathieu Desnoyers
  6 siblings, 0 replies; 127+ messages in thread
From: David Laight @ 2021-06-08 12:48 UTC (permalink / raw)
  To: 'Peter Zijlstra',
	Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks
  Cc: linux-kernel, linux-toolchains, linux-arch

From: Peter Zijlstra
> Sent: 04 June 2021 11:12
> 
> Hi!
> 
> With optimizing compilers becoming more and more agressive and C so far
> refusing to acknowledge the concept of control-dependencies even while
> we keep growing the amount of reliance on them, things will eventually
> come apart.
> 
> There have been talks with toolchain people on how to resolve this; one
> suggestion was allowing the volatile qualifier on branch statements like
> 'if', but so far no actual compiler has made any progress on this.
> 
> Rather than waiting any longer, provide our own construct based on that
> suggestion. The idea is by Alan Stern and refined by Paul and myself.
> 
> Code generation is sub-optimal (for the weak architectures) since we're
> forced to convert the condition into another and use a fixed conditional
> branch instruction, but shouldn't be too bad.

What happens on mips-like architectures (I think includes riscv)
that have 'compare two registers and branch' instructions rather
than a more traditional 'flags register'?

The generated code it likely to be somewhat different.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-08 11:22                                                           ` Peter Zijlstra
@ 2021-06-08 15:28                                                             ` Segher Boessenkool
  2021-06-09 12:44                                                               ` Marco Elver
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-08 15:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Marco Elver, Paul E. McKenney, Alexander Monakov, Linus Torvalds,
	Jakub Jelinek, Alan Stern, Will Deacon, Andrea Parri, Boqun Feng,
	Nick Piggin, David Howells, Jade Alglave, Luc Maranget,
	Akira Yokosawa, Linux Kernel Mailing List, linux-toolchains,
	linux-arch

On Tue, Jun 08, 2021 at 01:22:58PM +0200, Peter Zijlstra wrote:
> Works for me; and note how it mirrors how we implemented volatile_if()
> in the first place, by doing an expression wrapper.
> 
> __builtin_ctrl_depends(expr) would have to:
> 
>  - ensure !__builtin_const_p(expr)	(A)

Why would it be an error if __builtin_constant_p(expr)?  In many
programs the compiler can figure out some expression does never change.
Having a control dependency on sometthing like that is not erroneous.

>  - imply an acquire compiler fence	(B)
>  - ensure cond-branch is emitted	(C)

(C) is almost impossible to do.  This should be reformulated to talk
about the effect of the generated code, instead.

> *OR*
> 
>  - ensure !__builtin_const_p(expr);		(A)
>  - upgrade the load in @expr to load-acquire	(D)

So that will only work if there is exactly one read from memory in expr?
That is problematic.

This needs some work.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-08 15:28                                                             ` Segher Boessenkool
@ 2021-06-09 12:44                                                               ` Marco Elver
  2021-06-09 15:31                                                                 ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Marco Elver @ 2021-06-09 12:44 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Paul E. McKenney, Alexander Monakov,
	Linus Torvalds, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Tue, 8 Jun 2021 at 17:30, Segher Boessenkool
<segher@kernel.crashing.org> wrote:
> On Tue, Jun 08, 2021 at 01:22:58PM +0200, Peter Zijlstra wrote:
> > Works for me; and note how it mirrors how we implemented volatile_if()
> > in the first place, by doing an expression wrapper.
> >
> > __builtin_ctrl_depends(expr) would have to:
> >
> >  - ensure !__builtin_const_p(expr)    (A)
>
> Why would it be an error if __builtin_constant_p(expr)?  In many
> programs the compiler can figure out some expression does never change.
> Having a control dependency on sometthing like that is not erroneous.
>
> >  - imply an acquire compiler fence    (B)
> >  - ensure cond-branch is emitted      (C)
>
> (C) is almost impossible to do.  This should be reformulated to talk
> about the effect of the generated code, instead.
>
> > *OR*
> >
> >  - ensure !__builtin_const_p(expr);           (A)
> >  - upgrade the load in @expr to load-acquire  (D)
>
> So that will only work if there is exactly one read from memory in expr?
> That is problematic.
>
> This needs some work.

There is a valid concern that something at the level of the memory
model requires very precise specification in terms of language
semantics and not generated code. Otherwise it seems difficult to get
compiler folks onboard. And coming up with such a specification may
take a while, especially if we have to venture in the realm of the
C11/C++11 memory model while still trying to somehow make it work for
the LKMM. That seems like a very tricky maze we may want to avoid.

An alternative design would be to use a statement attribute to only
enforce (C) ("__attribute__((mustcontrol))" ?). The rest can be
composed through existing primitives I think (the compiler barriers
need optimizing though), which should give us ctrl_depends().

At least for Clang, it should be doable: https://reviews.llvm.org/D103958

Thanks,
-- Marco

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-09 12:44                                                               ` Marco Elver
@ 2021-06-09 15:31                                                                 ` Segher Boessenkool
  2021-06-09 16:13                                                                   ` Marco Elver
  0 siblings, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-09 15:31 UTC (permalink / raw)
  To: Marco Elver
  Cc: Peter Zijlstra, Paul E. McKenney, Alexander Monakov,
	Linus Torvalds, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Wed, Jun 09, 2021 at 02:44:08PM +0200, Marco Elver wrote:
> On Tue, 8 Jun 2021 at 17:30, Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> > This needs some work.
> 
> There is a valid concern that something at the level of the memory
> model requires very precise specification in terms of language
> semantics and not generated code.

Yup, exactly.  Especially because the meaning of generated code is hard
to describe, and even much more so if you do not limit yourself to a
single machine architecture.

> Otherwise it seems difficult to get
> compiler folks onboard.

It isn't just difficult to get us on board without it, we know it just
is impossible to do anything sensible without it.

> And coming up with such a specification may
> take a while,

Yes.

> especially if we have to venture in the realm of the
> C11/C++11 memory model while still trying to somehow make it work for
> the LKMM. That seems like a very tricky maze we may want to avoid.

Well, you only need to use the saner parts of the memory model (not the
full thing), and extensions are fine as well of course.

> An alternative design would be to use a statement attribute to only
> enforce (C) ("__attribute__((mustcontrol))" ?).

Statement attributes only exist for empty statements.  It is unclear how
(and if!) we could support it for general statements.

Some new builtin seems to fit the requirements better?  I haven't looked
too closely though.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-09 15:31                                                                 ` Segher Boessenkool
@ 2021-06-09 16:13                                                                   ` Marco Elver
  2021-06-09 17:14                                                                     ` Segher Boessenkool
  2021-06-09 18:25                                                                     ` Linus Torvalds
  0 siblings, 2 replies; 127+ messages in thread
From: Marco Elver @ 2021-06-09 16:13 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Paul E. McKenney, Alexander Monakov,
	Linus Torvalds, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Wed, 9 Jun 2021 at 17:33, Segher Boessenkool
<segher@kernel.crashing.org> wrote:
[...]
> > An alternative design would be to use a statement attribute to only
> > enforce (C) ("__attribute__((mustcontrol))" ?).
>
> Statement attributes only exist for empty statements.  It is unclear how
> (and if!) we could support it for general statements.

Statement attributes can apply to anything -- Clang has had them apply
to non-empty statements for a while. I have
[[clang::mustcontrol]]/__attribute__((mustcontrol)) working, but of
course it's not final but helped me figure out how feasible it is
without running in circles here -- proof here:
https://reviews.llvm.org/D103958

If [1] is up-to-date, then yes, I can see that GCC currently only
supports empty statement attributes, but Clang isn't limited to empty
[2].
[1] https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html
[2] https://clang.llvm.org/docs/AttributeReference.html#statement-attributes

In fact, since C++20 [3], GCC will have to support statement
attributes on non-empty statements, so presumably the parsing logic
should already be there.
[3] https://en.cppreference.com/w/cpp/language/attributes/likely

> Some new builtin seems to fit the requirements better?  I haven't looked
> too closely though.

I had a longer discussion with someone offline about it, and the
problem with a builtin is similar to the "memory_order_consume
implementation problem" -- you might have an expression that uses the
builtin in some function without any control, and merely returns the
result of the expression as a result. If that function is in another
compilation unit, it then becomes difficult to propagate this
information without somehow making it part of the type system.
Therefore, by using a statement attribute on conditional control
statements, we do not even have this problem. It seems cleaner
syntactically than having a __builtin_() that is either approximate,
or gives an error if used in the wrong context.

Hence the suggestion for a very simple attribute, which also
side-steps this problem.

Thanks,
-- Marco

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-09 16:13                                                                   ` Marco Elver
@ 2021-06-09 17:14                                                                     ` Segher Boessenkool
  2021-06-09 17:31                                                                       ` Nick Desaulniers
  2021-06-09 18:25                                                                     ` Linus Torvalds
  1 sibling, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-09 17:14 UTC (permalink / raw)
  To: Marco Elver
  Cc: Peter Zijlstra, Paul E. McKenney, Alexander Monakov,
	Linus Torvalds, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Wed, Jun 09, 2021 at 06:13:00PM +0200, Marco Elver wrote:
> On Wed, 9 Jun 2021 at 17:33, Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> [...]
> > > An alternative design would be to use a statement attribute to only
> > > enforce (C) ("__attribute__((mustcontrol))" ?).
> >
> > Statement attributes only exist for empty statements.  It is unclear how
> > (and if!) we could support it for general statements.
> 
> Statement attributes can apply to anything -- Clang has had them apply
> to non-empty statements for a while.

First off, it is not GCC's problem if LLVM decides to use a GCC
extension in some non-compatible way.

It might be possible to extend statement attributes to arbitrary
statement expressions, or some subset of statement expressions, but that
then has to be written down as well; it isn't obvious at all what this
woould do.

> In fact, since C++20 [3], GCC will have to support statement
> attributes on non-empty statements, so presumably the parsing logic
> should already be there.
> [3] https://en.cppreference.com/w/cpp/language/attributes/likely

C++ attributes have different syntax *and semantics*.  With GCC
attributes it isn't clear what statement something belongs to (a
statement can contain a statement after all).

C++ requires all unknown attributes to be ignored without error, so can
this be useful at all here?

> > Some new builtin seems to fit the requirements better?  I haven't looked
> > too closely though.
> 
> I had a longer discussion with someone offline about it, and the
> problem with a builtin is similar to the "memory_order_consume
> implementation problem" -- you might have an expression that uses the
> builtin in some function without any control, and merely returns the
> result of the expression as a result. If that function is in another
> compilation unit, it then becomes difficult to propagate this
> information without somehow making it part of the type system.
> Therefore, by using a statement attribute on conditional control
> statements, we do not even have this problem. It seems cleaner
> syntactically than having a __builtin_() that is either approximate,
> or gives an error if used in the wrong context.

You would use the builtin to mark exactly where you are making the
control dependency.

(And what is a "conditional control statement"?  Yes of course I can
imagine things, but that is not good enough at all).

> Hence the suggestion for a very simple attribute, which also
> side-steps this problem.

And introduces many more problems :-(


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-09 17:14                                                                     ` Segher Boessenkool
@ 2021-06-09 17:31                                                                       ` Nick Desaulniers
  2021-06-09 20:24                                                                         ` Segher Boessenkool
  0 siblings, 1 reply; 127+ messages in thread
From: Nick Desaulniers @ 2021-06-09 17:31 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Marco Elver, Peter Zijlstra, Paul E. McKenney, Alexander Monakov,
	Linus Torvalds, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Wed, Jun 9, 2021 at 10:20 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Wed, Jun 09, 2021 at 06:13:00PM +0200, Marco Elver wrote:
> > On Wed, 9 Jun 2021 at 17:33, Segher Boessenkool
> > <segher@kernel.crashing.org> wrote:
> > [...]
> > > > An alternative design would be to use a statement attribute to only
> > > > enforce (C) ("__attribute__((mustcontrol))" ?).
> > >
> > > Statement attributes only exist for empty statements.  It is unclear how
> > > (and if!) we could support it for general statements.
> >
> > Statement attributes can apply to anything -- Clang has had them apply
> > to non-empty statements for a while.
>
> First off, it is not GCC's problem if LLVM decides to use a GCC
> extension in some non-compatible way.

Reminds me of
https://lore.kernel.org/lkml/CAHk-=whu19Du_rZ-zBtGsXAB-Qo7NtoJjQjd-Sa9OB5u1Cq_Zw@mail.gmail.com/
-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-09 16:13                                                                   ` Marco Elver
  2021-06-09 17:14                                                                     ` Segher Boessenkool
@ 2021-06-09 18:25                                                                     ` Linus Torvalds
  1 sibling, 0 replies; 127+ messages in thread
From: Linus Torvalds @ 2021-06-09 18:25 UTC (permalink / raw)
  To: Marco Elver
  Cc: Segher Boessenkool, Peter Zijlstra, Paul E. McKenney,
	Alexander Monakov, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Wed, Jun 9, 2021 at 9:13 AM Marco Elver <elver@google.com> wrote:
>
> I had a longer discussion with someone offline about it, and the
> problem with a builtin is similar to the "memory_order_consume
> implementation problem"

The "memory_order_consume" problem is *entirely* artificial, and due
to the C standards body incompetence.

Really. I was there. Only very peripherally, but I was involved enough
to know what the problem was.

And the problem wasn't the concept of 'consume'. The problem was
entirely and 100% the incorrect model that the C standards people used
to describe the problem.

The C standards people took a "syntax and type based" approach to the
whole thing, and it was an utter disaster. It's the wrong model
entirely, because it became very very hard to describe the issue in
terms of optimizations of expressions and ordering at a syntactic
level.

What the standard _should_ have done, is to describe it in the same
terms that "volatile" is described - make all memory accesses "visible
in the virtual machine", and then specify the memory ordering
requirements within that virtual machine.

We have successful examples of that from other languages. I'm sorry if
this hurts some C language lawyers fragile ego, but Christ, Java did
it better. Java! A language that a lot of people love to piss on. But
it did memory ordering fundamentally better.

And it's not like it would even have been a new concept. The notion of
"volatile" has been there since the very beginning of C. Yes, yes, the
C++ people screwed it up mightily and confused themselves about what
an "access" means. But "volatile" is actually a lot better specified
than the memory ordering requirements were, and the specifications are
(a) simpler and (b) much *much* easier for a compiler person to
understand.

Plus with memory ordering described as an operation - rather than as a
type - even the C++ confusion of volatile would have gone away. So the
very thing that likely made people want to avoid the "visible access
in the virtual machine" model didn't even _exist_ in the first place.

So the language committee pointlessly said "volatile is bad, we need
to do something else", and came up with something that was an order of
magnitude worse than volatile, and that simply _couldn't_ possibly
sanely handle that "problem of consume".

But the problem was always purely about the model used to _describe_
the issue being bad, not the issue itself.

The "consume" memory ordering is actually very easy to describe in the
"as if" virtual machine memory model (well, as easy as _any_ memory
ordering is). If the C standards committee hadn't picked the wrong way
to describe things, the problem simply would not exist.

Really.

And I guarantee you that compiler writes would have had an easier time
with that "virtual memory model" approach too. No, memory ordering
sure as hell isn't simple to understand for *anybody*, but it got
about a million times worse by using the wrong abstraction layer to
try to "explain" it.

It really is fairly easy to explain what "acquire" is at a virtual
machine model level. About as easy as memory ordering gets. For a
compiler writer, it basically turns into "you have to do the actual
access using XYZ, and then you can't move later memory operations to
before it". End of story.

So you can actually describe these things in fairly straighforward
manner if you actually do it at that virtual machine level, because
that's literally the language that the hardware itself works at.

And then you could easily have defined "consume" as being the same
thing as "acquire", except that you can drop the special XYZ access
(fence, ld.acq, whatever) and replace it with a plain load if there
are only data dependencies on the loaded value (assuming, of course,
that your target hardware then supports that ordering requirements:
alpha would _always_ need the barrier).

That could literally have been done as a peephole optimization, and a
compiler writer would never have had to even really worry about it.
Easy peasy. 99% of all compiler writers would not have to know
anything about the issue, there would be just one very special
optimization at the end that allows you to drop a barrier (or turn a
"ld.acq" into just an "ld") once you see all the uses of that loaded
value. A trivial peephole will handle 99% of all cases, and then for
the rest you just keep it as acquire.

So anybody who tells you that "consume is complicated" is wrong.
Consume is *not* complicated. They've just chosen the wrong model to
describe it.

Look, memory ordering pretty much _is_ the rocket science of CS, but
the C standards committee basically made it a ton harder by specifying
"we have to make the rocket out of duct tape and bricks, and only use
liquid hydrogen as a propellant".

               Linus

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-09 17:31                                                                       ` Nick Desaulniers
@ 2021-06-09 20:24                                                                         ` Segher Boessenkool
  0 siblings, 0 replies; 127+ messages in thread
From: Segher Boessenkool @ 2021-06-09 20:24 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Marco Elver, Peter Zijlstra, Paul E. McKenney, Alexander Monakov,
	Linus Torvalds, Jakub Jelinek, Alan Stern, Will Deacon,
	Andrea Parri, Boqun Feng, Nick Piggin, David Howells,
	Jade Alglave, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Wed, Jun 09, 2021 at 10:31:13AM -0700, Nick Desaulniers wrote:
> On Wed, Jun 9, 2021 at 10:20 AM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> >
> > On Wed, Jun 09, 2021 at 06:13:00PM +0200, Marco Elver wrote:
> > > On Wed, 9 Jun 2021 at 17:33, Segher Boessenkool
> > > <segher@kernel.crashing.org> wrote:
> > > [...]
> > > > > An alternative design would be to use a statement attribute to only
> > > > > enforce (C) ("__attribute__((mustcontrol))" ?).
> > > >
> > > > Statement attributes only exist for empty statements.  It is unclear how
> > > > (and if!) we could support it for general statements.
> > >
> > > Statement attributes can apply to anything -- Clang has had them apply
> > > to non-empty statements for a while.
> >
> > First off, it is not GCC's problem if LLVM decides to use a GCC
> > extension in some non-compatible way.
> 
> Reminds me of
> https://lore.kernel.org/lkml/CAHk-=whu19Du_rZ-zBtGsXAB-Qo7NtoJjQjd-Sa9OB5u1Cq_Zw@mail.gmail.com/

And my reply to that
https://lore.kernel.org/lkml/20200910154423.GK28786@gate.crashing.org/


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
@ 2021-07-30 20:35                                                     ` Alan Stern
  2021-08-02 21:18                                                     ` Alan Stern
                                                                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-07-30 20:35 UTC (permalink / raw)
  To: Jade Alglave
  Cc: Will Deacon, Peter Zijlstra, Linus Torvalds, Segher Boessenkool,
	Paul E. McKenney, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jul 30, 2021 at 06:20:22PM +0100, Jade Alglave wrote:
> Dear all,

> Sincere apologies in taking so long to reply. I attach a technical
> report which describes the status of dependencies in the Arm memory
> model. 
> 
> I have also released the corresponding cat files and a collection of
> interesting litmus tests over here:
> https://github.com/herd/herdtools7/commit/f80bd7c2e49d7d3adad22afc62ff4768d65bf830
> 
> I hope this material can help inform this conversation and I would love
> to hear your thoughts.

Jade:

Here are a few very preliminary reactions (I haven't finished reading the 
entire paper yet).

P.2: Typo: "the register X1 contains the address x" should be "the 
register X1 contains the address of x".

P.4 and later: Several complicated instructions (including CSEL, CAS, and 
SWP) are mentioned but not explained; the text assumes that the reader 
already understands what these instructions do.  A brief description of 
their effects would help readers like me who aren't very familiar with the 
ARM instruction set.

P.4: The text describing Instrinsic dependencies in CSEL instructions says 
that if cond is true then there is an Intrinsic control dependencies from 
the read of PSTATE.NZCV to the read of Xm.  Why is this so?  Can't the CPU 
read Xm unconditionally before it knows whether the value will be used?

P.17: The definition of "Dependency through registers" uses the acronym 
"PE", but the acronym isn't defined anywhere.

P.14: In the description of Figure 18, I wasn't previously aware -- 
although perhaps I should have been -- that ARM could speculatively place 
a Store in a local store buffer, allowing it to be forwarded to a po-later 
Read.  Why doesn't the same mechanism apply to Figure 20, allowing the 
Store in D to be speculatively placed in a local store buffer and 
forwarded to E?  Is this because conditional branches are predicted but 
loads aren't?  If so, that is a significant difference.

More to come...

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
  2021-07-30 20:35                                                     ` Alan Stern
@ 2021-08-02 21:18                                                     ` Alan Stern
  2021-08-02 23:31                                                     ` Paul E. McKenney
                                                                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-08-02 21:18 UTC (permalink / raw)
  To: Jade Alglave
  Cc: Will Deacon, Peter Zijlstra, Linus Torvalds, Segher Boessenkool,
	Paul E. McKenney, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jul 30, 2021 at 06:20:22PM +0100, Jade Alglave wrote:
> I hope this material can help inform this conversation and I would love
> to hear your thoughts.

More comments...

I find the herd-style diagrams (Figures 2, 3, 5, 7, 9, and so on) almost 
impossible to decipher.  While they might be useful to people running 
herd, they have several drawbacks for readers of this report:

	They include multiple instructions, not just the one for which
	you want to illustrate the internal dependencies.  How about
	getting rid of the extraneous instructions?

	Each box contains three lines of information, of which only the
	first is really significant, and it is hard to figure out.  How 
	about getting rid of the second and third lines, and replacing
	things like "e: R0:X1q=x" in the first line with something more
	along the lines of "RegR X0" or "tmp1 = RegR X0"?

	The "iico" in the dependency arrows doesn't add anything.

Section 1.1 mentions order, data, and control Intrinsic dependencies but 
doesn't give so much as a hint as to what they are.  Instead the reader 
is forced to invent his own generalizations by reading through several 
complex special-case examples.  There should be a short description of 
what each Intrinsic dependency represents.  For instance, the first 
sentence in 1.3 would be a great way to explain data dependencies.  (And 
is it not true that control dependencies are mainly needed for 
situations where an instruction's inputs and outputs may include the 
same register or memory address, when it is necessary to enforce that 
the input value is read before the output value is written?)

Some of the dependencies listed for CAS are surprising, but there is no 
explanation.  Why is C2 a control dependency rather than a data 
dependency?  After all, the value read from [Xn] is stored in Xs in both 
cases.  In fact, Df1 supersedes C2 in the failure case, doesn't it?  And 
why are C1 and Ds1 a control and data dependency respectively rather 
than both order dependencies?

Section 2.1: Although the Store F is independent of the conditional 
branch and so might be made visible to other observers early, isn't it 
true that neither ARMv8 nor any other type of processor will do this?

General question: How does this discussion of conditional branches 
relate overall to the way computed branches are handled?

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
  2021-07-30 20:35                                                     ` Alan Stern
  2021-08-02 21:18                                                     ` Alan Stern
@ 2021-08-02 23:31                                                     ` Paul E. McKenney
  2021-08-04 20:09                                                       ` Alan Stern
  2021-08-05 19:47                                                     ` Alan Stern
  2021-08-07  0:51                                                     ` Alan Stern
  4 siblings, 1 reply; 127+ messages in thread
From: Paul E. McKenney @ 2021-08-02 23:31 UTC (permalink / raw)
  To: Jade Alglave
  Cc: Will Deacon, Peter Zijlstra, Linus Torvalds, Alan Stern,
	Segher Boessenkool, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jul 30, 2021 at 06:20:22PM +0100, Jade Alglave wrote:

[ . . . ]

> Sincere apologies in taking so long to reply. I attach a technical
> report which describes the status of dependencies in the Arm memory
> model. 
> 
> I have also released the corresponding cat files and a collection of
> interesting litmus tests over here:
> https://github.com/herd/herdtools7/commit/f80bd7c2e49d7d3adad22afc62ff4768d65bf830
> 
> I hope this material can help inform this conversation and I would love
> to hear your thoughts.

It is very good to see this!  A few random questions and comments below
based on a couple of passes through this document.

							Thanx, Paul

------------------------------------------------------------------------

o	Figure 2: The iico_data arc are essentially invisible after
	printing, as in the text on the following page is darker
	than these arcs.  I had similar difficulties with many
	of the other diagrams.

o	Figure 2: What does the "q" signify in the upper line of the
	uppermost event-c box ("R0:X1q=x")?  I get that we are reading
	register X1 and getting back the address of variable "x".  I am
	assuming that the "R0" means that process 0 is doing a read,
	but I cannot be sure.

	I am assuming that the "proc: P0 poi:0" means that this is
	the first instruction of process P0.  If this is incorrect,
	please let me know.

o	Figure 6 initializes X0, X1, X2, and X3, while Figure 4
	initializes only X0 and X3.  Is this difference meaningful?
	(My guess is that you have default-zero initialization so
	that it does not matter, but I figured that I should ask.)

o	Figure 6: The iico_ctrl arcs are easier to see on printed copy
	than the iico_data arcs, but it would be nice if they were a
	bit darker.  The rf-reg arcs are plainly visible.

o	Figure 6: Why is there no po arc to the CSEL instruction?

o	Section 1.3, "Swap instructions" paragraph.  Please supply
	a litmus-test figure to go along with Figure 12.

o	Figures 10 and 11: Having these on the same page was extremely
	helpful, thank you!

o	Figure 11: What does the "*" signify in the first line event "a:"
	("a: Rx*q=0")?	Why is there no "*" in the corresponding event
	in Figure 9?

o	Figure 11: The ca arcs are nicely visible, but I am coming up
	empty on hypotheses for their meaning.  Or is ca the new co?

o	Figure 11: Why two po arcs into the CAS instruction?  Due
	to independent register reads taht might proceed concurrently?
	If so, why no po arc to event g?

o	Figure 11: The connections between events a, f, and h lead me to
	believe that the hardware is permitted to rewrite register X3
	with the value previously read from X3 as opposed to the value
	read from [X1].  Or maybe omit the write entirely.

	I don't see anything wrong with taking this approach, but I
	figured I should check.

o	Section 2 I leave in Alan's capable hands.

o	Section 3.1, "Dependency through registers": A "PE" is a
	processing element or some such?

o	Section 3.1, "Dependency through registers", first bullet:
	The exclusion of Store Exclusive is to avoid ordering via
	the 0/1 status store, correct?

o	Section 3.1, "Address Dependency", second bullet, second
	sub-bullet: OK, I will bite.  The dependency from the Branching
	Effect is due to a load from the program counter or some such?
	Or are there some special-purpose ARMv8 branch instructions that
	I should look up.

o	Figure 27, "MOV W5, W0": It took me a bit to figure out that
	this instruction exists strictly for the benefit of the
	"exists" clause.  Or am I missing something subtle?

o	Section 4.1, "Interestingly, this notion of ``pick dependency...":
	I suggest using something like "require" instead of "proscribe",
	if that is what is meant.  The hamming distance between the
	antonyms "proscribe" and "prescribe" is quite small, which can
	result in errors both when writing and when reading.  :-(

o	Figure 30: The discarding of register X3 is intentional, correct?
	If so, it is indeed hard to imagine wanting ordering from this
	code sequence.  Though I might once again be suffering from a
	failure of imagination...

o	Figure 32: The reason for this litmus test being allowed is that
	the ordering through CSEL is sort of like a control dependency,
	and control dependencies to loads do not force ordering, correct?
	Or did I miss a turn in there somewhere?

o	Section 4.2, "Pick Basic dependency": Should the second and
	third bullets be indented under the first bullet?

It will take at least another pass to get my head around pick
dependencies, so I will stop here for the moment.

Again, good stuff, and great to see the additional definition!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-08-02 23:31                                                     ` Paul E. McKenney
@ 2021-08-04 20:09                                                       ` Alan Stern
  0 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-08-04 20:09 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jade Alglave, Will Deacon, Peter Zijlstra, Linus Torvalds,
	Segher Boessenkool, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Mon, Aug 02, 2021 at 04:31:56PM -0700, Paul E. McKenney wrote:
> o	Section 2 I leave in Alan's capable hands.

Here goes, although I'm not sure how important this is, given that 
section 2 is presented as merely a "straw man" argument for something 
that ARM decided to abandon.

While reading this section (and the paper in general), it was annoying 
that the terms "down-one-leg" and "down-two-legs" are never explained or 
motivated.  Even after reading section 2, I'm still not sure what they 
are really intended to mean.  My impression is that "down-one-leg" is an 
attempt to express the idea that control dependencies apply to accesses 
occurring along one leg of a conditional but not to accesses occurring 
after the two legs have rejoined.  Is that right?

P.17: "The drawback of this approach is that it would require order for 
the “independent” case" -- this doesn't seem like a drawback to me.  
Particularly since no existing architecture attempts to avoid ordering 
the independent case.

Def. of "points of divergence": This is not very precise.  What exactly 
is a "branching decision"?  Do the two paths of a CAS or CSEL 
instruction count?  What if the decision doesn't involve whether or not 
to take the branch but rather where to branch to (as in a computed 
branch or even just a call through a function pointer)?

Def. of "address dependency": How could there be a Dependency through 
registers from D4 to R2?  It's not at all easy to untangle the 
definitions to see what this might mean.  What would be an example?  At 
any rate, the case where RW2 is a Memory read doesn't seem right.  It 
says that:

	R0 = Load
	R1 = Load([R0])

is an address dependency but

	R0 = Load
	// Branching decision that depends on the value of R0 and
	// carries a Dependency through registers to a new value for
	// R0 (whatever that may mean) which is always equal to the
	// existing value 
	R1 = Load([R0])

isn't.  Is this really what you mean?  If so, what is the motivation for 
this definition?  How does it relate to the discussion earlier in this 
section?

Def. of antecedent: What is a Local read successor or an immediate Local 
write successor?  These terms aren't defined, and without knowing what 
they mean it is impossible to understand what an antecedent is.

Def. of pre-equivalent effects and related terms: I don't understand how 
you can have effects on different branches of a Point of divergence.  By 
definition, only one of the branches is executed -- how can there be any 
effects on the speculated branch?

With all these concepts being so unclear, I was completely unable to 
figure out what the definition of control dependency means.  The text 
doesn't help at all, because it doesn't contain any examples or 
explanations to make these things more comprehensible.

The formalization in cat may have some historical interest, but it 
conveys no information to a reader who isn't prepared to spend hours or 
days trying to decipher it.  Honestly, do you know _anybody_ who could 
tell what Figures 22 - 25 mean and what they do just from reading them?  
You pretty much have to be an expert in cat just to tell what some of 
the recursive functions in Figs. 23 and 24 do.

(As just one very minor example, the "bisimulation" function in the 
fourth-to-last line of Figure 25 isn't mentioned anywhere else.  How are 
people supposed to understand it?)

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
                                                                       ` (2 preceding siblings ...)
  2021-08-02 23:31                                                     ` Paul E. McKenney
@ 2021-08-05 19:47                                                     ` Alan Stern
  2021-08-07  0:51                                                     ` Alan Stern
  4 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-08-05 19:47 UTC (permalink / raw)
  To: Jade Alglave
  Cc: Will Deacon, Peter Zijlstra, Linus Torvalds, Segher Boessenkool,
	Paul E. McKenney, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jul 30, 2021 at 06:20:22PM +0100, Jade Alglave wrote:
> I hope this material can help inform this conversation and I would love
> to hear your thoughts.

Thoughts on section 3...

The paragraph on Branching Effect is pretty meager.  Exactly what 
effects does a conditional branch instruction generate?  I imagine 
there's a register read of the flags register, to see whether the branch 
should be taken.  And evidently there's a branch effect, which may take 
the register read as input but doesn't have any obvious outputs.  
Anything else? -- there don't seem to be any register writes.

Why are Store Exclusive instructions explicitly disallowed in the 
definition of dependency through registers?  Is this because ARM CPUs 
don't forward values written by such instructions to po-later reads?  If 
so, why don't they?  (Paul asked a similar question.)

Since the recursive definition of dependency through registers starts 
with either a register write or intrinsic order of events in an 
instruction, it appears that there cannot be any dependency through 
registers starting from a branching effect.  So why does the definition 
of address dependency talk about a dependency through registers from B4 
(a branching effect) to R2?  (Paul also asked about this -- does writing 
to the program counter get treated as a register write?  But almost no 
instructions explicitly read from the program counter.)

What is the whole point of the special handling of branching effects in 
the definition of address dependencies?  It isn't obvious and the text 
doesn't explain it.

Figure 26 includes a lot of terms that seem like herd primitives.  They 
must be relatively new, because they aren't mentioned in the 
documentation that I've got.  (I'm referring to such terms as iico_data, 
iico_ctrl, intrinsic, same-instance, DATA, and NDATA.)  Are they 
explained anywhere?

Way back in Section 1, various Intrinsic dependency relations were 
introduced.  The reason for treating Intrinsic control dependencies 
specially seems to be that the CPU can speculate past such dependencies 
(though it would be nice if the text made this point explicitly).  But 
why do you differentiate between data and order Intrinsic dependencies?  
Is this also related to some specific behavior of ARM CPUs?

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
       [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
                                                                       ` (3 preceding siblings ...)
  2021-08-05 19:47                                                     ` Alan Stern
@ 2021-08-07  0:51                                                     ` Alan Stern
  4 siblings, 0 replies; 127+ messages in thread
From: Alan Stern @ 2021-08-07  0:51 UTC (permalink / raw)
  To: Jade Alglave
  Cc: Will Deacon, Peter Zijlstra, Linus Torvalds, Segher Boessenkool,
	Paul E. McKenney, Andrea Parri, Boqun Feng, Nick Piggin,
	David Howells, Luc Maranget, Akira Yokosawa,
	Linux Kernel Mailing List, linux-toolchains, linux-arch

On Fri, Jul 30, 2021 at 06:20:22PM +0100, Jade Alglave wrote:
> I hope this material can help inform this conversation and I would love
> to hear your thoughts.

Thoughts on section 4...

The definition of Pick Basic dependency is phrased incorrectly.  The 
"all of the following apply" in the first paragraph refers only to first 
bullet point, which in turn refers to the following two bullet points.  
The "all of the following apply" phrase should be removed and the first 
bullet point should be merged into the main text.

The definition of Pick dependency is redundant, because each Pick 
Address and Pick Data dependency is itself already a Pick Basic 
dependency.  The same is true of the cat formalization.

In the cat code, the definition of Reg looks wrong.  It is:

	let Reg=~M | ~BR

Since (I presume) no event falls into both the M and BR classes, this 
definition includes all events.  It probably should be:

	let Reg=~(M | BR)

or

	let Reg=~M & ~BR

It's now clear that my original understanding of the underlying basis of 
Intrinsic control dependencies was wrong.  They aren't separated out 
because CPUs can speculate through conditional branches; rather they are 
separated out because they are the things which give rise to Pick 
dependencies.  It would have been nice if the text had explained this at 
the start instead of leaving it up to me to figure out for myself.

Alan

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
                   ` (5 preceding siblings ...)
  2021-06-08 12:48 ` David Laight
@ 2021-09-24 18:38 ` Mathieu Desnoyers
  2021-09-24 19:52   ` Alan Stern
  2021-09-24 19:55   ` Segher Boessenkool
  6 siblings, 2 replies; 127+ messages in thread
From: Mathieu Desnoyers @ 2021-09-24 18:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, will, paulmck, stern, parri.andrea, boqun.feng,
	npiggin, dhowells, j.alglave, luc.maranget, akiyks, linux-kernel,
	linux-toolchains, linux-arch

Hi,

Following the LPC2021 BoF about control dependency, I re-read the kernel
documentation about control dependency, and ended up thinking that what
we have now is utterly fragile.

Considering that the goal here is to prevent the compiler from being able to
optimize a conditional branch into something which lacks the control
dependency, while letting the compiler choose the best conditional
branch in each case, how about the following approach ?

#define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool) x)); x; })
#define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm volatile goto ("" : : : "cc", "memory" : l_dummy); (x); })
#define ctrl_dep_if(x)          if ((ctrl_dep_eval(x) && ctrl_dep_emit_loop(1)) || ctrl_dep_emit_loop(0))

The idea is to forbid the compiler from considering the two branches as
identical by adding a dummy loop in each branch with an empty asm goto.
Considering that the compiler should not assume anything about the
contents of the asm goto (it's been designed so the generated assembly
can be modified at runtime), then the compiler can hardly know whether
each branch will trigger an infinite loop or not, which should prevent
unwanted optimisations.

With this approach, the following code now keeps the control dependency:

	z = READ_ONCE(var1);
        ctrl_dep_if (z)
                WRITE_ONCE(var2, 5);
        else
                WRITE_ONCE(var2, 5);

And the ctrl_dep_eval() checking the constant triggers a build error
for:

        y = READ_ONCE(var1);
        ctrl_dep_if (y % 1)
                WRITE_ONCE(var2, 5);
        else
                WRITE_ONCE(var2, 6);

Which is good to have to ensure the compiler don't end up removing the
conditional branch because the resulting evaluation ends up evaluating a
constant.

Thoughts ?

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-09-24 18:38 ` Mathieu Desnoyers
@ 2021-09-24 19:52   ` Alan Stern
  2021-09-24 20:22     ` Mathieu Desnoyers
  2021-09-24 19:55   ` Segher Boessenkool
  1 sibling, 1 reply; 127+ messages in thread
From: Alan Stern @ 2021-09-24 19:52 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, Linus Torvalds, will, paulmck, parri.andrea,
	boqun.feng, npiggin, dhowells, j.alglave, luc.maranget, akiyks,
	linux-kernel, linux-toolchains, linux-arch

On Fri, Sep 24, 2021 at 02:38:58PM -0400, Mathieu Desnoyers wrote:
> Hi,
> 
> Following the LPC2021 BoF about control dependency, I re-read the kernel
> documentation about control dependency, and ended up thinking that what
> we have now is utterly fragile.
> 
> Considering that the goal here is to prevent the compiler from being able to
> optimize a conditional branch into something which lacks the control
> dependency, while letting the compiler choose the best conditional
> branch in each case, how about the following approach ?
> 
> #define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool) x)); x; })
> #define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm volatile goto ("" : : : "cc", "memory" : l_dummy); (x); })
> #define ctrl_dep_if(x)          if ((ctrl_dep_eval(x) && ctrl_dep_emit_loop(1)) || ctrl_dep_emit_loop(0))
> 
> The idea is to forbid the compiler from considering the two branches as
> identical by adding a dummy loop in each branch with an empty asm goto.
> Considering that the compiler should not assume anything about the
> contents of the asm goto (it's been designed so the generated assembly
> can be modified at runtime), then the compiler can hardly know whether
> each branch will trigger an infinite loop or not, which should prevent
> unwanted optimisations.
> 
> With this approach, the following code now keeps the control dependency:
> 
> 	z = READ_ONCE(var1);
>         ctrl_dep_if (z)
>                 WRITE_ONCE(var2, 5);
>         else
>                 WRITE_ONCE(var2, 5);
> 
> And the ctrl_dep_eval() checking the constant triggers a build error
> for:
> 
>         y = READ_ONCE(var1);
>         ctrl_dep_if (y % 1)
>                 WRITE_ONCE(var2, 5);
>         else
>                 WRITE_ONCE(var2, 6);
> 
> Which is good to have to ensure the compiler don't end up removing the
> conditional branch because the resulting evaluation ends up evaluating a
> constant.
> 
> Thoughts ?

As I remember the earlier discussion, Linus felt that the kernel doesn't 
really need any sort of explicit control dependency (although we called 
it "volatile if").  In many cases there is an actual semantic 
dependency, so it doesn't matter what the compiler does -- the hardware 
will enforce the actual dependency.  In other cases, we can work around 
the issue by using acquire loads or release stores.

In fact, Linus's biggest wish was to have a weak form of compiler 
barrier, one which would block the compiler from reordering accesses 
across the barrier but wouldn't invalidate the compiler's knowledge 
about the values of earlier reads (which barrier() would do).

Alan Stern

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-09-24 18:38 ` Mathieu Desnoyers
  2021-09-24 19:52   ` Alan Stern
@ 2021-09-24 19:55   ` Segher Boessenkool
  2021-09-24 20:39     ` Mathieu Desnoyers
  1 sibling, 1 reply; 127+ messages in thread
From: Segher Boessenkool @ 2021-09-24 19:55 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, Linus Torvalds, will, paulmck, stern,
	parri.andrea, boqun.feng, npiggin, dhowells, j.alglave,
	luc.maranget, akiyks, linux-kernel, linux-toolchains, linux-arch

Hi!

On Fri, Sep 24, 2021 at 02:38:58PM -0400, Mathieu Desnoyers wrote:
> Following the LPC2021 BoF about control dependency, I re-read the kernel
> documentation about control dependency, and ended up thinking that what
> we have now is utterly fragile.
> 
> Considering that the goal here is to prevent the compiler from being able to
> optimize a conditional branch into something which lacks the control
> dependency, while letting the compiler choose the best conditional
> branch in each case, how about the following approach ?
> 
> #define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool) x)); x; })
> #define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm volatile goto ("" : : : "cc", "memory" : l_dummy); (x); })
> #define ctrl_dep_if(x)          if ((ctrl_dep_eval(x) && ctrl_dep_emit_loop(1)) || ctrl_dep_emit_loop(0))

[The "cc" clobber only pessimises things: the asm doesn't actually
clobber the default condition code register (which is what "cc" means),
and you can have conditional branches using other condition code
registers, or on other registers even (general purpose registers is
common.]

> The idea is to forbid the compiler from considering the two branches as
> identical by adding a dummy loop in each branch with an empty asm goto.
> Considering that the compiler should not assume anything about the
> contents of the asm goto (it's been designed so the generated assembly
> can be modified at runtime), then the compiler can hardly know whether
> each branch will trigger an infinite loop or not, which should prevent
> unwanted optimisations.

The compiler looks if the code is identical, nothing more, nothing less.
There are no extra guarantees.  In principle the compiler could see both
copies are empty asms looping to self, and so consider them equal.


Segher

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-09-24 19:52   ` Alan Stern
@ 2021-09-24 20:22     ` Mathieu Desnoyers
  0 siblings, 0 replies; 127+ messages in thread
From: Mathieu Desnoyers @ 2021-09-24 20:22 UTC (permalink / raw)
  To: Alan Stern
  Cc: Peter Zijlstra, Linus Torvalds, Will Deacon, paulmck,
	Andrea Parri, Boqun Feng, Nicholas Piggin, David Howells,
	j alglave, luc maranget, akiyks, linux-kernel, linux-toolchains,
	linux-arch

----- On Sep 24, 2021, at 3:52 PM, Alan Stern stern@rowland.harvard.edu wrote:

> On Fri, Sep 24, 2021 at 02:38:58PM -0400, Mathieu Desnoyers wrote:
>> Hi,
>> 
>> Following the LPC2021 BoF about control dependency, I re-read the kernel
>> documentation about control dependency, and ended up thinking that what
>> we have now is utterly fragile.
>> 
>> Considering that the goal here is to prevent the compiler from being able to
>> optimize a conditional branch into something which lacks the control
>> dependency, while letting the compiler choose the best conditional
>> branch in each case, how about the following approach ?
>> 
>> #define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool)
>> x)); x; })
>> #define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm volatile goto
>> ("" : : : "cc", "memory" : l_dummy); (x); })
>> #define ctrl_dep_if(x)          if ((ctrl_dep_eval(x) && ctrl_dep_emit_loop(1))
>> || ctrl_dep_emit_loop(0))
>> 
>> The idea is to forbid the compiler from considering the two branches as
>> identical by adding a dummy loop in each branch with an empty asm goto.
>> Considering that the compiler should not assume anything about the
>> contents of the asm goto (it's been designed so the generated assembly
>> can be modified at runtime), then the compiler can hardly know whether
>> each branch will trigger an infinite loop or not, which should prevent
>> unwanted optimisations.
>> 
>> With this approach, the following code now keeps the control dependency:
>> 
>> 	z = READ_ONCE(var1);
>>         ctrl_dep_if (z)
>>                 WRITE_ONCE(var2, 5);
>>         else
>>                 WRITE_ONCE(var2, 5);
>> 
>> And the ctrl_dep_eval() checking the constant triggers a build error
>> for:
>> 
>>         y = READ_ONCE(var1);
>>         ctrl_dep_if (y % 1)
>>                 WRITE_ONCE(var2, 5);
>>         else
>>                 WRITE_ONCE(var2, 6);
>> 
>> Which is good to have to ensure the compiler don't end up removing the
>> conditional branch because the resulting evaluation ends up evaluating a
>> constant.
>> 
>> Thoughts ?
> 
> As I remember the earlier discussion, Linus felt that the kernel doesn't
> really need any sort of explicit control dependency (although we called
> it "volatile if").  In many cases there is an actual semantic
> dependency, so it doesn't matter what the compiler does -- the hardware
> will enforce the actual dependency.  In other cases, we can work around
> the issue by using acquire loads or release stores.

IMHO, having to chase down what the instruction selection does on every
architecture for every supported compiler for each control dependency in
order to confirm that the control dependency is still present in the resulting
assembly is fragile. If the kernel really doesn't need explicit control
dependency, then maybe the whole notion of "control dependency"-based
ordering should be removed in favor of explicit acquire loads/release stores
and barriers.

> In fact, Linus's biggest wish was to have a weak form of compiler
> barrier, one which would block the compiler from reordering accesses
> across the barrier but wouldn't invalidate the compiler's knowledge
> about the values of earlier reads (which barrier() would do).

Then maybe we could simply tweak the ctrl_dep_emit_loop() implementation and
remove the "memory" clobber. Then its only effect is to prevent the compiler
from knowing whether there is an infinite loop, thus preventing reordering
accesses across the conditional branch without requiring the compiler to
discard earlier reads:

#define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm_volatile_goto ("" : : : : l_dummy); (x); })

and for the records, the ctrl_dep_eval(x) implementation needs extra parentheses:

#define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool) (x))); (x); })

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-09-24 19:55   ` Segher Boessenkool
@ 2021-09-24 20:39     ` Mathieu Desnoyers
  2021-09-24 22:07       ` Mathieu Desnoyers
  0 siblings, 1 reply; 127+ messages in thread
From: Mathieu Desnoyers @ 2021-09-24 20:39 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Linus Torvalds, Will Deacon, paulmck, Alan Stern,
	Andrea Parri, Boqun Feng, Nicholas Piggin, David Howells,
	j alglave, luc maranget, akiyks, linux-kernel, linux-toolchains,
	linux-arch

----- On Sep 24, 2021, at 3:55 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> Hi!
> 
> On Fri, Sep 24, 2021 at 02:38:58PM -0400, Mathieu Desnoyers wrote:
>> Following the LPC2021 BoF about control dependency, I re-read the kernel
>> documentation about control dependency, and ended up thinking that what
>> we have now is utterly fragile.
>> 
>> Considering that the goal here is to prevent the compiler from being able to
>> optimize a conditional branch into something which lacks the control
>> dependency, while letting the compiler choose the best conditional
>> branch in each case, how about the following approach ?
>> 
>> #define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool)
>> x)); x; })
>> #define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm volatile goto
>> ("" : : : "cc", "memory" : l_dummy); (x); })
>> #define ctrl_dep_if(x)          if ((ctrl_dep_eval(x) && ctrl_dep_emit_loop(1))
>> || ctrl_dep_emit_loop(0))
> 
> [The "cc" clobber only pessimises things: the asm doesn't actually
> clobber the default condition code register (which is what "cc" means),
> and you can have conditional branches using other condition code
> registers, or on other registers even (general purpose registers is
> common.]

I'm currently considering removing both "memory" and "cc" clobbers from
the asm goto.

> 
>> The idea is to forbid the compiler from considering the two branches as
>> identical by adding a dummy loop in each branch with an empty asm goto.
>> Considering that the compiler should not assume anything about the
>> contents of the asm goto (it's been designed so the generated assembly
>> can be modified at runtime), then the compiler can hardly know whether
>> each branch will trigger an infinite loop or not, which should prevent
>> unwanted optimisations.
> 
> The compiler looks if the code is identical, nothing more, nothing less.
> There are no extra guarantees.  In principle the compiler could see both
> copies are empty asms looping to self, and so consider them equal.

I would expect the compiler not to attempt combining asm goto based on their
similarity because it has been made clear starting from the original requirements
from the kernel community to the gcc developers that one major use-case of asm
goto involves self-modifying code (patching between nops and jumps).

If this happens to be a real possibility, then we may need to work-around this for
other uses of asm goto as well.

If there is indeed a scenario where the compiler can combine similar asm goto statements,
then I suspect we may want to emit unique dummy code in the assembly which gets placed in a
discarded section, e.g.:

#define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm goto (       \
                ".pushsection .discard.ctrl_dep\n\t"                            \
                ".long " __stringify(__COUNTER__) "\n\t"                        \
                ".popsection\n\t"                                               \
                "" : : : : l_dummy); (x); })

But then a similar trick would be needed for jump labels as well.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 127+ messages in thread

* Re: [RFC] LKMM: Add volatile_if()
  2021-09-24 20:39     ` Mathieu Desnoyers
@ 2021-09-24 22:07       ` Mathieu Desnoyers
  0 siblings, 0 replies; 127+ messages in thread
From: Mathieu Desnoyers @ 2021-09-24 22:07 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Peter Zijlstra, Linus Torvalds, Will Deacon, paulmck, Alan Stern,
	Andrea Parri, Boqun Feng, Nicholas Piggin, David Howells,
	j alglave, luc maranget, akiyks, linux-kernel, linux-toolchains,
	linux-arch

----- On Sep 24, 2021, at 4:39 PM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:

> ----- On Sep 24, 2021, at 3:55 PM, Segher Boessenkool segher@kernel.crashing.org
> wrote:
> 
>> Hi!
>> 
>> On Fri, Sep 24, 2021 at 02:38:58PM -0400, Mathieu Desnoyers wrote:
>>> Following the LPC2021 BoF about control dependency, I re-read the kernel
>>> documentation about control dependency, and ended up thinking that what
>>> we have now is utterly fragile.
>>> 
>>> Considering that the goal here is to prevent the compiler from being able to
>>> optimize a conditional branch into something which lacks the control
>>> dependency, while letting the compiler choose the best conditional
>>> branch in each case, how about the following approach ?
>>> 
>>> #define ctrl_dep_eval(x)        ({ BUILD_BUG_ON(__builtin_constant_p((_Bool)
>>> x)); x; })
>>> #define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm volatile goto
>>> ("" : : : "cc", "memory" : l_dummy); (x); })
>>> #define ctrl_dep_if(x)          if ((ctrl_dep_eval(x) && ctrl_dep_emit_loop(1))
>>> || ctrl_dep_emit_loop(0))
>> 
>> [The "cc" clobber only pessimises things: the asm doesn't actually
>> clobber the default condition code register (which is what "cc" means),
>> and you can have conditional branches using other condition code
>> registers, or on other registers even (general purpose registers is
>> common.]
> 
> I'm currently considering removing both "memory" and "cc" clobbers from
> the asm goto.
> 
>> 
>>> The idea is to forbid the compiler from considering the two branches as
>>> identical by adding a dummy loop in each branch with an empty asm goto.
>>> Considering that the compiler should not assume anything about the
>>> contents of the asm goto (it's been designed so the generated assembly
>>> can be modified at runtime), then the compiler can hardly know whether
>>> each branch will trigger an infinite loop or not, which should prevent
>>> unwanted optimisations.
>> 
>> The compiler looks if the code is identical, nothing more, nothing less.
>> There are no extra guarantees.  In principle the compiler could see both
>> copies are empty asms looping to self, and so consider them equal.
> 
> I would expect the compiler not to attempt combining asm goto based on their
> similarity because it has been made clear starting from the original
> requirements
> from the kernel community to the gcc developers that one major use-case of asm
> goto involves self-modifying code (patching between nops and jumps).
> 
> If this happens to be a real possibility, then we may need to work-around this
> for
> other uses of asm goto as well.

Now that I page back this stuff into my brain (I last looked at it in details some
12 years ago), I recall that letting compilers combine asm goto statements which
happen to match CSE was actually something we wanted to permit, because we don't care
about editing the nops into jumps for each individual asm goto if they happen
to have the same effect when modified.

> 
> If there is indeed a scenario where the compiler can combine similar asm goto
> statements,
> then I suspect we may want to emit unique dummy code in the assembly which gets
> placed in a
> discarded section, e.g.:
> 
> #define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm goto (
> \
>                ".pushsection .discard.ctrl_dep\n\t"                            \
>                ".long " __stringify(__COUNTER__) "\n\t"                        \
>                ".popsection\n\t"                                               \
>                "" : : : : l_dummy); (x); })
> 

So I think your point is very much valid: we need some way to make the content of the asm goto
different between the two branches. I think the __COUNTER__ approach is overkill though:
we don't care about making each of the asm goto loop unique within the entire file;
we just don't want them to match between the two legs of the branch.

So something like this should be enough:

#define ctrl_dep_emit_loop(x)   ({ __label__ l_dummy; l_dummy: asm goto (       \
                ".pushsection .discard.ctrl_dep\n\t"                            \
                ".long " __stringify(x) "\n\t"                                  \
                ".popsection\n\t"                                               \
                "" : : : : l_dummy); (x); })

So we emit respectively a 0 and 1 into the discarded section.

Thoughts ?

Thanks,

Mathieu


> But then a similar trick would be needed for jump labels as well.
> 
> Thanks,
> 
> Mathieu
> 
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 127+ messages in thread

end of thread, other threads:[~2021-09-24 22:07 UTC | newest]

Thread overview: 127+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-04 10:12 [RFC] LKMM: Add volatile_if() Peter Zijlstra
2021-06-04 10:44 ` Will Deacon
2021-06-04 11:13   ` Will Deacon
2021-06-04 11:31   ` Peter Zijlstra
2021-06-04 13:44     ` Will Deacon
2021-06-04 13:56       ` Peter Zijlstra
2021-06-04 15:13         ` Will Deacon
2021-06-04 15:22           ` Peter Zijlstra
2021-06-04 15:36             ` Alan Stern
2021-06-04 15:42             ` Peter Zijlstra
2021-06-04 15:51               ` Alan Stern
2021-06-04 16:17                 ` Peter Zijlstra
2021-06-04 18:27                   ` Alan Stern
2021-06-04 19:09                     ` Linus Torvalds
2021-06-04 19:18                       ` Linus Torvalds
2021-06-04 20:56                         ` Paul E. McKenney
2021-06-04 21:27                           ` Linus Torvalds
2021-06-04 21:40                             ` Paul E. McKenney
2021-06-04 22:19                               ` Linus Torvalds
2021-06-05 14:57                                 ` Alan Stern
2021-06-06  0:14                                   ` Paul E. McKenney
2021-06-06  1:29                                     ` Alan Stern
2021-06-06  3:41                                       ` Linus Torvalds
2021-06-06  4:43                                         ` Paul E. McKenney
2021-06-06 13:17                                           ` Segher Boessenkool
2021-06-06 19:07                                             ` Paul E. McKenney
2021-06-06 12:59                                         ` Segher Boessenkool
2021-06-06 13:47                                           ` Alan Stern
2021-06-06 17:13                                             ` Segher Boessenkool
2021-06-06 18:25                                           ` Linus Torvalds
2021-06-06 19:19                                             ` Segher Boessenkool
2021-06-06 18:41                                         ` Alan Stern
2021-06-06 18:59                                         ` Jakub Jelinek
2021-06-06 19:15                                           ` Paul E. McKenney
2021-06-06 19:22                                           ` Linus Torvalds
2021-06-06 20:11                                             ` Segher Boessenkool
2021-06-06 21:19                                             ` Alexander Monakov
2021-06-06 22:38                                               ` Linus Torvalds
2021-06-06 23:39                                                 ` Rasmus Villemoes
2021-06-06 23:44                                                   ` Rasmus Villemoes
2021-06-07  8:01                                                 ` Alexander Monakov
2021-06-07  8:27                                                   ` Marco Elver
2021-06-07 15:28                                                     ` Paul E. McKenney
2021-06-07 17:04                                                       ` Marco Elver
2021-06-08  9:30                                                         ` Marco Elver
2021-06-08 11:22                                                           ` Peter Zijlstra
2021-06-08 15:28                                                             ` Segher Boessenkool
2021-06-09 12:44                                                               ` Marco Elver
2021-06-09 15:31                                                                 ` Segher Boessenkool
2021-06-09 16:13                                                                   ` Marco Elver
2021-06-09 17:14                                                                     ` Segher Boessenkool
2021-06-09 17:31                                                                       ` Nick Desaulniers
2021-06-09 20:24                                                                         ` Segher Boessenkool
2021-06-09 18:25                                                                     ` Linus Torvalds
2021-06-07 17:52                                                   ` Segher Boessenkool
2021-06-07 18:07                                                     ` Alexander Monakov
2021-06-07 18:18                                                       ` Segher Boessenkool
2021-06-07 17:42                                                 ` Segher Boessenkool
2021-06-07 20:31                                                   ` Linus Torvalds
2021-06-07 22:54                                                     ` Segher Boessenkool
2021-06-06 11:53                                       ` Segher Boessenkool
2021-06-06 13:45                                         ` Alan Stern
2021-06-06 18:04                                         ` Linus Torvalds
2021-06-06 18:22                                           ` Alan Stern
2021-06-06 18:43                                             ` Linus Torvalds
2021-06-07 10:43                                               ` Peter Zijlstra
2021-06-07 11:52                                                 ` Will Deacon
2021-06-07 15:25                                                   ` Paul E. McKenney
2021-06-07 16:02                                                     ` Will Deacon
2021-06-07 18:08                                                       ` Paul E. McKenney
     [not found]                                                   ` <20210730172020.GA32396@knuckles.cs.ucl.ac.uk>
2021-07-30 20:35                                                     ` Alan Stern
2021-08-02 21:18                                                     ` Alan Stern
2021-08-02 23:31                                                     ` Paul E. McKenney
2021-08-04 20:09                                                       ` Alan Stern
2021-08-05 19:47                                                     ` Alan Stern
2021-08-07  0:51                                                     ` Alan Stern
2021-06-06 18:40                                           ` Segher Boessenkool
2021-06-06 18:48                                             ` Linus Torvalds
2021-06-06 18:53                                               ` Linus Torvalds
2021-06-06 19:52                                               ` Segher Boessenkool
2021-06-06 20:11                                                 ` Linus Torvalds
2021-06-06 20:26                                                   ` Segher Boessenkool
2021-06-06 23:37                                                     ` Paul E. McKenney
2021-06-07 14:12                                                       ` Segher Boessenkool
2021-06-07 15:27                                                         ` Paul E. McKenney
2021-06-07 18:23                                                           ` Segher Boessenkool
2021-06-07 19:51                                                             ` Alan Stern
2021-06-07 20:16                                                               ` Paul E. McKenney
2021-06-07 22:40                                                                 ` Segher Boessenkool
2021-06-07 23:26                                                                   ` Paul E. McKenney
2021-06-07 10:52                                                     ` Peter Zijlstra
2021-06-07 14:16                                                       ` Segher Boessenkool
2021-06-04 22:05                             ` Peter Zijlstra
2021-06-05  3:14                       ` Alan Stern
2021-06-05 16:24                         ` Linus Torvalds
2021-06-04 15:50         ` Segher Boessenkool
2021-06-04 15:47     ` Segher Boessenkool
2021-06-04 11:44 ` Peter Zijlstra
2021-06-04 14:13   ` Paul E. McKenney
2021-06-04 15:35   ` Segher Boessenkool
2021-06-04 16:10     ` Peter Zijlstra
2021-06-04 16:40       ` Segher Boessenkool
2021-06-04 18:55         ` Paul E. McKenney
2021-06-04 19:53           ` Segher Boessenkool
2021-06-04 20:40             ` Paul E. McKenney
2021-06-06 11:36               ` Segher Boessenkool
2021-06-06 19:01                 ` Paul E. McKenney
2021-06-04 14:25 ` Alan Stern
2021-06-04 16:09 ` Segher Boessenkool
2021-06-04 16:33   ` Peter Zijlstra
2021-06-04 16:30 ` Linus Torvalds
2021-06-04 16:37   ` Peter Zijlstra
2021-06-04 16:52     ` Segher Boessenkool
2021-06-04 17:10     ` Linus Torvalds
2021-06-04 17:24       ` Segher Boessenkool
2021-06-04 17:38         ` Linus Torvalds
2021-06-04 18:25           ` Segher Boessenkool
2021-06-04 19:17         ` Peter Zijlstra
2021-06-04 20:43           ` Paul E. McKenney
2021-06-04 18:23       ` Alan Stern
2021-06-08 12:48 ` David Laight
2021-09-24 18:38 ` Mathieu Desnoyers
2021-09-24 19:52   ` Alan Stern
2021-09-24 20:22     ` Mathieu Desnoyers
2021-09-24 19:55   ` Segher Boessenkool
2021-09-24 20:39     ` Mathieu Desnoyers
2021-09-24 22:07       ` Mathieu Desnoyers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).