linux-toolchains.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* static_branch/jump_label vs branch merging
@ 2021-04-08 16:52 Peter Zijlstra
  2021-04-09  9:57 ` Ard Biesheuvel
  2021-04-09 13:03 ` Segher Boessenkool
  0 siblings, 2 replies; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-08 16:52 UTC (permalink / raw)
  To: linux-toolchains, linux-kernel; +Cc: jpoimboe, jbaron, rostedt, ardb

Hi!

Given code like:

DEFINE_STATIC_KEY_FALSE(sched_schedstats);

#define   schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
#define   schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
#define __schedstat_set(var, val)	do { var = (val); } while (0)

void foo(void)
{
	struct task_struct *p = current;

	schedstat_set(p->se.statistics.wait_start,  0);
	schedstat_set(p->se.statistics.sleep_start, 0);
	schedstat_set(p->se.statistics.block_start, 0);
}

Where the static_branch_unlikely() ends up being:

static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
	asm_volatile_goto("1:"
		".byte " __stringify(BYTES_NOP5) "\n\t"
		".pushsection __jump_table,  \"aw\" \n\t"
		_ASM_ALIGN "\n\t"
		".long 1b - ., %l[l_yes] - . \n\t"
		_ASM_PTR "%c0 + %c1 - .\n\t"
		".popsection \n\t"
		: :  "i" (key), "i" (branch) : : l_yes);

	return false;
l_yes:
	return true;
}

The compiler gives us code like:

000000000000a290 <foo>:
    a290:       65 48 8b 04 25 00 00 00 00      mov    %gs:0x0,%rax     a295: R_X86_64_32S      current_task
    a299:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    a29e:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    a2a3:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    a2a8:       c3                      retq
    a2a9:       48 c7 80 28 01 00 00 00 00 00 00        movq   $0x0,0x128(%rax)
    a2b4:       eb e8                   jmp    a29e <foo+0xe>
    a2b6:       48 c7 80 58 01 00 00 00 00 00 00        movq   $0x0,0x158(%rax)
    a2c1:       eb e0                   jmp    a2a3 <foo+0x13>
    a2c3:       48 c7 80 70 01 00 00 00 00 00 00        movq   $0x0,0x170(%rax)
    a2ce:       c3                      retq


Now, in this case I can easily rewrite foo like:

void foo2(void)
{
	struct task_struct *p = current;

	if (schedstat_enabled()) {
		__schedstat_set(p->se.statistics.wait_start,  0);
		__schedstat_set(p->se.statistics.sleep_start, 0);
		__schedstat_set(p->se.statistics.block_start, 0);
	}
}

Which gives the far more reasonable:

000000000000a2d0 <foo2>:
    a2d0:       65 48 8b 04 25 00 00 00 00      mov    %gs:0x0,%rax     a2d5: R_X86_64_32S      current_task
    a2d9:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    a2de:       c3                      retq
    a2df:       48 c7 80 28 01 00 00 00 00 00 00        movq   $0x0,0x128(%rax)
    a2ea:       48 c7 80 58 01 00 00 00 00 00 00        movq   $0x0,0x158(%rax)
    a2f5:       48 c7 80 70 01 00 00 00 00 00 00        movq   $0x0,0x170(%rax)
    a300:       c3                      retq

But I've found a few sites where this isn't so trivial.

Is there *any* way in which we can have the compiler recognise that the
asm_goto only depends on its arguments and have it merge the branches
itself?

I do realize that asm-goto being volatile this is a fairly huge ask, but
I figured I should at least raise the issue, if only to raise awareness.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-08 16:52 static_branch/jump_label vs branch merging Peter Zijlstra
@ 2021-04-09  9:57 ` Ard Biesheuvel
  2021-04-09 10:55   ` Florian Weimer
  2021-04-09 11:12   ` Peter Zijlstra
  2021-04-09 13:03 ` Segher Boessenkool
  1 sibling, 2 replies; 25+ messages in thread
From: Ard Biesheuvel @ 2021-04-09  9:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-toolchains, Linux Kernel Mailing List, Josh Poimboeuf,
	Jason Baron, Steven Rostedt (VMware)

On Thu, 8 Apr 2021 at 18:53, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Hi!
>
> Given code like:
>
> DEFINE_STATIC_KEY_FALSE(sched_schedstats);
>
> #define   schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
> #define   schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
> #define __schedstat_set(var, val)       do { var = (val); } while (0)
>
> void foo(void)
> {
>         struct task_struct *p = current;
>
>         schedstat_set(p->se.statistics.wait_start,  0);
>         schedstat_set(p->se.statistics.sleep_start, 0);
>         schedstat_set(p->se.statistics.block_start, 0);
> }
>
> Where the static_branch_unlikely() ends up being:
>
> static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
> {
>         asm_volatile_goto("1:"
>                 ".byte " __stringify(BYTES_NOP5) "\n\t"
>                 ".pushsection __jump_table,  \"aw\" \n\t"
>                 _ASM_ALIGN "\n\t"
>                 ".long 1b - ., %l[l_yes] - . \n\t"
>                 _ASM_PTR "%c0 + %c1 - .\n\t"
>                 ".popsection \n\t"
>                 : :  "i" (key), "i" (branch) : : l_yes);
>
>         return false;
> l_yes:
>         return true;
> }
>
> The compiler gives us code like:
>
> 000000000000a290 <foo>:
>     a290:       65 48 8b 04 25 00 00 00 00      mov    %gs:0x0,%rax     a295: R_X86_64_32S      current_task
>     a299:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
>     a29e:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
>     a2a3:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
>     a2a8:       c3                      retq
>     a2a9:       48 c7 80 28 01 00 00 00 00 00 00        movq   $0x0,0x128(%rax)
>     a2b4:       eb e8                   jmp    a29e <foo+0xe>
>     a2b6:       48 c7 80 58 01 00 00 00 00 00 00        movq   $0x0,0x158(%rax)
>     a2c1:       eb e0                   jmp    a2a3 <foo+0x13>
>     a2c3:       48 c7 80 70 01 00 00 00 00 00 00        movq   $0x0,0x170(%rax)
>     a2ce:       c3                      retq
>
>
> Now, in this case I can easily rewrite foo like:
>
> void foo2(void)
> {
>         struct task_struct *p = current;
>
>         if (schedstat_enabled()) {
>                 __schedstat_set(p->se.statistics.wait_start,  0);
>                 __schedstat_set(p->se.statistics.sleep_start, 0);
>                 __schedstat_set(p->se.statistics.block_start, 0);
>         }
> }
>
> Which gives the far more reasonable:
>
> 000000000000a2d0 <foo2>:
>     a2d0:       65 48 8b 04 25 00 00 00 00      mov    %gs:0x0,%rax     a2d5: R_X86_64_32S      current_task
>     a2d9:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
>     a2de:       c3                      retq
>     a2df:       48 c7 80 28 01 00 00 00 00 00 00        movq   $0x0,0x128(%rax)
>     a2ea:       48 c7 80 58 01 00 00 00 00 00 00        movq   $0x0,0x158(%rax)
>     a2f5:       48 c7 80 70 01 00 00 00 00 00 00        movq   $0x0,0x170(%rax)
>     a300:       c3                      retq
>
> But I've found a few sites where this isn't so trivial.
>
> Is there *any* way in which we can have the compiler recognise that the
> asm_goto only depends on its arguments and have it merge the branches
> itself?
>
> I do realize that asm-goto being volatile this is a fairly huge ask, but
> I figured I should at least raise the issue, if only to raise awareness.
>

Wouldn't that require the compiler to interpret the contents of the asm() block?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09  9:57 ` Ard Biesheuvel
@ 2021-04-09 10:55   ` Florian Weimer
  2021-04-09 11:16     ` Peter Zijlstra
  2021-04-09 11:12   ` Peter Zijlstra
  1 sibling, 1 reply; 25+ messages in thread
From: Florian Weimer @ 2021-04-09 10:55 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Peter Zijlstra, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

* Ard Biesheuvel:

> Wouldn't that require the compiler to interpret the contents of the
> asm() block?

Yes and no.  It would require proper toolchain support, so in this case
a new ELF relocation type, with compiler, assembler, and linker support
to generate those relocations and process them.  As far as I understand
it, the kernel doesn't do things this way.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09  9:57 ` Ard Biesheuvel
  2021-04-09 10:55   ` Florian Weimer
@ 2021-04-09 11:12   ` Peter Zijlstra
  2021-04-09 11:55     ` David Malcolm
  1 sibling, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 11:12 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-toolchains, Linux Kernel Mailing List, Josh Poimboeuf,
	Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 11:57:22AM +0200, Ard Biesheuvel wrote:
> On Thu, 8 Apr 2021 at 18:53, Peter Zijlstra <peterz@infradead.org> wrote:

> > Is there *any* way in which we can have the compiler recognise that the
> > asm_goto only depends on its arguments and have it merge the branches
> > itself?
> >
> > I do realize that asm-goto being volatile this is a fairly huge ask, but
> > I figured I should at least raise the issue, if only to raise awareness.
> >
> 
> Wouldn't that require the compiler to interpret the contents of the asm() block?

Yeah, this is more or less asking for ponies :-) One option would be
some annotation that conveys the desired semantics without it having to
untangle the mess in the asm block.

The thing the compiler needs to know is that the branch is constant for
any @key, and hence allow the obvious optimizations. I'm not sure if
this is something compiler folks would be even willing to consider, but
I figured asking never hurts.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 10:55   ` Florian Weimer
@ 2021-04-09 11:16     ` Peter Zijlstra
  2021-04-09 19:33       ` Nick Desaulniers
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 11:16 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 12:55:18PM +0200, Florian Weimer wrote:
> * Ard Biesheuvel:
> 
> > Wouldn't that require the compiler to interpret the contents of the
> > asm() block?
> 
> Yes and no.  It would require proper toolchain support, so in this case
> a new ELF relocation type, with compiler, assembler, and linker support
> to generate those relocations and process them.  As far as I understand
> it, the kernel doesn't do things this way.

I don't think that all is needed. All we need is for the compiler to
recognise that:

	if (cond) {
		stmt-A;
	}
	if (cond) {
		stmt-B;
	}

both cond are equivalent and hence can merge the blocks like:

	if (cond) {
		stmt-A;
		stmt-B;
	}

But because @cond is some super opaque asm crap, the compiler throws up
it's imaginry hands and says it cannot possibly tell and leaves them as
is.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 11:12   ` Peter Zijlstra
@ 2021-04-09 11:55     ` David Malcolm
  2021-04-09 12:03       ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: David Malcolm @ 2021-04-09 11:55 UTC (permalink / raw)
  To: Peter Zijlstra, Ard Biesheuvel
  Cc: linux-toolchains, Linux Kernel Mailing List, Josh Poimboeuf,
	Jason Baron, Steven Rostedt (VMware)

On Fri, 2021-04-09 at 13:12 +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 11:57:22AM +0200, Ard Biesheuvel wrote:
> > On Thu, 8 Apr 2021 at 18:53, Peter Zijlstra <peterz@infradead.org>
> > wrote:
> 
> > > Is there *any* way in which we can have the compiler recognise
> > > that the
> > > asm_goto only depends on its arguments and have it merge the
> > > branches
> > > itself?
> > > 
> > > I do realize that asm-goto being volatile this is a fairly huge
> > > ask, but
> > > I figured I should at least raise the issue, if only to raise
> > > awareness.
> > > 
> > 
> > Wouldn't that require the compiler to interpret the contents of the
> > asm() block?
> 
> Yeah, this is more or less asking for ponies :-) One option would be
> some annotation that conveys the desired semantics without it having
> to
> untangle the mess in the asm block.
> 
> The thing the compiler needs to know is that the branch is constant
> for
> any @key, and hence allow the obvious optimizations. I'm not sure if
> this is something compiler folks would be even willing to consider,
> but
> I figured asking never hurts.
> 

Sorry if this is a dumb question, but does the function attribute:
  __attribute__ ((pure)) 
help here?  It's meant to allow multiple calls to a predicate to be
merged - though I'd be nervous of using it here, the predicate isn't
100% pure, since AIUI the whole point of what you've built is for
predicates that very rarely change - but can change occasionally.

Hope this is constructive
Dave



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 11:55     ` David Malcolm
@ 2021-04-09 12:03       ` Peter Zijlstra
  2021-04-09 13:01         ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 12:03 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 07:55:42AM -0400, David Malcolm wrote:
> On Fri, 2021-04-09 at 13:12 +0200, Peter Zijlstra wrote:
> > On Fri, Apr 09, 2021 at 11:57:22AM +0200, Ard Biesheuvel wrote:
> > > On Thu, 8 Apr 2021 at 18:53, Peter Zijlstra <peterz@infradead.org>
> > > wrote:
> > 
> > > > Is there *any* way in which we can have the compiler recognise
> > > > that the
> > > > asm_goto only depends on its arguments and have it merge the
> > > > branches
> > > > itself?
> > > > 
> > > > I do realize that asm-goto being volatile this is a fairly huge
> > > > ask, but
> > > > I figured I should at least raise the issue, if only to raise
> > > > awareness.
> > > > 
> > > 
> > > Wouldn't that require the compiler to interpret the contents of the
> > > asm() block?
> > 
> > Yeah, this is more or less asking for ponies :-) One option would be
> > some annotation that conveys the desired semantics without it having
> > to
> > untangle the mess in the asm block.
> > 
> > The thing the compiler needs to know is that the branch is constant
> > for
> > any @key, and hence allow the obvious optimizations. I'm not sure if
> > this is something compiler folks would be even willing to consider,
> > but
> > I figured asking never hurts.
> > 
> 
> Sorry if this is a dumb question, but does the function attribute:
>   __attribute__ ((pure)) 
> help here?  It's meant to allow multiple calls to a predicate to be
> merged - though I'd be nervous of using it here, the predicate isn't
> 100% pure, since AIUI the whole point of what you've built is for
> predicates that very rarely change - but can change occasionally.

I actually tried that, but it doesn't seem to work. Given the function
arguments are all compile time constants it should DTRT AFAICT, but
alas.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 12:03       ` Peter Zijlstra
@ 2021-04-09 13:01         ` Peter Zijlstra
  2021-04-09 13:13           ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 13:01 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 02:03:46PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 07:55:42AM -0400, David Malcolm wrote:

> > Sorry if this is a dumb question, but does the function attribute:
> >   __attribute__ ((pure)) 
> > help here?  It's meant to allow multiple calls to a predicate to be
> > merged - though I'd be nervous of using it here, the predicate isn't
> > 100% pure, since AIUI the whole point of what you've built is for
> > predicates that very rarely change - but can change occasionally.
> 
> I actually tried that, but it doesn't seem to work. Given the function
> arguments are all compile time constants it should DTRT AFAICT, but
> alas.

FWIW, I tried the below patch and GCC-10.2.1 on current tip/master.

---
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 610a05374c02..704438d07bc8 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
 #include <linux/stringify.h>
 #include <linux/types.h>
 
-static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
+static __always_inline __pure bool arch_static_branch(struct static_key * const key, const bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte " __stringify(BYTES_NOP5) "\n\t"
@@ -30,7 +30,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, co
 	return true;
 }
 
-static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
+static __always_inline __pure bool arch_static_branch_jump(struct static_key * const key, const bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 05f5554d860f..834086663c26 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -195,12 +195,12 @@ struct module;
 #define JUMP_TYPE_LINKED	2UL
 #define JUMP_TYPE_MASK		3UL
 
-static __always_inline bool static_key_false(struct static_key *key)
+static __always_inline __pure bool static_key_false(struct static_key * const key)
 {
 	return arch_static_branch(key, false);
 }
 
-static __always_inline bool static_key_true(struct static_key *key)
+static __always_inline __pure bool static_key_true(struct static_key * const key)
 {
 	return !arch_static_branch(key, true);
 }

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-08 16:52 static_branch/jump_label vs branch merging Peter Zijlstra
  2021-04-09  9:57 ` Ard Biesheuvel
@ 2021-04-09 13:03 ` Segher Boessenkool
  1 sibling, 0 replies; 25+ messages in thread
From: Segher Boessenkool @ 2021-04-09 13:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-toolchains, linux-kernel, jpoimboe, jbaron, rostedt, ardb

On Thu, Apr 08, 2021 at 06:52:18PM +0200, Peter Zijlstra wrote:
> Is there *any* way in which we can have the compiler recognise that the
> asm_goto only depends on its arguments and have it merge the branches
> itself?
> 
> I do realize that asm-goto being volatile this is a fairly huge ask, but
> I figured I should at least raise the issue, if only to raise awareness.

"volatile" should not be an impediment to this at all, volatile means
there is an unspecified side effect, nothing more, nothing less.  But
yes this currently does not work with GCC:

void f(int x)
{       if (x)
                asm volatile("ojee %0" :: "r"(x));
        else
                asm volatile("ojee %0" :: "r"(x));
}

or even

static inline void h(int x)
{
        asm volatile("ojee %0" :: "r"(x));
}

void f1(int x)
{       if (x)
                h(x);
        else
                h(x);
}

which both emit silly machine code.


Segher

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 13:01         ` Peter Zijlstra
@ 2021-04-09 13:13           ` Peter Zijlstra
  2021-04-09 13:48             ` David Malcolm
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 13:13 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 03:01:50PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 02:03:46PM +0200, Peter Zijlstra wrote:
> > On Fri, Apr 09, 2021 at 07:55:42AM -0400, David Malcolm wrote:
> 
> > > Sorry if this is a dumb question, but does the function attribute:
> > >   __attribute__ ((pure)) 
> > > help here?  It's meant to allow multiple calls to a predicate to be
> > > merged - though I'd be nervous of using it here, the predicate isn't
> > > 100% pure, since AIUI the whole point of what you've built is for
> > > predicates that very rarely change - but can change occasionally.
> > 
> > I actually tried that, but it doesn't seem to work. Given the function
> > arguments are all compile time constants it should DTRT AFAICT, but
> > alas.
> 
> FWIW, I tried the below patch and GCC-10.2.1 on current tip/master.

I also just tried __attribute__((__const__)), which is stronger still
than __pure__ and that's also not working :/

I then also tried to replace the __buildin_types_compatible_p() magic in
static_branch_unlikely() with _Generic(), but still no joy.


---
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 610a05374c02..f14c6863b911 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
 #include <linux/stringify.h>
 #include <linux/types.h>
 
-static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
+static __always_inline __attribute_const__ bool arch_static_branch(struct static_key * const key, const bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte " __stringify(BYTES_NOP5) "\n\t"
@@ -30,7 +30,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, co
 	return true;
 }
 
-static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
+static __always_inline __attribute_const__ bool arch_static_branch_jump(struct static_key * const key, const bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 05f5554d860f..2c250d8b9a02 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -195,12 +195,12 @@ struct module;
 #define JUMP_TYPE_LINKED	2UL
 #define JUMP_TYPE_MASK		3UL
 
-static __always_inline bool static_key_false(struct static_key *key)
+static __always_inline __attribute_const__ bool static_key_false(struct static_key * const key)
 {
 	return arch_static_branch(key, false);
 }
 
-static __always_inline bool static_key_true(struct static_key *key)
+static __always_inline __attribute_const__ bool static_key_true(struct static_key * const key)
 {
 	return !arch_static_branch(key, true);
 }
@@ -466,6 +466,18 @@ extern bool ____wrong_branch_error(void);
  * See jump_label_type() / jump_label_init_type().
  */
 
+#if 1
+
+#define static_branch_likely(x)	_Generic(*(x),					\
+	struct static_key_true:	!arch_static_branch(&(x)->key, true),		\
+	struct static_key_false: !arch_static_branch_jump(&(x)->key, true))
+
+#define static_branch_unlikely(x) _Generic(*(x),				\
+	struct static_key_true:	arch_static_branch_jump(&(x)->key, false),	\
+	struct static_key_false: arch_static_branch(&(x)->key, false))
+
+#else
+
 #define static_branch_likely(x)							\
 ({										\
 	bool branch;								\
@@ -490,6 +502,8 @@ extern bool ____wrong_branch_error(void);
 	unlikely_notrace(branch);							\
 })
 
+#endif
+
 #else /* !CONFIG_JUMP_LABEL */
 
 #define static_branch_likely(x)		likely_notrace(static_key_enabled(&(x)->key))

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 13:13           ` Peter Zijlstra
@ 2021-04-09 13:48             ` David Malcolm
  2021-04-09 18:40               ` Peter Zijlstra
  2021-04-10 12:44               ` David Laight
  0 siblings, 2 replies; 25+ messages in thread
From: David Malcolm @ 2021-04-09 13:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, 2021-04-09 at 15:13 +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 03:01:50PM +0200, Peter Zijlstra wrote:
> > On Fri, Apr 09, 2021 at 02:03:46PM +0200, Peter Zijlstra wrote:
> > > On Fri, Apr 09, 2021 at 07:55:42AM -0400, David Malcolm wrote:
> > 
> > > > Sorry if this is a dumb question, but does the function
> > > > attribute:
> > > >   __attribute__ ((pure)) 
> > > > help here?  It's meant to allow multiple calls to a predicate
> > > > to be
> > > > merged - though I'd be nervous of using it here, the predicate
> > > > isn't
> > > > 100% pure, since AIUI the whole point of what you've built is
> > > > for
> > > > predicates that very rarely change - but can change
> > > > occasionally.
> > > 
> > > I actually tried that, but it doesn't seem to work. Given the
> > > function
> > > arguments are all compile time constants it should DTRT AFAICT,
> > > but
> > > alas.
> > 
> > FWIW, I tried the below patch and GCC-10.2.1 on current tip/master.
> 
> I also just tried __attribute__((__const__)), which is stronger still
> than __pure__ and that's also not working :/
> 
> I then also tried to replace the __buildin_types_compatible_p() magic
> in
> static_branch_unlikely() with _Generic(), but still no joy.
> 

[..snip...]

You tried __pure on arch_static_branch; did you try it on
static_branch_unlikely?

With the caveat that my knowledge of GCC's middle-end is mostly about
implementing warnings, rather than optimization, I did some
experimentation, with gcc trunk on x86_64 FWIW.

Given:

int __attribute__((pure)) foo(void);

int t(void)
{
  int a;
  if (foo())
    a++;
  if (foo())
    a++;
  if (foo())
    a++;
  return a;
}

At -O1 and above this is optimized to a single call to foo, returning 0
or 3 accordingly.

-fdump-tree-all shows that it's the "fre1" pass that eliminates the
subsequent calls to foo, replacing them with reuses of the result of
the first call.

This is in gcc/tree-ssa-sccvn.c, a value-numbering pass.

I think you want to somehow "teach" the compiler that:
  static_branch_unlikely(&sched_schedstats)
is "pure-ish", that for some portion of the surrounding code that you
want the result to be treated as pure - though I suspect compiler
maintainers with more experience than me are thinking "but which
portion? what is it safe to assume, and what will users be annoyed
about if we optimize away? what if t itself is inlined somewhere?" and
similar concerns.

Or maybe the asm stmt itself could somehow be marked as pure??? (with
similar concerns about semantics as above)

Hope this is constructive
Dave



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 13:48             ` David Malcolm
@ 2021-04-09 18:40               ` Peter Zijlstra
  2021-04-09 19:21                 ` David Malcolm
  2021-04-10 12:44               ` David Laight
  1 sibling, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 18:40 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 09:48:33AM -0400, David Malcolm wrote:
> You tried __pure on arch_static_branch; did you try it on
> static_branch_unlikely?

static_branch_unlikely() is a CPP macro that expands to a statement
expression, or as with the later patch, a _Generic(). I'm not sure how
to apply the attribute to either of them since it is a function
attribute.

I was hoping the attribute would percolate through, so to speak.

> With the caveat that my knowledge of GCC's middle-end is mostly about
> implementing warnings, rather than optimization, I did some
> experimentation, with gcc trunk on x86_64 FWIW.
> 
> Given:
> 
> int __attribute__((pure)) foo(void);
> 
> int t(void)
> {
>   int a;
>   if (foo())
>     a++;
>   if (foo())
>     a++;
>   if (foo())
>     a++;
>   return a;
> }
> 
> At -O1 and above this is optimized to a single call to foo, returning 0
> or 3 accordingly.
> 
> -fdump-tree-all shows that it's the "fre1" pass that eliminates the
> subsequent calls to foo, replacing them with reuses of the result of
> the first call.
> 
> This is in gcc/tree-ssa-sccvn.c, a value-numbering pass.
> 
> I think you want to somehow "teach" the compiler that:
>   static_branch_unlikely(&sched_schedstats)
> is "pure-ish", that for some portion of the surrounding code that you
> want the result to be treated as pure - though I suspect compiler
> maintainers with more experience than me are thinking "but which
> portion? what is it safe to assume, and what will users be annoyed
> about if we optimize away? what if t itself is inlined somewhere?" and
> similar concerns.

Right, pure or even const. As to the scope, as wide as possible. It
literally is a global constant, the value returned is the same
everywhere.

All we need GCC to do for the static_branch construct is to emit both
branches; that is, it must not treat the result as a constant and elide
the other branches. But it can consider consecutive calls (as far and
wide as it wants) to return the same value.

> Or maybe the asm stmt itself could somehow be marked as pure??? (with
> similar concerns about semantics as above)

Yeah, not sure, someone with more clue will have to inform us what, if
anything more than marking it either pure or const is required. Perhaps
that attribute is sufficient and the compiler just isn't optimizing for
an unrelated reason.

Regards,

Peter

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 18:40               ` Peter Zijlstra
@ 2021-04-09 19:21                 ` David Malcolm
  2021-04-09 20:09                   ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: David Malcolm @ 2021-04-09 19:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, 2021-04-09 at 20:40 +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 09:48:33AM -0400, David Malcolm wrote:
> > You tried __pure on arch_static_branch; did you try it on
> > static_branch_unlikely?
> 
> static_branch_unlikely() is a CPP macro that expands to a statement
> expression, or as with the later patch, a _Generic(). I'm not sure
> how
> to apply the attribute to either of them since it is a function
> attribute.
> 
> I was hoping the attribute would percolate through, so to speak.
> 
> > With the caveat that my knowledge of GCC's middle-end is mostly
> > about
> > implementing warnings, rather than optimization, I did some
> > experimentation, with gcc trunk on x86_64 FWIW.
> > 
> > Given:
> > 
> > int __attribute__((pure)) foo(void);
> > 
> > int t(void)
> > {
> >   int a;
> >   if (foo())
> >     a++;
> >   if (foo())
> >     a++;
> >   if (foo())
> >     a++;
> >   return a;
> > }
> > 
> > At -O1 and above this is optimized to a single call to foo,
> > returning 0
> > or 3 accordingly.
> > 
> > -fdump-tree-all shows that it's the "fre1" pass that eliminates the
> > subsequent calls to foo, replacing them with reuses of the result
> > of
> > the first call.
> > 
> > This is in gcc/tree-ssa-sccvn.c, a value-numbering pass.
> > 
> > I think you want to somehow "teach" the compiler that:
> >   static_branch_unlikely(&sched_schedstats)
> > is "pure-ish", that for some portion of the surrounding code that
> > you
> > want the result to be treated as pure - though I suspect compiler
> > maintainers with more experience than me are thinking "but which
> > portion? what is it safe to assume, and what will users be annoyed
> > about if we optimize away? what if t itself is inlined somewhere?"
> > and
> > similar concerns.
> 
> Right, pure or even const. As to the scope, as wide as possible. It
> literally is a global constant, the value returned is the same
> everywhere.

[Caveat: I'm a gcc developer, not a kernel expert]

But it's not *quite* a global constant, or presumably you would be
simply using a global constant, right?  As the optimizer gets smarter,
you don't want to have it one day decide that actually it really is
constant, and optimize away everything at compile-time (e.g. when LTO
is turned on, or whatnot).

I get the impression that you're resorting to assembler because you're
pushing beyond what the C language can express.  Taking things to a
slightly higher level, am I right in thinking that what you're trying
to achieve is a control flow construct that almost always takes one of
the given branches, but which can (very rarely) be switched to
permanently take one of the other branches, and that you want the
lowest possible overhead for the common case where the control flow
hasn't been touched yet?  (and presumably little overhead for when it
has been?)... and that you want to be able to merge repeated such
conditionals.  Perhaps a __builtin_ to hint that a conditional should
work that way (analogous to __builtin_expect)?  I can imagine a way of
implementing such a construct in gcc's gimple and RTL representations,
but it would be a ton of work (and I have a full plate already)

Or maybe another way of thinking about it is that you're reading a
value and you would like the compiler to amortize away repeated reads
of the value (perhaps just within the current function).

It's kind of the opposite of "volatile" - something that the user is
happy for the compiler to treat as not changing much, as opposed to
something the user is warning the compiler about changing from under
it.  A "const-ish" value?

Sorry if I'm being incoherent; I'm kind of thinking aloud here.

Hope this is constructive
Dave


> 
> All we need GCC to do for the static_branch construct is to emit both
> branches; that is, it must not treat the result as a constant and
> elide
> the other branches. But it can consider consecutive calls (as far and
> wide as it wants) to return the same value.
> 
> > Or maybe the asm stmt itself could somehow be marked as pure???
> > (with
> > similar concerns about semantics as above)
> 
> Yeah, not sure, someone with more clue will have to inform us what,
> if
> anything more than marking it either pure or const is required.
> Perhaps
> that attribute is sufficient and the compiler just isn't optimizing
> for
> an unrelated reason.
> 
> Regards,
> 
> Peter
> 



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 11:16     ` Peter Zijlstra
@ 2021-04-09 19:33       ` Nick Desaulniers
  2021-04-09 20:11         ` Peter Zijlstra
  2021-04-10 17:02         ` Segher Boessenkool
  0 siblings, 2 replies; 25+ messages in thread
From: Nick Desaulniers @ 2021-04-09 19:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Florian Weimer, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	Segher Boessenkool, dmalcolm

On Fri, Apr 9, 2021 at 4:18 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Apr 09, 2021 at 12:55:18PM +0200, Florian Weimer wrote:
> > * Ard Biesheuvel:
> >
> > > Wouldn't that require the compiler to interpret the contents of the
> > > asm() block?
> >
> > Yes and no.  It would require proper toolchain support, so in this case
> > a new ELF relocation type, with compiler, assembler, and linker support
> > to generate those relocations and process them.  As far as I understand
> > it, the kernel doesn't do things this way.
>
> I don't think that all is needed. All we need is for the compiler to
> recognise that:
>
>         if (cond) {
>                 stmt-A;
>         }
>         if (cond) {
>                 stmt-B;
>         }
>
> both cond are equivalent and hence can merge the blocks like:
>
>         if (cond) {
>                 stmt-A;
>                 stmt-B;
>         }
>
> But because @cond is some super opaque asm crap, the compiler throws up
> it's imaginry hands and says it cannot possibly tell and leaves them as
> is.

Right, because if `cond` has side effects (such as is implied by asm
statements that are volatile qualified), suddenly those side effects
would only occur once whereas previously they occurred twice.

Since asm goto is implicitly volatile qualified, it sounds like
removing the implicit volatile qualifier from asm goto might help?
Then if there were side effects but you forgot to inform the compiler
that there were via an explicit volatile qualifier, and it performed
the suggested merge, oh well.
-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 19:21                 ` David Malcolm
@ 2021-04-09 20:09                   ` Peter Zijlstra
  2021-04-09 21:07                     ` David Malcolm
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 20:09 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 03:21:49PM -0400, David Malcolm wrote:
> [Caveat: I'm a gcc developer, not a kernel expert]
> 
> But it's not *quite* a global constant, or presumably you would be
> simply using a global constant, right?  As the optimizer gets smarter,
> you don't want to have it one day decide that actually it really is
> constant, and optimize away everything at compile-time (e.g. when LTO
> is turned on, or whatnot).

Right; as I said, the result is not a constant, but any invocation ever,
will return the same result. Small but subtle difference :-)

> I get the impression that you're resorting to assembler because you're
> pushing beyond what the C language can express. 

Of course :-) I tend to always push waaaaay past what C considers sane.
Lets say I'm firmly in the C-as-Optimizing-Assembler camp :-)

> Taking things to a slightly higher level, am I right in thinking that
> what you're trying to achieve is a control flow construct that almost
> always takes one of the given branches, but which can (very rarely) be
> switched to permanently take one of the other branches, and that you
> want the lowest possible overhead for the common case where the
> control flow hasn't been touched yet? 

Correct, that's what it is. We do runtime code patching to flip the
branch if/when needed. We've been doing this for many many years now.

The issue of today is all this clever stuff defeating some simple
optimizations.

> (and presumably little overhead for when it
> has been?)... and that you want to be able to merge repeated such
> conditionals. 

This.. So the 'static' branches have been upstream and in use ever since
GCC added asm-goto, it was in fact the driving force to get asm-goto
implemented. This was 2010 according to git history.

So we emit, using asm goto, either a "NOP5" or "JMP.d32" (x86 speaking),
and a special section entry into which we encode the key address and the
instruction address and the jump target.

GCC, not knowing what the asm does, only sees the 2 edges and all is
well.

Then, at runtime, when we decide we want the other edge for a given key,
we iterate our section and rewrite the code to either nop5 or jmp.d32
with the correct jump target.

> It's kind of the opposite of "volatile" - something that the user is
> happy for the compiler to treat as not changing much, as opposed to
> something the user is warning the compiler about changing from under
> it.  A "const-ish" value?

Just so. Encoded in text, not data.

> Sorry if I'm being incoherent; I'm kind of thinking aloud here.

No problem, we're way outside of what is generally considered normal,
and I did somewhat assume people were familiar with our 'dodgy'
construct (some on this list are more than others).

I hope it's all a little clearer now.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 19:33       ` Nick Desaulniers
@ 2021-04-09 20:11         ` Peter Zijlstra
  2021-04-10 17:02         ` Segher Boessenkool
  1 sibling, 0 replies; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 20:11 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Florian Weimer, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	Segher Boessenkool, dmalcolm

On Fri, Apr 09, 2021 at 12:33:29PM -0700, Nick Desaulniers wrote:
> On Fri, Apr 9, 2021 at 4:18 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Fri, Apr 09, 2021 at 12:55:18PM +0200, Florian Weimer wrote:
> > > * Ard Biesheuvel:
> > >
> > > > Wouldn't that require the compiler to interpret the contents of the
> > > > asm() block?
> > >
> > > Yes and no.  It would require proper toolchain support, so in this case
> > > a new ELF relocation type, with compiler, assembler, and linker support
> > > to generate those relocations and process them.  As far as I understand
> > > it, the kernel doesn't do things this way.
> >
> > I don't think that all is needed. All we need is for the compiler to
> > recognise that:
> >
> >         if (cond) {
> >                 stmt-A;
> >         }
> >         if (cond) {
> >                 stmt-B;
> >         }
> >
> > both cond are equivalent and hence can merge the blocks like:
> >
> >         if (cond) {
> >                 stmt-A;
> >                 stmt-B;
> >         }
> >
> > But because @cond is some super opaque asm crap, the compiler throws up
> > it's imaginry hands and says it cannot possibly tell and leaves them as
> > is.
> 
> Right, because if `cond` has side effects (such as is implied by asm
> statements that are volatile qualified), suddenly those side effects
> would only occur once whereas previously they occurred twice.
> 
> Since asm goto is implicitly volatile qualified, it sounds like
> removing the implicit volatile qualifier from asm goto might help?
> Then if there were side effects but you forgot to inform the compiler
> that there were via an explicit volatile qualifier, and it performed
> the suggested merge, oh well.

So, as mentioned elsewhere in this thread, it would be nice if either
the pure or const function attribute could over-ride/constrain that
volatile side effect.

I'm fine with things going side-ways if we get it wrong, that's more or
less the game we're playing anyway ;-)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 20:09                   ` Peter Zijlstra
@ 2021-04-09 21:07                     ` David Malcolm
  2021-04-09 21:39                       ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: David Malcolm @ 2021-04-09 21:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, 2021-04-09 at 22:09 +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 03:21:49PM -0400, David Malcolm wrote:
> > [Caveat: I'm a gcc developer, not a kernel expert]
> > 
> > But it's not *quite* a global constant, or presumably you would be
> > simply using a global constant, right?  As the optimizer gets
> > smarter,
> > you don't want to have it one day decide that actually it really is
> > constant, and optimize away everything at compile-time (e.g. when
> > LTO
> > is turned on, or whatnot).
> 
> Right; as I said, the result is not a constant, but any invocation
> ever,
> will return the same result. Small but subtle difference :-)
> 
> > I get the impression that you're resorting to assembler because
> > you're
> > pushing beyond what the C language can express. 
> 
> Of course :-) I tend to always push waaaaay past what C considers
> sane.
> Lets say I'm firmly in the C-as-Optimizing-Assembler camp :-)

Yeah, I got that :)

> > Taking things to a slightly higher level, am I right in thinking
> > that
> > what you're trying to achieve is a control flow construct that
> > almost
> > always takes one of the given branches, but which can (very rarely)
> > be
> > switched to permanently take one of the other branches, and that
> > you
> > want the lowest possible overhead for the common case where the
> > control flow hasn't been touched yet? 
> 
> Correct, that's what it is. We do runtime code patching to flip the
> branch if/when needed. We've been doing this for many many years now.
> 
> The issue of today is all this clever stuff defeating some simple
> optimizations.

It's certainly clever - though, if you'll forgive me, that's not always
a good thing :)

> > (and presumably little overhead for when it
> > has been?)... and that you want to be able to merge repeated such
> > conditionals. 
> 
> This.. So the 'static' branches have been upstream and in use ever
> since
> GCC added asm-goto, it was in fact the driving force to get asm-goto
> implemented. This was 2010 according to git history.
> 
> So we emit, using asm goto, either a "NOP5" or "JMP.d32" (x86
> speaking),
> and a special section entry into which we encode the key address and
> the
> instruction address and the jump target.
> 
> GCC, not knowing what the asm does, only sees the 2 edges and all is
> well.
> 
> Then, at runtime, when we decide we want the other edge for a given
> key,
> we iterate our section and rewrite the code to either nop5 or jmp.d32
> with the correct jump target.
> 
> > It's kind of the opposite of "volatile" - something that the user
> > is
> > happy for the compiler to treat as not changing much, as opposed to
> > something the user is warning the compiler about changing from
> > under
> > it.  A "const-ish" value?
> 
> Just so. Encoded in text, not data.
> 
> > Sorry if I'm being incoherent; I'm kind of thinking aloud here.
> 
> No problem, we're way outside of what is generally considered normal,
> and I did somewhat assume people were familiar with our 'dodgy'
> construct (some on this list are more than others).
> 
> I hope it's all a little clearer now.

Yeah.  This is actually on two mailing lists; I'm only subscribed to
linux-toolchains, which AIUI is about sharing ideas between Linux and
the toolchains.

You've built a very specific thing out of asm-goto to fulfil the tough
requirements you outlined above - as well as the nops, there's a thing
in another section to contend with.

How to merge these asm-goto constructs?

Doing so feels very special-case to the kernel and not something that
other GCC users would find useful.

I can imagine a GCC plugin that implemented a custom optimization pass
for that - basically something that spots the asm-gotos in the gimple
IR and optimizes away duplicates by replacing them with jumps, but
having read about Linus's feelings about GCC plugins recently:
  https://lwn.net/Articles/851090/
I suspect that that isn't going to fly (and if you're going down the
route of adding an optimization pass via a plugin, there's probably a
way to do that that doesn't involve asm).  In theory, something to
optimize the asm-gotos could be relatively simple, but that said, we
don't really have a GCC plugin API; all of our internal APIs are
exposed, and are liable to change from release to release, which I know
is a pain (I've managed to break one of my own plugins with one of my
own API changes at least once).

Hope this is constructive
Dave



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 21:07                     ` David Malcolm
@ 2021-04-09 21:39                       ` Peter Zijlstra
  2021-04-22 11:48                         ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-09 21:39 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

On Fri, Apr 09, 2021 at 05:07:15PM -0400, David Malcolm wrote:

> You've built a very specific thing out of asm-goto to fulfil the tough
> requirements you outlined above - as well as the nops, there's a thing
> in another section to contend with.
> 
> How to merge these asm-goto constructs?

By calling the function less, you emit less of them. Which then brings
us back to the whole pure/const thing.

> Doing so feels very special-case to the kernel and not something that
> other GCC users would find useful.

Doesn't it boil down to 'fixing' the pure/const vs asm-goto interaction?
I could imagine that having that interaction fixed could allow other
creative uses.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: static_branch/jump_label vs branch merging
  2021-04-09 13:48             ` David Malcolm
  2021-04-09 18:40               ` Peter Zijlstra
@ 2021-04-10 12:44               ` David Laight
  1 sibling, 0 replies; 25+ messages in thread
From: David Laight @ 2021-04-10 12:44 UTC (permalink / raw)
  To: 'David Malcolm', Peter Zijlstra
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware)

From: David Malcolm
> Sent: 09 April 2021 14:49
...
> With the caveat that my knowledge of GCC's middle-end is mostly about
> implementing warnings, rather than optimization, I did some
> experimentation, with gcc trunk on x86_64 FWIW.
> 
> Given:
> 
> int __attribute__((pure)) foo(void);
> 
> int t(void)
> {
>   int a;
           = 0;
>   if (foo())
>     a++;
>   if (foo())
>     a++;
>   if (foo())
>     a++;
>   return a;
> }
> 
> At -O1 and above this is optimized to a single call to foo, returning 0
> or 3 accordingly.

Interesting horrid idea.
The code generated for the above should be:
	call	foo
	jz	label
So objtool could find the relocation entries for 'foo'
and use that information to replace the call instruction
(and maybe the jz) with a suitable alternate instruction
sequence.

Although that might end up with a game of 'whack-a-mole'
on the perverse instruction sequences the compiler
generates.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 19:33       ` Nick Desaulniers
  2021-04-09 20:11         ` Peter Zijlstra
@ 2021-04-10 17:02         ` Segher Boessenkool
  1 sibling, 0 replies; 25+ messages in thread
From: Segher Boessenkool @ 2021-04-10 17:02 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Peter Zijlstra, Florian Weimer, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	dmalcolm

On Fri, Apr 09, 2021 at 12:33:29PM -0700, Nick Desaulniers wrote:
> Since asm goto is implicitly volatile qualified, it sounds like
> removing the implicit volatile qualifier from asm goto might help?
> Then if there were side effects but you forgot to inform the compiler
> that there were via an explicit volatile qualifier, and it performed
> the suggested merge, oh well.

"asm goto" without outputs is always volatile, just like any other asm
without outputs (if it wasn't, the compiler would always delete every
asm without outputs!)


Segher

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-09 21:39                       ` Peter Zijlstra
@ 2021-04-22 11:48                         ` Peter Zijlstra
  2021-04-22 17:08                           ` Segher Boessenkool
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-22 11:48 UTC (permalink / raw)
  To: David Malcolm
  Cc: Ard Biesheuvel, linux-toolchains, Linux Kernel Mailing List,
	Josh Poimboeuf, Jason Baron, Steven Rostedt (VMware),
	yuanzhaoxiong

On Fri, Apr 09, 2021 at 11:39:49PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 05:07:15PM -0400, David Malcolm wrote:
> 
> > You've built a very specific thing out of asm-goto to fulfil the tough
> > requirements you outlined above - as well as the nops, there's a thing
> > in another section to contend with.
> > 
> > How to merge these asm-goto constructs?
> 
> By calling the function less, you emit less of them. Which then brings
> us back to the whole pure/const thing.
> 
> > Doing so feels very special-case to the kernel and not something that
> > other GCC users would find useful.
> 
> Doesn't it boil down to 'fixing' the pure/const vs asm-goto interaction?
> I could imagine that having that interaction fixed could allow other
> creative uses.

Here is another variant:

  https://lore.kernel.org/lkml/830177B0-45E0-4768-80AB-A99B85D3A52F@baidu.com/

Can we please have a __pure__ attribute that is prescriptive and not a
hint the compiler is free to ignore for $raisins ?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-22 11:48                         ` Peter Zijlstra
@ 2021-04-22 17:08                           ` Segher Boessenkool
  2021-04-22 17:49                             ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Segher Boessenkool @ 2021-04-22 17:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Malcolm, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	yuanzhaoxiong

On Thu, Apr 22, 2021 at 01:48:39PM +0200, Peter Zijlstra wrote:
> Can we please have a __pure__ attribute that is prescriptive and not a
> hint the compiler is free to ignore for $raisins ?

What does that mean?  What actual semantics do you want it to have?

The "pure" attribute means the compiler can assume this function does
not have side effects.  But in general (and in practice in many cases)
there is no way the compiler can actually check that, if that is what
you were asking for.

And any such checking will depend on optimisation level and related
flags, making that a no-go anyway.


Segher

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-22 17:08                           ` Segher Boessenkool
@ 2021-04-22 17:49                             ` Peter Zijlstra
  2021-04-22 18:31                               ` Segher Boessenkool
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-22 17:49 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: David Malcolm, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	yuanzhaoxiong

On Thu, Apr 22, 2021 at 12:08:20PM -0500, Segher Boessenkool wrote:
> On Thu, Apr 22, 2021 at 01:48:39PM +0200, Peter Zijlstra wrote:
> > Can we please have a __pure__ attribute that is prescriptive and not a
> > hint the compiler is free to ignore for $raisins ?
> 
> What does that mean?  What actual semantics do you want it to have?

I want a function marked as pure to be treated as such, unconditionally.

> The "pure" attribute means the compiler can assume this function does
> not have side effects.  But in general (and in practice in many cases)
> there is no way the compiler can actually check that, if that is what
> you were asking for.

Right, so currently the pure attribute gets ignored by the compiler
because of various reasons, one of them being an asm volatile ("") being
present somewhere inside it (AFAIU).

This means the compiler will emit multiple calls to the function, which
casuses loss in optimization possibilities; we've had
branch-eleminiation and hoisting as practical examples now.

That is; AFAICT the compiler sees the pure attribute and decides it was
wrongly applied because it cannot tell what the asm is doing; I want
that reversed such that it will assume the asm abides by the pure.

Does this mean we can have invalid code generation when we faultily
mark things pure? Yes, but then it's our own damn fault for sticking on
pure in the first place.

In short; I want pure (or really_pure if you want a second attribute) to
be a do-what-I-tell-you-already and not a
only-if-you-think-you-can-prove-I-didn't-make-a-mistake kinda knob. A
little bit like inline vs always_inline.

> And any such checking will depend on optimisation level and related
> flags, making that a no-go anyway.

Realistically I'm only bothered about -O2 and up since that's what we
build the kernel with. Obviously one doesn't care about optimizations
being lost when build with -O0.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-22 17:49                             ` Peter Zijlstra
@ 2021-04-22 18:31                               ` Segher Boessenkool
  2021-04-26 17:13                                 ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Segher Boessenkool @ 2021-04-22 18:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Malcolm, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	yuanzhaoxiong

On Thu, Apr 22, 2021 at 07:49:23PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 22, 2021 at 12:08:20PM -0500, Segher Boessenkool wrote:
> > On Thu, Apr 22, 2021 at 01:48:39PM +0200, Peter Zijlstra wrote:
> > > Can we please have a __pure__ attribute that is prescriptive and not a
> > > hint the compiler is free to ignore for $raisins ?
> > 
> > What does that mean?  What actual semantics do you want it to have?
> 
> I want a function marked as pure to be treated as such, unconditionally.
> 
> > The "pure" attribute means the compiler can assume this function does
> > not have side effects.  But in general (and in practice in many cases)
> > there is no way the compiler can actually check that, if that is what
> > you were asking for.
> 
> Right, so currently the pure attribute gets ignored by the compiler
> because of various reasons, one of them being an asm volatile ("") being
> present somewhere inside it (AFAIU).

In general, the compiler only sees the *declaration* of the function, so
it cannot do such a thing.

> Does this mean we can have invalid code generation when we faultily
> mark things pure? Yes, but then it's our own damn fault for sticking on
> pure in the first place.

Nope, you have undefined behaviour in that case, and you get to keep
all N pieces, the compiler cannot do anything wrong in such a case :-)

> In short; I want pure (or really_pure if you want a second attribute) to

You cannot make the meaning of "pure" different from what it has been
historically, because existing programs will no longer build (or worse,
start behaving differently).

> be a do-what-I-tell-you-already and not a
> only-if-you-think-you-can-prove-I-didn't-make-a-mistake kinda knob. A
> little bit like inline vs always_inline.

It sounds like you want it to behave like attribute((pure)) already is
documented as doing.  Please open a PR?  https://gcc.gnu.org/bugs.html
(We need buildable stand-alone example code, with what flags to use, and
something like what should happen and what did happen).

> > And any such checking will depend on optimisation level and related
> > flags, making that a no-go anyway.
> 
> Realistically I'm only bothered about -O2 and up since that's what we
> build the kernel with. Obviously one doesn't care about optimizations
> being lost when build with -O0.

GCC is used for other things as well, not just for building Linux ;-)


Segher

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: static_branch/jump_label vs branch merging
  2021-04-22 18:31                               ` Segher Boessenkool
@ 2021-04-26 17:13                                 ` Peter Zijlstra
  0 siblings, 0 replies; 25+ messages in thread
From: Peter Zijlstra @ 2021-04-26 17:13 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: David Malcolm, Ard Biesheuvel, linux-toolchains,
	Linux Kernel Mailing List, Josh Poimboeuf, Jason Baron,
	Steven Rostedt (VMware),
	yuanzhaoxiong

On Thu, Apr 22, 2021 at 01:31:51PM -0500, Segher Boessenkool wrote:
> It sounds like you want it to behave like attribute((pure)) already is
> documented as doing.  Please open a PR?  https://gcc.gnu.org/bugs.html
> (We need buildable stand-alone example code, with what flags to use, and
> something like what should happen and what did happen).


Something like so?

---

gcc (Debian 10.2.1-6) 10.2.1 20210110

Clear fail on both counts, the first emits _3_ nops, where 1 is
expected, and the second emits the nop _inside_ the loop.

0000000000001180 <elide_branches>:
1180:       48 83 ec 08             sub    $0x8,%rsp
1184:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
1189:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
118e:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
1193:       48 83 c4 08             add    $0x8,%rsp
1197:       c3                      retq
1198:       0f 1f 84 00 00 00 00 00         nopl   0x0(%rax,%rax,1)
11a0:       48 8d 3d 5d 0e 00 00    lea    0xe5d(%rip),%rdi        # 2004 <_IO_stdin_used+0x4>
11a7:       31 c0                   xor    %eax,%eax
11a9:       e8 a2 fe ff ff          callq  1050 <printf@plt>
11ae:       eb d9                   jmp    1189 <elide_branches+0x9>
11b0:       48 8d 3d 5a 0e 00 00    lea    0xe5a(%rip),%rdi        # 2011 <_IO_stdin_used+0x11>
11b7:       48 83 c4 08             add    $0x8,%rsp
11bb:       e9 80 fe ff ff          jmpq   1040 <puts@plt>
11c0:       48 8d 3d 45 0e 00 00    lea    0xe45(%rip),%rdi        # 200c <_IO_stdin_used+0xc>
11c7:       31 c0                   xor    %eax,%eax
11c9:       e8 82 fe ff ff          callq  1050 <printf@plt>
11ce:       eb be                   jmp    118e <elide_branches+0xe>

00000000000011d0 <hoist>:
11d0:       53                      push   %rbx
11d1:       31 db                   xor    %ebx,%ebx
11d3:       eb 16                   jmp    11eb <hoist+0x1b>
11d5:       0f 1f 00                nopl   (%rax)
11d8:       89 de                   mov    %ebx,%esi
11da:       48 8d 3d 36 0e 00 00    lea    0xe36(%rip),%rdi        # 2017 <_IO_stdin_used+0x17>
11e1:       31 c0                   xor    %eax,%eax
11e3:       83 c3 01                add    $0x1,%ebx
11e6:       e8 65 fe ff ff          callq  1050 <printf@plt>
11eb:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
11f0:       b8 0a 00 00 00          mov    $0xa,%eax
11f5:       39 c3                   cmp    %eax,%ebx
11f7:       7c df                   jl     11d8 <hoist+0x8>
11f9:       bf 0a 00 00 00          mov    $0xa,%edi
11fe:       5b                      pop    %rbx
11ff:       e9 2c fe ff ff          jmpq   1030 <putchar@plt>
1204:       0f 1f 40 00             nopl   0x0(%rax)
1208:       b8 05 00 00 00          mov    $0x5,%eax
120d:       eb e6                   jmp    11f5 <hoist+0x25>
120f:       90                      nop

---

/* gcc -O2 -o pure-fail pure-fail.c */

#include <stdio.h>
#include <stdbool.h>

#define __pure		                __attribute__((__pure__))

#define __stringify_1(x...)	#x
#define __stringify(x...)	__stringify_1(x)

#ifndef __x86_64__
#define BYTES_NOP4	0x8d,0x74,0x26,0x00
#define BYTES_NOP5	0x3e,BYTES_NOP4
#else
#define BYTES_NOP5	0x0f,0x1f,0x44,0x00,0x00
#endif

# define __ASM_FORM(x)	" " __stringify(x) " "
# define __ASM_FORM_RAW(x)     __stringify(x)
# define __ASM_FORM_COMMA(x) " " __stringify(x) ","

#ifndef __x86_64__
/* 32 bit */
# define __ASM_SEL(a,b) __ASM_FORM(a)
# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
#else
/* 64 bit */
# define __ASM_SEL(a,b) __ASM_FORM(b)
# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
#endif

#define __ASM_SIZE(inst, ...)	__ASM_SEL(inst##l##__VA_ARGS__, \
					  inst##q##__VA_ARGS__)
#define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)

#define _ASM_PTR	__ASM_SEL(.long, .quad)
#define _ASM_ALIGN	__ASM_SEL(.balign 4, .balign 8)

#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)

/* --- */

struct static_key {
	int enabled;
};

#define STATIC_KEY_INIT_TRUE	{ .enabled = 1, }
#define STATIC_KEY_INIT_FALSE	{ .enabled = 0, }

static __always_inline __pure bool arch_static_branch(struct static_key * const key, const bool branch)
{
	asm_volatile_goto("1:"
		".byte " __stringify(BYTES_NOP5) "\n\t"
		".pushsection __jump_table,  \"aw\" \n\t"
		_ASM_ALIGN "\n\t"
		".long 1b - ., %l[l_yes] - . \n\t"
		_ASM_PTR "%c0 + %c1 - .\n\t"
		".popsection \n\t"
		: :  "i" (key), "i" (branch) : : l_yes);

	return false;
l_yes:
	return true;
}


static __always_inline __pure bool static_key_false(struct static_key * const key)
{
	return arch_static_branch(key, false);
}

static __always_inline __pure bool static_key_true(struct static_key * const key)
{
	return !arch_static_branch(key, true);
}



/* --- */


static struct static_key key_A = STATIC_KEY_INIT_FALSE;

/*
 * Expect:
 *
 *	if (static_key_false(&key_A)) {
 *		printf("ponies ");
 *		printf("are ");
 *		printf("small\n");
 *	}
 */
void elide_branches(void)
{
	if (static_key_false(&key_A))
		printf("ponies ");
	if (static_key_false(&key_A))
		printf("are ");
	if (static_key_false(&key_A))
		printf("small\n");
}


/* --- */

static struct static_key key_B = STATIC_KEY_INIT_TRUE;

static __pure int count(void)
{
	if (static_key_true(&key_B))
		return 10;
	return 5;
}

/*
 * Expect:
 *
 *	tmp = count();
 *	for (i = 0; i < tmp; i++)
 *		printf("%d, ", i);
 *	printf("\n");
 */
void hoist(void)
{
	int i;

	for (i = 0; i < count(); i++)
		printf("%d, ", i);
	printf("\n");
}


/* --- */

int main(int argc, char **argv)
{
	elide_branches();
	hoist();

	return 0;
}

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2021-04-26 17:14 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-08 16:52 static_branch/jump_label vs branch merging Peter Zijlstra
2021-04-09  9:57 ` Ard Biesheuvel
2021-04-09 10:55   ` Florian Weimer
2021-04-09 11:16     ` Peter Zijlstra
2021-04-09 19:33       ` Nick Desaulniers
2021-04-09 20:11         ` Peter Zijlstra
2021-04-10 17:02         ` Segher Boessenkool
2021-04-09 11:12   ` Peter Zijlstra
2021-04-09 11:55     ` David Malcolm
2021-04-09 12:03       ` Peter Zijlstra
2021-04-09 13:01         ` Peter Zijlstra
2021-04-09 13:13           ` Peter Zijlstra
2021-04-09 13:48             ` David Malcolm
2021-04-09 18:40               ` Peter Zijlstra
2021-04-09 19:21                 ` David Malcolm
2021-04-09 20:09                   ` Peter Zijlstra
2021-04-09 21:07                     ` David Malcolm
2021-04-09 21:39                       ` Peter Zijlstra
2021-04-22 11:48                         ` Peter Zijlstra
2021-04-22 17:08                           ` Segher Boessenkool
2021-04-22 17:49                             ` Peter Zijlstra
2021-04-22 18:31                               ` Segher Boessenkool
2021-04-26 17:13                                 ` Peter Zijlstra
2021-04-10 12:44               ` David Laight
2021-04-09 13:03 ` Segher Boessenkool

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).