All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/build: Use new .nops directive when available
@ 2018-08-15 17:57 Andrew Cooper
  2018-08-16  9:55 ` Roger Pau Monné
  2018-08-17 12:45 ` Jan Beulich
  0 siblings, 2 replies; 15+ messages in thread
From: Andrew Cooper @ 2018-08-15 17:57 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper, Roger Pau Monné, Wei Liu, Jan Beulich

Newer versions of binutils are capable of emitting an exact number bytes worth
of optimised nops, which are P6 nops.  Use this in preference to .skip when
available.

Check at boot time whether the toolchain nops are the correct for the running
hardware, andskip optimising nops entirely when possible.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wei.liu2@citrix.com>
---
 xen/arch/x86/Rules.mk                 |  4 ++++
 xen/arch/x86/alternative.c            | 20 +++++++++++++++++++-
 xen/include/asm-x86/alternative-asm.h | 12 +++++++++++-
 xen/include/asm-x86/alternative.h     | 11 +++++++++--
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
index ac585a3..c84ed20 100644
--- a/xen/arch/x86/Rules.mk
+++ b/xen/arch/x86/Rules.mk
@@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
 $(call as-option-add,CFLAGS,CC,\
     ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
 
+# Check to see whether the assmbler supports the .nop directive.
+$(call as-option-add,CFLAGS,CC,\
+    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
+
 CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables
 
 # Xen doesn't use SSE interally.  If the compiler supports it, also skip the
diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
index 0ef7a8b..2c844d6 100644
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons
 
 static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
 
+#ifdef HAVE_AS_NOP_DIRECTIVE
+
+/* Nops in .init.rodata to compare against the runtime ideal nops. */
+asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
+      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
+      ".popsection\n\t");
+extern char toolchain_nops[ASM_NOP_MAX];
+static bool __read_mostly toolchain_nops_are_ideal;
+
+#else
+# define toolchain_nops_are_ideal false
+#endif
+
 static void __init arch_init_ideal_nops(void)
 {
     switch ( boot_cpu_data.x86_vendor )
@@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
             ideal_nops = k8_nops;
         break;
     }
+
+#ifdef HAVE_AS_NOP_DIRECTIVE
+    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
+        toolchain_nops_are_ideal = true;
+#endif
 }
 
 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
@@ -209,7 +227,7 @@ void init_or_livepatch apply_alternatives(struct alt_instr *start,
             base->priv = 1;
 
             /* Nothing useful to do? */
-            if ( a->pad_len <= 1 )
+            if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
                 continue;
 
             add_nops(buf, a->pad_len);
diff --git a/xen/include/asm-x86/alternative-asm.h b/xen/include/asm-x86/alternative-asm.h
index 0b61516..0d6fb4b 100644
--- a/xen/include/asm-x86/alternative-asm.h
+++ b/xen/include/asm-x86/alternative-asm.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_ALTERNATIVE_ASM_H_
 #define _ASM_X86_ALTERNATIVE_ASM_H_
 
+#include <asm/nops.h>
+
 #ifdef __ASSEMBLY__
 
 /*
@@ -19,6 +21,14 @@
     .byte 0 /* priv */
 .endm
 
+.macro mknops nr_bytes
+#ifdef HAVE_AS_NOP_DIRECTIVE
+    .nops \nr_bytes, ASM_NOP_MAX
+#else
+    .skip \nr_bytes, 0x90
+#endif
+.endm
+
 /* GAS's idea of true is -1, while Clang's idea is 1. */
 #ifdef HAVE_AS_NEGATIVE_TRUE
 # define as_true(x) (-(x))
@@ -29,7 +39,7 @@
 #define decl_orig(insn, padding)                  \
  .L\@_orig_s: insn; .L\@_orig_e:                  \
  .L\@_diff = padding;                             \
- .skip as_true(.L\@_diff > 0) * .L\@_diff, 0x90;  \
+ mknops (as_true(.L\@_diff > 0) * .L\@_diff);     \
  .L\@_orig_p:
 
 #define orig_len               (.L\@_orig_e       -     .L\@_orig_s)
diff --git a/xen/include/asm-x86/alternative.h b/xen/include/asm-x86/alternative.h
index 619472e..84b4854 100644
--- a/xen/include/asm-x86/alternative.h
+++ b/xen/include/asm-x86/alternative.h
@@ -2,7 +2,6 @@
 #define __X86_ALTERNATIVE_H__
 
 #include <asm/alternative-asm.h>
-#include <asm/nops.h>
 
 #ifndef __ASSEMBLY__
 #include <xen/stringify.h>
@@ -27,6 +26,14 @@ extern void add_nops(void *insns, unsigned int len);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 extern void alternative_instructions(void);
 
+asm ( ".macro mknops nr_bytes\n\t"
+#ifdef HAVE_AS_NOP_DIRECTIVE
+      ".nops \\nr_bytes, " __stringify(ASM_NOP_MAX) "\n\t"
+#else
+      ".skip \\nr_bytes, 0x90\n\t"
+#endif
+      ".endm\n\t" );
+
 #define alt_orig_len       "(.LXEN%=_orig_e - .LXEN%=_orig_s)"
 #define alt_pad_len        "(.LXEN%=_orig_p - .LXEN%=_orig_e)"
 #define alt_total_len      "(.LXEN%=_orig_p - .LXEN%=_orig_s)"
@@ -46,7 +53,7 @@ extern void alternative_instructions(void);
 #define OLDINSTR(oldinstr, padding)                              \
     ".LXEN%=_orig_s:\n\t" oldinstr "\n .LXEN%=_orig_e:\n\t"      \
     ".LXEN%=_diff = " padding "\n\t"                             \
-    ".skip "AS_TRUE"(.LXEN%=_diff > 0) * .LXEN%=_diff, 0x90\n\t" \
+    "mknops ("AS_TRUE"(.LXEN%=_diff > 0) * .LXEN%=_diff)\n\t"    \
     ".LXEN%=_orig_p:\n\t"
 
 #define OLDINSTR_1(oldinstr, n1)                                 \
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-15 17:57 [PATCH] x86/build: Use new .nops directive when available Andrew Cooper
@ 2018-08-16  9:55 ` Roger Pau Monné
  2018-08-16 10:18   ` Jan Beulich
  2018-08-16 10:42   ` Andrew Cooper
  2018-08-17 12:45 ` Jan Beulich
  1 sibling, 2 replies; 15+ messages in thread
From: Roger Pau Monné @ 2018-08-16  9:55 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Jan Beulich, Xen-devel

On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
> Newer versions of binutils are capable of emitting an exact number bytes worth
> of optimised nops, which are P6 nops.  Use this in preference to .skip when
> available.
> 
> Check at boot time whether the toolchain nops are the correct for the running
> hardware, andskip optimising nops entirely when possible.
               ^ missing space.

TBH I'm not sure I see the benefit of using .nops over using .skip.
Xen needs to do a memcmp in order to check whether the resulting nops
are what Xen considers the more optimized instructions for the CPU
currently running on. Xen can avoid the memcpy by using skip, because
in that case Xen knows exactly the current instructions and there's no
need to memcmp.

I guess the reason is that the memcmp will be done only once, and
hopefully in most cases the assembler generated nops will be the most
optimized version.

> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Wei Liu <wei.liu2@citrix.com>
> ---
>  xen/arch/x86/Rules.mk                 |  4 ++++
>  xen/arch/x86/alternative.c            | 20 +++++++++++++++++++-
>  xen/include/asm-x86/alternative-asm.h | 12 +++++++++++-
>  xen/include/asm-x86/alternative.h     | 11 +++++++++--
>  4 files changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
> index ac585a3..c84ed20 100644
> --- a/xen/arch/x86/Rules.mk
> +++ b/xen/arch/x86/Rules.mk
> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>  $(call as-option-add,CFLAGS,CC,\
>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>  
> +# Check to see whether the assmbler supports the .nop directive.
> +$(call as-option-add,CFLAGS,CC,\
> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)

I think I remember commenting on an earlier version of this about the
usage of the CONTROL parameter. I would expect the assembler to
use the most optimized version by default, is that not the case?

> +
>  CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables
>  
>  # Xen doesn't use SSE interally.  If the compiler supports it, also skip the
> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
> index 0ef7a8b..2c844d6 100644
> --- a/xen/arch/x86/alternative.c
> +++ b/xen/arch/x86/alternative.c
> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons
>  
>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
>  
> +#ifdef HAVE_AS_NOP_DIRECTIVE
> +
> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
> +      ".popsection\n\t");
> +extern char toolchain_nops[ASM_NOP_MAX];
> +static bool __read_mostly toolchain_nops_are_ideal;
> +
> +#else
> +# define toolchain_nops_are_ideal false
> +#endif
> +
>  static void __init arch_init_ideal_nops(void)
>  {
>      switch ( boot_cpu_data.x86_vendor )
> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
>              ideal_nops = k8_nops;
>          break;
>      }
> +
> +#ifdef HAVE_AS_NOP_DIRECTIVE
> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
> +        toolchain_nops_are_ideal = true;
> +#endif

You are only comparing that the biggest nop instruction (9 bytes
AFAICT) generated by the assembler is what Xen believes to be the more
optimized version. What about shorter nops?

I also see a chance that maybe newer assembler versions will at some
point generate more optimized nops, but Xen will replace them with not
so optimized versions if the Xen logic is not so up to date.

>  }
>  
>  /* Use this to add nops to a buffer, then text_poke the whole buffer. */
> @@ -209,7 +227,7 @@ void init_or_livepatch apply_alternatives(struct alt_instr *start,
>              base->priv = 1;
>  
>              /* Nothing useful to do? */
> -            if ( a->pad_len <= 1 )
> +            if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
>                  continue;
>  
>              add_nops(buf, a->pad_len);
> diff --git a/xen/include/asm-x86/alternative-asm.h b/xen/include/asm-x86/alternative-asm.h
> index 0b61516..0d6fb4b 100644
> --- a/xen/include/asm-x86/alternative-asm.h
> +++ b/xen/include/asm-x86/alternative-asm.h
> @@ -1,6 +1,8 @@
>  #ifndef _ASM_X86_ALTERNATIVE_ASM_H_
>  #define _ASM_X86_ALTERNATIVE_ASM_H_
>  
> +#include <asm/nops.h>
> +
>  #ifdef __ASSEMBLY__
>  
>  /*
> @@ -19,6 +21,14 @@
>      .byte 0 /* priv */
>  .endm
>  
> +.macro mknops nr_bytes
> +#ifdef HAVE_AS_NOP_DIRECTIVE
> +    .nops \nr_bytes, ASM_NOP_MAX
> +#else
> +    .skip \nr_bytes, 0x90

Use P6_NOP1 instead of open coding 0x90? Or have a

#define DEFAULT_NOP P6_NOP1

And use it instead.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16  9:55 ` Roger Pau Monné
@ 2018-08-16 10:18   ` Jan Beulich
  2018-08-16 11:57     ` Roger Pau Monné
  2018-08-16 10:42   ` Andrew Cooper
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2018-08-16 10:18 UTC (permalink / raw)
  To: Roger Pau Monne; +Cc: Andrew Cooper, Wei Liu, Xen-devel

>>> On 16.08.18 at 11:55, <roger.pau@citrix.com> wrote:
> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>> --- a/xen/arch/x86/Rules.mk
>> +++ b/xen/arch/x86/Rules.mk
>> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>>  $(call as-option-add,CFLAGS,CC,\
>>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>>  
>> +# Check to see whether the assmbler supports the .nop directive.
>> +$(call as-option-add,CFLAGS,CC,\
>> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
> 
> I think I remember commenting on an earlier version of this about the
> usage of the CONTROL parameter. I would expect the assembler to
> use the most optimized version by default, is that not the case?

How could it, without knowing what the target hardware is? And even
if it could, what is considered "most optimized" tends to change every
once in a while (or else we wouldn't have different NOP flavors to
begin with).

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16  9:55 ` Roger Pau Monné
  2018-08-16 10:18   ` Jan Beulich
@ 2018-08-16 10:42   ` Andrew Cooper
  2018-08-16 11:34     ` Jan Beulich
  2018-08-16 14:31     ` Roger Pau Monné
  1 sibling, 2 replies; 15+ messages in thread
From: Andrew Cooper @ 2018-08-16 10:42 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: Wei Liu, Jan Beulich, Xen-devel

On 16/08/18 10:55, Roger Pau Monné wrote:
> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>> Newer versions of binutils are capable of emitting an exact number bytes worth
>> of optimised nops, which are P6 nops.  Use this in preference to .skip when
>> available.
>>
>> Check at boot time whether the toolchain nops are the correct for the running
>> hardware, andskip optimising nops entirely when possible.
>                ^ missing space.
>
> TBH I'm not sure I see the benefit of using .nops over using .skip.

In this case, or in general?

In general, so we don't need to self/cross modify the alternatives
points which aren't patched.

In this case, because it is the .nops directive we're using to insert nops.

> Xen needs to do a memcmp in order to check whether the resulting nops
> are what Xen considers the more optimized instructions for the CPU
> currently running on. Xen can avoid the memcpy by using skip, because
> in that case Xen knows exactly the current instructions and there's no
> need to memcmp.

I'm afraid I don't understand what point you are attempting to make here.

> I guess the reason is that the memcmp will be done only once, and
> hopefully in most cases the assembler generated nops will be the most
> optimized version.

The memcmp() is once during init, and you've got to be on very ancient
hardware for the toolchain nops to not be the correct ones.  I'm going
to conservatively estimate that 98% of hardware running Xen will have P6
nops as ideal.

>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>> CC: Roger Pau Monné <roger.pau@citrix.com>
>> CC: Wei Liu <wei.liu2@citrix.com>
>> ---
>>  xen/arch/x86/Rules.mk                 |  4 ++++
>>  xen/arch/x86/alternative.c            | 20 +++++++++++++++++++-
>>  xen/include/asm-x86/alternative-asm.h | 12 +++++++++++-
>>  xen/include/asm-x86/alternative.h     | 11 +++++++++--
>>  4 files changed, 43 insertions(+), 4 deletions(-)
>>
>> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
>> index ac585a3..c84ed20 100644
>> --- a/xen/arch/x86/Rules.mk
>> +++ b/xen/arch/x86/Rules.mk
>> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>>  $(call as-option-add,CFLAGS,CC,\
>>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>>  
>> +# Check to see whether the assmbler supports the .nop directive.
>> +$(call as-option-add,CFLAGS,CC,\
>> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
> I think I remember commenting on an earlier version of this about the
> usage of the CONTROL parameter. I would expect the assembler to
> use the most optimized version by default, is that not the case?

Again, I don't understand what you're trying to say.

This expression is like this, because that's how we actually use it.

>
>> +
>>  CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables
>>  
>>  # Xen doesn't use SSE interally.  If the compiler supports it, also skip the
>> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
>> index 0ef7a8b..2c844d6 100644
>> --- a/xen/arch/x86/alternative.c
>> +++ b/xen/arch/x86/alternative.c
>> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons
>>  
>>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
>>  
>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>> +
>> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
>> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
>> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
>> +      ".popsection\n\t");
>> +extern char toolchain_nops[ASM_NOP_MAX];
>> +static bool __read_mostly toolchain_nops_are_ideal;
>> +
>> +#else
>> +# define toolchain_nops_are_ideal false
>> +#endif
>> +
>>  static void __init arch_init_ideal_nops(void)
>>  {
>>      switch ( boot_cpu_data.x86_vendor )
>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
>>              ideal_nops = k8_nops;
>>          break;
>>      }
>> +
>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
>> +        toolchain_nops_are_ideal = true;
>> +#endif
> You are only comparing that the biggest nop instruction (9 bytes
> AFAICT) generated by the assembler is what Xen believes to be the more
> optimized version. What about shorter nops?

They are all variations on a theme.

For P6 nops, its the 0f 1f root which is important, which takes a modrm
byte.  Traditionally, its always encoded with eax and uses redundant
memory encodings for longer instructions.

I can't think of any way of detecting if the optimised nops if the
toolchain starts using alternative registers in the encoding, but I
expect this case won't happen in practice.

> I also see a chance that maybe newer assembler versions will at some
> point generate more optimized nops, but Xen will replace them with not
> so optimized versions if the Xen logic is not so up to date.

The nops which are the most optimised are a property of the pipeline. 
Its fixed for released hardware, and vendors would have to have a very
good reason to change it moving forwards, considering how much use the
current nops get in optimised code.

>
>>  }
>>  
>>  /* Use this to add nops to a buffer, then text_poke the whole buffer. */
>> @@ -209,7 +227,7 @@ void init_or_livepatch apply_alternatives(struct alt_instr *start,
>>              base->priv = 1;
>>  
>>              /* Nothing useful to do? */
>> -            if ( a->pad_len <= 1 )
>> +            if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
>>                  continue;
>>  
>>              add_nops(buf, a->pad_len);
>> diff --git a/xen/include/asm-x86/alternative-asm.h b/xen/include/asm-x86/alternative-asm.h
>> index 0b61516..0d6fb4b 100644
>> --- a/xen/include/asm-x86/alternative-asm.h
>> +++ b/xen/include/asm-x86/alternative-asm.h
>> @@ -1,6 +1,8 @@
>>  #ifndef _ASM_X86_ALTERNATIVE_ASM_H_
>>  #define _ASM_X86_ALTERNATIVE_ASM_H_
>>  
>> +#include <asm/nops.h>
>> +
>>  #ifdef __ASSEMBLY__
>>  
>>  /*
>> @@ -19,6 +21,14 @@
>>      .byte 0 /* priv */
>>  .endm
>>  
>> +.macro mknops nr_bytes
>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>> +    .nops \nr_bytes, ASM_NOP_MAX
>> +#else
>> +    .skip \nr_bytes, 0x90
> Use P6_NOP1 instead of open coding 0x90? Or have a

0x90 is the nop instruction, which IIRC has existed since the 8080.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 10:42   ` Andrew Cooper
@ 2018-08-16 11:34     ` Jan Beulich
  2018-08-16 11:48       ` Andrew Cooper
  2018-08-16 14:31     ` Roger Pau Monné
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2018-08-16 11:34 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Xen-devel, Wei Liu, Roger Pau Monne

>>> On 16.08.18 at 12:42, <andrew.cooper3@citrix.com> wrote:
> On 16/08/18 10:55, Roger Pau Monné wrote:
>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
>>>              ideal_nops = k8_nops;
>>>          break;
>>>      }
>>> +
>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>>> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
>>> +        toolchain_nops_are_ideal = true;
>>> +#endif
>> You are only comparing that the biggest nop instruction (9 bytes
>> AFAICT) generated by the assembler is what Xen believes to be the more
>> optimized version. What about shorter nops?
> 
> They are all variations on a theme.
> 
> For P6 nops, its the 0f 1f root which is important, which takes a modrm
> byte.  Traditionally, its always encoded with eax and uses redundant
> memory encodings for longer instructions.
> 
> I can't think of any way of detecting if the optimised nops if the
> toolchain starts using alternative registers in the encoding, but I
> expect this case won't happen in practice.

It's not just the register encoding, but also the maximum single-insn
length that gets generated. Recall that until not very long ago we
had up to 8-byte NOP insns only? The view on the mod (as in ModRM)
usage may vary over time, as may the view on which or how many
prefixes are reasonable to have.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 11:34     ` Jan Beulich
@ 2018-08-16 11:48       ` Andrew Cooper
  2018-08-16 12:43         ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2018-08-16 11:48 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Xen-devel, Wei Liu, Roger Pau Monne

On 16/08/18 12:34, Jan Beulich wrote:
>>>> On 16.08.18 at 12:42, <andrew.cooper3@citrix.com> wrote:
>> On 16/08/18 10:55, Roger Pau Monné wrote:
>>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>>>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
>>>>              ideal_nops = k8_nops;
>>>>          break;
>>>>      }
>>>> +
>>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>>>> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
>>>> +        toolchain_nops_are_ideal = true;
>>>> +#endif
>>> You are only comparing that the biggest nop instruction (9 bytes
>>> AFAICT) generated by the assembler is what Xen believes to be the more
>>> optimized version. What about shorter nops?
>> They are all variations on a theme.
>>
>> For P6 nops, its the 0f 1f root which is important, which takes a modrm
>> byte.  Traditionally, its always encoded with eax and uses redundant
>> memory encodings for longer instructions.
>>
>> I can't think of any way of detecting if the optimised nops if the
>> toolchain starts using alternative registers in the encoding, but I
>> expect this case won't happen in practice.
> It's not just the register encoding, but also the maximum single-insn
> length that gets generated. Recall that until not very long ago we
> had up to 8-byte NOP insns only? The view on the mod (as in ModRM)
> usage may vary over time, as may the view on which or how many
> prefixes are reasonable to have.

Strictly speaking, the ORM says "encode the least-recently live
register", because all the hint nops are still subject to reg/reg
dependencies.

However, we definitely can't take advantage of this, nor can the
assembler.  Compilers can't either, because the exact length of the nop
depends on other relocations.  Furthermore, the perf improvement from
doing this would be fractional.

IOW, I don't expect we'll realistically see encodings other than the
exact byte layout described in the ORM.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 10:18   ` Jan Beulich
@ 2018-08-16 11:57     ` Roger Pau Monné
  2018-08-16 12:39       ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2018-08-16 11:57 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Wei Liu, Xen-devel

On Thu, Aug 16, 2018 at 04:18:03AM -0600, Jan Beulich wrote:
> >>> On 16.08.18 at 11:55, <roger.pau@citrix.com> wrote:
> > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
> >> --- a/xen/arch/x86/Rules.mk
> >> +++ b/xen/arch/x86/Rules.mk
> >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
> >>  $(call as-option-add,CFLAGS,CC,\
> >>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
> >>  
> >> +# Check to see whether the assmbler supports the .nop directive.
> >> +$(call as-option-add,CFLAGS,CC,\
> >> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
> > 
> > I think I remember commenting on an earlier version of this about the
> > usage of the CONTROL parameter. I would expect the assembler to
> > use the most optimized version by default, is that not the case?
> 
> How could it, without knowing what the target hardware is? And even
> if it could, what is considered "most optimized" tends to change every
> once in a while (or else we wouldn't have different NOP flavors to
> begin with).

I think I haven't explained myself well. By using the CONTROL
parameter we limit the max length of a nop instruction to 9 bytes
instead of using the maximum supported by the assembler (11 bytes IIRC
for 64bit). Is this done because there are issues with using nops > 9
bytes?

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 11:57     ` Roger Pau Monné
@ 2018-08-16 12:39       ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2018-08-16 12:39 UTC (permalink / raw)
  To: Roger Pau Monne; +Cc: Andrew Cooper, Wei Liu, Xen-devel

>>> On 16.08.18 at 13:57, <roger.pau@citrix.com> wrote:
> On Thu, Aug 16, 2018 at 04:18:03AM -0600, Jan Beulich wrote:
>> >>> On 16.08.18 at 11:55, <roger.pau@citrix.com> wrote:
>> > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>> >> --- a/xen/arch/x86/Rules.mk
>> >> +++ b/xen/arch/x86/Rules.mk
>> >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid 
> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>> >>  $(call as-option-add,CFLAGS,CC,\
>> >>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>> >>  
>> >> +# Check to see whether the assmbler supports the .nop directive.
>> >> +$(call as-option-add,CFLAGS,CC,\
>> >> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
>> > 
>> > I think I remember commenting on an earlier version of this about the
>> > usage of the CONTROL parameter. I would expect the assembler to
>> > use the most optimized version by default, is that not the case?
>> 
>> How could it, without knowing what the target hardware is? And even
>> if it could, what is considered "most optimized" tends to change every
>> once in a while (or else we wouldn't have different NOP flavors to
>> begin with).
> 
> I think I haven't explained myself well. By using the CONTROL
> parameter we limit the max length of a nop instruction to 9 bytes
> instead of using the maximum supported by the assembler (11 bytes IIRC
> for 64bit). Is this done because there are issues with using nops > 9
> bytes?

There the problems start that I've hinted at towards Andrew: gas
supports up to 11-byte NOPs despite the SDM saying otherwise at
present. But ORM and SDM disagree as well, for the 3- and 6-byte
variants. Otoh AMD recommends up to 11-byte NOPs for Fam15
and earlier, and suggests even using up to 15-byte ones on Fam17.

Besides that I also wonder whether in 64-bit mode REX prefixes
wouldn't be preferable over operand size override ones.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 11:48       ` Andrew Cooper
@ 2018-08-16 12:43         ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2018-08-16 12:43 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Xen-devel, Wei Liu, Roger Pau Monne

>>> On 16.08.18 at 13:48, <andrew.cooper3@citrix.com> wrote:
> On 16/08/18 12:34, Jan Beulich wrote:
>>>>> On 16.08.18 at 12:42, <andrew.cooper3@citrix.com> wrote:
>>> On 16/08/18 10:55, Roger Pau Monné wrote:
>>>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>>>>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
>>>>>              ideal_nops = k8_nops;
>>>>>          break;
>>>>>      }
>>>>> +
>>>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>>>>> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 
> )
>>>>> +        toolchain_nops_are_ideal = true;
>>>>> +#endif
>>>> You are only comparing that the biggest nop instruction (9 bytes
>>>> AFAICT) generated by the assembler is what Xen believes to be the more
>>>> optimized version. What about shorter nops?
>>> They are all variations on a theme.
>>>
>>> For P6 nops, its the 0f 1f root which is important, which takes a modrm
>>> byte.  Traditionally, its always encoded with eax and uses redundant
>>> memory encodings for longer instructions.
>>>
>>> I can't think of any way of detecting if the optimised nops if the
>>> toolchain starts using alternative registers in the encoding, but I
>>> expect this case won't happen in practice.
>> It's not just the register encoding, but also the maximum single-insn
>> length that gets generated. Recall that until not very long ago we
>> had up to 8-byte NOP insns only? The view on the mod (as in ModRM)
>> usage may vary over time, as may the view on which or how many
>> prefixes are reasonable to have.
> 
> Strictly speaking, the ORM says "encode the least-recently live
> register", because all the hint nops are still subject to reg/reg
> dependencies.
> 
> However, we definitely can't take advantage of this, nor can the
> assembler.

Well, _we_ could, at least when tail padding patched in insns: I very
much hope we know what we've patched in, and hence at least what
registers were used most recently. This is not readily available today,
but could be made so.

>  Compilers can't either, because the exact length of the nop
> depends on other relocations.  Furthermore, the perf improvement from
> doing this would be fractional.

True.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 10:42   ` Andrew Cooper
  2018-08-16 11:34     ` Jan Beulich
@ 2018-08-16 14:31     ` Roger Pau Monné
  2018-08-16 15:56       ` Andrew Cooper
  1 sibling, 1 reply; 15+ messages in thread
From: Roger Pau Monné @ 2018-08-16 14:31 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Jan Beulich, Xen-devel

On Thu, Aug 16, 2018 at 11:42:56AM +0100, Andrew Cooper wrote:
> On 16/08/18 10:55, Roger Pau Monné wrote:
> > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
> >> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
> >> index ac585a3..c84ed20 100644
> >> --- a/xen/arch/x86/Rules.mk
> >> +++ b/xen/arch/x86/Rules.mk
> >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
> >>  $(call as-option-add,CFLAGS,CC,\
> >>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
> >>  
> >> +# Check to see whether the assmbler supports the .nop directive.
> >> +$(call as-option-add,CFLAGS,CC,\
> >> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
> > I think I remember commenting on an earlier version of this about the
> > usage of the CONTROL parameter. I would expect the assembler to
> > use the most optimized version by default, is that not the case?
> 
> Again, I don't understand what you're trying to say.
> 
> This expression is like this, because that's how we actually use it.

As mentioned in another email, I was wondering why we choose to not
use nop instructions > 9 bytes. The assembler will by default use
nop instructions up to 11 bytes for 64bit code.

> >
> >> +
> >>  CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables
> >>  
> >>  # Xen doesn't use SSE interally.  If the compiler supports it, also skip the
> >> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
> >> index 0ef7a8b..2c844d6 100644
> >> --- a/xen/arch/x86/alternative.c
> >> +++ b/xen/arch/x86/alternative.c
> >> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons
> >>  
> >>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
> >>  
> >> +#ifdef HAVE_AS_NOP_DIRECTIVE
> >> +
> >> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
> >> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
> >> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
> >> +      ".popsection\n\t");
> >> +extern char toolchain_nops[ASM_NOP_MAX];
> >> +static bool __read_mostly toolchain_nops_are_ideal;
> >> +
> >> +#else
> >> +# define toolchain_nops_are_ideal false
> >> +#endif
> >> +
> >>  static void __init arch_init_ideal_nops(void)
> >>  {
> >>      switch ( boot_cpu_data.x86_vendor )
> >> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
> >>              ideal_nops = k8_nops;
> >>          break;
> >>      }
> >> +
> >> +#ifdef HAVE_AS_NOP_DIRECTIVE
> >> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
> >> +        toolchain_nops_are_ideal = true;
> >> +#endif
> > You are only comparing that the biggest nop instruction (9 bytes
> > AFAICT) generated by the assembler is what Xen believes to be the more
> > optimized version. What about shorter nops?
> 
> They are all variations on a theme.
> 
> For P6 nops, its the 0f 1f root which is important, which takes a modrm
> byte.  Traditionally, its always encoded with eax and uses redundant
> memory encodings for longer instructions.
> 
> I can't think of any way of detecting if the optimised nops if the
> toolchain starts using alternative registers in the encoding, but I
> expect this case won't happen in practice.

I would rather do:

toolchain_nops:
	.nops 1
	.nops 2
	.nops 3
	...
	.nops 9

And then build an assembler_nops[ASM_NOP_MAX+1] like it's done for the
other nops. Then you could do:

toolchain_nops_are_ideal = true;
for ( i = 1; i < ASM_NOP_MAX+1; i++ )
	if ( memcmp(ideal_nops[i], assembler_nops[i], i) )
	{
		toolchain_nops_are_ideal = false;
		break;
	}

In order to make sure all the possible nop sizes are using the
expected optimized version.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 14:31     ` Roger Pau Monné
@ 2018-08-16 15:56       ` Andrew Cooper
  2018-08-16 16:36         ` Roger Pau Monné
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2018-08-16 15:56 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: Wei Liu, Jan Beulich, Xen-devel

On 16/08/18 15:31, Roger Pau Monné wrote:
> On Thu, Aug 16, 2018 at 11:42:56AM +0100, Andrew Cooper wrote:
>> On 16/08/18 10:55, Roger Pau Monné wrote:
>>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
>>>> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
>>>> index ac585a3..c84ed20 100644
>>>> --- a/xen/arch/x86/Rules.mk
>>>> +++ b/xen/arch/x86/Rules.mk
>>>> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>>>>  $(call as-option-add,CFLAGS,CC,\
>>>>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>>>>  
>>>> +# Check to see whether the assmbler supports the .nop directive.
>>>> +$(call as-option-add,CFLAGS,CC,\
>>>> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
>>> I think I remember commenting on an earlier version of this about the
>>> usage of the CONTROL parameter. I would expect the assembler to
>>> use the most optimized version by default, is that not the case?
>> Again, I don't understand what you're trying to say.
>>
>> This expression is like this, because that's how we actually use it.
> As mentioned in another email, I was wondering why we choose to not
> use nop instructions > 9 bytes. The assembler will by default use
> nop instructions up to 11 bytes for 64bit code.

Using more than 9 bytes is suboptimal on some hardware.

Toolchains use long nops (exclusively?) for basic block alignment,
whereas we use use them for executing through because its still faster
than a branch.

>
>>>> +
>>>>  CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables
>>>>  
>>>>  # Xen doesn't use SSE interally.  If the compiler supports it, also skip the
>>>> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
>>>> index 0ef7a8b..2c844d6 100644
>>>> --- a/xen/arch/x86/alternative.c
>>>> +++ b/xen/arch/x86/alternative.c
>>>> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons
>>>>  
>>>>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
>>>>  
>>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>>>> +
>>>> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
>>>> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
>>>> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
>>>> +      ".popsection\n\t");
>>>> +extern char toolchain_nops[ASM_NOP_MAX];
>>>> +static bool __read_mostly toolchain_nops_are_ideal;
>>>> +
>>>> +#else
>>>> +# define toolchain_nops_are_ideal false
>>>> +#endif
>>>> +
>>>>  static void __init arch_init_ideal_nops(void)
>>>>  {
>>>>      switch ( boot_cpu_data.x86_vendor )
>>>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
>>>>              ideal_nops = k8_nops;
>>>>          break;
>>>>      }
>>>> +
>>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>>>> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
>>>> +        toolchain_nops_are_ideal = true;
>>>> +#endif
>>> You are only comparing that the biggest nop instruction (9 bytes
>>> AFAICT) generated by the assembler is what Xen believes to be the more
>>> optimized version. What about shorter nops?
>> They are all variations on a theme.
>>
>> For P6 nops, its the 0f 1f root which is important, which takes a modrm
>> byte.  Traditionally, its always encoded with eax and uses redundant
>> memory encodings for longer instructions.
>>
>> I can't think of any way of detecting if the optimised nops if the
>> toolchain starts using alternative registers in the encoding, but I
>> expect this case won't happen in practice.
> I would rather do:
>
> toolchain_nops:
> 	.nops 1
> 	.nops 2
> 	.nops 3
> 	...
> 	.nops 9
>
> And then build an assembler_nops[ASM_NOP_MAX+1] like it's done for the
> other nops. Then you could do:
>
> toolchain_nops_are_ideal = true;
> for ( i = 1; i < ASM_NOP_MAX+1; i++ )
> 	if ( memcmp(ideal_nops[i], assembler_nops[i], i) )
> 	{
> 		toolchain_nops_are_ideal = false;
> 		break;
> 	}
>
> In order to make sure all the possible nop sizes are using the
> expected optimized version.

What is the point?  Other than the 0f 1f, it really doesn't matter, and
small variations won't invalidate them as ideal nops.

This check needs to be just good enough to tell whether the toolchain
used P6 or K8 nops, and unless you explicitly built with -march=k8, the
answer is going to be P6.

It does not need to check every variation byte for byte.  Going back to
my original argument for not even doing this basic check, if we happen
to be wrong and the toolchain wrote the other kind of long nops, you
probably won't be able to measure the difference.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-16 15:56       ` Andrew Cooper
@ 2018-08-16 16:36         ` Roger Pau Monné
  0 siblings, 0 replies; 15+ messages in thread
From: Roger Pau Monné @ 2018-08-16 16:36 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Jan Beulich, Xen-devel

On Thu, Aug 16, 2018 at 04:56:04PM +0100, Andrew Cooper wrote:
> On 16/08/18 15:31, Roger Pau Monné wrote:
> > On Thu, Aug 16, 2018 at 11:42:56AM +0100, Andrew Cooper wrote:
> >> On 16/08/18 10:55, Roger Pau Monné wrote:
> >>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote:
> >>>> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk
> >>>> index ac585a3..c84ed20 100644
> >>>> --- a/xen/arch/x86/Rules.mk
> >>>> +++ b/xen/arch/x86/Rules.mk
> >>>> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
> >>>>  $(call as-option-add,CFLAGS,CC,\
> >>>>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
> >>>>  
> >>>> +# Check to see whether the assmbler supports the .nop directive.
> >>>> +$(call as-option-add,CFLAGS,CC,\
> >>>> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
> >>> I think I remember commenting on an earlier version of this about the
> >>> usage of the CONTROL parameter. I would expect the assembler to
> >>> use the most optimized version by default, is that not the case?
> >> Again, I don't understand what you're trying to say.
> >>
> >> This expression is like this, because that's how we actually use it.
> > As mentioned in another email, I was wondering why we choose to not
> > use nop instructions > 9 bytes. The assembler will by default use
> > nop instructions up to 11 bytes for 64bit code.
> 
> Using more than 9 bytes is suboptimal on some hardware.

OK. But using the same argument isn't it also suboptimal to use 9
bytes on some hardware then also?

What I mean by this is that it would be good to add a comment
somewhere that notes why nop instructions are limited to 9 bytes,
because that's likely to generate the more optimized code on a wide
variety of hardware.

At least it would have helped me.

> Toolchains use long nops (exclusively?) for basic block alignment,
> whereas we use use them for executing through because its still faster
> than a branch.
> 
> >
> >>>> +
> >>>>  CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables
> >>>>  
> >>>>  # Xen doesn't use SSE interally.  If the compiler supports it, also skip the
> >>>> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
> >>>> index 0ef7a8b..2c844d6 100644
> >>>> --- a/xen/arch/x86/alternative.c
> >>>> +++ b/xen/arch/x86/alternative.c
> >>>> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons
> >>>>  
> >>>>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
> >>>>  
> >>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
> >>>> +
> >>>> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
> >>>> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
> >>>> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
> >>>> +      ".popsection\n\t");
> >>>> +extern char toolchain_nops[ASM_NOP_MAX];
> >>>> +static bool __read_mostly toolchain_nops_are_ideal;
> >>>> +
> >>>> +#else
> >>>> +# define toolchain_nops_are_ideal false
> >>>> +#endif
> >>>> +
> >>>>  static void __init arch_init_ideal_nops(void)
> >>>>  {
> >>>>      switch ( boot_cpu_data.x86_vendor )
> >>>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void)
> >>>>              ideal_nops = k8_nops;
> >>>>          break;
> >>>>      }
> >>>> +
> >>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
> >>>> +    if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
> >>>> +        toolchain_nops_are_ideal = true;
> >>>> +#endif
> >>> You are only comparing that the biggest nop instruction (9 bytes
> >>> AFAICT) generated by the assembler is what Xen believes to be the more
> >>> optimized version. What about shorter nops?
> >> They are all variations on a theme.
> >>
> >> For P6 nops, its the 0f 1f root which is important, which takes a modrm
> >> byte.  Traditionally, its always encoded with eax and uses redundant
> >> memory encodings for longer instructions.
> >>
> >> I can't think of any way of detecting if the optimised nops if the
> >> toolchain starts using alternative registers in the encoding, but I
> >> expect this case won't happen in practice.
> > I would rather do:
> >
> > toolchain_nops:
> > 	.nops 1
> > 	.nops 2
> > 	.nops 3
> > 	...
> > 	.nops 9
> >
> > And then build an assembler_nops[ASM_NOP_MAX+1] like it's done for the
> > other nops. Then you could do:
> >
> > toolchain_nops_are_ideal = true;
> > for ( i = 1; i < ASM_NOP_MAX+1; i++ )
> > 	if ( memcmp(ideal_nops[i], assembler_nops[i], i) )
> > 	{
> > 		toolchain_nops_are_ideal = false;
> > 		break;
> > 	}
> >
> > In order to make sure all the possible nop sizes are using the
> > expected optimized version.
> 
> What is the point?  Other than the 0f 1f, it really doesn't matter, and
> small variations won't invalidate them as ideal nops.
> 
> This check needs to be just good enough to tell whether the toolchain
> used P6 or K8 nops, and unless you explicitly built with -march=k8, the
> answer is going to be P6.
> 
> It does not need to check every variation byte for byte.  Going back to
> my original argument for not even doing this basic check, if we happen
> to be wrong and the toolchain wrote the other kind of long nops, you
> probably won't be able to measure the difference.

I have to admit I don't know that much about nop instruction length or
encoding, but again I think it would be nice to add the reasoning
above to the commit as a comment on why only instruction of length 9
is tested.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-15 17:57 [PATCH] x86/build: Use new .nops directive when available Andrew Cooper
  2018-08-16  9:55 ` Roger Pau Monné
@ 2018-08-17 12:45 ` Jan Beulich
  2018-08-28 17:58   ` Andrew Cooper
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2018-08-17 12:45 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Xen-devel, Wei Liu, Roger Pau Monne

>>> On 15.08.18 at 19:57, <andrew.cooper3@citrix.com> wrote:
> --- a/xen/arch/x86/Rules.mk
> +++ b/xen/arch/x86/Rules.mk
> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid 
> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>  $(call as-option-add,CFLAGS,CC,\
>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>  
> +# Check to see whether the assmbler supports the .nop directive.
> +$(call as-option-add,CFLAGS,CC,\
> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)

Can this please be HAVE_AS_NOPS_DIRECTIVE?

> --- a/xen/arch/x86/alternative.c
> +++ b/xen/arch/x86/alternative.c
> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] 
> init_or_livepatch_cons
>  
>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
>  
> +#ifdef HAVE_AS_NOP_DIRECTIVE
> +
> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
> +      ".popsection\n\t");

Any particular reason not to put them in .init.text?

> @@ -27,6 +26,14 @@ extern void add_nops(void *insns, unsigned int len);
>  extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
>  extern void alternative_instructions(void);
>  
> +asm ( ".macro mknops nr_bytes\n\t"
> +#ifdef HAVE_AS_NOP_DIRECTIVE
> +      ".nops \\nr_bytes, " __stringify(ASM_NOP_MAX) "\n\t"
> +#else
> +      ".skip \\nr_bytes, 0x90\n\t"
> +#endif
> +      ".endm\n\t" );

Did you take a look at "x86/alternatives: allow using assembler macros
in favor of C ones" yet? Perhaps this redundant macro definition could
be avoided that way? I'm not going to exclude though that some more
work might be needed to get there.

Anyway, I'm in principle fine with the change, irrespective of the other
discussion we've had, so with the name change and possibly the other
two remarks taken care of
Acked-by: Jan Beulich <jbeulich@suse.com>

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-17 12:45 ` Jan Beulich
@ 2018-08-28 17:58   ` Andrew Cooper
  2018-08-29  6:31     ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2018-08-28 17:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Xen-devel, Wei Liu, Roger Pau Monne

On 17/08/18 13:45, Jan Beulich wrote:
>>>> On 15.08.18 at 19:57, <andrew.cooper3@citrix.com> wrote:
>> --- a/xen/arch/x86/Rules.mk
>> +++ b/xen/arch/x86/Rules.mk
>> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid 
>> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
>>  $(call as-option-add,CFLAGS,CC,\
>>      ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE)
>>  
>> +# Check to see whether the assmbler supports the .nop directive.
>> +$(call as-option-add,CFLAGS,CC,\
>> +    ".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE)
> Can this please be HAVE_AS_NOPS_DIRECTIVE?

Ok.

>
>> --- a/xen/arch/x86/alternative.c
>> +++ b/xen/arch/x86/alternative.c
>> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] 
>> init_or_livepatch_cons
>>  
>>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
>>  
>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>> +
>> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
>> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
>> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
>> +      ".popsection\n\t");
> Any particular reason not to put them in .init.text?

Because its data, not executable code.

>
>> @@ -27,6 +26,14 @@ extern void add_nops(void *insns, unsigned int len);
>>  extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
>>  extern void alternative_instructions(void);
>>  
>> +asm ( ".macro mknops nr_bytes\n\t"
>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>> +      ".nops \\nr_bytes, " __stringify(ASM_NOP_MAX) "\n\t"
>> +#else
>> +      ".skip \\nr_bytes, 0x90\n\t"
>> +#endif
>> +      ".endm\n\t" );
> Did you take a look at "x86/alternatives: allow using assembler macros
> in favor of C ones" yet? Perhaps this redundant macro definition could
> be avoided that way? I'm not going to exclude though that some more
> work might be needed to get there.

No - not looked at that yet.  That does look like we can get a large net
cleanup, and can avoid using always_inline to work around the same problem.

>
> Anyway, I'm in principle fine with the change, irrespective of the other
> discussion we've had, so with the name change and possibly the other
> two remarks taken care of
> Acked-by: Jan Beulich <jbeulich@suse.com>

Thanks,

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/build: Use new .nops directive when available
  2018-08-28 17:58   ` Andrew Cooper
@ 2018-08-29  6:31     ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2018-08-29  6:31 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Xen-devel, Wei Liu, Roger Pau Monne

>>> On 28.08.18 at 19:58, <andrew.cooper3@citrix.com> wrote:
> On 17/08/18 13:45, Jan Beulich wrote:
>>>>> On 15.08.18 at 19:57, <andrew.cooper3@citrix.com> wrote:
>>> --- a/xen/arch/x86/alternative.c
>>> +++ b/xen/arch/x86/alternative.c
>>> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] 
> 
>>> init_or_livepatch_cons
>>>  
>>>  static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
>>>  
>>> +#ifdef HAVE_AS_NOP_DIRECTIVE
>>> +
>>> +/* Nops in .init.rodata to compare against the runtime ideal nops. */
>>> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
>>> +      "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
>>> +      ".popsection\n\t");
>> Any particular reason not to put them in .init.text?
> 
> Because its data, not executable code.

Well, depends. My view is that any insn is (potentially) executable
code. In fact I like gas'es behavior on ARM where insns and data
directives are separated by emitting "artificial" labels as markers.
But I'm fine either way.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2018-08-29  6:31 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-15 17:57 [PATCH] x86/build: Use new .nops directive when available Andrew Cooper
2018-08-16  9:55 ` Roger Pau Monné
2018-08-16 10:18   ` Jan Beulich
2018-08-16 11:57     ` Roger Pau Monné
2018-08-16 12:39       ` Jan Beulich
2018-08-16 10:42   ` Andrew Cooper
2018-08-16 11:34     ` Jan Beulich
2018-08-16 11:48       ` Andrew Cooper
2018-08-16 12:43         ` Jan Beulich
2018-08-16 14:31     ` Roger Pau Monné
2018-08-16 15:56       ` Andrew Cooper
2018-08-16 16:36         ` Roger Pau Monné
2018-08-17 12:45 ` Jan Beulich
2018-08-28 17:58   ` Andrew Cooper
2018-08-29  6:31     ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.