On 17.07.22 23:54, Thomas Gleixner wrote: > On Sun, Jul 17 2022 at 22:13, Thomas Gleixner wrote: >> On Sun, Jul 17 2022 at 22:08, Thomas Gleixner wrote: >>> On Sun, Jul 17 2022 at 21:08, Thomas Gleixner wrote: >>> loadsegment_simple() was a red herring. The gs segment is already zero. >>> >>> So what explodes here is the early boot when switching from early per >>> CPU to the real per CPU area. >>> >>> start_kernel() >>> ..... >>> setup_per_cpu_areas(); >>> smp_prepare_boot_cpu() >> >> Bah. switch_to_new_gdt() is already invoked from setup_per_cpu_areas() >> and then again in smp_prepare_boot_cpu() and once more in cpu_init(), >> >> What a mess. > > So the below builds and boots at least on 64bit. I'll stare at it some > more tomorrow. I have no idea whether native_load_gdt() works with > XEN_PV. It should, but what do I know. No, shouldn't work. But ... > > Thanks, > > tglx > --- > --- a/arch/x86/include/asm/desc.h > +++ b/arch/x86/include/asm/desc.h > @@ -205,7 +205,7 @@ static inline void native_set_ldt(const > } > } > > -static inline void native_load_gdt(const struct desc_ptr *dtr) > +static __always_inline void native_load_gdt(const struct desc_ptr *dtr) > { > asm volatile("lgdt %0"::"m" (*dtr)); > } > --- a/arch/x86/include/asm/processor.h > +++ b/arch/x86/include/asm/processor.h > @@ -670,10 +670,9 @@ extern int sysenter_setup(void); > /* Defined in head.S */ > extern struct desc_ptr early_gdt_descr; > > -extern void switch_to_new_gdt(int); > +extern void switch_to_real_gdt(int); > extern void load_direct_gdt(int); > extern void load_fixmap_gdt(int); > -extern void load_percpu_segment(int); > extern void cpu_init(void); > extern void cpu_init_secondary(void); > extern void cpu_init_exception_handling(void); > --- a/arch/x86/kernel/cpu/Makefile > +++ b/arch/x86/kernel/cpu/Makefile > @@ -7,20 +7,24 @@ > ifdef CONFIG_FUNCTION_TRACER > CFLAGS_REMOVE_common.o = -pg > CFLAGS_REMOVE_perf_event.o = -pg > +CFLAGS_REMOVE_switch_gdt.o = -pg > endif > > # If these files are instrumented, boot hangs during the first second. > KCOV_INSTRUMENT_common.o := n > KCOV_INSTRUMENT_perf_event.o := n > +KCOV_INSTRUMENT_switch_gdt.o := n > > # As above, instrumenting secondary CPU boot code causes boot hangs. > KCSAN_SANITIZE_common.o := n > +KCSAN_SANITIZE_switch_gdt.o := n > > -# Make sure load_percpu_segment has no stackprotector > -CFLAGS_common.o := -fno-stack-protector > +# Make sure that switching the GDT and the per CPU segment > +# does not have stack protector enabled. > +CFLAGS_switch_gdt.o := -fno-stack-protector > > obj-y := cacheinfo.o scattered.o topology.o > -obj-y += common.o > +obj-y += common.o switch_gdt.o > obj-y += rdrand.o > obj-y += match.o > obj-y += bugs.o > --- a/arch/x86/kernel/cpu/common.c > +++ b/arch/x86/kernel/cpu/common.c > @@ -701,16 +701,6 @@ static const char *table_lookup_model(st > __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); > __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); > > -void load_percpu_segment(int cpu) > -{ > -#ifdef CONFIG_X86_32 > - loadsegment(fs, __KERNEL_PERCPU); > -#else > - __loadsegment_simple(gs, 0); > - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); > -#endif > -} > - > #ifdef CONFIG_X86_32 > /* The 32-bit entry code needs to find cpu_entry_area. */ > DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); > @@ -738,18 +728,6 @@ void load_fixmap_gdt(int cpu) > } > EXPORT_SYMBOL_GPL(load_fixmap_gdt); > > -/* > - * Current gdt points %fs at the "master" per-cpu area: after this, > - * it's on the real one. > - */ > -void switch_to_new_gdt(int cpu) > -{ > - /* Load the original GDT */ > - load_direct_gdt(cpu); > - /* Reload the per-cpu base */ > - load_percpu_segment(cpu); > -} > - > static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; > > static void get_model_name(struct cpuinfo_x86 *c) > @@ -2228,12 +2206,6 @@ void cpu_init(void) > boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) > cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); > > - /* > - * Initialize the per-CPU GDT with the boot GDT, > - * and set up the GDT descriptor: > - */ > - switch_to_new_gdt(cpu); > - > if (IS_ENABLED(CONFIG_X86_64)) { > loadsegment(fs, 0); > memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); > --- /dev/null > +++ b/arch/x86/kernel/cpu/switch_gdt.c > @@ -0,0 +1,31 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include > +#include > +#include > + > +/* > + * Invoked during early boot to switch from early GDT and early per CPU > + * (%fs on 32bit, GS_BASE on 64bit) to the real GDT and the runtime per CPU > + * area. > + * > + * This has to be done atomic because after switching from early GDT to > + * the real one any per cpu variable access is going to fault because > + * %fs resp. GS_BASE is not yet pointing to the real per CPU data. > + * > + * As a consequence this uses the native variants of load_gdt() and > + * wrmsrl(). So XEN_PV has to take the fault and emulate. > + */ > +void __init switch_to_real_gdt(int cpu) > +{ > + struct desc_ptr gdt_descr; > + > + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); > + gdt_descr.size = GDT_SIZE - 1; > + native_load_gdt(&gdt_descr); > + > +#ifdef CONFIG_X86_32 > + loadsegment(fs, __KERNEL_PERCPU); > +#else > + native_wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); > +#endif > +} > --- a/arch/x86/kernel/setup_percpu.c > +++ b/arch/x86/kernel/setup_percpu.c > @@ -211,7 +211,7 @@ void __init setup_per_cpu_areas(void) > * area. Reload any changed state for the boot CPU. > */ > if (!cpu) > - switch_to_new_gdt(cpu); > + switch_to_real_gdt(cpu); > } > > /* indicate the early static arrays will soon be gone */ > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -1457,7 +1457,11 @@ void arch_thaw_secondary_cpus_end(void) > void __init native_smp_prepare_boot_cpu(void) > { > int me = smp_processor_id(); > - switch_to_new_gdt(me); > + > + /* SMP invokes this from setup_per_cpu_areas() */ > + if (!IS_ENABLED(CONFIG_SMP)) > + switch_to_real_gdt(me); > + > /* already set me in cpu_online_mask in boot_cpu_init() */ > cpumask_set_cpu(me, cpu_callout_mask); > cpu_set_state_online(me); > --- a/arch/x86/xen/enlighten_pv.c > +++ b/arch/x86/xen/enlighten_pv.c > @@ -1164,7 +1164,7 @@ static void __init xen_setup_gdt(int cpu > pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; > pv_ops.cpu.load_gdt = xen_load_gdt_boot; > > - switch_to_new_gdt(cpu); > + switch_to_real_gdt(cpu); ... can't you use the paravirt variant of load_gdt in switch_to_real_gdt() ? > > pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; > pv_ops.cpu.load_gdt = xen_load_gdt; > Juergen