All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] use x86 cpu park to speedup smp_init in kexec situation
@ 2020-12-15 14:46 shenkai (D)
  2020-12-15 16:31 ` Andy Lutomirski
  0 siblings, 1 reply; 21+ messages in thread
From: shenkai (D) @ 2020-12-15 14:46 UTC (permalink / raw)
  To: linux-kernel, tglx, mingo, bp, x86, hpa
  Cc: hewenliang4, hushiyuan, luolongjun, hejingxian

From: shenkai <shenkai8@huawei.com>
Date: Tue, 15 Dec 2020 01:58:06 +0000
Subject: [PATCH] use x86 cpu park to speedup smp_init in kexec situation

In kexec reboot on x86 machine, APs will be halted and then waked up
by the apic INIT and SIPI interrupt. Here we can let APs spin instead
of being halted and boot APs by writing to specific address. In this way
we can accelerate smp_init procedure for we don't need to pull APs up
from a deep C-state.

This is meaningful in many situations where users are sensitive to reboot
time cost.

On 72-core x86(Intel Xeon Gold 6154 CPU @ 3.00GHz) machine with
Linux-5.10.0, time cost of smp_init is 210ms with cpu park while 80ms
without, and the difference is more significant when there are more
cores.

Implementation is basicly as follow:
1. reserve some space for cpu park code and data
2. when APs get reboot IPI and cpu_park is enabled, they will turn MMU
    off and excute cpu park code where APs will spin and wait on an
    address.(APs will reuse the pgtable which is used by BSP in kexec
    procedure to turn MMU off)
3. after BSP reboot successfully, it will wake up APs by writing an entry
    address to the spin-read address so that APs can jmp to normal bootup
    procedure.
4. The hyperthreaded twin processor of BSP can be specified so that the
    twin processor can halt as normal procedure and will not compete with
    BSP during booting.

Signed-off-by: shenkai <shenkai8@huawei.com>
Co-Developed-by: Luo Longjun <luolongjun@huawei.com>
---
  arch/x86/Kconfig                     | 12 ++++
  arch/x86/include/asm/kexec.h         | 43 ++++++++++++++
  arch/x86/include/asm/realmode.h      |  3 +
  arch/x86/kernel/Makefile             |  1 +
  arch/x86/kernel/cpu-park.S           | 87 +++++++++++++++++++++++++++
  arch/x86/kernel/machine_kexec_64.c   | 16 +++++
  arch/x86/kernel/process.c            | 13 ++++
  arch/x86/kernel/reboot.c             |  6 ++
  arch/x86/kernel/relocate_kernel_64.S | 51 ++++++++++++++++
  arch/x86/kernel/setup.c              | 67 +++++++++++++++++++++
  arch/x86/kernel/smp.c                | 89 ++++++++++++++++++++++++++++
  arch/x86/kernel/smpboot.c            | 14 +++++
  arch/x86/realmode/rm/header.S        |  3 +
  arch/x86/realmode/rm/trampoline_64.S |  5 ++
  kernel/smp.c                         |  6 ++
  15 files changed, 416 insertions(+)
  create mode 100644 arch/x86/kernel/cpu-park.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fbf26e0f7a6a..8dedd0e62eb2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2406,6 +2406,18 @@ config MODIFY_LDT_SYSCALL
        surface.  Disabling it removes the modify_ldt(2) system call.

        Saying 'N' here may make sense for embedded or server kernels.
+config X86_CPU_PARK
+    bool "Support CPU PARK on kexec"
+    default n
+    depends on SMP
+    depends on KEXEC_CORE
+    help
+      This enables support for CPU PARK feature in
+      order to save time of cpu down to up.
+      CPU park is a state through kexec, spin loop
+      instead of cpu die before jumping to new kernel,
+      jumping out from loop to new kernel entry in
+      smp_init.

  source "kernel/livepatch/Kconfig"

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6802c59e8252..3801df240f5f 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -197,6 +197,49 @@ typedef void crash_vmclear_fn(void);
  extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
  extern void kdump_nmi_shootdown_cpus(void);

+#ifdef CONFIG_X86_CPU_PARK
+#define PARK_MAGIC 0x7061726b
+#define PARK_SECTION_SIZE PAGE_ALIGN(sizeof(struct cpu_park_section) + 
2 * PAGE_SIZE - 1)
+extern unsigned long park_start;
+extern unsigned long park_len;
+extern bool cpu_park_enable;
+extern int boot_core_sibling;
+extern void cpu_park(unsigned int cpu);
+extern void __do_cpu_park(unsigned long exit);
+extern int write_park_exit(unsigned int cpu);
+extern unsigned long park_cpu(unsigned long pgd_addr,
+            unsigned long park_code_addr,
+            unsigned long exit_addr);
+extern int install_cpu_park(void);
+extern int uninstall_cpu_park(void);
+
+struct cpu_park_section {
+    struct {
+        unsigned long exit;    /* entry address to exit loop */
+        unsigned long magic;    /* maigc indicates park state */
+    } para[NR_CPUS];
+    char text[0];            /* text section of park */
+};
+
+static inline unsigned long *cpu_park_exit_addr(struct cpu_park_section 
*sec, unsigned int cpu)
+{
+    return &sec->para[cpu].exit;
+}
+static inline unsigned long *cpu_park_magic_addr(struct 
cpu_park_section *sec, unsigned int cpu)
+{
+    return &sec->para[cpu].magic;
+}
+static inline struct cpu_park_section *cpu_park_section_virt(void)
+{
+    return (struct cpu_park_section *)phys_to_virt(park_start);
+
+}
+static inline struct cpu_park_section *cpu_park_section_phy(void)
+{
+    return (struct cpu_park_section *)park_start;
+}
+#endif
+
  #endif /* __ASSEMBLY__ */

  #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/realmode.h 
b/arch/x86/include/asm/realmode.h
index 5db5d083c873..117f82f5c602 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -37,6 +37,9 @@ struct real_mode_header {
  #ifdef CONFIG_X86_64
      u32    machine_real_restart_seg;
  #endif
+#ifdef CONFIG_X86_CPU_PARK
+    u32    park_startup_32;
+#endif
  };

  /* This must match data at realmode/rm/trampoline_{32,64}.S */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..5dab6772ddf7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -149,6 +149,7 @@ obj-$(CONFIG_X86_UMIP)            += umip.o
  obj-$(CONFIG_UNWINDER_ORC)        += unwind_orc.o
  obj-$(CONFIG_UNWINDER_FRAME_POINTER)    += unwind_frame.o
  obj-$(CONFIG_UNWINDER_GUESS)        += unwind_guess.o
+obj-$(CONFIG_X86_CPU_PARK)        += cpu-park.o

  obj-$(CONFIG_AMD_MEM_ENCRYPT)        += sev-es.o
  ###
diff --git a/arch/x86/kernel/cpu-park.S b/arch/x86/kernel/cpu-park.S
new file mode 100644
index 000000000000..fc2cc675519c
--- /dev/null
+++ b/arch/x86/kernel/cpu-park.S
@@ -0,0 +1,87 @@
+#include <asm/kexec.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+
+#define PARK_MAGIC 0x7061726b
+
+    .text
+    .align PAGE_SIZE
+SYM_CODE_START_NOALIGN(__do_cpu_park)
+    .code64
+
+    /* %rdi park exit addr */
+
+    leaq    gdt(%rip), %rax
+    /* gdt address will be updated with the same value several times */
+    movq    %rax, (gdt_meta + 0x2)(%rip)
+    lgdt    gdt_meta(%rip)
+
+    movl    $0x18, %eax    /* data segment */
+    movl    %eax, %ds
+    movl    %eax, %es
+    movl    %eax, %ss
+    movl    %eax, %fs
+    movl    %eax, %gs
+
+    /* set stack */
+    leaq    stack_init(%rip), %rsp
+    leaq    __enter_protection(%rip), %rax
+    /* stack will be writted with the same value several times */
+    pushq    $0x08 /* CS */
+    pushq    %rax
+    lretq
+
+    .code32
+__enter_protection:
+    /* Disable paging */
+    movl    %cr0, %eax
+    andl    $~0x80000000, %eax
+    movl    %eax, %cr0
+
+    /* Disable long mode */
+    movl    $0xC0000080, %ecx
+    rdmsr
+    andl    $~0x00000100, %eax
+    wrmsr
+
+    /* Disable PAE */
+    xorl    %eax, %eax
+    movl    %eax, %cr4
+
+    mov    $PARK_MAGIC, %eax
+    mov    %eax, 0x8(%edi)
+0:
+    mov    (%edi), %eax
+    test    %eax, %eax
+    jz    0b
+
+    mov    $0x18, %edx
+    jmp    *%eax
+
+    .balign 16
+gdt:
+    /* 0x00 unusable segment
+     */
+    .quad    0
+
+    /* 0x08 32 4GB flat code segment */
+    .word    0xFFFF, 0x0000, 0x9B00, 0x00CF /* protect mode */
+
+    /* 0x10 4GB flat code segment */
+    .word    0xFFFF, 0x0000, 0x9B00, 0x00AF /* long mode */
+
+    /* 0x18 4GB flat data segment */
+    .word    0xFFFF, 0x0000, 0x9300, 0x00CF
+
+gdt_end:
+gdt_meta:
+    .word    gdt_end - gdt - 1
+    .quad    gdt
+    .word    0, 0, 0
+stack:    .quad    0, 0
+stack_init:
+    .fill    PAGE_SIZE - (. - __do_cpu_park), 1, 0
+SYM_CODE_END(__do_cpu_park)
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index a29a44a98e5b..95b9a17f7a3b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -241,6 +241,20 @@ static int init_pgtable(struct kimage *image, 
unsigned long start_pgtable)
              return result;
      }

+#ifdef CONFIG_X86_CPU_PARK
+    if (cpu_park_enable) {
+        void *control_page;
+
+        control_page = page_address(image->control_code_page) + PAGE_SIZE;
+        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+
+        result = kernel_ident_mapping_init(&info, level4p, park_start, 
park_start + park_len);
+        if (result) {
+            cpu_park_enable = false;
+            return result;
+        }
+    }
+#endif
      /*
       * Prepare EFI systab and ACPI tables for kexec kernel since they are
       * not covered by pfn_mapped.
@@ -353,7 +367,9 @@ void machine_kexec(struct kimage *image)
      }

      control_page = page_address(image->control_code_page) + PAGE_SIZE;
+#ifndef CONFIG_X86_CPU_PARK
      memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+#endif

      page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
      page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..15c52036bbba 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,6 +46,9 @@

  #include "process.h"

+#ifdef CONFIG_X86_CPU_PARK
+#include <linux/kexec.h>
+#endif
  /*
   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
   * no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -723,6 +726,16 @@ void stop_this_cpu(void *dummy)
       */
      if (boot_cpu_has(X86_FEATURE_SME))
          native_wbinvd();
+#ifdef CONFIG_X86_CPU_PARK
+    /*
+     * Go to cpu park state.
+     * Otherwise go to cpu die.
+     */
+    if (kexec_in_progress && cpu_park_enable) {
+        if (smp_processor_id() != boot_core_sibling)
+            cpu_park(smp_processor_id());
+    }
+#endif
      for (;;) {
          /*
           * Use native_halt() so that memory contents don't change
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index db115943e8bd..a34b975efa9f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -32,6 +32,9 @@
  #include <asm/realmode.h>
  #include <asm/x86_init.h>
  #include <asm/efi.h>
+#ifdef CONFIG_X86_CPU_PARK
+#include <linux/kexec.h>
+#endif

  /*
   * Power off function, if any
@@ -705,6 +708,9 @@ void native_machine_shutdown(void)
       * scheduler's load balance.
       */
      local_irq_disable();
+#ifdef CONFIG_X86_CPU_PARK
+    install_cpu_park();
+#endif
      stop_other_cpus();
  #endif

diff --git a/arch/x86/kernel/relocate_kernel_64.S 
b/arch/x86/kernel/relocate_kernel_64.S
index a4d9a261425b..d794b0aefaf3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -107,6 +107,57 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
      ret
  SYM_CODE_END(relocate_kernel)

+SYM_CODE_START_NOALIGN(park_cpu)
+    /*
+     * %rdi pgd addr
+     * %rsi __do_cpu_park addr
+     * %rdx park exit addr
+     */
+
+    /* get physical address of page table now too */
+    movq    %rdi, %r9
+    /* Switch to the identity mapped page tables */
+    movq    %r9, %cr3
+
+    movq    %cr4, %rax
+    movq    %rax, %r13
+
+    /*
+     * Set cr0 to a known state:
+     *  - Paging enabled
+     *  - Alignment check disabled
+     *  - Write protect disabled
+     *  - No task switch
+     *  - Don't do FP software emulation.
+     *  - Proctected mode enabled
+     */
+    movq    %cr0, %rax
+    andq    $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
+    orl        $(X86_CR0_PG | X86_CR0_PE), %eax
+    movq    %rax, %cr0
+
+    /*
+     * Set cr4 to a known state:
+     *  - physical address extension enabled
+     *  - 5-level paging, if it was enabled before
+     */
+    movl    $X86_CR4_PAE, %eax
+    testq    $X86_CR4_LA57, %r13
+    jz    1f
+    orl    $X86_CR4_LA57, %eax
+1:
+    movq    %rax, %cr4
+
+    jmp 1f
+1:
+
+    mov    %rdx, %rdi
+    mov    %rsi, %rax
+    jmp    *%rax
+
+    ret    /* never */
+SYM_CODE_END(park_cpu)
+
  SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
      UNWIND_HINT_EMPTY
      /* set return address to 0 if not preserving context */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 84f581c91db4..dfe854c1ecf8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -195,6 +195,70 @@ static inline void __init copy_edd(void)
  }
  #endif

+#ifdef CONFIG_X86_CPU_PARK
+/* Physical address of reserved park memory. */
+unsigned long park_start;
+/* Virtual address of reserved park memory. */
+unsigned long park_start_virt;
+/* park reserve mem len should be PAGE_SIZE * NR_CPUS */
+unsigned long park_len = PARK_SECTION_SIZE;
+bool cpu_park_enable;
+int boot_core_sibling;
+
+static int __init parse_boot_core_sibling(char *p)
+{
+    if (!p)
+        return 0;
+    get_option(&p, &boot_core_sibling);
+    return 0;
+}
+early_param("bootcoresibling", parse_boot_core_sibling);
+
+static int __init parse_park_mem(char *p)
+{
+    if (!p)
+        return 0;
+
+    park_start = PAGE_ALIGN(memparse(p, NULL));
+    if (park_start == 0)
+        pr_info("cpu park mem params[%s]", p);
+
+    return 0;
+}
+early_param("cpuparkmem", parse_park_mem);
+
+static int __init reserve_park_mem(void)
+{
+    if (park_start == 0 || park_len == 0)
+        return 0;
+
+    park_start = PAGE_ALIGN(park_start);
+    park_len = PAGE_ALIGN(park_len);
+
+    if (!memblock_is_region_memory(park_start, park_len)) {
+        pr_warn("cannot reserve park mem: region is not memory!\n");
+        goto out;
+    }
+
+    if (memblock_is_region_reserved(park_start, park_len)) {
+        pr_warn("cannot reserve park mem: region overlaps reserved 
memory!\n");
+        goto out;
+    }
+
+    memblock_reserve(park_start, park_len);
+    pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld KB)\n",
+        park_start, park_start + park_len, park_len >> 10);
+
+    cpu_park_enable = true;
+    return 0;
+out:
+    park_start = 0;
+    park_len = 0;
+    cpu_park_enable = false;
+    return -EINVAL;
+}
+#endif
+
  void * __init extend_brk(size_t size, size_t align)
  {
      size_t mask = align - 1;
@@ -1154,6 +1218,9 @@ void __init setup_arch(char **cmdline_p)
       * won't consume hotpluggable memory.
       */
      reserve_crashkernel();
+#ifdef CONFIG_X86_CPU_PARK
+    reserve_park_mem();
+#endif

      memblock_find_dma_reserve();

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index eff4ce3b10da..19297fd5cdc2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -34,6 +34,10 @@
  #include <asm/kexec.h>
  #include <asm/virtext.h>

+#ifdef CONFIG_X86_CPU_PARK
+#include <linux/kexec.h>
+#include <asm/realmode.h>
+#endif
  /*
   *    Some notes on x86 processor bugs affecting SMP operation:
   *
@@ -128,6 +132,91 @@ static int smp_stop_nmi_callback(unsigned int val, 
struct pt_regs *regs)
      return NMI_HANDLED;
  }

+#ifdef CONFIG_X86_CPU_PARK
+/*
+ * Write the secondary_entry to exit section of park state.
+ * Then the secondary cpu will jump straight into the kernel
+ * by the secondary_entry.
+ */
+int write_park_exit(unsigned int cpu)
+{
+    struct cpu_park_section *park_section;
+    unsigned long *park_exit;
+    unsigned long *park_magic;
+
+    if (!cpu_park_enable)
+        return  -EPERM;
+
+    park_section = cpu_park_section_virt();
+    park_exit = cpu_park_exit_addr(park_section, cpu);
+    park_magic = cpu_park_magic_addr(park_section, cpu);
+
+    /*
+     * Test first 8 bytes to determine
+     * whether needs to write cpu park exit.
+     */
+    if (*park_magic == PARK_MAGIC) {
+        cpumask_clear_cpu(cpu, cpu_initialized_mask);
+        smp_mb();
+
+        *park_exit = real_mode_header->park_startup_32;
+        pr_info("park cpu %d up", cpu);
+        return 0;
+    }
+    pr_info("normal cpu %d up", cpu);
+    return -EPERM;
+}
+
+/* Install cpu park sections for the specific cpu. */
+int install_cpu_park(void)
+{
+    struct cpu_park_section *park_section;
+    unsigned long park_text_len;
+
+    park_section = cpu_park_section_virt();
+    memset((void *)park_section, 0, PARK_SECTION_SIZE);
+
+    park_text_len = PAGE_SIZE;
+    memcpy((void *)park_section->text, __do_cpu_park, park_text_len);
+
+    return 0;
+}
+
+int uninstall_cpu_park(void)
+{
+    struct cpu_park_section *park_section;
+
+    if (!cpu_park_enable)
+        return -EPERM;
+
+    park_section = cpu_park_section_virt();
+    memset((void *)park_section, 0, PARK_SECTION_SIZE);
+    pr_info("clear park area 0x%lx - 0x%lx", (unsigned 
long)cpu_park_section_phy(),
+                    (unsigned long)cpu_park_section_phy() + 
PARK_SECTION_SIZE);
+
+    return 0;
+}
+
+void cpu_park(unsigned int cpu)
+{
+    struct cpu_park_section *park_section_p;
+    unsigned long park_exit_phy;
+    unsigned long do_park;
+    unsigned long pgd;
+
+    park_section_p = cpu_park_section_phy();
+    park_exit_phy = (unsigned long)cpu_park_exit_addr(park_section_p, cpu);
+    do_park = (unsigned long)&park_section_p->text;
+    pgd = (unsigned 
long)__pa(page_address(kexec_image->control_code_page));
+
+    hw_breakpoint_disable();
+
+    park_cpu(pgd, do_park, park_exit_phy);
+
+    unreachable();
+}
+#endif
+
  /*
   * this function calls the 'stop' function on all other CPUs in the 
system.
   */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index de776b2e6046..4d8f7ac9f9ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -82,6 +82,10 @@
  #include <asm/hw_irq.h>
  #include <asm/stackprotector.h>

+#ifdef CONFIG_X86_CPU_PARK
+#include <linux/kexec.h>
+#endif
+
  /* representing HT siblings of each logical CPU */
  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -1048,6 +1052,13 @@ static int do_boot_cpu(int apicid, int cpu, 
struct task_struct *idle,
       * the targeted processor.
       */

+#ifdef CONFIG_X86_CPU_PARK
+    if (cpu != boot_core_sibling && write_park_exit(cpu) == 0) {
+        cpumask_set_cpu(cpu, cpu_callout_mask);
+        goto unpark;
+    }
+#endif
+
      if (x86_platform.legacy.warm_reset) {

          pr_debug("Setting warm reset code and vector.\n");
@@ -1102,6 +1113,9 @@ static int do_boot_cpu(int apicid, int cpu, struct 
task_struct *idle,
          }
      }

+#ifdef CONFIG_X86_CPU_PARK
+unpark:
+#endif
      if (!boot_error) {
          /*
           * Wait till AP completes initial initialization
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 8c1db5bf5d78..8c786c13a7e0 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -36,6 +36,9 @@ SYM_DATA_START(real_mode_header)
  #ifdef CONFIG_X86_64
      .long    __KERNEL32_CS
  #endif
+#ifdef CONFIG_X86_CPU_PARK
+    .long    pa_park_startup_32
+#endif
  SYM_DATA_END(real_mode_header)

      /* End signature, used to verify integrity */
diff --git a/arch/x86/realmode/rm/trampoline_64.S 
b/arch/x86/realmode/rm/trampoline_64.S
index 84c5d1b33d10..2f25d414844b 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -161,6 +161,11 @@ SYM_CODE_START(startup_32)
      ljmpl    $__KERNEL_CS, $pa_startup_64
  SYM_CODE_END(startup_32)

+SYM_CODE_START(park_startup_32)
+    movl    $rm_stack_end, %esp
+    jmp    startup_32
+SYM_CODE_END(park_startup_32)
+
      .section ".text64","ax"
      .code64
      .balign 4
diff --git a/kernel/smp.c b/kernel/smp.c
index 4d17501433be..fc0b9c488618 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,6 +23,9 @@
  #include <linux/sched/clock.h>
  #include <linux/nmi.h>
  #include <linux/sched/debug.h>
+#ifdef CONFIG_X86_CPU_PARK
+#include <linux/kexec.h>
+#endif

  #include "smpboot.h"
  #include "sched/smp.h"
@@ -817,6 +820,9 @@ void __init smp_init(void)

      /* Any cleanup work */
      smp_cpus_done(setup_max_cpus);
+#ifdef CONFIG_X86_CPU_PARK
+    uninstall_cpu_park();
+#endif
  }

  /*
-- 
2.23.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-15 14:46 [PATCH] use x86 cpu park to speedup smp_init in kexec situation shenkai (D)
@ 2020-12-15 16:31 ` Andy Lutomirski
  2020-12-15 21:20   ` Thomas Gleixner
  0 siblings, 1 reply; 21+ messages in thread
From: Andy Lutomirski @ 2020-12-15 16:31 UTC (permalink / raw)
  To: shenkai (D)
  Cc: LKML, Thomas Gleixner, Ingo Molnar, Borislav Petkov, X86 ML,
	H. Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

On Tue, Dec 15, 2020 at 6:46 AM shenkai (D) <shenkai8@huawei.com> wrote:
>
> From: shenkai <shenkai8@huawei.com>
> Date: Tue, 15 Dec 2020 01:58:06 +0000
> Subject: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
>
> In kexec reboot on x86 machine, APs will be halted and then waked up
> by the apic INIT and SIPI interrupt. Here we can let APs spin instead
> of being halted and boot APs by writing to specific address. In this way
> we can accelerate smp_init procedure for we don't need to pull APs up
> from a deep C-state.
>
> This is meaningful in many situations where users are sensitive to reboot
> time cost.

I like the concept.

>
> On 72-core x86(Intel Xeon Gold 6154 CPU @ 3.00GHz) machine with
> Linux-5.10.0, time cost of smp_init is 210ms with cpu park while 80ms
> without, and the difference is more significant when there are more
> cores.
>
> Implementation is basicly as follow:
> 1. reserve some space for cpu park code and data
> 2. when APs get reboot IPI and cpu_park is enabled, they will turn MMU
>     off and excute cpu park code where APs will spin and wait on an
>     address.(APs will reuse the pgtable which is used by BSP in kexec
>     procedure to turn MMU off)
> 3. after BSP reboot successfully, it will wake up APs by writing an entry
>     address to the spin-read address so that APs can jmp to normal bootup
>     procedure.
> 4. The hyperthreaded twin processor of BSP can be specified so that the
>     twin processor can halt as normal procedure and will not compete with
>     BSP during booting.
>
> Signed-off-by: shenkai <shenkai8@huawei.com>
> Co-Developed-by: Luo Longjun <luolongjun@huawei.com>
> ---
>   arch/x86/Kconfig                     | 12 ++++
>   arch/x86/include/asm/kexec.h         | 43 ++++++++++++++
>   arch/x86/include/asm/realmode.h      |  3 +
>   arch/x86/kernel/Makefile             |  1 +
>   arch/x86/kernel/cpu-park.S           | 87 +++++++++++++++++++++++++++
>   arch/x86/kernel/machine_kexec_64.c   | 16 +++++
>   arch/x86/kernel/process.c            | 13 ++++
>   arch/x86/kernel/reboot.c             |  6 ++
>   arch/x86/kernel/relocate_kernel_64.S | 51 ++++++++++++++++
>   arch/x86/kernel/setup.c              | 67 +++++++++++++++++++++
>   arch/x86/kernel/smp.c                | 89 ++++++++++++++++++++++++++++
>   arch/x86/kernel/smpboot.c            | 14 +++++
>   arch/x86/realmode/rm/header.S        |  3 +
>   arch/x86/realmode/rm/trampoline_64.S |  5 ++
>   kernel/smp.c                         |  6 ++
>   15 files changed, 416 insertions(+)
>   create mode 100644 arch/x86/kernel/cpu-park.S
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index fbf26e0f7a6a..8dedd0e62eb2 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2406,6 +2406,18 @@ config MODIFY_LDT_SYSCALL
>         surface.  Disabling it removes the modify_ldt(2) system call.
>
>         Saying 'N' here may make sense for embedded or server kernels.
> +config X86_CPU_PARK
> +    bool "Support CPU PARK on kexec"
> +    default n
> +    depends on SMP
> +    depends on KEXEC_CORE
> +    help
> +      This enables support for CPU PARK feature in
> +      order to save time of cpu down to up.
> +      CPU park is a state through kexec, spin loop
> +      instead of cpu die before jumping to new kernel,
> +      jumping out from loop to new kernel entry in
> +      smp_init.

Why does this need to be configurable?

>
>   source "kernel/livepatch/Kconfig"
>
> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> index 6802c59e8252..3801df240f5f 100644
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -197,6 +197,49 @@ typedef void crash_vmclear_fn(void);
>   extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>   extern void kdump_nmi_shootdown_cpus(void);
>
> +#ifdef CONFIG_X86_CPU_PARK
> +#define PARK_MAGIC 0x7061726b
> +#define PARK_SECTION_SIZE PAGE_ALIGN(sizeof(struct cpu_park_section) +
> 2 * PAGE_SIZE - 1)
> +extern unsigned long park_start;
> +extern unsigned long park_len;
> +extern bool cpu_park_enable;
> +extern int boot_core_sibling;
> +extern void cpu_park(unsigned int cpu);
> +extern void __do_cpu_park(unsigned long exit);
> +extern int write_park_exit(unsigned int cpu);
> +extern unsigned long park_cpu(unsigned long pgd_addr,
> +            unsigned long park_code_addr,
> +            unsigned long exit_addr);
> +extern int install_cpu_park(void);
> +extern int uninstall_cpu_park(void);
> +
> +struct cpu_park_section {
> +    struct {
> +        unsigned long exit;    /* entry address to exit loop */
> +        unsigned long magic;    /* maigc indicates park state */

No magic please.  If you actually need a flag, give it a descriptive
name and document what the values are.  But I think you could get away
with just a single field per cpu:

u32 jump_address;

0 means keep spinning.

But I think you could do even better -- just have a single field that
you write to wake all parked CPUs.  You'll also want some indication
of how many CPUs are parked so that the new kernel can reliably tell
when it has unparked all of them.

> +    } para[NR_CPUS];

What does "para" mean?

> +    char text[0];            /* text section of park */

text[0] is useless.  Either give it a real size or just omit it
entirely.  Or use text[] if you absolutely must.


> +#define PARK_MAGIC 0x7061726b

What is this for?

> +
> +    .text
> +    .align PAGE_SIZE
> +SYM_CODE_START_NOALIGN(__do_cpu_park)
> +    .code64
> +
> +    /* %rdi park exit addr */
> +
> +    leaq    gdt(%rip), %rax
> +    /* gdt address will be updated with the same value several times */

Why do you need to change the GDT at all?  You're spinning at CPL0,
and you don't have a usable IDT, and any NMI or MCE that comes in is
going to kill the system entirely.  So either you need a full working
GDT, IDT, and NMI/MCE handler, or you could just not have a valid GDT
at all.

In any case, these comments are useless.  Why would you update the
address with the same value several times?  Why are you sticking a
pointer to "gdt" into "gdt_meta".  (Yes, I figured out that you are
building a pointer to feed lgdt.  Just use the stack for it and make
the code less bizarre.)

> +    movq    %rax, (gdt_meta + 0x2)(%rip)
> +    lgdt    gdt_meta(%rip)
> +
> +    movl    $0x18, %eax    /* data segment */
> +    movl    %eax, %ds
> +    movl    %eax, %es
> +    movl    %eax, %ss
> +    movl    %eax, %fs
> +    movl    %eax, %gs
> +
> +    /* set stack */
> +    leaq    stack_init(%rip), %rsp
> +    leaq    __enter_protection(%rip), %rax
> +    /* stack will be writted with the same value several times */

"written"

But this comment makes no sense at all.  How about "switch to compat mode"?

> +    pushq    $0x08 /* CS */
> +    pushq    %rax
> +    lretq
> +
> +    .code32
> +__enter_protection:
> +    /* Disable paging */
> +    movl    %cr0, %eax
> +    andl    $~0x80000000, %eax
> +    movl    %eax, %cr0
> +
> +    /* Disable long mode */
> +    movl    $0xC0000080, %ecx
> +    rdmsr
> +    andl    $~0x00000100, %eax
> +    wrmsr

I don't think you really need to be in compat mode to exit long mode.
I admit that the semantics would be more bizarre if you just exited
directly.

Also, no magic numbers please.

> +
> +    /* Disable PAE */
> +    xorl    %eax, %eax
> +    movl    %eax, %cr4
> +
> +    mov    $PARK_MAGIC, %eax
> +    mov    %eax, 0x8(%edi)
> +0:
> +    mov    (%edi), %eax
> +    test    %eax, %eax
> +    jz    0b

mwait, please.

> +
> +    mov    $0x18, %edx

What's that for?

>
> diff --git a/arch/x86/kernel/relocate_kernel_64.S
> b/arch/x86/kernel/relocate_kernel_64.S
> index a4d9a261425b..d794b0aefaf3 100644
> --- a/arch/x86/kernel/relocate_kernel_64.S
> +++ b/arch/x86/kernel/relocate_kernel_64.S
> @@ -107,6 +107,57 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
>       ret
>   SYM_CODE_END(relocate_kernel)
>
> +SYM_CODE_START_NOALIGN(park_cpu)
> +    /*
> +     * %rdi pgd addr
> +     * %rsi __do_cpu_park addr
> +     * %rdx park exit addr
> +     */
> +
> +    /* get physical address of page table now too */
> +    movq    %rdi, %r9
> +    /* Switch to the identity mapped page tables */
> +    movq    %r9, %cr3
> +
> +    movq    %cr4, %rax
> +    movq    %rax, %r13
> +
> +    /*
> +     * Set cr0 to a known state:
> +     *  - Paging enabled
> +     *  - Alignment check disabled
> +     *  - Write protect disabled
> +     *  - No task switch
> +     *  - Don't do FP software emulation.
> +     *  - Proctected mode enabled
> +     */
> +    movq    %cr0, %rax
> +    andq    $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
> +    orl        $(X86_CR0_PG | X86_CR0_PE), %eax
> +    movq    %rax, %cr0
> +
> +    /*
> +     * Set cr4 to a known state:
> +     *  - physical address extension enabled
> +     *  - 5-level paging, if it was enabled before
> +     */
> +    movl    $X86_CR4_PAE, %eax
> +    testq    $X86_CR4_LA57, %r13
> +    jz    1f
> +    orl    $X86_CR4_LA57, %eax
> +1:
> +    movq    %rax, %cr4
> +
> +    jmp 1f
> +1:
> +
> +    mov    %rdx, %rdi
> +    mov    %rsi, %rax
> +    jmp    *%rax
> +
> +    ret    /* never */
> +SYM_CODE_END(park_cpu)

What is the purpose of this function?


> +
>   SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
>       UNWIND_HINT_EMPTY
>       /* set return address to 0 if not preserving context */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 84f581c91db4..dfe854c1ecf8 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -195,6 +195,70 @@ static inline void __init copy_edd(void)
>   }
>   #endif
>
> +#ifdef CONFIG_X86_CPU_PARK
> +/* Physical address of reserved park memory. */
> +unsigned long park_start;
> +/* Virtual address of reserved park memory. */
> +unsigned long park_start_virt;
> +/* park reserve mem len should be PAGE_SIZE * NR_CPUS */
> +unsigned long park_len = PARK_SECTION_SIZE;
> +bool cpu_park_enable;
> +int boot_core_sibling;
> +
> +static int __init parse_boot_core_sibling(char *p)
> +{
> +    if (!p)
> +        return 0;
> +    get_option(&p, &boot_core_sibling);
> +    return 0;
> +}
> +early_param("bootcoresibling", parse_boot_core_sibling);

What is this for?

> +
> +static int __init parse_park_mem(char *p)
> +{
> +    if (!p)
> +        return 0;
> +
> +    park_start = PAGE_ALIGN(memparse(p, NULL));
> +    if (park_start == 0)
> +        pr_info("cpu park mem params[%s]", p);
> +
> +    return 0;
> +}
> +early_param("cpuparkmem", parse_park_mem);
> +
> +static int __init reserve_park_mem(void)
> +{
> +    if (park_start == 0 || park_len == 0)
> +        return 0;
> +
> +    park_start = PAGE_ALIGN(park_start);
> +    park_len = PAGE_ALIGN(park_len);
> +
> +    if (!memblock_is_region_memory(park_start, park_len)) {
> +        pr_warn("cannot reserve park mem: region is not memory!\n");
> +        goto out;
> +    }
> +
> +    if (memblock_is_region_reserved(park_start, park_len)) {
> +        pr_warn("cannot reserve park mem: region overlaps reserved
> memory!\n");
> +        goto out;
> +    }
> +
> +    memblock_reserve(park_start, park_len);
> +    pr_info("cpu park mem reserved: 0x%016lx - 0x%016lx (%ld KB)\n",
> +        park_start, park_start + park_len, park_len >> 10);
> +
> +    cpu_park_enable = true;
> +    return 0;
> +out:
> +    park_start = 0;
> +    park_len = 0;
> +    cpu_park_enable = false;
> +    return -EINVAL;
> +}
> +#endif
> +
>   void * __init extend_brk(size_t size, size_t align)
>   {
>       size_t mask = align - 1;
> @@ -1154,6 +1218,9 @@ void __init setup_arch(char **cmdline_p)
>        * won't consume hotpluggable memory.
>        */
>       reserve_crashkernel();
> +#ifdef CONFIG_X86_CPU_PARK
> +    reserve_park_mem();
> +#endif
>
>       memblock_find_dma_reserve();
>
> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
> index eff4ce3b10da..19297fd5cdc2 100644
> --- a/arch/x86/kernel/smp.c
> +++ b/arch/x86/kernel/smp.c
> @@ -34,6 +34,10 @@
>   #include <asm/kexec.h>
>   #include <asm/virtext.h>
>
> +#ifdef CONFIG_X86_CPU_PARK
> +#include <linux/kexec.h>
> +#include <asm/realmode.h>
> +#endif
>   /*
>    *    Some notes on x86 processor bugs affecting SMP operation:
>    *
> @@ -128,6 +132,91 @@ static int smp_stop_nmi_callback(unsigned int val,
> struct pt_regs *regs)
>       return NMI_HANDLED;
>   }
>
> +#ifdef CONFIG_X86_CPU_PARK
> +/*
> + * Write the secondary_entry to exit section of park state.
> + * Then the secondary cpu will jump straight into the kernel
> + * by the secondary_entry.
> + */
> +int write_park_exit(unsigned int cpu)
> +{
> +    struct cpu_park_section *park_section;
> +    unsigned long *park_exit;
> +    unsigned long *park_magic;
> +
> +    if (!cpu_park_enable)
> +        return  -EPERM;
> +
> +    park_section = cpu_park_section_virt();
> +    park_exit = cpu_park_exit_addr(park_section, cpu);
> +    park_magic = cpu_park_magic_addr(park_section, cpu);
> +
> +    /*
> +     * Test first 8 bytes to determine
> +     * whether needs to write cpu park exit.
> +     */
> +    if (*park_magic == PARK_MAGIC) {
> +        cpumask_clear_cpu(cpu, cpu_initialized_mask);
> +        smp_mb();
> +
> +        *park_exit = real_mode_header->park_startup_32;
> +        pr_info("park cpu %d up", cpu);
> +        return 0;
> +    }
> +    pr_info("normal cpu %d up", cpu);
> +    return -EPERM;
> +}
> +
> +/* Install cpu park sections for the specific cpu. */
> +int install_cpu_park(void)
> +{
> +    struct cpu_park_section *park_section;
> +    unsigned long park_text_len;
> +
> +    park_section = cpu_park_section_virt();
> +    memset((void *)park_section, 0, PARK_SECTION_SIZE);
> +
> +    park_text_len = PAGE_SIZE;
> +    memcpy((void *)park_section->text, __do_cpu_park, park_text_len);
> +
> +    return 0;
> +}
> +
> +int uninstall_cpu_park(void)
> +{
> +    struct cpu_park_section *park_section;
> +
> +    if (!cpu_park_enable)
> +        return -EPERM;
> +
> +    park_section = cpu_park_section_virt();
> +    memset((void *)park_section, 0, PARK_SECTION_SIZE);
> +    pr_info("clear park area 0x%lx - 0x%lx", (unsigned
> long)cpu_park_section_phy(),
> +                    (unsigned long)cpu_park_section_phy() +
> PARK_SECTION_SIZE);
> +
> +    return 0;
> +}
> +
> +void cpu_park(unsigned int cpu)
> +{
> +    struct cpu_park_section *park_section_p;
> +    unsigned long park_exit_phy;
> +    unsigned long do_park;
> +    unsigned long pgd;
> +
> +    park_section_p = cpu_park_section_phy();
> +    park_exit_phy = (unsigned long)cpu_park_exit_addr(park_section_p, cpu);
> +    do_park = (unsigned long)&park_section_p->text;
> +    pgd = (unsigned
> long)__pa(page_address(kexec_image->control_code_page));
> +
> +    hw_breakpoint_disable();
> +
> +    park_cpu(pgd, do_park, park_exit_phy);
> +
> +    unreachable();

A __noreturn would make more sense.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-15 16:31 ` Andy Lutomirski
@ 2020-12-15 21:20   ` Thomas Gleixner
  2020-12-16  8:45     ` shenkai (D)
  2021-01-19 12:12     ` David Woodhouse
  0 siblings, 2 replies; 21+ messages in thread
From: Thomas Gleixner @ 2020-12-15 21:20 UTC (permalink / raw)
  To: Andy Lutomirski, shenkai (D)
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

On Tue, Dec 15 2020 at 08:31, Andy Lutomirski wrote:
> On Tue, Dec 15, 2020 at 6:46 AM shenkai (D) <shenkai8@huawei.com> wrote:
>>
>> From: shenkai <shenkai8@huawei.com>
>> Date: Tue, 15 Dec 2020 01:58:06 +0000
>> Subject: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
>>
>> In kexec reboot on x86 machine, APs will be halted and then waked up
>> by the apic INIT and SIPI interrupt. Here we can let APs spin instead
>> of being halted and boot APs by writing to specific address. In this way
>> we can accelerate smp_init procedure for we don't need to pull APs up
>> from a deep C-state.
>>
>> This is meaningful in many situations where users are sensitive to reboot
>> time cost.
>
> I like the concept.

No. This is the wrong thing to do. We are not optimizing for _one_
special case.

We can optimize it for all operations where all the non boot CPUs have
to brought up, be it cold boot, hibernation resume or kexec.

Aside of that this is not a magic X86 special problem. Pretty much all
architectures have the same issue and it can be solved very simple,
which has been discussed before and I outlined the solution years ago,
but nobody sat down and actually made it work.

Since the rewrite of the CPU hotplug infrastructure to a state machine
it's pretty obvious that the bringup of APs can changed from the fully
serialized:

     for_each_present_cpu(cpu) {
     	if (!cpu_online(cpu))
           cpu_up(cpu, CPUHP_ONLINE);
     }

to

     for_each_present_cpu(cpu) {
     	if (!cpu_online(cpu))
           cpu_up(cpu, CPUHP_KICK_CPU);
     }

     for_each_present_cpu(cpu) {
     	if (!cpu_active(cpu))
           cpu_up(cpu, CPUHP_ONLINE);
     }

The CPUHP_KICK_CPU state does not exist today, but it's just the logical
consequence of the state machine. It's basically splitting __cpu_up()
into:

__cpu_kick()
{
    prepare();
    arch_kick_remote_cpu();     -> Send IPI/NMI, Firmware call .....
}
    
__cpu_wait_online()
{
    wait_until_cpu_online();
    do_further_stuff();
}

There is some more to it than just blindly splitting it up at the
architecture level.

All __cpu_up() implementations across arch/ have a lot of needlessly
duplicated and pointlessly differently implemented code which can move
completely into the core.

So actually we want to split this further up:

   CPUHP_PREPARE_CPU_UP:	Generic preparation step where all
                                the magic cruft which is duplicated
                                across architectures goes to

   CPUHP_KICK_CPU:		Architecture specific prepare and kick

   CPUHP_WAIT_ONLINE:           Generic wait function for CPU coming
                                online: wait_for_completion_timeout()
                                which releases the upcoming CPU and
                                invokes an optional arch_sync_cpu_up()
                                function which finalizes the bringup.
and on the AP side:

   CPU comes up, does all the low level setup, sets online, calls
   complete() and the spinwaits for release.

Once the control CPU comes out of the completion it releases the
spinwait.

That works for all bringup situations and not only for kexec and the
simple trick is that by the time the last CPU has been kicked in the
first step, the first kicked CPU is already spinwaiting for release.

By the time the first kicked CPU has completed the process, i.e. reached
the active state, then the next CPU is spinwaiting and so on.

If you look at the provided time saving:

   Mainline:		210ms
   Patched:		 80ms
-----------------------------
   Delta                130ms

i.e. it takes ~ 1.8ms to kick and wait for the AP to come up and ~ 1.1ms
per CPU for the whole bringup. It does not completly add up, but it has
a clear benefit for everything.

Also the changelog says that the delay is related to CPUs in deep
C-states. If CPUs are brought down for kexec then it's trivial enough to
limit the C-states or just not use mwait() at all.

It would be interesting to see the numbers just with play_dead() using
hlt() or mwait(eax=0, 0) for the kexec case and no other change at all.

Thanks,

        tglx





^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-15 21:20   ` Thomas Gleixner
@ 2020-12-16  8:45     ` shenkai (D)
  2020-12-16 10:12       ` Thomas Gleixner
  2021-01-19 12:12     ` David Woodhouse
  1 sibling, 1 reply; 21+ messages in thread
From: shenkai (D) @ 2020-12-16  8:45 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

在 2020/12/16 5:20, Thomas Gleixner 写道:
> On Tue, Dec 15 2020 at 08:31, Andy Lutomirski wrote:
>> On Tue, Dec 15, 2020 at 6:46 AM shenkai (D) <shenkai8@huawei.com> wrote:
>>> From: shenkai <shenkai8@huawei.com>
>>> Date: Tue, 15 Dec 2020 01:58:06 +0000
>>> Subject: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
>>>
>>> In kexec reboot on x86 machine, APs will be halted and then waked up
>>> by the apic INIT and SIPI interrupt. Here we can let APs spin instead
>>> of being halted and boot APs by writing to specific address. In this way
>>> we can accelerate smp_init procedure for we don't need to pull APs up
>>> from a deep C-state.
>>>
>>> This is meaningful in many situations where users are sensitive to reboot
>>> time cost.
>> I like the concept.
> No. This is the wrong thing to do. We are not optimizing for _one_
> special case.
>
> We can optimize it for all operations where all the non boot CPUs have
> to brought up, be it cold boot, hibernation resume or kexec.
>
> Aside of that this is not a magic X86 special problem. Pretty much all
> architectures have the same issue and it can be solved very simple,
> which has been discussed before and I outlined the solution years ago,
> but nobody sat down and actually made it work.
>
> Since the rewrite of the CPU hotplug infrastructure to a state machine
> it's pretty obvious that the bringup of APs can changed from the fully
> serialized:
>
>       for_each_present_cpu(cpu) {
>       	if (!cpu_online(cpu))
>             cpu_up(cpu, CPUHP_ONLINE);
>       }
>
> to
>
>       for_each_present_cpu(cpu) {
>       	if (!cpu_online(cpu))
>             cpu_up(cpu, CPUHP_KICK_CPU);
>       }
>
>       for_each_present_cpu(cpu) {
>       	if (!cpu_active(cpu))
>             cpu_up(cpu, CPUHP_ONLINE);
>       }
>
> The CPUHP_KICK_CPU state does not exist today, but it's just the logical
> consequence of the state machine. It's basically splitting __cpu_up()
> into:
>
> __cpu_kick()
> {
>      prepare();
>      arch_kick_remote_cpu();     -> Send IPI/NMI, Firmware call .....
> }
>      
> __cpu_wait_online()
> {
>      wait_until_cpu_online();
>      do_further_stuff();
> }
>
> There is some more to it than just blindly splitting it up at the
> architecture level.
>
> All __cpu_up() implementations across arch/ have a lot of needlessly
> duplicated and pointlessly differently implemented code which can move
> completely into the core.
>
> So actually we want to split this further up:
>
>     CPUHP_PREPARE_CPU_UP:	Generic preparation step where all
>                                  the magic cruft which is duplicated
>                                  across architectures goes to
>
>     CPUHP_KICK_CPU:		Architecture specific prepare and kick
>
>     CPUHP_WAIT_ONLINE:           Generic wait function for CPU coming
>                                  online: wait_for_completion_timeout()
>                                  which releases the upcoming CPU and
>                                  invokes an optional arch_sync_cpu_up()
>                                  function which finalizes the bringup.
> and on the AP side:
>
>     CPU comes up, does all the low level setup, sets online, calls
>     complete() and the spinwaits for release.
>
> Once the control CPU comes out of the completion it releases the
> spinwait.
>
> That works for all bringup situations and not only for kexec and the
> simple trick is that by the time the last CPU has been kicked in the
> first step, the first kicked CPU is already spinwaiting for release.
>
> By the time the first kicked CPU has completed the process, i.e. reached
> the active state, then the next CPU is spinwaiting and so on.
>
> If you look at the provided time saving:
>
>     Mainline:		210ms
>     Patched:		 80ms
> -----------------------------
>     Delta                130ms
>
> i.e. it takes ~ 1.8ms to kick and wait for the AP to come up and ~ 1.1ms
> per CPU for the whole bringup. It does not completly add up, but it has
> a clear benefit for everything.
>
> Also the changelog says that the delay is related to CPUs in deep
> C-states. If CPUs are brought down for kexec then it's trivial enough to
> limit the C-states or just not use mwait() at all.
>
> It would be interesting to see the numbers just with play_dead() using
> hlt() or mwait(eax=0, 0) for the kexec case and no other change at all.
>
> Thanks,
>
>          tglx
>
Thanks for your and Andy's precious comments. I would like to take a try on

reconstructing this patch to make it more decent and generic.


Thanks again

Kai

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-16  8:45     ` shenkai (D)
@ 2020-12-16 10:12       ` Thomas Gleixner
  2020-12-16 14:18         ` shenkai (D)
  0 siblings, 1 reply; 21+ messages in thread
From: Thomas Gleixner @ 2020-12-16 10:12 UTC (permalink / raw)
  To: shenkai (D), Andy Lutomirski
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

Kai,

On Wed, Dec 16 2020 at 16:45, shenkai wrote:
> 在 2020/12/16 5:20, Thomas Gleixner 写道:
>>
>>
> Thanks for your and Andy's precious comments. I would like to take a try on
>
> reconstructing this patch to make it more decent and generic.

>> It would be interesting to see the numbers just with play_dead() using
>> hlt() or mwait(eax=0, 0) for the kexec case and no other change at all.

Can you please as a first step look into this and check if the time
changes?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-16 10:12       ` Thomas Gleixner
@ 2020-12-16 14:18         ` shenkai (D)
  2020-12-16 15:31           ` Thomas Gleixner
  0 siblings, 1 reply; 21+ messages in thread
From: shenkai (D) @ 2020-12-16 14:18 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

在 2020/12/16 18:12, Thomas Gleixner 写道:
> Kai,
>
> On Wed, Dec 16 2020 at 16:45, shenkai wrote:
>> 在 2020/12/16 5:20, Thomas Gleixner 写道:
>>>
>> Thanks for your and Andy's precious comments. I would like to take a try on
>>
>> reconstructing this patch to make it more decent and generic.
>>> It would be interesting to see the numbers just with play_dead() using
>>> hlt() or mwait(eax=0, 0) for the kexec case and no other change at all.
> Can you please as a first step look into this and check if the time
> changes?
>
> Thanks,
>
>          tglx
> .

After some tests, the conclusion that time cost is from deep C-state 
turns out to be wrong

Sorry for that.

Here is what I do:

In kexec case, first let APs spinwait like what I did  in that patch, 
but wake APs up by

sending apic INIT and SIPI  interrupts as normal procedure instead of 
writing to some

address and there is no acceleration (time cost is still 210ms).

So can we say that the main time cost is from apic INIT and SIPI 
interrupts and the handling

of them instead of deep C-state?


I didn't test with play_dead() because in kexec case, one new kernel 
will be started and APs can't be

waken up by normal interrupts like in hibernate case for the irq vectors 
are gone with the old kernel.

Or maybe I didn't get the point correctly?


Best regards

Kai




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-16 14:18         ` shenkai (D)
@ 2020-12-16 15:31           ` Thomas Gleixner
  2020-12-17 14:53             ` shenkai (D)
  2021-01-07 15:18             ` David Woodhouse
  0 siblings, 2 replies; 21+ messages in thread
From: Thomas Gleixner @ 2020-12-16 15:31 UTC (permalink / raw)
  To: shenkai (D), Andy Lutomirski
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

Kai,

On Wed, Dec 16 2020 at 22:18, shenkai wrote:
> After some tests, the conclusion that time cost is from deep C-state 
> turns out to be wrong
>
> Sorry for that.

No problem.

> In kexec case, first let APs spinwait like what I did  in that patch,
> but wake APs up by sending apic INIT and SIPI  interrupts as normal
> procedure instead of writing to some address and there is no
> acceleration (time cost is still 210ms).

Ok.

> So can we say that the main time cost is from apic INIT and SIPI
> interrupts and the handling of them instead of deep C-state?

That's a fair conclusion.

> I didn't test with play_dead() because in kexec case, one new kernel
> will be started and APs can't be waken up by normal interrupts like in
> hibernate case for the irq vectors are gone with the old kernel.
>
> Or maybe I didn't get the point correctly?

Not exactly, but your experiment answered the question already.

My point was that the regular kexec unplugs the APs which then end up in
play_dead() and trying to use the deepest C-state via mwait(). So if the
overhead would be related to getting them out of a deep C-state then
forcing that play_dead() to use the HLT instruction or the most shallow
C-state with mwait() would have brought an improvement, right?

But obviously the C-state in which the APs are waiting is not really
relevant, as you demonstrated that the cost is due to INIT/SIPI even
with spinwait, which is what I suspected.

OTOH, the advantage of INIT/SIPI is that the AP comes up in a well known
state.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-16 15:31           ` Thomas Gleixner
@ 2020-12-17 14:53             ` shenkai (D)
  2021-01-07 15:18             ` David Woodhouse
  1 sibling, 0 replies; 21+ messages in thread
From: shenkai (D) @ 2020-12-17 14:53 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

在 2020/12/16 23:31, Thomas Gleixner 写道:
> OTOH, the advantage of INIT/SIPI is that the AP comes up in a well known
> state.

We can set APs to a known state explicitly like BSP will do in kexec 
case (what we also tried

to do in the patch). Maybe it is not a big problem?

Best regards

Kai


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-16 15:31           ` Thomas Gleixner
  2020-12-17 14:53             ` shenkai (D)
@ 2021-01-07 15:18             ` David Woodhouse
  1 sibling, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-01-07 15:18 UTC (permalink / raw)
  To: Thomas Gleixner, shenkai (D), Andy Lutomirski, Schander, Johanna Amelie
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

[-- Attachment #1: Type: text/plain, Size: 1009 bytes --]

On Wed, 2020-12-16 at 16:31 +0100, Thomas Gleixner wrote:
> But obviously the C-state in which the APs are waiting is not really
> relevant, as you demonstrated that the cost is due to INIT/SIPI even
> with spinwait, which is what I suspected.
> 
> OTOH, the advantage of INIT/SIPI is that the AP comes up in a well known
> state.

And once we parallelise the bringup we basically only incur the latency
of *one* INIT/SIPI instead of multiplying it by the number of CPUs, so
it isn't clear that there's any *disadvantage* to it. It's certainly a
lot simpler.

I think we should definitely start by implementing the parallel bringup
as you described it, and then see if there's still a problem left to be
solved.

We were working on a SIPI-avoiding patch set which is similar to the
above, which Johanna had just about got working the night before this
one was posted. But it looks like we should go back to the drawing
board anyway instead of bothering to compare the details of the two.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2020-12-15 21:20   ` Thomas Gleixner
  2020-12-16  8:45     ` shenkai (D)
@ 2021-01-19 12:12     ` David Woodhouse
  2021-01-21 14:55       ` Thomas Gleixner
  1 sibling, 1 reply; 21+ messages in thread
From: David Woodhouse @ 2021-01-19 12:12 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski, shenkai (D),
	Schander, Johanna 'Mimoja' Amelie
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

[-- Attachment #1: Type: text/plain, Size: 17710 bytes --]

On Tue, 2020-12-15 at 22:20 +0100, Thomas Gleixner wrote:
> Since the rewrite of the CPU hotplug infrastructure to a state machine
> it's pretty obvious that the bringup of APs can changed from the fully
> serialized:
> 
>      for_each_present_cpu(cpu) {
>         if (!cpu_online(cpu))
>            cpu_up(cpu, CPUHP_ONLINE);
>      }
> 
> to
> 
>      for_each_present_cpu(cpu) {
>         if (!cpu_online(cpu))
>            cpu_up(cpu, CPUHP_KICK_CPU);
>      }
> 
>      for_each_present_cpu(cpu) {
>         if (!cpu_active(cpu))
>            cpu_up(cpu, CPUHP_ONLINE);
>      }
> 
> The CPUHP_KICK_CPU state does not exist today, but it's just the logical
> consequence of the state machine. It's basically splitting __cpu_up()
> into:
> 
> __cpu_kick()
> {
>     prepare();
>     arch_kick_remote_cpu();     -> Send IPI/NMI, Firmware call .....
> }
>     
> __cpu_wait_online()
> {
>     wait_until_cpu_online();
>     do_further_stuff();
> }
> 
> There is some more to it than just blindly splitting it up at the
> architecture level.

We've been playing with this a little. There's a proof-of-concept hack
below; don't look too hard because it's only really for figuring out
the timing etc.

Basically we ripped out the 'wait' parts of the x86 do_boot_cpu() into
a separate function do_wait_cpu(). There are four phases to the wait.

 • Wait for the AP to turn up in cpu_initialized_mask, set its bit in
   cpu_callout_mask to allow it to run the AP thread.
 • Wait for it to finish init and show up in cpu_callin_mask.
 • check_tsc_sync_source()
 • Wait for cpu_online(cpu)

There's an EARLY_INIT macro which controls whether the early bringup
call actually *does* anything, or whether it's left until bringup_cpu()
as the current code does. It allows a simple comparison of the two.

First we tested under qemu (on a Skylake EC2 c5.metal instance). The
do_boot_cpu() actually sending the IPIs took ~300k cycles itself.
Without EARLY_INIT we see timing for the four wait phases along the
lines of:

[    0.285312] CPU#10 up in     192950,    952898,  60014786,        28 (  61160662)
[    0.288311] CPU#11 up in     181092,    962704,  60010432,        30 (  61154258)
[    0.291312] CPU#12 up in     386080,    970416,  60013956,        28 (  61370480)
[    0.294311] CPU#13 up in     372782,    964506,  60010564,        28 (  61347880)
[    0.297312] CPU#14 up in     389602,    976280,  60013046,        28 (  61378956)
[    0.300312] CPU#15 up in     213132,    968148,  60012138,        28 (  61193446)

If we define EARLY_INIT then that first phase of waiting for the CPU
add itself is fairly much instantaneous, which is precisely what we
were hoping for. We also seem to save about 300k cycles on the AP
bringup too. It's just that it *all* pales into insignificance with
whatever it's doing to synchronise the TSC for 60M cycles.

[    0.338829] CPU#10 up in        600,    689054,  60025522,        28 (  60715204)
[    0.341829] CPU#11 up in        610,    635346,  60019390,        28 (  60655374)
[    0.343829] CPU#12 up in        632,    619352,  60020728,        28 (  60640740)
[    0.346829] CPU#13 up in        602,    514234,  60025402,        26 (  60540264)
[    0.348830] CPU#14 up in        608,    621058,  60025952,        26 (  60647644)
[    0.351829] CPU#15 up in        600,    624690,  60021526,       410 (  60647226)

Testing on real hardware has been more interesting and less useful so
far. We started with the CPUHP_BRINGUP_KICK_CPU state being
*immediately* before CPUHP_BRINGUP_CPU. On my 28-thread Haswell box,
that didn't come up at all even without actually *doing* anything in
the pre-bringup phase. Merely bringing all the AP threads up through
the various CPUHP_PREPARE_foo stages before actually bringing them
online, was enough to break it. I have no serial port on this box so we
haven't get worked out why; I've resorted to putting the
CPUHP_BRINGUP_KICK_CPU state before CPUHP_WORKQUEUE_PREP instead.

That lets it boot without the EARLY_INIT at least (so it's basically a
no-op), and I get these timings. Looks like there's 3-4M cycles to be
had by the parallel SIPI/INIT, but the *first* thread of each core is
also taking another 8M cycles and it might be worth doing *those* in
parallel too. And Thomas I think that waiting for the AP to bring
itself up is the part you meant was pointlessly differently
reimplemented across architectures? So the way forward there might be
to offer a generic CPUHP state for that, for architectures to plug into
and ditch their own tracking.

[    0.311581] CPU#1 up in    4057008,   8820492,      1828,       808 (  12880136)
[    0.316802] CPU#2 up in    3885080,   8738092,      1792,       904 (  12625868)
[    0.321674] CPU#3 up in    3468276,   8244880,      1724,       860 (  11715740)
[    0.326609] CPU#4 up in    3565216,   8357876,      1808,       984 (  11925884)
[    0.331565] CPU#5 up in    3566916,   8367340,      1836,       708 (  11936800)
[    0.336446] CPU#6 up in    3465324,   8249512,      1756,       796 (  11717388)
[    0.341337] CPU#7 up in    3518268,   8313476,      1572,      1072 (  11834388)
[    0.346196] CPU#8 up in    3479444,   8260244,      1648,       608 (  11741944)
[    0.351068] CPU#9 up in    3475692,   8269092,      1568,       908 (  11747260)
[    0.355968] CPU#10 up in    3534648,   8336648,      1488,       864 (  11873648)
[    0.361306] CPU#11 up in    4028288,   8932216,      1632,       692 (  12962828)
[    0.366657] CPU#12 up in    4046256,   8941736,      1624,      1164 (  12990780)
[    0.371985] CPU#13 up in    4012912,   8922192,      1700,       964 (  12937768)
[    0.373813] CPU#14 up in    3794196,    300948,      1520,      1300 (   4097964)
[    0.374936] CPU#15 up in    3853616,    265080,      1428,       784 (   4120908)
[    0.376843] CPU#16 up in    3841572,    261448,      1428,       528 (   4104976)
[    0.378597] CPU#17 up in    3420856,    258888,      1272,       872 (   3681888)
[    0.380403] CPU#18 up in    3516220,    259840,      2152,       648 (   3778860)
[    0.382210] CPU#19 up in    3503316,    262876,      1720,       500 (   3768412)
[    0.383975] CPU#20 up in    3421752,    263248,      1472,       764 (   3687236)
[    0.385747] CPU#21 up in    3434744,    272240,      1352,       716 (   3709052)
[    0.387516] CPU#22 up in    3427700,    273900,      1260,       820 (   3703680)
[    0.389300] CPU#23 up in    3457724,    269708,      1328,       816 (   3729576)
[    0.391089] CPU#24 up in    3466012,    269136,      1296,       824 (   3737268)
[    0.393067] CPU#25 up in    3970568,    279256,      1432,       892 (   4252148)
[    0.395042] CPU#26 up in    3977228,    283956,      1656,       772 (   4263612)
[    0.397020] CPU#27 up in    3946448,    288852,      1600,       648 (   4237548)

When I enabled EARLY_INIT it didn't boot; I need to hook up some box
with a serial port to make more meaningful progress there, but figured
it was worth sharing the findings so far.

Here's the hack we're testing with, for reference. It's kind of ugly
but you can see where it's going. Note that the CMOS mangling for the
warm reset vector is going to need to be lifted out of the per-cpu
loop, and done *once* at startup and torn down once in smp_cpus_done.
Except that it also needs to be done before/after a hotplug cpu up;
we'll have to come back to that but we've just shifted it to
native_smp_cpus_done() for testing for now.


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7f57ede3cb8e..99d1fa254921 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -308,7 +308,7 @@ static void kvm_register_steal_time(void)
 		return;
 
 	wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
-	pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+	if (0)	pr_info("stealtime: cpu %d, msr %llx\n", cpu,
 		(unsigned long long) slow_virt_to_phys(st));
 }
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa593743acf6..79a5c26c376e 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -108,7 +108,7 @@ static inline void kvm_sched_clock_init(bool stable)
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_ops.time.sched_clock = kvm_sched_clock_read;
 
-	pr_info("kvm-clock: using sched offset of %llu cycles",
+	if (0) pr_info("kvm-clock: using sched offset of %llu cycles",
 		kvm_sched_clock_offset);
 
 	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
@@ -184,7 +184,7 @@ static void kvm_register_clock(char *txt)
 
 	pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
 	wrmsrl(msr_kvm_system_time, pa);
-	pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+	if (0)	pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
 }
 
 static void kvm_save_sched_clock_state(void)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index de776b2e6046..42f479979b52 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -360,7 +360,7 @@ int topology_update_die_map(unsigned int die, unsigned int cpu)
 		goto found;
 
 	new = logical_die++;
-	if (new != die) {
+	if (0 && new != die) {
 		pr_info("CPU %u Converting physical %u to logical die %u\n",
 			cpu, die, new);
 	}
@@ -1028,9 +1028,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 {
 	/* start_ip had better be page-aligned! */
 	unsigned long start_ip = real_mode_header->trampoline_start;
-
 	unsigned long boot_error = 0;
-	unsigned long timeout;
 
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
@@ -1083,55 +1081,71 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 		boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
 						     cpu0_nmi_registered);
 
-	if (!boot_error) {
-		/*
-		 * Wait 10s total for first sign of life from AP
-		 */
-		boot_error = -1;
-		timeout = jiffies + 10*HZ;
-		while (time_before(jiffies, timeout)) {
-			if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
-				/*
-				 * Tell AP to proceed with initialization
-				 */
-				cpumask_set_cpu(cpu, cpu_callout_mask);
-				boot_error = 0;
-				break;
-			}
-			schedule();
-		}
-	}
+	return boot_error;
+}
 
-	if (!boot_error) {
-		/*
-		 * Wait till AP completes initial initialization
-		 */
-		while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
+int do_wait_cpu(unsigned int cpu)
+{
+	unsigned long flags;
+	unsigned long timeout;
+	cycles_t t1 = get_cycles(), t2, t3, t4, t5;
+	/*
+	 * Wait 10s total for first sign of life from AP
+	 */
+	int err = -1;
+	timeout = jiffies + 10*HZ;
+	while (time_before(jiffies, timeout)) {
+		if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
 			/*
-			 * Allow other tasks to run while we wait for the
-			 * AP to come online. This also gives a chance
-			 * for the MTRR work(triggered by the AP coming online)
-			 * to be completed in the stop machine context.
+			 * Tell AP to proceed with initialization
 			 */
-			schedule();
+			cpumask_set_cpu(cpu, cpu_callout_mask);
+			err = 0;
+			break;
 		}
+		schedule();
 	}
 
-	if (x86_platform.legacy.warm_reset) {
+	if (err)
+		return err;
+	t2 = get_cycles();
+	/*
+	 * Wait till AP completes initial initialization
+	 */
+	while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
 		/*
-		 * Cleanup possible dangling ends...
+		 * Allow other tasks to run while we wait for the
+		 * AP to come online. This also gives a chance
+		 * for the MTRR work(triggered by the AP coming online)
+		 * to be completed in the stop machine context.
 		 */
-		smpboot_restore_warm_reset_vector();
+		schedule();
 	}
 
-	return boot_error;
+	/*
+	 * Check TSC synchronization with the AP (keep irqs disabled
+	 * while doing so):
+	 */
+	t3 = get_cycles();
+	local_irq_save(flags);
+	check_tsc_sync_source(cpu);
+	local_irq_restore(flags);
+	t4 = get_cycles();
+	while (!cpu_online(cpu)) {
+		cpu_relax();
+		touch_nmi_watchdog();
+	}
+	t5 = get_cycles();
+
+	printk("CPU#%d up in %10lld,%10lld,%10lld,%10lld (%10lld)\n", cpu,
+	       t2-t1, t3-t2, t4-t3, t5-t4, t5-t1);
+	return 0;
 }
 
-int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int apicid = apic->cpu_present_to_apicid(cpu);
 	int cpu0_nmi_registered = 0;
-	unsigned long flags;
 	int err, ret = 0;
 
 	lockdep_assert_irqs_enabled();
@@ -1178,19 +1192,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		goto unreg_nmi;
 	}
 
-	/*
-	 * Check TSC synchronization with the AP (keep irqs disabled
-	 * while doing so):
-	 */
-	local_irq_save(flags);
-	check_tsc_sync_source(cpu);
-	local_irq_restore(flags);
-
-	while (!cpu_online(cpu)) {
-		cpu_relax();
-		touch_nmi_watchdog();
-	}
-
 unreg_nmi:
 	/*
 	 * Clean up the nmi handler. Do this after the callin and callout sync
@@ -1202,6 +1203,31 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return ret;
 }
 
+#define EARLY_INIT
+
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+	int ret;
+
+#ifndef EARLY_INIT
+	ret = do_cpu_up(cpu, tidle);
+	if (ret)
+		return ret;
+#endif
+	ret = do_wait_cpu(cpu);
+	return ret;
+}
+
+int __cpu_init(unsigned int cpu, struct task_struct *tidle)
+{
+	int ret = 0;
+
+#ifdef EARLY_INIT
+	ret = do_cpu_up(cpu, tidle);
+#endif
+	return ret;
+}
+
 /**
  * arch_disable_smp_support() - disables SMP support for x86 at runtime
  */
@@ -1415,6 +1441,13 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 	pr_debug("Boot done\n");
 
+	if (x86_platform.legacy.warm_reset) {
+		/*
+		 * Cleanup possible dangling ends...
+		 */
+		smpboot_restore_warm_reset_vector();
+	}
+
 	calculate_max_logical_packages();
 
 	if (x86_has_numa_in_package)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bc56287a1ed1..cdea060b1009 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -61,6 +61,7 @@ enum cpuhp_state {
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
+	CPUHP_BRINGUP_KICK_CPU,		/* Asynchronously kick/wake/INIT CPU */
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,
@@ -92,7 +93,7 @@ enum cpuhp_state {
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
-	CPUHP_BRINGUP_CPU,
+	CPUHP_BRINGUP_CPU,		/* Wait for CPU to actually respond */
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
 	CPUHP_AP_SCHED_STARTING,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..17881f836de6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -545,11 +545,36 @@ static int bringup_wait_for_ap(unsigned int cpu)
 	return cpuhp_kick_ap(st, st->target);
 }
 
-static int bringup_cpu(unsigned int cpu)
+extern int __cpu_init(unsigned int cpu, struct task_struct *tidle);
+static int bringup_kick_cpu(unsigned int cpu)
 {
 	struct task_struct *idle = idle_thread_get(cpu);
 	int ret;
+	cycles_t t = get_cycles();
+	/*
+	 * Some architectures have to walk the irq descriptors to
+	 * setup the vector space for the cpu which comes online.
+	 * Prevent irq alloc/free across the bringup.
+	 */
+	irq_lock_sparse();
+
+	/* Arch-specific enabling code. */
+	ret = __cpu_init(cpu, idle);
+	irq_unlock_sparse();
+
+	t = get_cycles() - t;
+	printk("bringup_kick_cpu %d in %ld cycles\n", cpu, t);
+	if (ret)
+		return ret;
+	return 0;
+}
 
+static int bringup_cpu(unsigned int cpu)
+{
+	struct task_struct *idle = idle_thread_get(cpu);
+	int ret;
+	cycles_t t2, t = get_cycles();
+	
 	/*
 	 * Some architectures have to walk the irq descriptors to
 	 * setup the vector space for the cpu which comes online.
@@ -562,7 +587,12 @@ static int bringup_cpu(unsigned int cpu)
 	irq_unlock_sparse();
 	if (ret)
 		return ret;
-	return bringup_wait_for_ap(cpu);
+	t2 = get_cycles() - t;
+	ret = bringup_wait_for_ap(cpu);
+	t = get_cycles() - t;
+	printk("bringup_cpu %d in %ld,%ld cycles\n", cpu, t2, t -t2);
+
+	return ret;
 }
 
 static int finish_cpu(unsigned int cpu)
@@ -1336,6 +1366,13 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus)
 {
 	unsigned int cpu;
 
+	for_each_present_cpu(cpu) {
+		if (num_online_cpus() >= setup_max_cpus)
+			break;
+		if (!cpu_online(cpu))
+			cpu_up(cpu, CPUHP_BRINGUP_KICK_CPU);
+	}
+
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
 			break;
@@ -1565,7 +1602,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= timers_prepare_cpu,
 		.teardown.single	= timers_dead_cpu,
 	},
-	/* Kicks the plugged cpu into life */
+	/* Asynchronously kicks the plugged cpu into life */
+	[CPUHP_BRINGUP_KICK_CPU] = {
+		.name			= "cpu:kick",
+		.startup.single		= bringup_kick_cpu,
+		.cant_stop		= true,
+	},
+	/* Wait for woken CPU to be responding */
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup.single		= bringup_cpu,
diff --git a/kernel/smp.c b/kernel/smp.c
index 4d17501433be..2d07d1c42789 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -807,6 +807,8 @@ void __init smp_init(void)
 
 	pr_info("Bringing up secondary CPUs ...\n");
 
+	//	smp_cpus_start(setup_max_cpus);
+
 	bringup_nonboot_cpus(setup_max_cpus);
 
 	num_nodes = num_online_nodes();


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2021-01-19 12:12     ` David Woodhouse
@ 2021-01-21 14:55       ` Thomas Gleixner
  2021-01-21 15:42         ` David Woodhouse
                           ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Thomas Gleixner @ 2021-01-21 14:55 UTC (permalink / raw)
  To: David Woodhouse, Andy Lutomirski, shenkai (D),
	Schander, Johanna 'Mimoja' Amelie
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

David,

On Tue, Jan 19 2021 at 12:12, David Woodhouse wrote:
> On Tue, 2020-12-15 at 22:20 +0100, Thomas Gleixner wrote:
> We've been playing with this a little. There's a proof-of-concept hack
> below; don't look too hard because it's only really for figuring out
> the timing etc.
>
> Basically we ripped out the 'wait' parts of the x86 do_boot_cpu() into
> a separate function do_wait_cpu(). There are four phases to the wait.
>
>  • Wait for the AP to turn up in cpu_initialized_mask, set its bit in
>    cpu_callout_mask to allow it to run the AP thread.
>  • Wait for it to finish init and show up in cpu_callin_mask.
>  • check_tsc_sync_source()
>  • Wait for cpu_online(cpu)
>
> There's an EARLY_INIT macro which controls whether the early bringup
> call actually *does* anything, or whether it's left until bringup_cpu()
> as the current code does. It allows a simple comparison of the two.
>
> First we tested under qemu (on a Skylake EC2 c5.metal instance). The
> do_boot_cpu() actually sending the IPIs took ~300k cycles itself.
> Without EARLY_INIT we see timing for the four wait phases along the
> lines of:
>
> [    0.285312] CPU#10 up in     192950,    952898,  60014786,        28 (  61160662)
> [    0.288311] CPU#11 up in     181092,    962704,  60010432,        30 (  61154258)
> [    0.291312] CPU#12 up in     386080,    970416,  60013956,        28 (  61370480)
> [    0.294311] CPU#13 up in     372782,    964506,  60010564,        28 (  61347880)
> [    0.297312] CPU#14 up in     389602,    976280,  60013046,        28 (  61378956)
> [    0.300312] CPU#15 up in     213132,    968148,  60012138,        28 (  61193446)
>
> If we define EARLY_INIT then that first phase of waiting for the CPU
> add itself is fairly much instantaneous, which is precisely what we
> were hoping for. We also seem to save about 300k cycles on the AP
> bringup too. It's just that it *all* pales into insignificance with
> whatever it's doing to synchronise the TSC for 60M cycles.

Yes, that's annoying, but it can be avoided. The host could tell the
guest that the TSC is perfectly synced.

> [    0.338829] CPU#10 up in        600,    689054,  60025522,        28 (  60715204)
> [    0.341829] CPU#11 up in        610,    635346,  60019390,        28 (  60655374)
> [    0.343829] CPU#12 up in        632,    619352,  60020728,        28 (  60640740)
> [    0.346829] CPU#13 up in        602,    514234,  60025402,        26 (  60540264)
> [    0.348830] CPU#14 up in        608,    621058,  60025952,        26 (  60647644)
> [    0.351829] CPU#15 up in        600,    624690,  60021526,       410 (  60647226)
>
> Testing on real hardware has been more interesting and less useful so
> far. We started with the CPUHP_BRINGUP_KICK_CPU state being
> *immediately* before CPUHP_BRINGUP_CPU. On my 28-thread Haswell box,
> that didn't come up at all even without actually *doing* anything in
> the pre-bringup phase. Merely bringing all the AP threads up through
> the various CPUHP_PREPARE_foo stages before actually bringing them
> online, was enough to break it. I have no serial port on this box so we
> haven't get worked out why; I've resorted to putting the
> CPUHP_BRINGUP_KICK_CPU state before CPUHP_WORKQUEUE_PREP instead.

Hrm.

> That lets it boot without the EARLY_INIT at least (so it's basically a
> no-op), and I get these timings. Looks like there's 3-4M cycles to be
> had by the parallel SIPI/INIT, but the *first* thread of each core is
> also taking another 8M cycles and it might be worth doing *those* in
> parallel too. And Thomas I think that waiting for the AP to bring
> itself up is the part you meant was pointlessly differently
> reimplemented across architectures? So the way forward there might be
> to offer a generic CPUHP state for that, for architectures to plug into
> and ditch their own tracking.

Yes. The whole wait for alive and callin and online can be generic.

> When I enabled EARLY_INIT it didn't boot; I need to hook up some box
> with a serial port to make more meaningful progress there, but figured
> it was worth sharing the findings so far.
>
> Here's the hack we're testing with, for reference. It's kind of ugly
> but you can see where it's going. Note that the CMOS mangling for the
> warm reset vector is going to need to be lifted out of the per-cpu
> loop, and done *once* at startup and torn down once in smp_cpus_done.
> Except that it also needs to be done before/after a hotplug cpu up;
> we'll have to come back to that but we've just shifted it to
> native_smp_cpus_done() for testing for now.

Right. It's at least a start.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2021-01-21 14:55       ` Thomas Gleixner
@ 2021-01-21 15:42         ` David Woodhouse
  2021-01-21 17:34           ` David Woodhouse
  2021-01-21 19:59         ` [PATCH] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
  2021-02-01 10:36         ` [PATCH] use x86 cpu park to speedup smp_init in kexec situation David Woodhouse
  2 siblings, 1 reply; 21+ messages in thread
From: David Woodhouse @ 2021-01-21 15:42 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski, shenkai (D),
	Schander, Johanna 'Mimoja' Amelie
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

[-- Attachment #1: Type: text/plain, Size: 11740 bytes --]

On Thu, 2021-01-21 at 15:55 +0100, Thomas Gleixner wrote:
> > Testing on real hardware has been more interesting and less useful so
> > far. We started with the CPUHP_BRINGUP_KICK_CPU state being
> > *immediately* before CPUHP_BRINGUP_CPU. On my 28-thread Haswell box,
> > that didn't come up at all even without actually *doing* anything in
> > the pre-bringup phase. Merely bringing all the AP threads up through
> > the various CPUHP_PREPARE_foo stages before actually bringing them
> > online, was enough to break it. I have no serial port on this box so we
> > haven't get worked out why; I've resorted to putting the
> > CPUHP_BRINGUP_KICK_CPU state before CPUHP_WORKQUEUE_PREP instead.
> 
> Hrm.

Aha, I managed to reproduce in qemu. It's CPUHP_X2APIC_PREPARE, which
is only used in x2apic *cluster* mode not physical mode. So I actually
need to give the guest an IOMMU with IRQ remapping before I see it.


$ git diff
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bc56287a1ed1..f503e66b4718 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -92,6 +92,7 @@ enum cpuhp_state {
        CPUHP_MIPS_SOC_PREPARE,
        CPUHP_BP_PREPARE_DYN,
        CPUHP_BP_PREPARE_DYN_END                = CPUHP_BP_PREPARE_DYN + 20,
+       CPUHP_BRINGUP_WAKE_CPU,
        CPUHP_BRINGUP_CPU,
        CPUHP_AP_IDLE_DEAD,
        CPUHP_AP_OFFLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..6c6f2986bfdb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1336,6 +1336,12 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus)
 {
        unsigned int cpu;
 
+       for_each_present_cpu(cpu) {
+               if (num_online_cpus() >= setup_max_cpus)
+                       break;
+               if (!cpu_online(cpu))
+                       cpu_up(cpu, CPUHP_BRINGUP_WAKE_CPU);
+       }
        for_each_present_cpu(cpu) {
                if (num_online_cpus() >= setup_max_cpus)
                        break;
$ qemu-system-x86_64 -kernel arch/x86/boot/bzImage -append "console=ttyS0  trace_event=cpuhp tp_printk" -display none -serial mon:stdio  -m 2G -M q35,accel=kvm,kernel-irqchip=split -device intel-iommu,intremap=on -smp 40
...
[    0.349968] smp: Bringing up secondary CPUs ...
[    0.350281] cpuhp_enter: cpu: 0001 target:  42 step:   1 (smpboot_create_threads)
[    0.351421] cpuhp_exit:  cpu: 0001  state:   1 step:   1 ret: 0
[    0.352074] cpuhp_enter: cpu: 0001 target:  42 step:   2 (perf_event_init_cpu)
[    0.352276] cpuhp_exit:  cpu: 0001  state:   2 step:   2 ret: 0
[    0.353273] cpuhp_enter: cpu: 0001 target:  42 step:  37 (workqueue_prepare_cpu)
[    0.354377] cpuhp_exit:  cpu: 0001  state:  37 step:  37 ret: 0
[    0.355273] cpuhp_enter: cpu: 0001 target:  42 step:  39 (hrtimers_prepare_cpu)
[    0.356271] cpuhp_exit:  cpu: 0001  state:  39 step:  39 ret: 0
[    0.356937] cpuhp_enter: cpu: 0001 target:  42 step:  41 (x2apic_prepare_cpu)
[    0.357277] cpuhp_exit:  cpu: 0001  state:  41 step:  41 ret: 0
[    0.358278] cpuhp_enter: cpu: 0002 target:  42 step:   1 (smpboot_create_threads)
...
[    0.614278] cpuhp_enter: cpu: 0032 target:  42 step:   1 (smpboot_create_threads)
[    0.615610] cpuhp_exit:  cpu: 0032  state:   1 step:   1 ret: 0
[    0.616274] cpuhp_enter: cpu: 0032 target:  42 step:   2 (perf_event_init_cpu)
[    0.617271] cpuhp_exit:  cpu: 0032  state:   2 step:   2 ret: 0
[    0.618272] cpuhp_enter: cpu: 0032 target:  42 step:  37 (workqueue_prepare_cpu)
[    0.619388] cpuhp_exit:  cpu: 0032  state:  37 step:  37 ret: 0
[    0.620273] cpuhp_enter: cpu: 0032 target:  42 step:  39 (hrtimers_prepare_cpu)
[    0.621270] cpuhp_exit:  cpu: 0032  state:  39 step:  39 ret: 0
[    0.622009] cpuhp_enter: cpu: 0032 target:  42 step:  41 (x2apic_prepare_cpu)
[    0.622275] cpuhp_exit:  cpu: 0032  state:  41 step:  41 ret: 0
...
[    0.684272] cpuhp_enter: cpu: 0039 target:  42 step:  41 (x2apic_prepare_cpu)
[    0.685277] cpuhp_exit:  cpu: 0039  state:  41 step:  41 ret: 0
[    0.685979] cpuhp_enter: cpu: 0001 target: 217 step:  43 (smpcfd_prepare_cpu)
[    0.686283] cpuhp_exit:  cpu: 0001  state:  43 step:  43 ret: 0
[    0.687274] cpuhp_enter: cpu: 0001 target: 217 step:  44 (relay_prepare_cpu)
[    0.688274] cpuhp_exit:  cpu: 0001  state:  44 step:  44 ret: 0
[    0.689274] cpuhp_enter: cpu: 0001 target: 217 step:  47 (rcutree_prepare_cpu)
[    0.690271] cpuhp_exit:  cpu: 0001  state:  47 step:  47 ret: 0
[    0.690982] cpuhp_multi_enter: cpu: 0001 target: 217 step:  59 (trace_rb_cpu_prepare)
[    0.691281] cpuhp_exit:  cpu: 0001  state:  59 step:  59 ret: 0
[    0.692272] cpuhp_multi_enter: cpu: 0001 target: 217 step:  59 (trace_rb_cpu_prepare)
[    0.694640] cpuhp_exit:  cpu: 0001  state:  59 step:  59 ret: 0
[    0.695272] cpuhp_multi_enter: cpu: 0001 target: 217 step:  59 (trace_rb_cpu_prepare)
[    0.696280] cpuhp_exit:  cpu: 0001  state:  59 step:  59 ret: 0
[    0.697279] cpuhp_enter: cpu: 0001 target: 217 step:  65 (timers_prepare_cpu)
[    0.698168] cpuhp_exit:  cpu: 0001  state:  65 step:  65 ret: 0
[    0.698272] cpuhp_enter: cpu: 0001 target: 217 step:  67 (kvmclock_setup_percpu)
[    0.699270] cpuhp_exit:  cpu: 0001  state:  67 step:  67 ret: 0
[    0.700272] cpuhp_enter: cpu: 0001 target: 217 step:  88 (bringup_cpu)
[    0.701312] x86: Booting SMP configuration:
[    0.702270] .... node  #0, CPUs:        #1
[    0.127218] kvm-clock: cpu 1, msr 59401041, secondary cpu clock
[    0.127218] smpboot: CPU 1 Converting physical 0 to logical die 1
[    0.709281] cpuhp_enter: cpu: 0001 target: 217 step: 147 (smpboot_unpark_threads)
[    0.712294] cpuhp_exit:  cpu: 0001  state: 147 step: 147 ret: 0
[    0.714283] cpuhp_enter: cpu: 0001 target: 217 step: 149 (irq_affinity_online_cpu)
[    0.717292] cpuhp_exit:  cpu: 0001  state: 149 step: 149 ret: 0
[    0.719283] cpuhp_enter: cpu: 0001 target: 217 step: 153 (perf_event_init_cpu)
[    0.721279] cpuhp_exit:  cpu: 0001  state: 153 step: 153 ret: 0
[    0.724285] cpuhp_enter: cpu: 0001 target: 217 step: 179 (lockup_detector_online_cpu)
[    0.727279] cpuhp_exit:  cpu: 0001  state: 179 step: 179 ret: 0
[    0.729279] cpuhp_enter: cpu: 0001 target: 217 step: 180 (workqueue_online_cpu)
[    0.731309] cpuhp_exit:  cpu: 0001  state: 180 step: 180 ret: 0
[    0.733281] cpuhp_enter: cpu: 0001 target: 217 step: 181 (rcutree_online_cpu)
[    0.735276] cpuhp_exit:  cpu: 0001  state: 181 step: 181 ret: 0
[    0.737278] cpuhp_enter: cpu: 0001 target: 217 step: 183 (kvm_cpu_online)
[    0.739286] kvm-guest: stealtime: cpu 1, msr 7d46c080
[    0.740274] cpuhp_exit:  cpu: 0001  state: 183 step: 183 ret: 0
[    0.742278] cpuhp_enter: cpu: 0001 target: 217 step: 184 (page_writeback_cpu_online)
[    0.744275] cpuhp_exit:  cpu: 0001  state: 184 step: 184 ret: 0
[    0.745277] cpuhp_enter: cpu: 0001 target: 217 step: 185 (vmstat_cpu_online)
[    0.747276] cpuhp_exit:  cpu: 0001  state: 185 step: 185 ret: 0
[    0.749280] cpuhp_enter: cpu: 0001 target: 217 step: 216 (sched_cpu_activate)
[    0.750275] cpuhp_exit:  cpu: 0001  state: 216 step: 216 ret: 0
[    0.752273] cpuhp_exit:  cpu: 0001  state: 217 step:  88 ret: 0
[    0.753030] cpuhp_enter: cpu: 0002 target: 217 step:  43 (smpcfd_prepare_cpu)
...
[    2.311273] cpuhp_exit:  cpu: 0031  state: 217 step:  88 ret: 0
[    2.312278] cpuhp_enter: cpu: 0032 target: 217 step:  43 (smpcfd_prepare_cpu)
[    2.313119] cpuhp_exit:  cpu: 0032  state:  43 step:  43 ret: 0
[    2.313277] cpuhp_enter: cpu: 0032 target: 217 step:  44 (relay_prepare_cpu)
[    2.314275] cpuhp_exit:  cpu: 0032  state:  44 step:  44 ret: 0
[    2.315274] cpuhp_enter: cpu: 0032 target: 217 step:  47 (rcutree_prepare_cpu)
[    2.316104] cpuhp_exit:  cpu: 0032  state:  47 step:  47 ret: 0
[    2.316273] cpuhp_multi_enter: cpu: 0032 target: 217 step:  59 (trace_rb_cpu_prepare)
[    2.317292] cpuhp_exit:  cpu: 0032  state:  59 step:  59 ret: 0
[    2.318275] cpuhp_multi_enter: cpu: 0032 target: 217 step:  59 (trace_rb_cpu_prepare)
[    2.320401] cpuhp_exit:  cpu: 0032  state:  59 step:  59 ret: 0
[    2.321111] cpuhp_multi_enter: cpu: 0032 target: 217 step:  59 (trace_rb_cpu_prepare)
[    2.321286] cpuhp_exit:  cpu: 0032  state:  59 step:  59 ret: 0
[    2.322273] cpuhp_enter: cpu: 0032 target: 217 step:  65 (timers_prepare_cpu)
[    2.323271] cpuhp_exit:  cpu: 0032  state:  65 step:  65 ret: 0
[    2.324272] cpuhp_enter: cpu: 0032 target: 217 step:  67 (kvmclock_setup_percpu)
[    2.325133] cpuhp_exit:  cpu: 0032  state:  67 step:  67 ret: 0
[    2.325273] cpuhp_enter: cpu: 0032 target: 217 step:  88 (bringup_cpu)
[    2.326292]  #32
[    2.289283] kvm-clock: cpu 32, msr 59401801, secondary cpu clock
[    2.289283] BUG: kernel NULL pointer dereference, address: 0000000000000000
[    2.289283] #PF: supervisor write access in kernel mode
[    2.289283] #PF: error_code(0x0002) - not-present page
[    2.289283] PGD 0 P4D 0 
[    2.289283] Oops: 0002 [#1] SMP PTI
[    2.289283] CPU: 32 PID: 0 Comm: swapper/32 Not tainted 5.10.0+ #745
[    2.289283] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1.fc33 04/01/2014
[    2.289283] RIP: 0010:init_x2apic_ldr+0xa0/0xb0
[    2.289283] Code: 89 2d 9c 81 fb 72 65 8b 15 cd 12 fb 72 89 d2 f0 48 0f ab 50 08 5b 5d c3 48 8b 05 a3 7b 09 02 48 c7 05 98 7b 09 02 00 00 00 00 <89> 18 eb cd 66 66 2e 0f 1f 84 00 00 00 00 00 90 0f 1f 44 00 00 89
[    2.289283] RSP: 0000:ffffb15e8016fec0 EFLAGS: 00010046
[    2.289283] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000040
[    2.289283] RDX: 00000000ffffffff RSI: 0000000000000000 RDI: 0000000000000028
[    2.289283] RBP: 0000000000018428 R08: 0000000000000000 R09: 0000000000000028
[    2.289283] R10: ffffb15e8016fd78 R11: ffff88ca7ff28368 R12: 0000000000000200
[    2.289283] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000
[    2.289283] FS:  0000000000000000(0000) GS:ffff88ca7dc00000(0000) knlGS:0000000000000000
[    2.289283] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    2.289283] CR2: 0000000000000000 CR3: 0000000058610000 CR4: 00000000000006a0
[    2.289283] Call Trace:
[    2.289283]  setup_local_APIC+0x88/0x320
[    2.289283]  ? printk+0x48/0x4a
[    2.289283]  apic_ap_setup+0xa/0x20
[    2.289283]  start_secondary+0x2f/0x130
[    2.289283]  secondary_startup_64_no_verify+0xc2/0xcb
[    2.289283] Modules linked in:
[    2.289283] CR2: 0000000000000000
[    2.289283] ---[ end trace 676dcdbf63e55075 ]---
[    2.289283] RIP: 0010:init_x2apic_ldr+0xa0/0xb0
[    2.289283] Code: 89 2d 9c 81 fb 72 65 8b 15 cd 12 fb 72 89 d2 f0 48 0f ab 50 08 5b 5d c3 48 8b 05 a3 7b 09 02 48 c7 05 98 7b 09 02 00 00 00 00 <89> 18 eb cd 66 66 2e 0f 1f 84 00 00 00 00 00 90 0f 1f 44 00 00 89
[    2.289283] RSP: 0000:ffffb15e8016fec0 EFLAGS: 00010046
[    2.289283] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000040
[    2.289283] RDX: 00000000ffffffff RSI: 0000000000000000 RDI: 0000000000000028
[    2.289283] RBP: 0000000000018428 R08: 0000000000000000 R09: 0000000000000028
[    2.289283] R10: ffffb15e8016fd78 R11: ffff88ca7ff28368 R12: 0000000000000200
[    2.289283] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000
[    2.289283] FS:  0000000000000000(0000) GS:ffff88ca7dc00000(0000) knlGS:0000000000000000
[    2.289283] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    2.289283] CR2: 0000000000000000 CR3: 0000000058610000 CR4: 00000000000006a0
[    2.289283] Kernel panic - not syncing: Attempted to kill the idle task!
[    2.289283] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2021-01-21 15:42         ` David Woodhouse
@ 2021-01-21 17:34           ` David Woodhouse
  0 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-01-21 17:34 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski, shenkai (D),
	Schander, Johanna 'Mimoja' Amelie
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

[-- Attachment #1: Type: text/plain, Size: 4962 bytes --]

On Thu, 2021-01-21 at 15:42 +0000, David Woodhouse wrote:
> [    2.289283] BUG: kernel NULL pointer dereference, address: 0000000000000000
> [    2.289283] #PF: supervisor write access in kernel mode
> [    2.289283] #PF: error_code(0x0002) - not-present page
> [    2.289283] PGD 0 P4D 0 
> [    2.289283] Oops: 0002 [#1] SMP PTI
> [    2.289283] CPU: 32 PID: 0 Comm: swapper/32 Not tainted 5.10.0+ #745
> [    2.289283] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1.fc33 04/01/2014
> [    2.289283] RIP: 0010:init_x2apic_ldr+0xa0/0xb0


OK... in alloc_clustermask() for each CPU we were preallocating a
cluster_mask and storing it in the global cluster_hotplug_mask.

Then later for each CPU we were taking the preallocated cluster_mask
and setting cluster_hotplug_mask to NULL.

That doesn't parallelise well :)

So... ditch the global variable, let alloc_clustermask() install the
appropriate cluster_mask *directly* into the target CPU's per_cpu data
before it's running. And since we have to calculate the logical APIC ID
for the cluster ID, we might as well set x86_cpu_to_logical_apicid at
the same time.

Now all that init_x2apic_ldr() actually *does* on the target CPU is set
that CPU's bit in the pre-existing cluster_mask.

To reduce the number of loops over all (present or online) CPUs, I've
made it set the per_cpu cluster_mask for *all* CPUs in the cluster in
one pass at boot time. I think the case for later hotplug is also sane;
will have to test that.

But it passes that qemu boot test it was failing earlier, at least...
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b0889c48a2ac..74bb4cae8b5b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -18,7 +18,6 @@ struct cluster_mask {
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -98,54 +97,61 @@ static u32 x2apic_calc_apicid(unsigned int cpu)
 static void init_x2apic_ldr(void)
 {
 	struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
-	u32 cluster, apicid = apic_read(APIC_LDR);
-	unsigned int cpu;
 
-	this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+	BUG_ON(!cmsk);
 
-	if (cmsk)
-		goto update;
-
-	cluster = apicid >> 16;
-	for_each_online_cpu(cpu) {
-		cmsk = per_cpu(cluster_masks, cpu);
-		/* Matching cluster found. Link and update it. */
-		if (cmsk && cmsk->clusterid == cluster)
-			goto update;
-	}
-	cmsk = cluster_hotplug_mask;
-	cmsk->clusterid = cluster;
-	cluster_hotplug_mask = NULL;
-update:
-	this_cpu_write(cluster_masks, cmsk);
 	cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
 }
 
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
 {
+	struct cluster_mask *cmsk = NULL;
+	u32 apicid;
+
 	if (per_cpu(cluster_masks, cpu))
 		return 0;
-	/*
-	 * If a hotplug spare mask exists, check whether it's on the right
-	 * node. If not, free it and allocate a new one.
+
+	/* For the hotplug case, don't always allocate a new one */
+	for_each_online_cpu(cpu) {
+		apicid = apic->cpu_present_to_apicid(cpu);
+		if (apicid != BAD_APICID && apicid >> 4 == cluster) {
+			cmsk = per_cpu(cluster_masks, cpu);
+			if (cmsk)
+				break;
+		}
+	}
+	if (!cmsk)
+		cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+	if (!cmsk)
+		return -ENOMEM;
+
+	cmsk->node = node;
+	cmsk->clusterid = cluster;
+
+        /*
+	 * As an optimisation during boot, set the cluster_mask for *all*
+	 * present CPUs at once, which will include 'cpu'.
 	 */
-	if (cluster_hotplug_mask) {
-		if (cluster_hotplug_mask->node == node)
-			return 0;
-		kfree(cluster_hotplug_mask);
+	if (system_state < SYSTEM_RUNNING) {
+		for_each_present_cpu(cpu) {
+			u32 apicid = apic->cpu_present_to_apicid(cpu);
+			if (apicid != BAD_APICID && apicid >> 4 == cluster)
+				per_cpu(cluster_masks, cpu) = cmsk;
+		}
 	}
 
-	cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
-					    GFP_KERNEL, node);
-	if (!cluster_hotplug_mask)
-		return -ENOMEM;
-	cluster_hotplug_mask->node = node;
 	return 0;
 }
 
 static int x2apic_prepare_cpu(unsigned int cpu)
 {
-	if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+	u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+	u32 cluster = phys_apicid >> 4;
+	u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+	per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_apicid;
+
+	if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
 		return -ENOMEM;
 	if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
 		return -ENOMEM;

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH] x86/apic/x2apic: Fix parallel handling of cluster_mask
  2021-01-21 14:55       ` Thomas Gleixner
  2021-01-21 15:42         ` David Woodhouse
@ 2021-01-21 19:59         ` David Woodhouse
  2021-02-01 10:36         ` [PATCH] use x86 cpu park to speedup smp_init in kexec situation David Woodhouse
  2 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-01-21 19:59 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai, Schander, Johanna Amelie, LKML,
	Ingo Molnar, Borislav Petkov, X86 ML, H . Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

From: David Woodhouse <dwmw@amazon.co.uk>

For each CPU being brought up, the alloc_clustermask() function
allocates a new struct cluster_mask just in case it's needed. Then the
target CPU actually runs, and in init_x2apic_ldr() it either uses a
cluster_mask from a previous CPU in the same cluster, or consumes the
"spare" one and sets the global pointer to NULL.

That isn't going to parallelise stunningly well.

Ditch the global variable, let alloc_clustermask() install the struct
*directly* in the per_cpu data for the CPU being brought up. As an
optimisation, actually make it do so for *all* present CPUs in the same
cluster, which means only one iteration over for_each_present_cpu()
instead of doing so repeatedly, once for each CPU.

This was a harmless "bug" while CPU bringup wasn't actually happening in
parallel. It's about to become less harmless...

Fixes: 023a611748fd5 ("x86/apic/x2apic: Simplify cluster management")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 82 ++++++++++++++++-----------
 1 file changed, 49 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b0889c48a2ac..ee5a9a438780 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -18,7 +18,6 @@ struct cluster_mask {
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -98,54 +97,71 @@ static u32 x2apic_calc_apicid(unsigned int cpu)
 static void init_x2apic_ldr(void)
 {
 	struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
-	u32 cluster, apicid = apic_read(APIC_LDR);
-	unsigned int cpu;
 
-	this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+	BUG_ON(!cmsk);
 
-	if (cmsk)
-		goto update;
-
-	cluster = apicid >> 16;
-	for_each_online_cpu(cpu) {
-		cmsk = per_cpu(cluster_masks, cpu);
-		/* Matching cluster found. Link and update it. */
-		if (cmsk && cmsk->clusterid == cluster)
-			goto update;
-	}
-	cmsk = cluster_hotplug_mask;
-	cmsk->clusterid = cluster;
-	cluster_hotplug_mask = NULL;
-update:
-	this_cpu_write(cluster_masks, cmsk);
 	cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
 }
 
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
 {
+	struct cluster_mask *cmsk = NULL;
+	unsigned int cpu_i;
+	u32 apicid;
+
 	if (per_cpu(cluster_masks, cpu))
 		return 0;
-	/*
-	 * If a hotplug spare mask exists, check whether it's on the right
-	 * node. If not, free it and allocate a new one.
+
+	/* For the hotplug case, don't always allocate a new one */
+	for_each_present_cpu(cpu_i) {
+		apicid = apic->cpu_present_to_apicid(cpu_i);
+		if (apicid != BAD_APICID && apicid >> 4 == cluster) {
+			cmsk = per_cpu(cluster_masks, cpu_i);
+			if (cmsk)
+				break;
+		}
+	}
+	if (!cmsk) {
+		cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+	}
+	if (!cmsk)
+		return -ENOMEM;
+
+	cmsk->node = node;
+	cmsk->clusterid = cluster;
+
+	per_cpu(cluster_masks, cpu) = cmsk;
+
+        /*
+	 * As an optimisation during boot, set the cluster_mask for *all*
+	 * present CPUs at once, to prevent *each* of them having to iterate
+	 * over the others to find the existing cluster_mask.
 	 */
-	if (cluster_hotplug_mask) {
-		if (cluster_hotplug_mask->node == node)
-			return 0;
-		kfree(cluster_hotplug_mask);
+	if (system_state < SYSTEM_RUNNING) {
+		for_each_present_cpu(cpu) {
+			u32 apicid = apic->cpu_present_to_apicid(cpu);
+			if (apicid != BAD_APICID && apicid >> 4 == cluster) {
+				struct cluster_mask **cpu_cmsk = &per_cpu(cluster_masks, cpu);
+				if (*cpu_cmsk)
+					BUG_ON(*cpu_cmsk != cmsk);
+				else
+					*cpu_cmsk = cmsk;
+			}
+		}
 	}
 
-	cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
-					    GFP_KERNEL, node);
-	if (!cluster_hotplug_mask)
-		return -ENOMEM;
-	cluster_hotplug_mask->node = node;
 	return 0;
 }
 
 static int x2apic_prepare_cpu(unsigned int cpu)
 {
-	if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+	u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+	u32 cluster = phys_apicid >> 4;
+	u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+	per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_apicid;
+
+	if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
 		return -ENOMEM;
 	if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
 		return -ENOMEM;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] use x86 cpu park to speedup smp_init in kexec situation
  2021-01-21 14:55       ` Thomas Gleixner
  2021-01-21 15:42         ` David Woodhouse
  2021-01-21 19:59         ` [PATCH] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
@ 2021-02-01 10:36         ` David Woodhouse
  2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
  2 siblings, 1 reply; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:36 UTC (permalink / raw)
  To: Thomas Gleixner, Andy Lutomirski, shenkai (D),
	Schander, Johanna 'Mimoja' Amelie
  Cc: LKML, Ingo Molnar, Borislav Petkov, X86 ML, H. Peter Anvin,
	hewenliang4, hushiyuan, luolongjun, hejingxian

[-- Attachment #1: Type: text/plain, Size: 4775 bytes --]

On Thu, 2021-01-21 at 15:55 +0100, Thomas Gleixner wrote:
> > Here's the hack we're testing with, for reference. It's kind of ugly
> > but you can see where it's going. Note that the CMOS mangling for the
> > warm reset vector is going to need to be lifted out of the per-cpu
> > loop, and done *once* at startup and torn down once in smp_cpus_done.
> > Except that it also needs to be done before/after a hotplug cpu up;
> > we'll have to come back to that but we've just shifted it to
> > native_smp_cpus_done() for testing for now.
> 
> Right. It's at least a start.

Here's what we have now.

I've refcounted the warm reset vector thing which should fix the
hotplug case, although I need to check it gets torn down in the error
cases correctly.

With the X2APIC global variable thing fixed, the new states can be
immediately before CPUHP_BRINGUP_CPU as we originally wanted. I've
fixed up the bringup_nonboot_cpus() loop to bring an appropriate number
of CPUs to those "CPUHP_BP_PARALLEL_DYN" dynamic parallel pre-bringup
states in parallel.

We spent a while vacillating about how to add the new states, because
of the existing special-case hackery in bringup_cpu() for the
CPUHP_BRINGUP_CPU case.

The irq_lock_sparse() and the idle_thread_get() from there might
actually be needed in *earlier* states for platforms which do parallel
bringup.... so do we add similar wrappers in kernel/cpu.c for *all* of
the pre-bringup states, having hard-coded them? Then let the arch
provide a config symbol for whether it really wants them or not? That
seemed kind of horrid, so I went for the simple option of just letting
the arch register the CPUHP_BP_PARALLEL_DYN states the normal way with
its own functions to be called directly, and the loop in
bringup_nonboot_cpus() can then operate directly on whether they exist
in the state table or not, for which there is precedent already.

That means I needed to export idle_thread_get() for the pre-bringup
state functions to use too. I'll also want to add the irq_lock_sparse()
into my final patch but frankly, that's the least of my worries about
that patch right now.

It's also fairly much a no-brainer to splitting up the x86
native_cpu_up() into the four separate phases that I had got separate
timings for previously. We can do that just as a "cleanup" with no
functional change.

So I'm relatively happy at least that far, as preparatory work...

David Woodhouse (6):
      x86/apic/x2apic: Fix parallel handling of cluster_mask
      cpu/hotplug: Add dynamic states before CPUHP_BRINGUP_CPU for parallel bringup
      x86/smpboot: Reference count on smpboot_setup_warm_reset_vector()
      x86/smpboot: Split up native_cpu_up into separate phases
      cpu/hotplug: Move idle_thread_get() to <linux/smpboot.h>

 arch/x86/kernel/apic/x2apic_cluster.c |  82 +++++++++++++++++------------
 arch/x86/kernel/smpboot.c             | 159 ++++++++++++++++++++++++++++++++++----------------------
 include/linux/cpuhotplug.h            |   2 +
 include/linux/smpboot.h               |   7 +++
 kernel/cpu.c                          |  27 +++++++++-
 kernel/smpboot.h                      |   2 -
 6 files changed, 180 insertions(+), 99 deletions(-)

That's the generic part mostly done, and the fun part is where we turn
back to x86 and actually try to split out those four phases of
native_cpu_up() to happen in parallel.

We store initial_stack and initial_gs for "the" AP that is coming up,
in global variables. It turns out that the APs don't like all sharing
the same stack as they come up in parallel, and weird behaviour ensues.

I think the only thing the AP has that can disambiguate it from other
APs is its CPUID, which it can get in its full 32-bit glory from
CPUID.0BH:EDX (and I think we can say we'll do parallel bringup *only*
of that leaf exists on the boot CPU).

So the trampoline code would need to find the logical CPU# and thus the
idle thread stack and per-cpu data with a lookup based on its APICID.
Perhaps just by trawling the various per-cpu data until it finds one
with the right apicid, much like default_cpu_present_to_apicid() does.

Oh, and ideally it needs to do this without using a real-mode stack,
because they won't like sharing that *either*.

(Actually they don't seem to mind in practice right now because the
only thing they all use it for is a 'call verify_cpu' and they all
place the *same* return address at the same place on the stack, but it
would be horrid to rely on that on *purpose* :)

So we'll continue to work on that in order to enable the parallel
bringup on x86, unless anyone has any cleverer ideas.

After that we'll get to the TSC sync, which is also not working in
parallel.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask
  2021-02-01 10:36         ` [PATCH] use x86 cpu park to speedup smp_init in kexec situation David Woodhouse
@ 2021-02-01 10:38           ` David Woodhouse
  2021-02-01 10:38             ` [PATCH 2/6] cpu/hotplug: Add dynamic states before CPUHP_BRINGUP_CPU for parallel bringup David Woodhouse
                               ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai (D),
	mimoja, LKML, Ingo Molnar, Borislav Petkov, X86 ML,
	H . Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

For each CPU being brought up, the alloc_clustermask() function
allocates a new struct cluster_mask just in case it's needed. Then the
target CPU actually runs, and in init_x2apic_ldr() it either uses a
cluster_mask from a previous CPU in the same cluster, or consumes the
"spare" one and sets the global pointer to NULL.

That isn't going to parallelise stunningly well.

Ditch the global variable, let alloc_clustermask() install the struct
*directly* in the per_cpu data for the CPU being brought up. As an
optimisation, actually make it do so for *all* present CPUs in the same
cluster, which means only one iteration over for_each_present_cpu()
instead of doing so repeatedly, once for each CPU.

This was a harmless "bug" while CPU bringup wasn't actually happening in
parallel. It's about to become less harmless...

Fixes: 023a611748fd5 ("x86/apic/x2apic: Simplify cluster management")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 82 ++++++++++++++++-----------
 1 file changed, 49 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index df6adc5674c9..2afa4609496f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -18,7 +18,6 @@ struct cluster_mask {
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -98,54 +97,71 @@ static u32 x2apic_calc_apicid(unsigned int cpu)
 static void init_x2apic_ldr(void)
 {
 	struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
-	u32 cluster, apicid = apic_read(APIC_LDR);
-	unsigned int cpu;
 
-	this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+	BUG_ON(!cmsk);
 
-	if (cmsk)
-		goto update;
-
-	cluster = apicid >> 16;
-	for_each_online_cpu(cpu) {
-		cmsk = per_cpu(cluster_masks, cpu);
-		/* Matching cluster found. Link and update it. */
-		if (cmsk && cmsk->clusterid == cluster)
-			goto update;
-	}
-	cmsk = cluster_hotplug_mask;
-	cmsk->clusterid = cluster;
-	cluster_hotplug_mask = NULL;
-update:
-	this_cpu_write(cluster_masks, cmsk);
 	cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
 }
 
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
 {
+	struct cluster_mask *cmsk = NULL;
+	unsigned int cpu_i;
+	u32 apicid;
+
 	if (per_cpu(cluster_masks, cpu))
 		return 0;
-	/*
-	 * If a hotplug spare mask exists, check whether it's on the right
-	 * node. If not, free it and allocate a new one.
+
+	/* For the hotplug case, don't always allocate a new one */
+	for_each_present_cpu(cpu_i) {
+		apicid = apic->cpu_present_to_apicid(cpu_i);
+		if (apicid != BAD_APICID && apicid >> 4 == cluster) {
+			cmsk = per_cpu(cluster_masks, cpu_i);
+			if (cmsk)
+				break;
+		}
+	}
+	if (!cmsk) {
+		cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+	}
+	if (!cmsk)
+		return -ENOMEM;
+
+	cmsk->node = node;
+	cmsk->clusterid = cluster;
+
+	per_cpu(cluster_masks, cpu) = cmsk;
+
+        /*
+	 * As an optimisation during boot, set the cluster_mask for *all*
+	 * present CPUs at once, to prevent *each* of them having to iterate
+	 * over the others to find the existing cluster_mask.
 	 */
-	if (cluster_hotplug_mask) {
-		if (cluster_hotplug_mask->node == node)
-			return 0;
-		kfree(cluster_hotplug_mask);
+	if (system_state < SYSTEM_RUNNING) {
+		for_each_present_cpu(cpu) {
+			u32 apicid = apic->cpu_present_to_apicid(cpu);
+			if (apicid != BAD_APICID && apicid >> 4 == cluster) {
+				struct cluster_mask **cpu_cmsk = &per_cpu(cluster_masks, cpu);
+				if (*cpu_cmsk)
+					BUG_ON(*cpu_cmsk != cmsk);
+				else
+					*cpu_cmsk = cmsk;
+			}
+		}
 	}
 
-	cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
-					    GFP_KERNEL, node);
-	if (!cluster_hotplug_mask)
-		return -ENOMEM;
-	cluster_hotplug_mask->node = node;
 	return 0;
 }
 
 static int x2apic_prepare_cpu(unsigned int cpu)
 {
-	if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+	u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+	u32 cluster = phys_apicid >> 4;
+	u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+	per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_apicid;
+
+	if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
 		return -ENOMEM;
 	if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
 		return -ENOMEM;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/6] cpu/hotplug: Add dynamic states before CPUHP_BRINGUP_CPU for parallel bringup
  2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
@ 2021-02-01 10:38             ` David Woodhouse
  2021-02-01 10:38             ` [PATCH 3/6] x86/smpboot: Reference count on smpboot_setup_warm_reset_vector() David Woodhouse
                               ` (3 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai (D),
	mimoja, LKML, Ingo Molnar, Borislav Petkov, X86 ML,
	H . Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

If the platform registers these states, bring all CPUs to each registered
state in parallel, before the final bringup to CPUHP_BRINGUP_CPU.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/cpuhotplug.h |  2 ++
 kernel/cpu.c               | 27 +++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0042ef362511..ae07cd1dafe3 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -92,6 +92,8 @@ enum cpuhp_state {
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
+	CPUHP_BP_PARALLEL_DYN,
+	CPUHP_BP_PARALLEL_DYN_END		= CPUHP_BP_PARALLEL_DYN + 4,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4e11e91010e1..11f9504b4845 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1335,6 +1335,24 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu)
 void bringup_nonboot_cpus(unsigned int setup_max_cpus)
 {
 	unsigned int cpu;
+	int n = setup_max_cpus - num_online_cpus();
+
+	/* ∀ parallel pre-bringup state, bring N CPUs to it */
+	if (n > 0) {
+		enum cpuhp_state st = CPUHP_BP_PARALLEL_DYN;
+
+		while (st <= CPUHP_BP_PARALLEL_DYN_END &&
+		       cpuhp_hp_states[st].name) {
+			int i = n;
+
+			for_each_present_cpu(cpu) {
+				cpu_up(cpu, st);
+				if (!--i)
+					break;
+			}
+			st++;
+		}
+	}
 
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
@@ -1702,6 +1720,10 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
 		step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
 		end = CPUHP_BP_PREPARE_DYN_END;
 		break;
+	case CPUHP_BP_PARALLEL_DYN:
+		step = cpuhp_hp_states + CPUHP_BP_PARALLEL_DYN;
+		end = CPUHP_BP_PARALLEL_DYN_END;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1726,14 +1748,15 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
 	/*
 	 * If name is NULL, then the state gets removed.
 	 *
-	 * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
+	 * CPUHP_AP_ONLINE_DYN and CPUHP_BP_xxxxx_DYN are handed out on
 	 * the first allocation from these dynamic ranges, so the removal
 	 * would trigger a new allocation and clear the wrong (already
 	 * empty) state, leaving the callbacks of the to be cleared state
 	 * dangling, which causes wreckage on the next hotplug operation.
 	 */
 	if (name && (state == CPUHP_AP_ONLINE_DYN ||
-		     state == CPUHP_BP_PREPARE_DYN)) {
+		     state == CPUHP_BP_PREPARE_DYN ||
+		     state == CPUHP_BP_PARALLEL_DYN)) {
 		ret = cpuhp_reserve_state(state);
 		if (ret < 0)
 			return ret;
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/6] x86/smpboot: Reference count on smpboot_setup_warm_reset_vector()
  2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
  2021-02-01 10:38             ` [PATCH 2/6] cpu/hotplug: Add dynamic states before CPUHP_BRINGUP_CPU for parallel bringup David Woodhouse
@ 2021-02-01 10:38             ` David Woodhouse
  2021-02-01 10:38             ` [PATCH 4/6] x86/smpboot: Split up native_cpu_up into separate phases David Woodhouse
                               ` (2 subsequent siblings)
  4 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai (D),
	mimoja, LKML, Ingo Molnar, Borislav Petkov, X86 ML,
	H . Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

If we want to do parallel CPU bringup, we're going to need to set this up
and leave it until all CPUs are done. Might as well use the RTC spinlock
to protect the refcount, as we need to take it anyway.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/smpboot.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 117e24fbfd8a..bec0059d3d3d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -125,17 +125,22 @@ int arch_update_cpu_topology(void)
 	return retval;
 }
 
+
+static unsigned int smpboot_warm_reset_vector_count;
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&rtc_lock, flags);
-	CMOS_WRITE(0xa, 0xf);
+	if (!smpboot_warm_reset_vector_count++) {
+		CMOS_WRITE(0xa, 0xf);
+		*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+			start_eip >> 4;
+		*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+			start_eip & 0xf;
+	}
 	spin_unlock_irqrestore(&rtc_lock, flags);
-	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
-							start_eip >> 4;
-	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
-							start_eip & 0xf;
 }
 
 static inline void smpboot_restore_warm_reset_vector(void)
@@ -147,10 +152,12 @@ static inline void smpboot_restore_warm_reset_vector(void)
 	 * to default values.
 	 */
 	spin_lock_irqsave(&rtc_lock, flags);
-	CMOS_WRITE(0, 0xf);
-	spin_unlock_irqrestore(&rtc_lock, flags);
+	if (!--smpboot_warm_reset_vector_count) {
+		CMOS_WRITE(0, 0xf);
 
-	*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+		*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+	}
+	spin_unlock_irqrestore(&rtc_lock, flags);
 }
 
 static void init_freq_invariance(bool secondary, bool cppc_ready);
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/6] x86/smpboot: Split up native_cpu_up into separate phases
  2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
  2021-02-01 10:38             ` [PATCH 2/6] cpu/hotplug: Add dynamic states before CPUHP_BRINGUP_CPU for parallel bringup David Woodhouse
  2021-02-01 10:38             ` [PATCH 3/6] x86/smpboot: Reference count on smpboot_setup_warm_reset_vector() David Woodhouse
@ 2021-02-01 10:38             ` David Woodhouse
  2021-02-01 10:38             ` [PATCH 5/6] cpu/hotplug: Move idle_thread_get() to <linux/smpboot.h> David Woodhouse
  2021-02-01 10:38             ` [PATCH 6/6] pre states for x86 David Woodhouse
  4 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai (D),
	mimoja, LKML, Ingo Molnar, Borislav Petkov, X86 ML,
	H . Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

There are four logical parts to what native_cpu_up() does.

First it actually wakes AP.

Second, it waits for the AP to make it as far as wait_for_master_cpu()
which sets that CPU's bit in cpu_initialized_mask, and sets the bit in
cpu_callout_mask to let the AP proceed through cpu_init().

Then, it waits for the AP to finish cpu_init() and get as far as the
smp_callin() call, which sets that CPU's bit in cpu_callin_mask.

Finally, it does the TSC synchronization and waits for the AP to actually
mark itself online in cpu_online_mask.

This commit should have no behavioural change, but merely splits those
phases out into separate functions so that future commits and make them
happen in parallel for all APs.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kernel/smpboot.c | 136 +++++++++++++++++++++++---------------
 1 file changed, 82 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bec0059d3d3d..649b8236309b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1039,9 +1039,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 {
 	/* start_ip had better be page-aligned! */
 	unsigned long start_ip = real_mode_header->trampoline_start;
-
 	unsigned long boot_error = 0;
-	unsigned long timeout;
 
 	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
@@ -1094,55 +1092,70 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
 		boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
 						     cpu0_nmi_registered);
 
-	if (!boot_error) {
-		/*
-		 * Wait 10s total for first sign of life from AP
-		 */
-		boot_error = -1;
-		timeout = jiffies + 10*HZ;
-		while (time_before(jiffies, timeout)) {
-			if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
-				/*
-				 * Tell AP to proceed with initialization
-				 */
-				cpumask_set_cpu(cpu, cpu_callout_mask);
-				boot_error = 0;
-				break;
-			}
-			schedule();
-		}
-	}
+	return boot_error;
+}
 
-	if (!boot_error) {
-		/*
-		 * Wait till AP completes initial initialization
-		 */
-		while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
-			/*
-			 * Allow other tasks to run while we wait for the
-			 * AP to come online. This also gives a chance
-			 * for the MTRR work(triggered by the AP coming online)
-			 * to be completed in the stop machine context.
-			 */
-			schedule();
-		}
+static int do_wait_cpu_cpumask(unsigned int cpu, const struct cpumask *mask)
+{
+	unsigned long timeout;
+
+	/*
+	 * Wait up to 10s for the CPU to report in.
+	 */
+	timeout = jiffies + 10*HZ;
+	while (time_before(jiffies, timeout)) {
+		if (cpumask_test_cpu(cpu, mask))
+			return 0;
+
+		schedule();
 	}
+	return -1;
+}
 
-	if (x86_platform.legacy.warm_reset) {
-		/*
-		 * Cleanup possible dangling ends...
-		 */
-		smpboot_restore_warm_reset_vector();
+static int do_wait_cpu_initialized(unsigned int cpu)
+{
+	/*
+	 * Wait for first sign of life from AP.
+	 */
+	if (do_wait_cpu_cpumask(cpu, cpu_initialized_mask))
+		return -1;
+
+	cpumask_set_cpu(cpu, cpu_callout_mask);
+	return 0;
+}
+
+static int do_wait_cpu_callin(unsigned int cpu)
+{
+	/*
+	 * Wait till AP completes initial initialization.
+	 */
+	return do_wait_cpu_cpumask(cpu, cpu_callin_mask);
+}
+
+static int do_wait_cpu_online(unsigned int cpu)
+{
+	unsigned long flags;
+
+	/*
+	 * Check TSC synchronization with the AP (keep irqs disabled
+	 * while doing so):
+	 */
+	local_irq_save(flags);
+	check_tsc_sync_source(cpu);
+	local_irq_restore(flags);
+
+	while (!cpu_online(cpu)) {
+		cpu_relax();
+		touch_nmi_watchdog();
 	}
 
-	return boot_error;
+	return 0;
 }
 
-int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+int do_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int apicid = apic->cpu_present_to_apicid(cpu);
 	int cpu0_nmi_registered = 0;
-	unsigned long flags;
 	int err, ret = 0;
 
 	lockdep_assert_irqs_enabled();
@@ -1189,19 +1202,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		goto unreg_nmi;
 	}
 
-	/*
-	 * Check TSC synchronization with the AP (keep irqs disabled
-	 * while doing so):
-	 */
-	local_irq_save(flags);
-	check_tsc_sync_source(cpu);
-	local_irq_restore(flags);
-
-	while (!cpu_online(cpu)) {
-		cpu_relax();
-		touch_nmi_watchdog();
-	}
-
 unreg_nmi:
 	/*
 	 * Clean up the nmi handler. Do this after the callin and callout sync
@@ -1213,6 +1213,34 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return ret;
 }
 
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+	int ret;
+
+	ret = do_cpu_up(cpu, tidle);
+	if (ret)
+		return ret;
+
+	ret = do_wait_cpu_initialized(cpu);
+	if (ret)
+		return ret;
+
+	ret = do_wait_cpu_callin(cpu);
+	if (ret)
+		return ret;
+
+	ret = do_wait_cpu_online(cpu);
+
+	if (x86_platform.legacy.warm_reset) {
+		/*
+		 * Cleanup possible dangling ends...
+		 */
+		smpboot_restore_warm_reset_vector();
+	}
+
+	return ret;
+}
+
 /**
  * arch_disable_smp_support() - disables SMP support for x86 at runtime
  */
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/6] cpu/hotplug: Move idle_thread_get() to <linux/smpboot.h>
  2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
                               ` (2 preceding siblings ...)
  2021-02-01 10:38             ` [PATCH 4/6] x86/smpboot: Split up native_cpu_up into separate phases David Woodhouse
@ 2021-02-01 10:38             ` David Woodhouse
  2021-02-01 10:38             ` [PATCH 6/6] pre states for x86 David Woodhouse
  4 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai (D),
	mimoja, LKML, Ingo Molnar, Borislav Petkov, X86 ML,
	H . Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

Instead of relying purely on the special-case wrapper in bringup_cpu()
to pass the idle thread to __cpu_up(), expose idle_thread_get() so that
the architecture code can obtain it directly when necessary.

This will be useful when the existing __cpu_up() is split into multiple
phases, only *one* of which will actually need the idle thread.

If the architecture code is to register its new pre-bringup states with
the cpuhp core, having a special-case wrapper to pass extra arguments is
non-trivial and it's easier just to let the arch register its function
pointer to be invoked with the standard API.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/smpboot.h | 7 +++++++
 kernel/smpboot.h        | 2 --
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index 9d1bc65d226c..3862addcaa34 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -5,6 +5,13 @@
 #include <linux/types.h>
 
 struct task_struct;
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+struct task_struct *idle_thread_get(unsigned int cpu);
+#else
+static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
+#endif
+
 /* Cookie handed to the thread_fn*/
 struct smpboot_thread_data;
 
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 34dd3d7ba40b..60c609318ad6 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -5,11 +5,9 @@
 struct task_struct;
 
 #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
-struct task_struct *idle_thread_get(unsigned int cpu);
 void idle_thread_set_boot_cpu(void);
 void idle_threads_init(void);
 #else
-static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
 static inline void idle_thread_set_boot_cpu(void) { }
 static inline void idle_threads_init(void) { }
 #endif
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 6/6] pre states for x86
  2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
                               ` (3 preceding siblings ...)
  2021-02-01 10:38             ` [PATCH 5/6] cpu/hotplug: Move idle_thread_get() to <linux/smpboot.h> David Woodhouse
@ 2021-02-01 10:38             ` David Woodhouse
  4 siblings, 0 replies; 21+ messages in thread
From: David Woodhouse @ 2021-02-01 10:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, shenkai (D),
	mimoja, LKML, Ingo Molnar, Borislav Petkov, X86 ML,
	H . Peter Anvin, hewenliang4, hushiyuan, luolongjun, hejingxian

Utterly broken because the bringup uses global initial_stack and initial_gs
variables, and the TSC sync is similarly hosed (I should probably do one
at a time).

The bringup is going to be the most fun to fix, because the AP coming up
doesn't actually have a lot that it *can* use to disambiguate itself from
the others. Starting them at different locations is mostly a non-starter
as we can only specify a page address under 1MiB and there are only 256
of those even if we could allocate them *all* to start different CPUs.
I think we need to get them to find their own logical CPU# from their
APICID, perhaps by trawling the per_cpu x86_cpu_to_apicid or some other
means, then find their initial stack / %gs from that.

We also need to work out if we can eliminate the real-mode stack for the
trampoline, or do some locking if we really must share it perhaps.
---
 arch/x86/kernel/smpboot.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 649b8236309b..03f63027fdad 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -57,6 +57,7 @@
 #include <linux/pgtable.h>
 #include <linux/overflow.h>
 #include <linux/syscore_ops.h>
+#include <linux/smpboot.h>
 
 #include <asm/acpi.h>
 #include <asm/desc.h>
@@ -1217,14 +1218,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret;
 
-	ret = do_cpu_up(cpu, tidle);
-	if (ret)
-		return ret;
-
-	ret = do_wait_cpu_initialized(cpu);
-	if (ret)
-		return ret;
-
 	ret = do_wait_cpu_callin(cpu);
 	if (ret)
 		return ret;
@@ -1241,6 +1234,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return ret;
 }
 
+int native_cpu_kick(unsigned int cpu)
+{
+	return do_cpu_up(cpu, idle_thread_get(cpu));
+}
+
+int native_cpu_wait_init(unsigned int cpu)
+{
+	return do_wait_cpu_initialized(cpu);
+}
+
 /**
  * arch_disable_smp_support() - disables SMP support for x86 at runtime
  */
@@ -1412,6 +1415,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	smp_quirk_init_udelay();
 
 	speculative_store_bypass_ht_init();
+
+	cpuhp_setup_state_nocalls(CPUHP_BP_PARALLEL_DYN, "x86/cpu:kick",
+				  native_cpu_kick, NULL);
+	cpuhp_setup_state_nocalls(CPUHP_BP_PARALLEL_DYN, "x86/cpu:wait-init",
+				  native_cpu_wait_init, NULL);
 }
 
 void arch_thaw_secondary_cpus_begin(void)
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-02-01 10:43 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-15 14:46 [PATCH] use x86 cpu park to speedup smp_init in kexec situation shenkai (D)
2020-12-15 16:31 ` Andy Lutomirski
2020-12-15 21:20   ` Thomas Gleixner
2020-12-16  8:45     ` shenkai (D)
2020-12-16 10:12       ` Thomas Gleixner
2020-12-16 14:18         ` shenkai (D)
2020-12-16 15:31           ` Thomas Gleixner
2020-12-17 14:53             ` shenkai (D)
2021-01-07 15:18             ` David Woodhouse
2021-01-19 12:12     ` David Woodhouse
2021-01-21 14:55       ` Thomas Gleixner
2021-01-21 15:42         ` David Woodhouse
2021-01-21 17:34           ` David Woodhouse
2021-01-21 19:59         ` [PATCH] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
2021-02-01 10:36         ` [PATCH] use x86 cpu park to speedup smp_init in kexec situation David Woodhouse
2021-02-01 10:38           ` [PATCH 1/6] x86/apic/x2apic: Fix parallel handling of cluster_mask David Woodhouse
2021-02-01 10:38             ` [PATCH 2/6] cpu/hotplug: Add dynamic states before CPUHP_BRINGUP_CPU for parallel bringup David Woodhouse
2021-02-01 10:38             ` [PATCH 3/6] x86/smpboot: Reference count on smpboot_setup_warm_reset_vector() David Woodhouse
2021-02-01 10:38             ` [PATCH 4/6] x86/smpboot: Split up native_cpu_up into separate phases David Woodhouse
2021-02-01 10:38             ` [PATCH 5/6] cpu/hotplug: Move idle_thread_get() to <linux/smpboot.h> David Woodhouse
2021-02-01 10:38             ` [PATCH 6/6] pre states for x86 David Woodhouse

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.