All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jan Beulich" <JBeulich@suse.com>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>,
	Wei Liu <wei.liu2@citrix.com>,
	George Dunlap <George.Dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Ian Jackson <Ian.Jackson@eu.citrix.com>, Tim Deegan <tim@xen.org>,
	Julien Grall <julien.grall@arm.com>
Subject: [PATCH 2/8] x86: distinguish CPU offlining from CPU removal
Date: Wed, 11 Jul 2018 06:06:04 -0600	[thread overview]
Message-ID: <5B45F2AC02000078001D3133@prv1-mh.provo.novell.com> (raw)
In-Reply-To: <5B45F01B02000078001D30FF@prv1-mh.provo.novell.com>

In order to be able to service #MC on offlined CPUs, GDT, IDT, stack,
and per-CPU data (which includes the TSS) need to be kept allocated.
They should only be freed upon CPU removal (which we currently don't
support, so some code is becoming effectively dead for the moment).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int c
 
     mcabanks_free(poll);
     mcabanks_free(clr);
+
+    per_cpu(poll_bankmask, cpu) = NULL;
+    per_cpu(mce_clear_banks, cpu) = NULL;
 }
 
 static int cpu_bank_alloc(unsigned int cpu)
 {
-    struct mca_banks *poll = mcabanks_alloc();
-    struct mca_banks *clr = mcabanks_alloc();
+    struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc();
+    struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc();
 
     if ( !poll || !clr )
     {
@@ -725,7 +728,13 @@ static int cpu_callback(
 
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        cpu_bank_free(cpu);
+        if ( !park_offline_cpus )
+            cpu_bank_free(cpu);
+        break;
+
+    case CPU_REMOVE:
+        if ( park_offline_cpus )
+            cpu_bank_free(cpu);
         break;
     }
 
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -107,10 +107,11 @@ static void play_dead(void)
     local_irq_disable();
 
     /*
-     * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
-     * as they may be freed at any time. In this case, heap corruption or
-     * #PF can occur (when heap debugging is enabled). For example, even
-     * printk() can involve tasklet scheduling, which touches per-cpu vars.
+     * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible,
+     * as they may be freed at any time if offline CPUs don't get parked. In
+     * this case, heap corruption or #PF can occur (when heap debugging is
+     * enabled). For example, even printk() can involve tasklet scheduling,
+     * which touches per-cpu vars.
      * 
      * Consider very carefully when adding code to *dead_idle. Most hypervisor
      * subsystems are unsafe to call.
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -201,18 +201,25 @@ static int update_clusterinfo(
         if ( !cluster_cpus_spare )
             cluster_cpus_spare = xzalloc(cpumask_t);
         if ( !cluster_cpus_spare ||
-             !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
+             !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
             err = -ENOMEM;
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
+    case CPU_REMOVE:
+        if ( park_offline_cpus == (action != CPU_REMOVE) )
+            break;
         if ( per_cpu(cluster_cpus, cpu) )
         {
             cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
             if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
+            {
                 xfree(per_cpu(cluster_cpus, cpu));
+                per_cpu(cluster_cpus, cpu) = NULL;
+            }
         }
         free_cpumask_var(per_cpu(scratch_mask, cpu));
+        clear_cpumask_var(&per_cpu(scratch_mask, cpu));
         break;
     }
 
--- a/xen/arch/x86/percpu.c
+++ b/xen/arch/x86/percpu.c
@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int
     char *p;
 
     if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA )
-        return -EBUSY;
+        return 0;
 
     if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL )
         return -ENOMEM;
@@ -76,9 +76,12 @@ static int cpu_percpu_callback(
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        free_percpu_area(cpu);
+        if ( !park_offline_cpus )
+            free_percpu_area(cpu);
         break;
-    default:
+    case CPU_REMOVE:
+        if ( park_offline_cpus )
+            free_percpu_area(cpu);
         break;
     }
 
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask;
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+bool __read_mostly park_offline_cpus;
+
 unsigned int __read_mostly nr_sockets;
 cpumask_t **__read_mostly socket_cpumask;
 static cpumask_t *secondary_socket_cpumask;
@@ -887,7 +889,7 @@ static void cleanup_cpu_root_pgt(unsigne
     }
 }
 
-static void cpu_smpboot_free(unsigned int cpu)
+static void cpu_smpboot_free(unsigned int cpu, bool all)
 {
     unsigned int order, socket = cpu_to_socket(cpu);
     struct cpuinfo_x86 *c = cpu_data;
@@ -898,15 +900,24 @@ static void cpu_smpboot_free(unsigned in
         socket_cpumask[socket] = NULL;
     }
 
-    c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
-    c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
-    c[cpu].compute_unit_id = INVALID_CUID;
     cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
 
-    free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
-    free_cpumask_var(per_cpu(cpu_core_mask, cpu));
-    if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
-        free_cpumask_var(per_cpu(scratch_cpumask, cpu));
+    if ( all )
+    {
+        c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
+        c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
+        c[cpu].compute_unit_id = INVALID_CUID;
+
+        free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
+        clear_cpumask_var(&per_cpu(cpu_sibling_mask, cpu));
+        free_cpumask_var(per_cpu(cpu_core_mask, cpu));
+        clear_cpumask_var(&per_cpu(cpu_core_mask, cpu));
+        if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
+        {
+            free_cpumask_var(per_cpu(scratch_cpumask, cpu));
+            clear_cpumask_var(&per_cpu(scratch_cpumask, cpu));
+        }
+    }
 
     cleanup_cpu_root_pgt(cpu);
 
@@ -928,19 +939,26 @@ static void cpu_smpboot_free(unsigned in
     }
 
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    free_xenheap_pages(per_cpu(gdt_table, cpu), order);
+    if ( all )
+    {
+        free_xenheap_pages(per_cpu(gdt_table, cpu), order);
+        per_cpu(gdt_table, cpu) = NULL;
+    }
 
     free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
 
-    order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    free_xenheap_pages(idt_tables[cpu], order);
-    idt_tables[cpu] = NULL;
-
-    if ( stack_base[cpu] != NULL )
+    if ( all )
     {
-        memguard_unguard_stack(stack_base[cpu]);
-        free_xenheap_pages(stack_base[cpu], STACK_ORDER);
-        stack_base[cpu] = NULL;
+        order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
+        free_xenheap_pages(idt_tables[cpu], order);
+        idt_tables[cpu] = NULL;
+
+        if ( stack_base[cpu] != NULL )
+        {
+            memguard_unguard_stack(stack_base[cpu]);
+            free_xenheap_pages(stack_base[cpu], STACK_ORDER);
+            stack_base[cpu] = NULL;
+        }
     }
 }
 
@@ -955,15 +973,19 @@ static int cpu_smpboot_alloc(unsigned in
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
 
-    stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
+    if ( stack_base[cpu] == NULL )
+        stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
     if ( stack_base[cpu] == NULL )
         goto out;
     memguard_guard_stack(stack_base[cpu]);
 
     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
-    per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
+    gdt = per_cpu(gdt_table, cpu);
+    if ( gdt == NULL )
+        gdt = alloc_xenheap_pages(order, memflags);
     if ( gdt == NULL )
         goto out;
+    per_cpu(gdt_table, cpu) = gdt;
     memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
     BUILD_BUG_ON(NR_CPUS > 0x10000);
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
@@ -975,7 +997,8 @@ static int cpu_smpboot_alloc(unsigned in
     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
 
     order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
-    idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
+    if ( idt_tables[cpu] == NULL )
+        idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
     if ( idt_tables[cpu] == NULL )
         goto out;
     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
@@ -1003,16 +1026,16 @@ static int cpu_smpboot_alloc(unsigned in
          (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
         goto out;
 
-    if ( !(zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
-           zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
-           alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
+    if ( !(cond_zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
+           cond_zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+           cond_alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu))) )
         goto out;
 
     rc = 0;
 
  out:
     if ( rc )
-        cpu_smpboot_free(cpu);
+        cpu_smpboot_free(cpu, true);
 
     return rc;
 }
@@ -1030,9 +1053,10 @@ static int cpu_smpboot_callback(
         break;
     case CPU_UP_CANCELED:
     case CPU_DEAD:
-        cpu_smpboot_free(cpu);
+        cpu_smpboot_free(cpu, !park_offline_cpus);
         break;
-    default:
+    case CPU_REMOVE:
+        cpu_smpboot_free(cpu, true);
         break;
     }
 
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -26,6 +26,8 @@ DECLARE_PER_CPU(cpumask_var_t, cpu_sibli
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
 DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
 
+extern bool park_offline_cpus;
+
 void smp_send_nmi_allbutself(void);
 
 void send_IPI_mask(const cpumask_t *, int vector);
--- a/xen/include/xen/cpu.h
+++ b/xen/include/xen/cpu.h
@@ -47,6 +47,8 @@ void register_cpu_notifier(struct notifi
 #define CPU_DYING        (0x0007 | NOTIFY_REVERSE)
 /* CPU_DEAD: CPU is dead. */
 #define CPU_DEAD         (0x0008 | NOTIFY_REVERSE)
+/* CPU_REMOVE: CPU was removed. */
+#define CPU_REMOVE       (0x0009 | NOTIFY_REVERSE)
 
 /* Perform CPU hotplug. May return -EAGAIN. */
 int cpu_down(unsigned int cpu);
--- a/xen/include/xen/cpumask.h
+++ b/xen/include/xen/cpumask.h
@@ -351,16 +351,37 @@ static inline bool_t alloc_cpumask_var(c
 	return *mask != NULL;
 }
 
+static inline bool cond_alloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (*mask == NULL)
+		*mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long));
+	return *mask != NULL;
+}
+
 static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
 {
 	*(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
 	return *mask != NULL;
 }
 
+static inline bool cond_zalloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (*mask == NULL)
+		*mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long));
+	else
+		cpumask_clear(*mask);
+	return *mask != NULL;
+}
+
 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 	xfree(mask);
 }
+
+static inline void clear_cpumask_var(cpumask_var_t *mask)
+{
+	*mask = NULL;
+}
 #else
 typedef cpumask_t cpumask_var_t[1];
 
@@ -368,16 +389,22 @@ static inline bool_t alloc_cpumask_var(c
 {
 	return 1;
 }
+#define cond_alloc_cpumask_var alloc_cpumask_var
 
 static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
 {
 	cpumask_clear(*mask);
 	return 1;
 }
+#define cond_zalloc_cpumask_var zalloc_cpumask_var
 
 static inline void free_cpumask_var(cpumask_var_t mask)
 {
 }
+
+static inline void clear_cpumask_var(cpumask_var_t *mask)
+{
+}
 #endif
 
 #if NR_CPUS > 1




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

  parent reply	other threads:[~2018-07-11 12:06 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-11 11:55 [PATCH 0/8] x86: (allow to) suppress use of hyper-threading Jan Beulich
2018-07-11 12:04 ` [PATCH 1/8] cpupools: fix state when downing a CPU failed Jan Beulich
2018-07-11 12:06 ` Jan Beulich [this message]
2018-07-12 10:53   ` [PATCH 2/8] x86: distinguish CPU offlining from CPU removal Wei Liu
2018-07-12 11:48     ` Jan Beulich
2018-07-13  8:39       ` Wei Liu
2018-07-12 12:42   ` Andrew Cooper
2018-07-11 12:06 ` [PATCH 3/8] allow cpu_down() to be called earlier Jan Beulich
2018-07-12 10:55   ` Wei Liu
2018-07-12 12:44   ` Andrew Cooper
2018-07-11 12:07 ` [PATCH 4/8] x86/AMD: distinguish compute units from hyper-threads Jan Beulich
2018-07-11 18:11   ` Brian Woods
2018-07-12 13:02   ` Andrew Cooper
2018-07-12 14:22     ` Jan Beulich
2018-07-11 12:09 ` [PATCH 5/8] x86: bring up all CPUs even if not all are supposed to be used Jan Beulich
2018-07-12 15:38   ` Andrew Cooper
2018-07-13  8:11     ` Jan Beulich
2018-07-11 12:10 ` [PATCH 6/8] x86: (command line option to) avoid use of secondary hyper-threads Jan Beulich
2018-07-12 15:45   ` Andrew Cooper
2018-07-13  8:13     ` Jan Beulich
2018-07-16 12:37       ` Andrew Cooper
2018-07-16 12:53         ` Jan Beulich
2018-07-16 13:01           ` Andrew Cooper
2018-07-11 12:11 ` [PATCH 7/8] x86/shim: fully ignore "nosmp" and "maxcpus=" Jan Beulich
2018-07-11 12:23   ` Andrew Cooper
2018-07-11 15:18   ` Roger Pau Monné
2018-07-11 16:02   ` Wei Liu
2018-07-11 12:12 ` [PATCH 8/8] cpumask: tidy {,z}alloc_cpumask_var() Jan Beulich
2018-07-11 12:20   ` Andrew Cooper
2018-07-12 15:13   ` Wei Liu
     [not found] ` <5B45F26A02000078001D312F@suse.com>
2018-07-13  9:02   ` [PATCH 1/8] cpupools: fix state when downing a CPU failed Juergen Gross
2018-07-16  9:17     ` Jan Beulich
     [not found]     ` <5B4C629002000078001D4346@suse.com>
2018-07-16 11:47       ` Juergen Gross
2018-07-16 12:19         ` Jan Beulich
     [not found]         ` <5B4C8D3702000078001D45EA@suse.com>
2018-07-16 12:47           ` Juergen Gross
2018-07-16 13:01             ` Jan Beulich
     [not found]             ` <5B4C973D02000078001D4693@suse.com>
2018-07-16 14:21               ` Juergen Gross
2018-07-16 14:26                 ` Jan Beulich
     [not found]                 ` <5B4CAB1202000078001D47BC@suse.com>
2018-07-16 14:53                   ` Juergen Gross

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5B45F2AC02000078001D3133@prv1-mh.provo.novell.com \
    --to=jbeulich@suse.com \
    --cc=George.Dunlap@eu.citrix.com \
    --cc=Ian.Jackson@eu.citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=julien.grall@arm.com \
    --cc=sstabellini@kernel.org \
    --cc=tim@xen.org \
    --cc=wei.liu2@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.