All of lore.kernel.org
 help / color / mirror / Atom feed
From: Denys Vlasenko <dvlasenk@redhat.com>
To: Ingo Molnar <mingo@kernel.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>,
	Boris Ostrovsky <boris.ostrovsky@oracle.com>,
	David Vrabel <david.vrabel@citrix.com>,
	Joerg Roedel <joro@8bytes.org>, Gleb Natapov <gleb@kernel.org>,
	Paolo Bonzini <pbonzini@redhat.com>,
	kvm@vger.kernel.org, x86@kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH] x86: Use entire page for the per-cpu GDT only if paravirt-enabled
Date: Sat, 26 Sep 2015 20:00:40 +0200	[thread overview]
Message-ID: <1443290440-14930-1-git-send-email-dvlasenk@redhat.com> (raw)

We have our GDT in a page-sized per-cpu structure, gdt_page.

On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.

It is page-sized because of paravirt. Hypervisors need to know when
GDT is changed, so they remap it read-only and handle write faults.
If it's not in its own page, other writes nearby will cause
those faults too.

In other words, we need GDT to live in a separate page
only if CONFIG_HYPERVISOR_GUEST=y.

This patch reduces GDT alignment to cacheline-aligned
if CONFIG_HYPERVISOR_GUEST is not set.

Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
cpu_tss per-cpu variable), since now it is not always a full page.

    $ objdump -x vmlinux | grep .data..percpu | sort
Before:
    (offset)                                (size)
    0000000000000000  w    O .data..percpu  0000000000004000 irq_stack_union
    0000000000004000  w    O .data..percpu  0000000000005000 exception_stacks
    0000000000009000  w    O .data..percpu  0000000000001000 gdt_page  <<< HERE
    000000000000a000  w    O .data..percpu  0000000000000008 espfix_waddr
    000000000000a008  w    O .data..percpu  0000000000000008 espfix_stack
    ...
    0000000000019398 g       .data..percpu  0000000000000000 __per_cpu_end
After:
    0000000000000000  w    O .data..percpu  0000000000004000 irq_stack_union
    0000000000004000  w    O .data..percpu  0000000000005000 exception_stacks
    0000000000009000  w    O .data..percpu  0000000000000008 espfix_waddr
    0000000000009008  w    O .data..percpu  0000000000000008 espfix_stack
    ...
    0000000000013c80  w    O .data..percpu  0000000000000040 cyc2ns
    0000000000013cc0  w    O .data..percpu  00000000000022c0 cpu_tss
    0000000000015f80  w    O .data..percpu  0000000000000080 cpu_gdt  <<< HERE
    0000000000016000  w    O .data..percpu  0000000000000018 cpu_tlbstate
    ...
    0000000000018418 g       .data..percpu  0000000000000000 __per_cpu_end

Run-tested on a 144 CPU machine.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Ingo Molnar <mingo@kernel.org>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
CC: Boris Ostrovsky <boris.ostrovsky@oracle.com>
CC: David Vrabel <david.vrabel@citrix.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: kvm@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/entry/entry_32.S        |  2 +-
 arch/x86/include/asm/desc.h      | 16 +++++++++++-----
 arch/x86/kernel/cpu/common.c     | 10 ++++++++--
 arch/x86/kernel/cpu/perf_event.c |  2 +-
 arch/x86/kernel/head_32.S        |  4 ++--
 arch/x86/kernel/head_64.S        |  2 +-
 arch/x86/kernel/vmlinux.lds.S    |  2 +-
 arch/x86/tools/relocs.c          |  2 +-
 arch/x86/xen/enlighten.c         |  4 ++--
 9 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2909bf..bc6ae1c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -429,7 +429,7 @@ ldt_ss:
  * compensating for the offset by changing to the ESPFIX segment with
  * a base address that matches for the difference.
  */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
 	mov	%esp, %edx			/* load kernel esp */
 	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
 	mov	%dx, %ax			/* eax: new kernel esp */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73..76de300 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -39,15 +39,21 @@ extern gate_desc idt_table[];
 extern struct desc_ptr debug_idt_descr;
 extern gate_desc debug_idt_table[];
 
-struct gdt_page {
+struct cpu_gdt {
 	struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+}
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+__attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
+#else
+____cacheline_aligned;
+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
+#endif
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
-	return per_cpu(gdt_page, cpu).gdt;
+	return per_cpu(cpu_gdt, cpu).gdt;
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index de22ea7..6b90785 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#else
+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#endif
+{ .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	GDT_STACK_CANARY_INIT
 #endif
 } };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
 
 static int __init x86_mpx_setup(char *s)
 {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 66dd3fe9..41ebc94 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned int segment)
 		if (idx > GDT_ENTRIES)
 			return 0;
 
-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+		desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
 	}
 
 	return get_desc_base(desc);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0e2d96f..33c9d10 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -521,7 +521,7 @@ setup_once:
 	 * relocation.  Manually set base address in stack canary
 	 * segment descriptor.
 	 */
-	movl $gdt_page,%eax
+	movl $cpu_gdt,%eax
 	movl $stack_canary,%ecx
 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
 	shrl $16, %ecx
@@ -758,7 +758,7 @@ idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long gdt_page			/* Overwritten for secondary CPUs */
+	.long cpu_gdt			/* Overwritten for secondary CPUs */
 
 /*
  * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8..5a80e8c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt)
 early_gdt_descr:
 	.word	GDT_ENTRIES*8-1
 early_gdt_descr_base:
-	.quad	INIT_PER_CPU_VAR(gdt_page)
+	.quad	INIT_PER_CPU_VAR(cpu_gdt)
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf1..198e46b 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -348,7 +348,7 @@ SECTIONS
  * for the boot processor.
  */
 #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(cpu_gdt);
 INIT_PER_CPU(irq_stack_union);
 
 /*
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8..5a1da82 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -736,7 +736,7 @@ static void percpu_init(void)
  *
  * The "gold" linker incorrectly associates:
  *	init_per_cpu__irq_stack_union
- *	init_per_cpu__gdt_page
+ *	init_per_cpu__cpu_gdt
  */
 static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
 {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30d12af..baecfc6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/*
 	 * The only reliable way to retain the initial address of the
-	 * percpu gdt_page is to remember it here, so we can go and
+	 * percpu cpu_gdt is to remember it here, so we can go and
 	 * mark it RW later, when the initial percpu area is freed.
 	 */
-	xen_initial_gdt = &per_cpu(gdt_page, 0);
+	xen_initial_gdt = &per_cpu(cpu_gdt, 0);
 
 	xen_smp_init();
 
-- 
1.8.1.4


             reply	other threads:[~2015-09-26 18:00 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-26 18:00 Denys Vlasenko [this message]
2015-09-26 19:50 ` [PATCH] x86: Use entire page for the per-cpu GDT only if paravirt-enabled H. Peter Anvin
2015-09-26 20:38   ` Denys Vlasenko
2015-09-28  7:58     ` Ingo Molnar
2015-09-28 12:45       ` Denys Vlasenko
2015-09-29  9:01         ` Ingo Molnar
2015-09-29 17:35           ` Andy Lutomirski
2015-09-29 17:50             ` Linus Torvalds
2015-09-29 18:02               ` Andy Lutomirski
2015-09-29 20:30                 ` H. Peter Anvin
2015-09-30  1:20               ` Eric W. Biederman
2015-09-30  1:31                 ` H. Peter Anvin
2015-09-30  2:11                   ` Eric W. Biederman
2015-09-30  2:26                     ` H. Peter Anvin
2015-09-29 18:18             ` H. Peter Anvin
2015-09-29 18:22               ` Andy Lutomirski
2015-09-29 18:27                 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1443290440-14930-1-git-send-email-dvlasenk@redhat.com \
    --to=dvlasenk@redhat.com \
    --cc=boris.ostrovsky@oracle.com \
    --cc=david.vrabel@citrix.com \
    --cc=gleb@kernel.org \
    --cc=hpa@zytor.com \
    --cc=joro@8bytes.org \
    --cc=konrad.wilk@oracle.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.