All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
@ 2017-06-12 17:26 Andy Lutomirski
  2017-06-13  6:49 ` Ingo Molnar
                   ` (5 more replies)
  0 siblings, 6 replies; 12+ messages in thread
From: Andy Lutomirski @ 2017-06-12 17:26 UTC (permalink / raw)
  To: x86
  Cc: linux-kernel, Borislav Petkov, Andy Lutomirski, Tom Lendacky,
	Juergen Gross, xen-devel, Boris Ostrovsky

The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

Hi Ingo-

I broke this out because Tom's SME series and my PCID series both need it.
Please consider applying it to tip:x86/mm.

I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Thanks,
Andy

 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h             |  2 +-
 arch/x86/include/asm/mmu_context.h     |  4 ++--
 arch/x86/include/asm/paravirt.h        |  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h   | 10 +++++++---
 arch/x86/include/asm/tlbflush.h        |  4 ++--
 arch/x86/kernel/head64.c               |  3 ++-
 arch/x86/kernel/paravirt.c             |  2 +-
 arch/x86/kernel/process_32.c           |  2 +-
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mm/fault.c                    | 10 +++++-----
 arch/x86/mm/ioremap.c                  |  2 +-
 arch/x86/platform/efi/efi_64.c         |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c                   |  2 +-
 arch/x86/power/hibernate_64.c          |  3 ++-
 arch/x86/xen/mmu_pv.c                  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..8e69df96492e 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
 	 * and we must append to the existing area instead of entirely
 	 * overwriting it.
 	 */
-	level4p = read_cr3();
+	level4p = read_cr3_pa();
 	if (level4p == (unsigned long)_pgtable) {
 		debug_putstr("booted via startup_32()\n");
 		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
 	__kernel_fpu_begin();						\
 									\
 	if (efi_scratch.use_pgd) {					\
-		efi_scratch.prev_cr3 = read_cr3();			\
+		efi_scratch.prev_cr3 = __read_cr3();			\
 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
 		__flush_tlb_all();					\
 	}								\
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5a93f6261302..cfe6034ebfc6 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || !in_atomic());
 
-	VM_BUG_ON(cr3 != read_cr3());
+	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739d9f4b..a63e77f8eb41 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
 	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK	0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..9de02c985aa4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
 
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+	return __read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
 	write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
 	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
 
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
 	native_write_cr2(x);
 }
 
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-	return native_read_cr3();
+	return __native_read_cr3();
 }
 
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 388c2463fde6..5f78c6a77578 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	native_write_cr3(native_read_cr3());
+	native_write_cr3(__native_read_cr3());
 	preempt_enable();
 }
 
@@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
 	this_cpu_write(cpu_tlbstate.state, 0);
 	this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
 
-	WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
+	WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..794e8f517a81 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+	if (physaddr >= MAXMEM ||
+	    read_cr3_pa() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
+	.read_cr3 = __native_read_cr3,
 	.write_cr3 = native_write_cr3,
 
 	.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8bcc41..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 19cde555d73f..d143dd397dc9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 * Save the most likely value for this task's CR3 in the VMCS.
 	 * We can't use __get_current_cr3_fast() because we're not atomic.
 	 */
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
 	vmx->host_state.vmcs_host_cr3 = cr3;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * Do _not_ use "current" here. We might be inside
 	 * an interrupt in the middle of a task switch..
 	 */
-	pgd_paddr = read_cr3();
+	pgd_paddr = read_cr3_pa();
 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 	if (!pmd_k)
 		return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(address)];
 	p4d_t *p4d;
 	pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * happen within a race in page table update. In the later
 	 * case just flush:
 	 */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
 	pgd_ref = pgd_offset_k(address);
 	if (pgd_none(*pgd_ref))
 		return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = base + pgd_index(address);
 	p4d_t *p4d;
 	pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pgd_t *pgd;
 		pte_t *pte;
 
-		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+		pgd = __va(read_cr3_pa());
 		pgd += pgd_index(address);
 
 		pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
 	/* Don't assume we're using swapper_pg_dir at this point */
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(addr)];
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff15a7f6..f40bf6230480 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 	int n_pgds, i, j;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
-		save_pgd = (pgd_t *)read_cr3();
+		save_pgd = (pgd_t *)__read_cr3();
 		write_cr3((unsigned long)efi_scratch.efi_pgt);
 		goto out;
 	}
@@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
 	efi_sync_low_kernel_mappings();
 	local_irq_save(flags);
 
-	efi_scratch.prev_cr3 = read_cr3();
+	efi_scratch.prev_cr3 = __read_cr3();
 	write_cr3((unsigned long)efi_scratch.efi_pgt);
 	__flush_tlb_all();
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-	void *pgd_addr = __va(read_cr3());
+	void *pgd_addr = __va(read_cr3_pa());
 
 	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
 	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 	 */
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
-	ctxt->cr3 = read_cr3();
+	ctxt->cr3 = __read_cr3();
 	ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
 	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
 
 	/* Make the page containing the relocated code executable */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+	pgd = (pgd_t *)__va(read_cr3_pa()) +
+		pgd_index(relocated_restore_code);
 	p4d = p4d_offset(pgd, relocated_restore_code);
 	if (p4d_large(*p4d)) {
 		set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 21beb37114b7..4f638309deea 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
 	pmd_t pmd;
 	pte_t pte;
 
-	pa = read_cr3();
+	pa = read_cr3_pa();
 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
 						       sizeof(pgd)));
 	if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
-	pgd = __va(read_cr3());
+	pgd = __va(read_cr3_pa());
 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
 	idx_p4d = 0;
 	save_pud = n_pud;
@@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 {
 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
-	BUG_ON(read_cr3() != __pa(initial_page_table));
+	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
 	BUG_ON(cr3 != __pa(swapper_pg_dir));
 
 	/*
-- 
2.9.4

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
  2017-06-13  6:49 ` Ingo Molnar
@ 2017-06-13  6:49 ` Ingo Molnar
  2017-06-13  9:26 ` Borislav Petkov
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Ingo Molnar @ 2017-06-13  6:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, linux-kernel, Borislav Petkov, Tom Lendacky, Juergen Gross,
	xen-devel, Boris Ostrovsky


* Andy Lutomirski <luto@kernel.org> wrote:

> The kernel has several code paths that read CR3.  Most of them assume that
> CR3 contains the PGD's physical address, whereas some of them awkwardly
> use PHYSICAL_PAGE_MASK to mask off low bits.
> 
> Add explicit mask macros for CR3 and convert all of the CR3 readers.
> This will keep them from breaking when PCID is enabled.
> 
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: xen-devel <xen-devel@lists.xen.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
> 
> Hi Ingo-
> 
> I broke this out because Tom's SME series and my PCID series both need it.
> Please consider applying it to tip:x86/mm.
> 
> I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Ok, thanks - will push it out after a bit of testing.

Note that I also merged sched/urgent into x86/mm, to create a better base tree for 
the PCID changes by picking up 252d2a4117bc that PCID depends on.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
@ 2017-06-13  6:49 ` Ingo Molnar
  2017-06-13  6:49 ` Ingo Molnar
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Ingo Molnar @ 2017-06-13  6:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Juergen Gross, Tom Lendacky, x86, linux-kernel, xen-devel,
	Borislav Petkov, Boris Ostrovsky


* Andy Lutomirski <luto@kernel.org> wrote:

> The kernel has several code paths that read CR3.  Most of them assume that
> CR3 contains the PGD's physical address, whereas some of them awkwardly
> use PHYSICAL_PAGE_MASK to mask off low bits.
> 
> Add explicit mask macros for CR3 and convert all of the CR3 readers.
> This will keep them from breaking when PCID is enabled.
> 
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: xen-devel <xen-devel@lists.xen.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
> 
> Hi Ingo-
> 
> I broke this out because Tom's SME series and my PCID series both need it.
> Please consider applying it to tip:x86/mm.
> 
> I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Ok, thanks - will push it out after a bit of testing.

Note that I also merged sched/urgent into x86/mm, to create a better base tree for 
the PCID changes by picking up 252d2a4117bc that PCID depends on.

Thanks,

	Ingo

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
  2017-06-13  6:49 ` Ingo Molnar
  2017-06-13  6:49 ` Ingo Molnar
@ 2017-06-13  9:26 ` Borislav Petkov
  2017-06-13 16:00   ` Andy Lutomirski
  2017-06-13 16:00   ` Andy Lutomirski
  2017-06-13  9:26 ` Borislav Petkov
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 12+ messages in thread
From: Borislav Petkov @ 2017-06-13  9:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: x86, linux-kernel, Tom Lendacky, Juergen Gross, xen-devel,
	Boris Ostrovsky

On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
> The kernel has several code paths that read CR3.  Most of them assume that
> CR3 contains the PGD's physical address, whereas some of them awkwardly
> use PHYSICAL_PAGE_MASK to mask off low bits.
> 
> Add explicit mask macros for CR3 and convert all of the CR3 readers.
> This will keep them from breaking when PCID is enabled.

...

> +/*
> + * CR3's layout varies depending on several things.
> + *
> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
> + * CR3[2:0] and CR3[11:5] are ignored.
> + *
> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
> + *
> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
> + * written as 1 to prevent the write to CR3 from flushing the TLB.
> + *
> + * On systems with SME, one bit (in a variable position!) is stolen to indicate
> + * that the top-level paging structure is encrypted.
> + *
> + * All of the remaining bits indicate the physical address of the top-level
> + * paging structure.
> + *
> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
> + */
> +#ifdef CONFIG_X86_64
> +/* Mask off the address space ID bits. */
> +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
> +#define CR3_PCID_MASK 0xFFFull
> +#else
> +/*
> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
> + * a tiny bit of code size by setting all the bits.
> + */
> +#define CR3_ADDR_MASK 0xFFFFFFFFull
> +#define CR3_PCID_MASK 0ull

All those can do GENMASK_ULL for better readability.

> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 3586996fc50d..bc0a849589bb 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>  
>  	.read_cr2 = native_read_cr2,
>  	.write_cr2 = native_write_cr2,
> -	.read_cr3 = native_read_cr3,
> +	.read_cr3 = __native_read_cr3,
>  	.write_cr3 = native_write_cr3,
>  
>  	.flush_tlb_user = native_flush_tlb,
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index ffeae818aa7a..c6d6dc5f8bb2 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>  
>  	cr0 = read_cr0();
>  	cr2 = read_cr2();
> -	cr3 = read_cr3();
> +	cr3 = __read_cr3();
>  	cr4 = __read_cr4();

This is a good example for my confusion. So we have __read_cr4() - with
the underscores - but not read_cr4().

Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
well lose the "__", right?

Or are the PCID series bringing a read_cr3() without "__" too?

Oh, and to confuse me even more, there's __native_read_cr3() which is
*finally* accessing %cr3 :-) But there's native_write_cr3() without
underscores.

So can we make those names a bit more balanced please?

Thanks.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
                   ` (2 preceding siblings ...)
  2017-06-13  9:26 ` Borislav Petkov
@ 2017-06-13  9:26 ` Borislav Petkov
  2017-06-13 10:04 ` [tip:x86/mm] " tip-bot for Andy Lutomirski
  2017-06-13 10:04 ` tip-bot for Andy Lutomirski
  5 siblings, 0 replies; 12+ messages in thread
From: Borislav Petkov @ 2017-06-13  9:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Juergen Gross, Tom Lendacky, x86, linux-kernel, xen-devel,
	Boris Ostrovsky

On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
> The kernel has several code paths that read CR3.  Most of them assume that
> CR3 contains the PGD's physical address, whereas some of them awkwardly
> use PHYSICAL_PAGE_MASK to mask off low bits.
> 
> Add explicit mask macros for CR3 and convert all of the CR3 readers.
> This will keep them from breaking when PCID is enabled.

...

> +/*
> + * CR3's layout varies depending on several things.
> + *
> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
> + * CR3[2:0] and CR3[11:5] are ignored.
> + *
> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
> + *
> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
> + * written as 1 to prevent the write to CR3 from flushing the TLB.
> + *
> + * On systems with SME, one bit (in a variable position!) is stolen to indicate
> + * that the top-level paging structure is encrypted.
> + *
> + * All of the remaining bits indicate the physical address of the top-level
> + * paging structure.
> + *
> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
> + */
> +#ifdef CONFIG_X86_64
> +/* Mask off the address space ID bits. */
> +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
> +#define CR3_PCID_MASK 0xFFFull
> +#else
> +/*
> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
> + * a tiny bit of code size by setting all the bits.
> + */
> +#define CR3_ADDR_MASK 0xFFFFFFFFull
> +#define CR3_PCID_MASK 0ull

All those can do GENMASK_ULL for better readability.

> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 3586996fc50d..bc0a849589bb 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>  
>  	.read_cr2 = native_read_cr2,
>  	.write_cr2 = native_write_cr2,
> -	.read_cr3 = native_read_cr3,
> +	.read_cr3 = __native_read_cr3,
>  	.write_cr3 = native_write_cr3,
>  
>  	.flush_tlb_user = native_flush_tlb,
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index ffeae818aa7a..c6d6dc5f8bb2 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>  
>  	cr0 = read_cr0();
>  	cr2 = read_cr2();
> -	cr3 = read_cr3();
> +	cr3 = __read_cr3();
>  	cr4 = __read_cr4();

This is a good example for my confusion. So we have __read_cr4() - with
the underscores - but not read_cr4().

Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
well lose the "__", right?

Or are the PCID series bringing a read_cr3() without "__" too?

Oh, and to confuse me even more, there's __native_read_cr3() which is
*finally* accessing %cr3 :-) But there's native_write_cr3() without
underscores.

So can we make those names a bit more balanced please?

Thanks.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:x86/mm] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
                   ` (4 preceding siblings ...)
  2017-06-13 10:04 ` [tip:x86/mm] " tip-bot for Andy Lutomirski
@ 2017-06-13 10:04 ` tip-bot for Andy Lutomirski
  5 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Andy Lutomirski @ 2017-06-13 10:04 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, jpoimboe, peterz, tglx, jgross, bp, brgerst,
	thomas.lendacky, luto, torvalds, mingo, boris.ostrovsky,
	dvlasenk, xen-devel

Commit-ID:  6c690ee1039b251e583fc65b28da30e97d6a7385
Gitweb:     http://git.kernel.org/tip/6c690ee1039b251e583fc65b28da30e97d6a7385
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Mon, 12 Jun 2017 10:26:14 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 13 Jun 2017 08:48:09 +0200

x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h             |  2 +-
 arch/x86/include/asm/mmu_context.h     |  4 ++--
 arch/x86/include/asm/paravirt.h        |  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h   | 10 +++++++---
 arch/x86/include/asm/tlbflush.h        |  4 ++--
 arch/x86/kernel/head64.c               |  3 ++-
 arch/x86/kernel/paravirt.c             |  2 +-
 arch/x86/kernel/process_32.c           |  2 +-
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mm/fault.c                    | 10 +++++-----
 arch/x86/mm/ioremap.c                  |  2 +-
 arch/x86/platform/efi/efi_64.c         |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c                   |  2 +-
 arch/x86/power/hibernate_64.c          |  3 ++-
 arch/x86/xen/mmu_pv.c                  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f17..8e69df9 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
 	 * and we must append to the existing area instead of entirely
 	 * overwriting it.
 	 */
-	level4p = read_cr3();
+	level4p = read_cr3_pa();
 	if (level4p == (unsigned long)_pgtable) {
 		debug_putstr("booted via startup_32()\n");
 		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bce..d2ff779 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
 	__kernel_fpu_begin();						\
 									\
 	if (efi_scratch.use_pgd) {					\
-		efi_scratch.prev_cr3 = read_cr3();			\
+		efi_scratch.prev_cr3 = __read_cr3();			\
 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
 		__flush_tlb_all();					\
 	}								\
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5a93f62..cfe6034 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || !in_atomic());
 
-	VM_BUG_ON(cr3 != read_cr3());
+	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739..a63e77f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
 	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618..79aa2f9 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK	0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada99..9de02c9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
 
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+	return __read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
 	write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e3..9efaabf 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
 	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
 
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
 	native_write_cr2(x);
 }
 
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-	return native_read_cr3();
+	return __native_read_cr3();
 }
 
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 388c246..5f78c6a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	native_write_cr3(native_read_cr3());
+	native_write_cr3(__native_read_cr3());
 	preempt_enable();
 }
 
@@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
 	this_cpu_write(cpu_tlbstate.state, 0);
 	this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
 
-	WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
+	WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002..794e8f5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+	if (physaddr >= MAXMEM ||
+	    read_cr3_pa() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996..bc0a849 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
+	.read_cr3 = __native_read_cr3,
 	.write_cr3 = native_write_cr3,
 
 	.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae81..c6d6dc5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8..c3169be 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 19cde55..d143dd3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 * Save the most likely value for this task's CR3 in the VMCS.
 	 * We can't use __get_current_cr3_fast() because we're not atomic.
 	 */
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
 	vmx->host_state.vmcs_host_cr3 = cr3;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a0..2a1fa10c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * Do _not_ use "current" here. We might be inside
 	 * an interrupt in the middle of a task switch..
 	 */
-	pgd_paddr = read_cr3();
+	pgd_paddr = read_cr3_pa();
 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 	if (!pmd_k)
 		return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(address)];
 	p4d_t *p4d;
 	pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * happen within a race in page table update. In the later
 	 * case just flush:
 	 */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
 	pgd_ref = pgd_offset_k(address);
 	if (pgd_none(*pgd_ref))
 		return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = base + pgd_index(address);
 	p4d_t *p4d;
 	pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pgd_t *pgd;
 		pte_t *pte;
 
-		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+		pgd = __va(read_cr3_pa());
 		pgd += pgd_index(address);
 
 		pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b..4c1b5fd 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
 	/* Don't assume we're using swapper_pg_dir at this point */
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(addr)];
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff1..f40bf62 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 	int n_pgds, i, j;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
-		save_pgd = (pgd_t *)read_cr3();
+		save_pgd = (pgd_t *)__read_cr3();
 		write_cr3((unsigned long)efi_scratch.efi_pgt);
 		goto out;
 	}
@@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
 	efi_sync_low_kernel_mappings();
 	local_irq_save(flags);
 
-	efi_scratch.prev_cr3 = read_cr3();
+	efi_scratch.prev_cr3 = __read_cr3();
 	write_cr3((unsigned long)efi_scratch.efi_pgt);
 	__flush_tlb_all();
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd..0668aaf 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-	void *pgd_addr = __va(read_cr3());
+	void *pgd_addr = __va(read_cr3_pa());
 
 	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
 	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a92..78459a6 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 	 */
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
-	ctxt->cr3 = read_cr3();
+	ctxt->cr3 = __read_cr3();
 	ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fe..e3e62c8 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
 	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
 
 	/* Make the page containing the relocated code executable */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+	pgd = (pgd_t *)__va(read_cr3_pa()) +
+		pgd_index(relocated_restore_code);
 	p4d = p4d_offset(pgd, relocated_restore_code);
 	if (p4d_large(*p4d)) {
 		set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 21beb37..4f63830 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
 	pmd_t pmd;
 	pte_t pte;
 
-	pa = read_cr3();
+	pa = read_cr3_pa();
 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
 						       sizeof(pgd)));
 	if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
-	pgd = __va(read_cr3());
+	pgd = __va(read_cr3_pa());
 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
 	idx_p4d = 0;
 	save_pud = n_pud;
@@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 {
 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
-	BUG_ON(read_cr3() != __pa(initial_page_table));
+	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
 	BUG_ON(cr3 != __pa(swapper_pg_dir));
 
 	/*

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:x86/mm] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
                   ` (3 preceding siblings ...)
  2017-06-13  9:26 ` Borislav Petkov
@ 2017-06-13 10:04 ` tip-bot for Andy Lutomirski
  2017-06-13 10:04 ` tip-bot for Andy Lutomirski
  5 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Andy Lutomirski @ 2017-06-13 10:04 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: jgross, thomas.lendacky, dvlasenk, peterz, brgerst,
	boris.ostrovsky, hpa, linux-kernel, xen-devel, bp, luto,
	jpoimboe, tglx, torvalds, mingo

Commit-ID:  6c690ee1039b251e583fc65b28da30e97d6a7385
Gitweb:     http://git.kernel.org/tip/6c690ee1039b251e583fc65b28da30e97d6a7385
Author:     Andy Lutomirski <luto@kernel.org>
AuthorDate: Mon, 12 Jun 2017 10:26:14 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 13 Jun 2017 08:48:09 +0200

x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h             |  2 +-
 arch/x86/include/asm/mmu_context.h     |  4 ++--
 arch/x86/include/asm/paravirt.h        |  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h   | 10 +++++++---
 arch/x86/include/asm/tlbflush.h        |  4 ++--
 arch/x86/kernel/head64.c               |  3 ++-
 arch/x86/kernel/paravirt.c             |  2 +-
 arch/x86/kernel/process_32.c           |  2 +-
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mm/fault.c                    | 10 +++++-----
 arch/x86/mm/ioremap.c                  |  2 +-
 arch/x86/platform/efi/efi_64.c         |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c                   |  2 +-
 arch/x86/power/hibernate_64.c          |  3 ++-
 arch/x86/xen/mmu_pv.c                  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f17..8e69df9 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
 	 * and we must append to the existing area instead of entirely
 	 * overwriting it.
 	 */
-	level4p = read_cr3();
+	level4p = read_cr3_pa();
 	if (level4p == (unsigned long)_pgtable) {
 		debug_putstr("booted via startup_32()\n");
 		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bce..d2ff779 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
 	__kernel_fpu_begin();						\
 									\
 	if (efi_scratch.use_pgd) {					\
-		efi_scratch.prev_cr3 = read_cr3();			\
+		efi_scratch.prev_cr3 = __read_cr3();			\
 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
 		__flush_tlb_all();					\
 	}								\
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5a93f62..cfe6034 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || !in_atomic());
 
-	VM_BUG_ON(cr3 != read_cr3());
+	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739..a63e77f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
 	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618..79aa2f9 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK	0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada99..9de02c9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
 
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+	return __read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
 	write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e3..9efaabf 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
 	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
 
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
 	native_write_cr2(x);
 }
 
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-	return native_read_cr3();
+	return __native_read_cr3();
 }
 
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 388c246..5f78c6a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	native_write_cr3(native_read_cr3());
+	native_write_cr3(__native_read_cr3());
 	preempt_enable();
 }
 
@@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
 	this_cpu_write(cpu_tlbstate.state, 0);
 	this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
 
-	WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
+	WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002..794e8f5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+	if (physaddr >= MAXMEM ||
+	    read_cr3_pa() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996..bc0a849 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
+	.read_cr3 = __native_read_cr3,
 	.write_cr3 = native_write_cr3,
 
 	.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae81..c6d6dc5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8..c3169be 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 19cde55..d143dd3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 * Save the most likely value for this task's CR3 in the VMCS.
 	 * We can't use __get_current_cr3_fast() because we're not atomic.
 	 */
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
 	vmx->host_state.vmcs_host_cr3 = cr3;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a0..2a1fa10c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * Do _not_ use "current" here. We might be inside
 	 * an interrupt in the middle of a task switch..
 	 */
-	pgd_paddr = read_cr3();
+	pgd_paddr = read_cr3_pa();
 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 	if (!pmd_k)
 		return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(address)];
 	p4d_t *p4d;
 	pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * happen within a race in page table update. In the later
 	 * case just flush:
 	 */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
 	pgd_ref = pgd_offset_k(address);
 	if (pgd_none(*pgd_ref))
 		return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = base + pgd_index(address);
 	p4d_t *p4d;
 	pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pgd_t *pgd;
 		pte_t *pte;
 
-		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+		pgd = __va(read_cr3_pa());
 		pgd += pgd_index(address);
 
 		pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b..4c1b5fd 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
 	/* Don't assume we're using swapper_pg_dir at this point */
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(addr)];
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff1..f40bf62 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 	int n_pgds, i, j;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
-		save_pgd = (pgd_t *)read_cr3();
+		save_pgd = (pgd_t *)__read_cr3();
 		write_cr3((unsigned long)efi_scratch.efi_pgt);
 		goto out;
 	}
@@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
 	efi_sync_low_kernel_mappings();
 	local_irq_save(flags);
 
-	efi_scratch.prev_cr3 = read_cr3();
+	efi_scratch.prev_cr3 = __read_cr3();
 	write_cr3((unsigned long)efi_scratch.efi_pgt);
 	__flush_tlb_all();
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd..0668aaf 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-	void *pgd_addr = __va(read_cr3());
+	void *pgd_addr = __va(read_cr3_pa());
 
 	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
 	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a92..78459a6 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 	 */
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
-	ctxt->cr3 = read_cr3();
+	ctxt->cr3 = __read_cr3();
 	ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fe..e3e62c8 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
 	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
 
 	/* Make the page containing the relocated code executable */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+	pgd = (pgd_t *)__va(read_cr3_pa()) +
+		pgd_index(relocated_restore_code);
 	p4d = p4d_offset(pgd, relocated_restore_code);
 	if (p4d_large(*p4d)) {
 		set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 21beb37..4f63830 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
 	pmd_t pmd;
 	pte_t pte;
 
-	pa = read_cr3();
+	pa = read_cr3_pa();
 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
 						       sizeof(pgd)));
 	if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
-	pgd = __va(read_cr3());
+	pgd = __va(read_cr3_pa());
 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
 	idx_p4d = 0;
 	save_pud = n_pud;
@@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 {
 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
-	BUG_ON(read_cr3() != __pa(initial_page_table));
+	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
 	BUG_ON(cr3 != __pa(swapper_pg_dir));
 
 	/*

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-13  9:26 ` Borislav Petkov
  2017-06-13 16:00   ` Andy Lutomirski
@ 2017-06-13 16:00   ` Andy Lutomirski
  2017-06-13 16:18     ` Borislav Petkov
  2017-06-13 16:18     ` Borislav Petkov
  1 sibling, 2 replies; 12+ messages in thread
From: Andy Lutomirski @ 2017-06-13 16:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, X86 ML, linux-kernel, Tom Lendacky,
	Juergen Gross, xen-devel, Boris Ostrovsky

On Tue, Jun 13, 2017 at 2:26 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
>> The kernel has several code paths that read CR3.  Most of them assume that
>> CR3 contains the PGD's physical address, whereas some of them awkwardly
>> use PHYSICAL_PAGE_MASK to mask off low bits.
>>
>> Add explicit mask macros for CR3 and convert all of the CR3 readers.
>> This will keep them from breaking when PCID is enabled.
>
> ...
>
>> +/*
>> + * CR3's layout varies depending on several things.
>> + *
>> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
>> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
>> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
>> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
>> + * CR3[2:0] and CR3[11:5] are ignored.
>> + *
>> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
>> + *
>> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
>> + * written as 1 to prevent the write to CR3 from flushing the TLB.
>> + *
>> + * On systems with SME, one bit (in a variable position!) is stolen to indicate
>> + * that the top-level paging structure is encrypted.
>> + *
>> + * All of the remaining bits indicate the physical address of the top-level
>> + * paging structure.
>> + *
>> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
>> + */
>> +#ifdef CONFIG_X86_64
>> +/* Mask off the address space ID bits. */
>> +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
>> +#define CR3_PCID_MASK 0xFFFull
>> +#else
>> +/*
>> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
>> + * a tiny bit of code size by setting all the bits.
>> + */
>> +#define CR3_ADDR_MASK 0xFFFFFFFFull
>> +#define CR3_PCID_MASK 0ull
>
> All those can do GENMASK_ULL for better readability.
>

Hmm.  I'll look at that.

>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 3586996fc50d..bc0a849589bb 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>>
>>       .read_cr2 = native_read_cr2,
>>       .write_cr2 = native_write_cr2,
>> -     .read_cr3 = native_read_cr3,
>> +     .read_cr3 = __native_read_cr3,
>>       .write_cr3 = native_write_cr3,
>>
>>       .flush_tlb_user = native_flush_tlb,
>> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
>> index ffeae818aa7a..c6d6dc5f8bb2 100644
>> --- a/arch/x86/kernel/process_32.c
>> +++ b/arch/x86/kernel/process_32.c
>> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>>
>>       cr0 = read_cr0();
>>       cr2 = read_cr2();
>> -     cr3 = read_cr3();
>> +     cr3 = __read_cr3();
>>       cr4 = __read_cr4();
>
> This is a good example for my confusion. So we have __read_cr4() - with
> the underscores - but not read_cr4().
>
> Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
> well lose the "__", right?
>
> Or are the PCID series bringing a read_cr3() without "__" too?

The intent was twofold:

1. Make sure that every read_cr3() instance got converted.  I didn't
want a mid-air collision with someone else's patch in which it would
appear to apply and compile but the result would randomly fail on PCID
systems.

2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
means "return this complicated register value -- I know what I'm
doing" and read_cr3_pa() means "give me the PA".

Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
I suppose.

What do you think?  My general preference is to clean this up after
the rest of the big patchsets (SME and PCID) land.

>
> Oh, and to confuse me even more, there's __native_read_cr3() which is
> *finally* accessing %cr3 :-) But there's native_write_cr3() without
> underscores.
>
> So can we make those names a bit more balanced please?

write_cr3() was less widespread, so I worried about it less.

--Andy

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-13  9:26 ` Borislav Petkov
@ 2017-06-13 16:00   ` Andy Lutomirski
  2017-06-13 16:00   ` Andy Lutomirski
  1 sibling, 0 replies; 12+ messages in thread
From: Andy Lutomirski @ 2017-06-13 16:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Juergen Gross, Tom Lendacky, X86 ML, linux-kernel, xen-devel,
	Andy Lutomirski, Boris Ostrovsky

On Tue, Jun 13, 2017 at 2:26 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
>> The kernel has several code paths that read CR3.  Most of them assume that
>> CR3 contains the PGD's physical address, whereas some of them awkwardly
>> use PHYSICAL_PAGE_MASK to mask off low bits.
>>
>> Add explicit mask macros for CR3 and convert all of the CR3 readers.
>> This will keep them from breaking when PCID is enabled.
>
> ...
>
>> +/*
>> + * CR3's layout varies depending on several things.
>> + *
>> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
>> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
>> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
>> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
>> + * CR3[2:0] and CR3[11:5] are ignored.
>> + *
>> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
>> + *
>> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
>> + * written as 1 to prevent the write to CR3 from flushing the TLB.
>> + *
>> + * On systems with SME, one bit (in a variable position!) is stolen to indicate
>> + * that the top-level paging structure is encrypted.
>> + *
>> + * All of the remaining bits indicate the physical address of the top-level
>> + * paging structure.
>> + *
>> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
>> + */
>> +#ifdef CONFIG_X86_64
>> +/* Mask off the address space ID bits. */
>> +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
>> +#define CR3_PCID_MASK 0xFFFull
>> +#else
>> +/*
>> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
>> + * a tiny bit of code size by setting all the bits.
>> + */
>> +#define CR3_ADDR_MASK 0xFFFFFFFFull
>> +#define CR3_PCID_MASK 0ull
>
> All those can do GENMASK_ULL for better readability.
>

Hmm.  I'll look at that.

>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 3586996fc50d..bc0a849589bb 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>>
>>       .read_cr2 = native_read_cr2,
>>       .write_cr2 = native_write_cr2,
>> -     .read_cr3 = native_read_cr3,
>> +     .read_cr3 = __native_read_cr3,
>>       .write_cr3 = native_write_cr3,
>>
>>       .flush_tlb_user = native_flush_tlb,
>> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
>> index ffeae818aa7a..c6d6dc5f8bb2 100644
>> --- a/arch/x86/kernel/process_32.c
>> +++ b/arch/x86/kernel/process_32.c
>> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>>
>>       cr0 = read_cr0();
>>       cr2 = read_cr2();
>> -     cr3 = read_cr3();
>> +     cr3 = __read_cr3();
>>       cr4 = __read_cr4();
>
> This is a good example for my confusion. So we have __read_cr4() - with
> the underscores - but not read_cr4().
>
> Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
> well lose the "__", right?
>
> Or are the PCID series bringing a read_cr3() without "__" too?

The intent was twofold:

1. Make sure that every read_cr3() instance got converted.  I didn't
want a mid-air collision with someone else's patch in which it would
appear to apply and compile but the result would randomly fail on PCID
systems.

2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
means "return this complicated register value -- I know what I'm
doing" and read_cr3_pa() means "give me the PA".

Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
I suppose.

What do you think?  My general preference is to clean this up after
the rest of the big patchsets (SME and PCID) land.

>
> Oh, and to confuse me even more, there's __native_read_cr3() which is
> *finally* accessing %cr3 :-) But there's native_write_cr3() without
> underscores.
>
> So can we make those names a bit more balanced please?

write_cr3() was less widespread, so I worried about it less.

--Andy

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-13 16:00   ` Andy Lutomirski
@ 2017-06-13 16:18     ` Borislav Petkov
  2017-06-13 16:18     ` Borislav Petkov
  1 sibling, 0 replies; 12+ messages in thread
From: Borislav Petkov @ 2017-06-13 16:18 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: X86 ML, linux-kernel, Tom Lendacky, Juergen Gross, xen-devel,
	Boris Ostrovsky

On Tue, Jun 13, 2017 at 09:00:01AM -0700, Andy Lutomirski wrote:
> 1. Make sure that every read_cr3() instance got converted.  I didn't
> want a mid-air collision with someone else's patch in which it would
> appear to apply and compile but the result would randomly fail on PCID
> systems.

Right.

> 2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
> means "return this complicated register value -- I know what I'm
> doing" and read_cr3_pa() means "give me the PA".

Agreed with the _pa thing.

> Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
> wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
> I suppose.

Yeah, both make sense to me. I like the _raw thing and the _noshadow
too, as they actually say what the function *really* does.

In any case, the __ variant is less descriptive than having:

read_crX_pa
read_crX_raw
read_crX_noshadow

and so on which actually say what they each do and when you wonder which
to use, you know.

> What do you think?  My general preference is to clean this up after
> the rest of the big patchsets (SME and PCID) land.

Of course.

Thanks.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
  2017-06-13 16:00   ` Andy Lutomirski
  2017-06-13 16:18     ` Borislav Petkov
@ 2017-06-13 16:18     ` Borislav Petkov
  1 sibling, 0 replies; 12+ messages in thread
From: Borislav Petkov @ 2017-06-13 16:18 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Juergen Gross, Tom Lendacky, X86 ML, linux-kernel, xen-devel,
	Boris Ostrovsky

On Tue, Jun 13, 2017 at 09:00:01AM -0700, Andy Lutomirski wrote:
> 1. Make sure that every read_cr3() instance got converted.  I didn't
> want a mid-air collision with someone else's patch in which it would
> appear to apply and compile but the result would randomly fail on PCID
> systems.

Right.

> 2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
> means "return this complicated register value -- I know what I'm
> doing" and read_cr3_pa() means "give me the PA".

Agreed with the _pa thing.

> Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
> wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
> I suppose.

Yeah, both make sense to me. I like the _raw thing and the _noshadow
too, as they actually say what the function *really* does.

In any case, the __ variant is less descriptive than having:

read_crX_pa
read_crX_raw
read_crX_noshadow

and so on which actually say what they each do and when you wonder which
to use, you know.

> What do you think?  My general preference is to clean this up after
> the rest of the big patchsets (SME and PCID) land.

Of course.

Thanks.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()
@ 2017-06-12 17:26 Andy Lutomirski
  0 siblings, 0 replies; 12+ messages in thread
From: Andy Lutomirski @ 2017-06-12 17:26 UTC (permalink / raw)
  To: x86
  Cc: Juergen Gross, Tom Lendacky, linux-kernel, xen-devel,
	Borislav Petkov, Andy Lutomirski, Boris Ostrovsky

The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

Hi Ingo-

I broke this out because Tom's SME series and my PCID series both need it.
Please consider applying it to tip:x86/mm.

I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Thanks,
Andy

 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h             |  2 +-
 arch/x86/include/asm/mmu_context.h     |  4 ++--
 arch/x86/include/asm/paravirt.h        |  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h   | 10 +++++++---
 arch/x86/include/asm/tlbflush.h        |  4 ++--
 arch/x86/kernel/head64.c               |  3 ++-
 arch/x86/kernel/paravirt.c             |  2 +-
 arch/x86/kernel/process_32.c           |  2 +-
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mm/fault.c                    | 10 +++++-----
 arch/x86/mm/ioremap.c                  |  2 +-
 arch/x86/platform/efi/efi_64.c         |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c                   |  2 +-
 arch/x86/power/hibernate_64.c          |  3 ++-
 arch/x86/xen/mmu_pv.c                  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..8e69df96492e 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@ void initialize_identity_maps(void)
 	 * and we must append to the existing area instead of entirely
 	 * overwriting it.
 	 */
-	level4p = read_cr3();
+	level4p = read_cr3_pa();
 	if (level4p == (unsigned long)_pgtable) {
 		debug_putstr("booted via startup_32()\n");
 		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
 	__kernel_fpu_begin();						\
 									\
 	if (efi_scratch.use_pgd) {					\
-		efi_scratch.prev_cr3 = read_cr3();			\
+		efi_scratch.prev_cr3 = __read_cr3();			\
 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
 		__flush_tlb_all();					\
 	}								\
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5a93f6261302..cfe6034ebfc6 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@ static inline unsigned long __get_current_cr3_fast(void)
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || !in_atomic());
 
-	VM_BUG_ON(cr3 != read_cr3());
+	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739d9f4b..a63e77f8eb41 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
 	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK	0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..9de02c985aa4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
 
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+	return __read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
 	write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
 	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
 
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
 	native_write_cr2(x);
 }
 
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-	return native_read_cr3();
+	return __native_read_cr3();
 }
 
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 388c2463fde6..5f78c6a77578 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,7 +156,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	native_write_cr3(native_read_cr3());
+	native_write_cr3(__native_read_cr3());
 	preempt_enable();
 }
 
@@ -264,7 +264,7 @@ static inline void reset_lazy_tlbstate(void)
 	this_cpu_write(cpu_tlbstate.state, 0);
 	this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
 
-	WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
+	WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..794e8f517a81 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -55,7 +55,8 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+	if (physaddr >= MAXMEM ||
+	    read_cr3_pa() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
+	.read_cr3 = __native_read_cr3,
 	.write_cr3 = native_write_cr3,
 
 	.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8bcc41..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 19cde555d73f..d143dd397dc9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5024,7 +5024,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 * Save the most likely value for this task's CR3 in the VMCS.
 	 * We can't use __get_current_cr3_fast() because we're not atomic.
 	 */
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
 	vmx->host_state.vmcs_host_cr3 = cr3;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * Do _not_ use "current" here. We might be inside
 	 * an interrupt in the middle of a task switch..
 	 */
-	pgd_paddr = read_cr3();
+	pgd_paddr = read_cr3_pa();
 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 	if (!pmd_k)
 		return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(address)];
 	p4d_t *p4d;
 	pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
 	 * happen within a race in page table update. In the later
 	 * case just flush:
 	 */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
 	pgd_ref = pgd_offset_k(address);
 	if (pgd_none(*pgd_ref))
 		return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = base + pgd_index(address);
 	p4d_t *p4d;
 	pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pgd_t *pgd;
 		pte_t *pte;
 
-		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+		pgd = __va(read_cr3_pa());
 		pgd += pgd_index(address);
 
 		pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
 	/* Don't assume we're using swapper_pg_dir at this point */
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(addr)];
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff15a7f6..f40bf6230480 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 	int n_pgds, i, j;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
-		save_pgd = (pgd_t *)read_cr3();
+		save_pgd = (pgd_t *)__read_cr3();
 		write_cr3((unsigned long)efi_scratch.efi_pgt);
 		goto out;
 	}
@@ -646,7 +646,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
 	efi_sync_low_kernel_mappings();
 	local_irq_save(flags);
 
-	efi_scratch.prev_cr3 = read_cr3();
+	efi_scratch.prev_cr3 = __read_cr3();
 	write_cr3((unsigned long)efi_scratch.efi_pgt);
 	__flush_tlb_all();
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-	void *pgd_addr = __va(read_cr3());
+	void *pgd_addr = __va(read_cr3_pa());
 
 	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
 	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 	 */
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
-	ctxt->cr3 = read_cr3();
+	ctxt->cr3 = __read_cr3();
 	ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
 	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
 
 	/* Make the page containing the relocated code executable */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+	pgd = (pgd_t *)__va(read_cr3_pa()) +
+		pgd_index(relocated_restore_code);
 	p4d = p4d_offset(pgd, relocated_restore_code);
 	if (p4d_large(*p4d)) {
 		set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 21beb37114b7..4f638309deea 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2017,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
 	pmd_t pmd;
 	pte_t pte;
 
-	pa = read_cr3();
+	pa = read_cr3_pa();
 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
 						       sizeof(pgd)));
 	if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@ void __init xen_relocate_p2m(void)
 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
-	pgd = __va(read_cr3());
+	pgd = __va(read_cr3_pa());
 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
 	idx_p4d = 0;
 	save_pud = n_pud;
@@ -2204,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 {
 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
-	BUG_ON(read_cr3() != __pa(initial_page_table));
+	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
 	BUG_ON(cr3 != __pa(swapper_pg_dir));
 
 	/*
-- 
2.9.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-06-13 16:19 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-12 17:26 [PATCH] x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3() Andy Lutomirski
2017-06-13  6:49 ` Ingo Molnar
2017-06-13  6:49 ` Ingo Molnar
2017-06-13  9:26 ` Borislav Petkov
2017-06-13 16:00   ` Andy Lutomirski
2017-06-13 16:00   ` Andy Lutomirski
2017-06-13 16:18     ` Borislav Petkov
2017-06-13 16:18     ` Borislav Petkov
2017-06-13  9:26 ` Borislav Petkov
2017-06-13 10:04 ` [tip:x86/mm] " tip-bot for Andy Lutomirski
2017-06-13 10:04 ` tip-bot for Andy Lutomirski
  -- strict thread matches above, loose matches on Subject: below --
2017-06-12 17:26 [PATCH] " Andy Lutomirski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.