All of lore.kernel.org
 help / color / mirror / Atom feed
From: Petr Tesarik <petrtesarik@huaweicloud.com>
To: Jonathan Corbet <corbet@lwn.net>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)),
	"H. Peter Anvin" <hpa@zytor.com>,
	Andy Lutomirski <luto@kernel.org>,
	Oleg Nesterov <oleg@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>, Xin Li <xin3.li@intel.com>,
	Arnd Bergmann <arnd@arndb.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Rick Edgecombe <rick.p.edgecombe@intel.com>,
	Kees Cook <keescook@chromium.org>,
	"Masami Hiramatsu (Google)" <mhiramat@kernel.org>,
	Pengfei Xu <pengfei.xu@intel.com>,
	Josh Poimboeuf <jpoimboe@kernel.org>,
	Ze Gao <zegao2021@gmail.com>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Kai Huang <kai.huang@intel.com>,
	David Woodhouse <dwmw@amazon.co.uk>,
	Brian Gerst <brgerst@gmail.com>, Jason Gunthorpe <jgg@ziepe.ca>,
	Joerg Roedel <jroedel@suse.de>,
	"Mike Rapoport (IBM)" <rppt@kernel.org>,
	Tina Zhang <tina.zhang@intel.com>,
	Jacob Pan <jacob.jun.pan@linux.intel.com>,
	linux-doc@vger.kernel.org (open list:DOCUMENTATION),
	linux-kernel@vger.kernel.org (open list)
Cc: Roberto Sassu <roberto.sassu@huaweicloud.com>,
	petr@tesarici.cz,
	Petr Tesarik <petr.tesarik1@huawei-partners.com>
Subject: [PATCH v1 8/8] sbm: x86: lazy TLB flushing
Date: Wed, 14 Feb 2024 12:35:16 +0100	[thread overview]
Message-ID: <20240214113516.2307-9-petrtesarik@huaweicloud.com> (raw)
In-Reply-To: <20240214113516.2307-1-petrtesarik@huaweicloud.com>

From: Petr Tesarik <petr.tesarik1@huawei-partners.com>

Implement lazy TLB flushing in sandbox mode and keep CR4.PGE enabled.

For the transition from sandbox mode to kernel mode:

1. All user page translations (sandbox code and data) are flushed from the
   TLB, because their page protection bits do not include _PAGE_GLOBAL.

2. Any kernel page translations remain valid after the transition. The SBM
   state page is an exception; map it without _PAGE_GLOBAL.

For the transition from kernel mode to sandbox mode:

1. Kernel page translations become stale. However, any access by code
   running in sandbox mode (with CPL 3) causes a protection violation.
   Handle the spurious page faults from such accesses, lazily replacing
   entries in the TLB.

2. If the TLB contains any user page translations before the switch to
   sandbox mode, they are flushed, because their page protection bits do
   not include _PAGE_GLOBAL. This ensures that sandbox mode cannot access
   user mode pages.

Note that the TLB may keep kernel page translations for addresses which are
never accessed by sandbox mode. They remain valid after returning to kernel
mode.

Signed-off-by: Petr Tesarik <petr.tesarik1@huawei-partners.com>
---
 arch/x86/entry/entry_64.S     |  17 +-----
 arch/x86/kernel/sbm/call_64.S |   5 +-
 arch/x86/kernel/sbm/core.c    | 100 +++++++++++++++++++++++++++++++++-
 3 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e1364115408a..4ba3eea38102 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -632,10 +632,8 @@ SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
 	movq	PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
 	movq	TASK_sbm_state(%rcx), %rcx
 	movq	SBM_sbm_cr3(%rcx), %rcx
-	movq	%cr4, %rax
-	andb	$~X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%rcx, %cr3
+	invlpg	x86_sbm_state
 	orb	$3, CS(%rsp)
 #endif
 
@@ -897,9 +895,6 @@ SYM_CODE_START(paranoid_entry)
 
 	movq	%cr3, %r14
 	andb	$~3, CS+8(%rsp)
-	movq	%cr4, %rax
-	orb	$X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%rcx, %cr3
 	jmp	.Lparanoid_gsbase
 #endif
@@ -1073,9 +1068,6 @@ SYM_CODE_START(error_entry)
 	jrcxz	.Lerror_swapgs
 
 	andb	$~3, CS+8(%rsp)
-	movq	%cr4, %rax
-	orb	$X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%rcx, %cr3
 	jmp	.Lerror_entry_done_lfence
 #endif
@@ -1281,9 +1273,6 @@ SYM_CODE_START(asm_exc_nmi)
 	 * stack. The code is similar to NMI from user mode.
 	 */
 	andb	$~3, CS-RIP+8(%rsp)
-	movq	%cr4, %rdx
-	orb	$X86_CR4_PGE, %dl
-	movq	%rdx, %cr4
 	movq	x86_sbm_state + SBM_kernel_cr3, %rdx
 	movq	%rdx, %cr3
 
@@ -1533,10 +1522,8 @@ end_repeat_nmi:
 	movq	TASK_sbm_state(%rcx), %rcx
 	jrcxz	nmi_no_sbm
 
-	movq	%cr4, %rax
-	andb	$~X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%r14, %cr3
+	invlpg	x86_sbm_state
 #endif
 
 nmi_no_sbm:
diff --git a/arch/x86/kernel/sbm/call_64.S b/arch/x86/kernel/sbm/call_64.S
index 8b2b524c5b46..21edce5666bc 100644
--- a/arch/x86/kernel/sbm/call_64.S
+++ b/arch/x86/kernel/sbm/call_64.S
@@ -10,7 +10,6 @@
 #include <linux/linkage.h>
 #include <asm/nospec-branch.h>
 #include <asm/percpu.h>
-#include <asm/processor-flags.h>
 #include <asm/segment.h>
 
 .code64
@@ -75,12 +74,10 @@ SYM_FUNC_START(x86_sbm_exec)
 	 * The NMI handler takes extra care to restore CR3 and CR4.
 	 */
 	mov	SBM_sbm_cr3(%rdi), %r11
-	mov	%cr4, %rax
-	and	$~X86_CR4_PGE, %al
 	mov	%rdx, %rdi	/* args */
 	cli
-	mov	%rax, %cr4
 	mov	%r11, %cr3
+	invlpg	x86_sbm_state
 	iretq
 
 SYM_INNER_LABEL(x86_sbm_return, SYM_L_GLOBAL)
diff --git a/arch/x86/kernel/sbm/core.c b/arch/x86/kernel/sbm/core.c
index 0ea193550a83..296f1fde3c22 100644
--- a/arch/x86/kernel/sbm/core.c
+++ b/arch/x86/kernel/sbm/core.c
@@ -33,6 +33,11 @@ union {
 	char page[PAGE_SIZE];
 } x86_sbm_state __page_aligned_bss;
 
+static inline pgprot_t pgprot_nonglobal(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) & ~_PAGE_GLOBAL);
+}
+
 static inline phys_addr_t page_to_ptval(struct page *page)
 {
 	return PFN_PHYS(page_to_pfn(page)) | _PAGE_TABLE;
@@ -287,7 +292,7 @@ int arch_sbm_init(struct sbm *sbm)
 
 	BUILD_BUG_ON(sizeof(x86_sbm_state) != PAGE_SIZE);
 	err = map_page(state, (unsigned long)&x86_sbm_state,
-		       PHYS_PFN(__pa(state)), PAGE_KERNEL);
+		       PHYS_PFN(__pa(state)), pgprot_nonglobal(PAGE_KERNEL));
 	if (err < 0)
 		return err;
 
@@ -379,11 +384,104 @@ int arch_sbm_exec(struct sbm *sbm, sbm_func func, void *args)
 	return err;
 }
 
+static bool spurious_sbm_fault_check(unsigned long error_code, pte_t *pte)
+{
+	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
+		return false;
+
+	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
+		return false;
+
+	return true;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry.  Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ *
+ * Returns true if a spurious fault was handled, false otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
+ */
+static bool
+spurious_sbm_fault(struct x86_sbm_state *state, unsigned long error_code,
+		   unsigned long address)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool ret;
+
+	if ((error_code & ~(X86_PF_WRITE | X86_PF_INSTR)) !=
+	    (X86_PF_USER | X86_PF_PROT))
+		return false;
+
+	pgd = __va(state->sbm_cr3 & CR3_ADDR_MASK) + pgd_index(address);
+	if (!pgd_present(*pgd))
+		return false;
+
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		return false;
+
+	if (p4d_large(*p4d))
+		return spurious_sbm_fault_check(error_code, (pte_t *)p4d);
+
+	pud = pud_offset(p4d, address);
+	if (!pud_present(*pud))
+		return false;
+
+	if (pud_large(*pud))
+		return spurious_sbm_fault_check(error_code, (pte_t *)pud);
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return false;
+
+	if (pmd_large(*pmd))
+		return spurious_sbm_fault_check(error_code, (pte_t *)pmd);
+
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte))
+		return false;
+
+	ret = spurious_sbm_fault_check(error_code, pte);
+	if (!ret)
+		return false;
+
+	/*
+	 * Make sure we have permissions in PMD.
+	 * If not, then there's a bug in the page tables:
+	 */
+	ret = spurious_sbm_fault_check(error_code, (pte_t *)pmd);
+	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+	return ret;
+}
+
 void handle_sbm_fault(struct pt_regs *regs, unsigned long error_code,
 		      unsigned long address)
 {
 	struct x86_sbm_state *state = current_thread_info()->sbm_state;
 
+	if (spurious_sbm_fault(state, error_code, address))
+		return;
+
 	/*
 	 * Force -EFAULT unless the fault was due to a user-mode instruction
 	 * fetch from the designated return address.
-- 
2.34.1


  parent reply	other threads:[~2024-02-14 11:37 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-14 11:35 [PATCH v1 0/8] x86_64 SandBox Mode arch hooks Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 1/8] sbm: x86: page table " Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 2/8] sbm: x86: execute target function on sandbox mode stack Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 3/8] sbm: x86: map system data structures into the sandbox Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 4/8] sbm: x86: allocate and map an exception stack Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 5/8] sbm: x86: handle sandbox mode faults Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 6/8] sbm: x86: switch to sandbox mode pages in arch_sbm_exec() Petr Tesarik
2024-02-14 11:35 ` [PATCH v1 7/8] sbm: documentation of the x86-64 SandBox Mode implementation Petr Tesarik
2024-02-14 18:37   ` Xin Li
2024-02-14 19:16     ` Petr Tesařík
2024-02-14 11:35 ` Petr Tesarik [this message]
2024-02-14 14:52 ` [PATCH v1 0/8] x86_64 SandBox Mode arch hooks Dave Hansen
2024-02-14 15:28   ` H. Peter Anvin
2024-02-14 16:41     ` Petr Tesařík
2024-02-14 17:29       ` H. Peter Anvin
2024-02-14 19:14         ` Petr Tesařík
2024-02-14 18:14       ` Edgecombe, Rick P
2024-02-14 18:32         ` Petr Tesařík
2024-02-14 19:19           ` Edgecombe, Rick P
2024-02-14 19:35             ` Petr Tesařík
2024-02-14 18:22   ` Petr Tesařík
2024-02-14 18:42     ` Dave Hansen
2024-02-14 19:33       ` Petr Tesařík
2024-02-14 20:16         ` Dave Hansen
2024-02-16 15:24           ` [RFC 0/8] PGP key parser using SandBox Mode Petr Tesarik
2024-02-16 15:24             ` [RFC 1/8] mpi: Introduce mpi_key_length() Petr Tesarik
2024-02-16 15:24             ` [RFC 2/8] rsa: add parser of raw format Petr Tesarik
2024-02-16 15:24             ` [RFC 3/8] PGPLIB: PGP definitions (RFC 4880) Petr Tesarik
2024-02-16 15:24             ` [RFC 4/8] PGPLIB: Basic packet parser Petr Tesarik
2024-02-16 15:24             ` [RFC 5/8] PGPLIB: Signature parser Petr Tesarik
2024-02-16 15:24             ` [RFC 6/8] KEYS: PGP data parser Petr Tesarik
2024-02-16 16:44               ` Matthew Wilcox
2024-02-16 16:53                 ` Roberto Sassu
2024-02-16 17:08                   ` H. Peter Anvin
2024-02-16 17:13                     ` Roberto Sassu
2024-02-20 10:55                     ` Petr Tesarik
2024-02-21 14:02                       ` H. Peter Anvin
2024-02-22  7:53                         ` Petr Tesařík
2024-02-16 18:44                   ` Matthew Wilcox
2024-02-16 19:54                     ` Roberto Sassu
2024-02-28 17:58                       ` Roberto Sassu
2024-02-16 15:24             ` [RFC 7/8] KEYS: Run PGP key parser in a sandbox Petr Tesarik
2024-02-18  6:07               ` kernel test robot
2024-02-18  8:02               ` kernel test robot
2024-02-16 15:24             ` [RFC 8/8] KEYS: Add intentional fault injection Petr Tesarik
2024-02-16 15:38             ` [RFC 0/8] PGP key parser using SandBox Mode Dave Hansen
2024-02-16 16:08               ` Petr Tesařík
2024-02-16 17:21                 ` Jonathan Corbet
2024-02-16 18:24                   ` Roberto Sassu
2024-02-22 13:12           ` [RFC 0/5] PoC: convert AppArmor parser to " Petr Tesarik
2024-02-22 13:12             ` [RFC 1/5] sbm: x86: fix SBM error entry path Petr Tesarik
2024-02-22 13:12             ` [RFC 2/5] sbm: enhance buffer mapping API Petr Tesarik
2024-02-22 13:12             ` [RFC 3/5] sbm: x86: infrastructure to fix up sandbox faults Petr Tesarik
2024-02-22 13:12             ` [RFC 4/5] sbm: fix up calls to dynamic memory allocators Petr Tesarik
2024-02-22 15:51               ` Dave Hansen
2024-02-22 17:57                 ` Petr Tesařík
2024-02-22 18:03                   ` Dave Hansen
2024-02-22 13:12             ` [RFC 5/5] apparmor: parse profiles in sandbox mode Petr Tesarik
2024-02-14 18:52     ` [PATCH v1 0/8] x86_64 SandBox Mode arch hooks Xin Li
2024-02-15  6:59       ` Petr Tesařík
2024-02-15  8:16         ` H. Peter Anvin
2024-02-15  9:30           ` Petr Tesařík
2024-02-15  9:37             ` Roberto Sassu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240214113516.2307-9-petrtesarik@huaweicloud.com \
    --to=petrtesarik@huaweicloud.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=dwmw@amazon.co.uk \
    --cc=hpa@zytor.com \
    --cc=jacob.jun.pan@linux.intel.com \
    --cc=jgg@ziepe.ca \
    --cc=jpoimboe@kernel.org \
    --cc=jroedel@suse.de \
    --cc=kai.huang@intel.com \
    --cc=keescook@chromium.org \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mhiramat@kernel.org \
    --cc=mingo@redhat.com \
    --cc=oleg@redhat.com \
    --cc=pengfei.xu@intel.com \
    --cc=peterz@infradead.org \
    --cc=petr.tesarik1@huawei-partners.com \
    --cc=petr@tesarici.cz \
    --cc=rick.p.edgecombe@intel.com \
    --cc=roberto.sassu@huaweicloud.com \
    --cc=rppt@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=tina.zhang@intel.com \
    --cc=x86@kernel.org \
    --cc=xin3.li@intel.com \
    --cc=zegao2021@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.