linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nadav Amit <nadav.amit@gmail.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Hugh Dickins <hughd@google.com>,
	Andy Lutomirski <luto@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Nadav Amit <namit@vmware.com>,
	Sean Christopherson <seanjc@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	x86@kernel.org
Subject: [RFC 3/6] x86/vdso: introduce page_prefetch()
Date: Wed, 24 Feb 2021 23:29:07 -0800	[thread overview]
Message-ID: <20210225072910.2811795-4-namit@vmware.com> (raw)
In-Reply-To: <20210225072910.2811795-1-namit@vmware.com>

From: Nadav Amit <namit@vmware.com>

Introduce a new vDSO function: page_prefetch() which is to be used when
certain memory, which might be paged out, is expected to be used soon.
The function prefetches the page if needed. The function returns zero if
the page is accessible after the call and -1 otherwise.

page_prefetch() is intended to be very lightweight both when the page is
already present and when the page is prefetched.

The implementation leverages the new vDSO exception tables mechanism.
page_prefetch() accesses the page for read and has a corresponding vDSO
exception-table entry that indicates that a #PF might occur and that in
such case the page should be brought asynchronously. If #PF indeed
occurs, the page-fault handler sets the FAULT_FLAG_RETRY_NOWAIT flag.

If the page-fault was not resolved, the page-fault handler does not
retry, and instead jumps to the new IP that is marked in the exception
table. The vDSO part returns accordingly the return value.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: x86@kernel.org
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 arch/x86/Kconfig                |  1 +
 arch/x86/entry/vdso/Makefile    |  1 +
 arch/x86/entry/vdso/extable.c   | 59 +++++++++++++++++++++++++--------
 arch/x86/entry/vdso/vdso.lds.S  |  1 +
 arch/x86/entry/vdso/vprefetch.S | 39 ++++++++++++++++++++++
 arch/x86/include/asm/vdso.h     | 38 +++++++++++++++++++--
 arch/x86/mm/fault.c             | 11 ++++--
 lib/vdso/Kconfig                |  5 +++
 8 files changed, 136 insertions(+), 19 deletions(-)
 create mode 100644 arch/x86/entry/vdso/vprefetch.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 21f851179ff0..86a4c265e8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -136,6 +136,7 @@ config X86
 	select GENERIC_TIME_VSYSCALL
 	select GENERIC_GETTIMEOFDAY
 	select GENERIC_VDSO_TIME_NS
+	select GENERIC_VDSO_PREFETCH
 	select GUP_GET_PTE_LOW_HIGH		if X86_PAE
 	select HARDIRQS_SW_RESEND
 	select HARDLOCKUP_CHECK_TIMESTAMP	if X86_64
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 02e3e42f380b..e32ca1375b84 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -28,6 +28,7 @@ vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
 vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
 vobjs32-y += vdso32/vclock_gettime.o
 vobjs-$(CONFIG_X86_SGX)	+= vsgx.o
+vobjs-$(CONFIG_GENERIC_VDSO_PREFETCH) += vprefetch.o
 
 # files to link into kernel
 obj-y				+= vma.o extable.o
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
index 93fb37bd32ad..e821887112ce 100644
--- a/arch/x86/entry/vdso/extable.c
+++ b/arch/x86/entry/vdso/extable.c
@@ -4,36 +4,67 @@
 #include <asm/current.h>
 #include <asm/traps.h>
 #include <asm/vdso.h>
+#include "extable.h"
 
 struct vdso_exception_table_entry {
 	int insn, fixup;
 	unsigned int mask, flags;
 };
 
-bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
-			  unsigned long error_code, unsigned long fault_addr)
+static unsigned long
+get_vdso_exception_table_entry(const struct pt_regs *regs, int trapnr,
+			       unsigned int *flags)
 {
 	const struct vdso_image *image = current->mm->context.vdso_image;
 	const struct vdso_exception_table_entry *extable;
 	unsigned int nr_entries, i;
 	unsigned long base;
+	unsigned long ip = regs->ip;
+	unsigned long vdso_base = (unsigned long)current->mm->context.vdso;
 
-	if (!current->mm->context.vdso)
-		return false;
-
-	base =  (unsigned long)current->mm->context.vdso + image->extable_base;
+	base = vdso_base + image->extable_base;
 	nr_entries = image->extable_len / (sizeof(*extable));
 	extable = image->extable;
 
 	for (i = 0; i < nr_entries; i++, base += sizeof(*extable)) {
-		if (regs->ip == base + extable[i].insn) {
-			regs->ip = base + extable[i].fixup;
-			regs->di = trapnr;
-			regs->si = error_code;
-			regs->dx = fault_addr;
-			return true;
-		}
+		if (ip != base + extable[i].insn)
+			continue;
+
+		if (!((1u << trapnr) & extable[i].mask))
+			continue;
+
+		/* found */
+		if (flags)
+			*flags = extable[i].flags;
+		return base + extable[i].fixup;
 	}
 
-	return false;
+	return 0;
+}
+
+bool __fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+			    unsigned long error_code, unsigned long fault_addr)
+{
+	unsigned long new_ip;
+
+	new_ip = get_vdso_exception_table_entry(regs, trapnr, NULL);
+	if (!new_ip)
+		return false;
+
+	instruction_pointer_set(regs, new_ip);
+	regs->di = trapnr;
+	regs->si = error_code;
+	regs->dx = fault_addr;
+	return true;
+}
+
+__attribute_const__ bool __is_async_vdso_exception(struct pt_regs *regs,
+						   int trapnr)
+{
+	unsigned long new_ip;
+	unsigned int flags;
+
+	new_ip = get_vdso_exception_table_entry(regs, trapnr, &flags);
+
+	return new_ip && (flags & ASM_VDSO_ASYNC_FLAGS);
 }
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 4bf48462fca7..fd4ba24571c8 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -28,6 +28,7 @@ VERSION {
 		clock_getres;
 		__vdso_clock_getres;
 		__vdso_sgx_enter_enclave;
+		__vdso_prefetch_page;
 	local: *;
 	};
 }
diff --git a/arch/x86/entry/vdso/vprefetch.S b/arch/x86/entry/vdso/vprefetch.S
new file mode 100644
index 000000000000..a0fcafb7d546
--- /dev/null
+++ b/arch/x86/entry/vdso/vprefetch.S
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+#include <asm/errno.h>
+#include <asm/enclu.h>
+
+#include "extable.h"
+
+.code64
+.section .text, "ax"
+
+SYM_FUNC_START(__vdso_prefetch_page)
+	/* Prolog */
+	.cfi_startproc
+	push	%rbp
+	.cfi_adjust_cfa_offset	8
+	.cfi_rel_offset		%rbp, 0
+	mov	%rsp, %rbp
+	.cfi_def_cfa_register	%rbp
+
+	xor	%rax, %rax
+.Laccess_page:
+	movb	(%rdi), %dil
+.Lout:
+
+	/* Epilog */
+	pop	%rbp
+	.cfi_def_cfa		%rsp, 8
+	ret
+
+.Lhandle_exception:
+	mov	$-1ll, %rax
+	jmp	.Lout
+	.cfi_endproc
+ASM_VDSO_EXTABLE_HANDLE .Laccess_page, .Lhandle_exception,		\
+			(1<<X86_TRAP_PF), ASM_VDSO_ASYNC_FLAGS
+
+SYM_FUNC_END(__vdso_prefetch_page)
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 98aa103eb4ab..ee47660fcd0d 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -9,6 +9,7 @@
 #ifndef __ASSEMBLER__
 
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 
 struct vdso_image {
 	void *data;
@@ -49,9 +50,40 @@ extern void __init init_vdso_image(const struct vdso_image *image);
 
 extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
 
-extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
-				 unsigned long error_code,
-				 unsigned long fault_addr);
+extern bool __fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+				   unsigned long error_code,
+				   unsigned long fault_addr);
+
+extern __attribute_const__ bool __is_async_vdso_exception(struct pt_regs *regs,
+							  int trapnr);
+
+static inline bool is_exception_in_vdso(struct pt_regs *regs)
+{
+	const struct vdso_image *image = current->mm->context.vdso_image;
+	unsigned long vdso_base = (unsigned long)current->mm->context.vdso;
+
+	return regs->ip >= vdso_base && regs->ip < vdso_base + image->size &&
+		vdso_base != 0;
+}
+
+static inline bool is_async_vdso_exception(struct pt_regs *regs, int trapnr)
+{
+	if (!is_exception_in_vdso(regs))
+		return false;
+
+	return __is_async_vdso_exception(regs, trapnr);
+}
+
+static inline bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+				   unsigned long error_code,
+				   unsigned long fault_addr)
+{
+	if (is_exception_in_vdso(regs))
+		return __fixup_vdso_exception(regs, trapnr, error_code,
+					      fault_addr);
+	return false;
+}
+
 #endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a0956a..87d8ae46510c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1289,6 +1289,10 @@ void do_user_addr_fault(struct pt_regs *regs,
 	if (user_mode(regs)) {
 		local_irq_enable();
 		flags |= FAULT_FLAG_USER;
+		if (IS_ENABLED(CONFIG_GENERIC_VDSO_PREFETCH) &&
+		    is_async_vdso_exception(regs, X86_TRAP_PF))
+			flags |= FAULT_FLAG_ALLOW_RETRY |
+				 FAULT_FLAG_RETRY_NOWAIT;
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
 			local_irq_enable();
@@ -1407,8 +1411,11 @@ void do_user_addr_fault(struct pt_regs *regs,
 	 */
 	if (unlikely((fault & VM_FAULT_RETRY) &&
 		     (flags & FAULT_FLAG_ALLOW_RETRY))) {
-		flags |= FAULT_FLAG_TRIED;
-		goto retry;
+		if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			flags |= FAULT_FLAG_TRIED;
+			goto retry;
+		}
+		fixup_vdso_exception(regs, X86_TRAP_PF, hw_error_code, address);
 	}
 
 	mmap_read_unlock(mm);
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index d883ac299508..a64d2b08b6f4 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -30,4 +30,9 @@ config GENERIC_VDSO_TIME_NS
 	  Selected by architectures which support time namespaces in the
 	  VDSO
 
+config GENERIC_VDSO_PREFETCH
+	bool
+	help
+	  Selected by architectures which support page prefetch VDSO
+
 endif
-- 
2.25.1


  parent reply	other threads:[~2021-02-25  7:36 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-25  7:29 [RFC 0/6] x86: prefetch_page() vDSO call Nadav Amit
2021-02-25  7:29 ` [RFC 1/6] vdso/extable: fix calculation of base Nadav Amit
2021-02-25 21:16   ` Sean Christopherson
2021-02-26 17:24     ` Nadav Amit
2021-02-26 17:47       ` Sean Christopherson
2021-02-28  9:20         ` Nadav Amit
2021-02-25  7:29 ` [RFC 2/6] x86/vdso: add mask and flags to extable Nadav Amit
2021-02-25  7:29 ` Nadav Amit [this message]
2021-02-25  7:29 ` [RFC 4/6] mm/swap_state: respect FAULT_FLAG_RETRY_NOWAIT Nadav Amit
2021-02-25  7:29 ` [RFC 5/6] mm: use lightweight reclaim on FAULT_FLAG_RETRY_NOWAIT Nadav Amit
2021-02-25  7:29 ` [PATCH 6/6] testing/selftest: test vDSO prefetch_page() Nadav Amit
2021-02-25  8:40 ` [RFC 0/6] x86: prefetch_page() vDSO call Peter Zijlstra
2021-02-25  8:52   ` Nadav Amit
2021-02-25  9:32     ` Nadav Amit
2021-02-25  9:55       ` Peter Zijlstra
2021-02-25 12:16 ` Matthew Wilcox
2021-02-25 16:56   ` Nadav Amit
2021-02-25 17:32     ` Matthew Wilcox
2021-02-25 17:53       ` Nadav Amit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210225072910.2811795-4-namit@vmware.com \
    --to=nadav.amit@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=hughd@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=namit@vmware.com \
    --cc=peterz@infradead.org \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).