* [POC 02/12] init/main.c: call update_rai_access()
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 03/12] arch/Kconfig: add ARCH_HAS_RAI symbol Rasmus Villemoes
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
I hope that one can actually interchange the order of these calls a bit
so that they read
mark_readonly();
update_rai_access();
free_initmem();
because there will be some metadata associated to each rai_* macro
invocation that might as well live in __initdata. But for now, we will
live with that wasted space.
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
init/main.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/init/main.c b/init/main.c
index a664246450d1..39709ca33316 100644
--- a/init/main.c
+++ b/init/main.c
@@ -92,6 +92,7 @@
#include <linux/rodata_test.h>
#include <linux/jump_label.h>
#include <linux/mem_encrypt.h>
+#include <linux/rai.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -1066,6 +1067,7 @@ static int __ref kernel_init(void *unused)
ftrace_free_init_mem();
free_initmem();
mark_readonly();
+ update_rai_access();
/*
* Kernel mappings are now finalized - update the userspace page-table
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 03/12] arch/Kconfig: add ARCH_HAS_RAI symbol
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
2018-10-17 22:33 ` [POC 02/12] init/main.c: call update_rai_access() Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 04/12] vmlinux.lds.h: handle various rai sections Rasmus Villemoes
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
arch/Kconfig | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/Kconfig b/arch/Kconfig
index 9d329608913e..160893bd6a5c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -275,6 +275,9 @@ config ARCH_THREAD_STACK_ALLOCATOR
config ARCH_WANTS_DYNAMIC_TASK_STRUCT
bool
+config ARCH_HAS_RAI
+ bool
+
config HAVE_REGS_AND_STACK_ACCESS_API
bool
help
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 04/12] vmlinux.lds.h: handle various rai sections
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
2018-10-17 22:33 ` [POC 02/12] init/main.c: call update_rai_access() Rasmus Villemoes
2018-10-17 22:33 ` [POC 03/12] arch/Kconfig: add ARCH_HAS_RAI symbol Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 05/12] x86-64: initial ro-after-init patching support Rasmus Villemoes
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
include/asm-generic/vmlinux.lds.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f09ee3c544bc..f38510c6bfcc 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -257,6 +257,7 @@
__start___verbose = .; \
KEEP(*(__verbose)) \
__stop___verbose = .; \
+ KEEP(*(.rai_templ)) \
LIKELY_PROFILE() \
BRANCH_PROFILE() \
TRACE_PRINTKS() \
@@ -328,6 +329,9 @@
__start___tracepoints_ptrs = .; \
KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \
__stop___tracepoints_ptrs = .; \
+ __start_rai_data = .; \
+ KEEP(*(.rai_data)) \
+ __stop_rai_data = .; \
*(__tracepoints_strings)/* Tracepoints: strings */ \
} \
\
@@ -494,6 +498,7 @@
#define TEXT_TEXT \
ALIGN_FUNCTION(); \
*(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \
+ *(.text.rai_thunk) \
*(.text..refcount) \
*(.ref.text) \
MEM_KEEP(init.text*) \
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 05/12] x86-64: initial ro-after-init patching support
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (2 preceding siblings ...)
2018-10-17 22:33 ` [POC 04/12] vmlinux.lds.h: handle various rai sections Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 06/12] ugly ugly hack Rasmus Villemoes
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
This just sets things up so that the ARCH_HAS_RAI symbol gets selected,
and prepare the arch-specific headers and support functions.
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/rai.S | 18 ++++++++++++++++++
arch/x86/include/asm/rai.h | 25 ++++++++++++++++++++++++
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/macros.S | 1 +
arch/x86/kernel/rai.c | 39 ++++++++++++++++++++++++++++++++++++++
6 files changed, 85 insertions(+)
create mode 100644 arch/x86/include/asm/rai.S
create mode 100644 arch/x86/include/asm/rai.h
create mode 100644 arch/x86/kernel/rai.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5136a1281870..3f1679f258c9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_RAI if X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE
diff --git a/arch/x86/include/asm/rai.S b/arch/x86/include/asm/rai.S
new file mode 100644
index 000000000000..253d27453416
--- /dev/null
+++ b/arch/x86/include/asm/rai.S
@@ -0,0 +1,18 @@
+#ifdef __ASSEMBLY__
+
+.macro rai_entry type instr instr_end templ templ_end thunk
+ .long \type
+ .long \instr - .
+ .long \instr_end - \instr
+ .long \templ - .
+ .long \templ_end - \templ
+ .long \thunk - .
+.endm
+
+.macro rai_entry_pad start end
+ .ifgt STRUCT_RAI_ENTRY_SIZE-(\end-\start)
+ .skip STRUCT_RAI_ENTRY_SIZE-(\end-\start), 0x00
+ .endif
+.endm
+
+#endif
diff --git a/arch/x86/include/asm/rai.h b/arch/x86/include/asm/rai.h
new file mode 100644
index 000000000000..269d696255b0
--- /dev/null
+++ b/arch/x86/include/asm/rai.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_RAI_H
+#define _ASM_X86_RAI_H
+
+#define STRUCT_RAI_ENTRY_SIZE 24
+
+/* Put the asm macros in a separate file for easier editing. */
+#include <asm/rai.S>
+
+#ifndef __ASSEMBLY__
+
+struct rai_entry {
+ int type; /* RAI_xxx constant */
+ s32 instr_offset; /* member-relative offset to instructions-to-be-patched */
+ s32 instr_len; /* size of area, >= templ_len */
+ s32 templ_offset; /* member-relative offset to template */
+ s32 templ_len; /* length of template */
+ s32 thunk_offset; /* member-relative offset to ool thunk */
+ /* type-specific data follows */
+};
+_Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
+ "please update STRUCT_RAI_ENTRY_SIZE");
+
+#endif /* !__ASSEMBLY */
+
+#endif /* _ASM_X86_RAI_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8824d01c0c35..b4dea4e72081 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o hw_breakpoint.o
+obj-$(CONFIG_ARCH_HAS_RAI) += rai.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
diff --git a/arch/x86/kernel/macros.S b/arch/x86/kernel/macros.S
index 161c95059044..af5672a302d4 100644
--- a/arch/x86/kernel/macros.S
+++ b/arch/x86/kernel/macros.S
@@ -14,3 +14,4 @@
#include <asm/asm.h>
#include <asm/cpufeature.h>
#include <asm/jump_label.h>
+#include <asm/rai.h>
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
new file mode 100644
index 000000000000..2c6ff06f7a34
--- /dev/null
+++ b/arch/x86/kernel/rai.c
@@ -0,0 +1,39 @@
+#include <linux/memory.h>
+#include <linux/mutex.h>
+#include <linux/rai.h>
+#include <asm/text-patching.h>
+
+extern struct rai_entry __start_rai_data[];
+extern struct rai_entry __stop_rai_data[];
+
+static void
+rai_patch_one(const struct rai_entry *r)
+{
+ u8 *instr = (u8*)&r->instr_offset + r->instr_offset;
+ u8 *templ = (u8*)&r->templ_offset + r->templ_offset;
+ u8 *thunk = (u8*)&r->thunk_offset + r->thunk_offset;
+
+ switch (r->type) {
+ default:
+ WARN_ONCE(1, "unhandled RAI type %d\n", r->type);
+ return;
+ }
+ text_poke_bp(instr, templ, r->templ_len, thunk);
+}
+
+static void
+rai_patch(const struct rai_entry *start, const struct rai_entry *stop)
+{
+ const struct rai_entry *r;
+
+ for (r = start; r < stop; ++r)
+ rai_patch_one(r);
+}
+
+void
+update_rai_access(void)
+{
+ mutex_lock(&text_mutex);
+ rai_patch(__start_rai_data, __stop_rai_data);
+ mutex_unlock(&text_mutex);
+}
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 06/12] ugly ugly hack
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (3 preceding siblings ...)
2018-10-17 22:33 ` [POC 05/12] x86-64: initial ro-after-init patching support Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 07/12] x86-64: rai: implement _rai_load Rasmus Villemoes
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
Just to have a quick way of seeing that rai-patching works (i.e., once
we implement rai_load, we'd not expect the output to change). Also,
inside virtme we can do a quick "gdb vmlinux /proc/kcode" and
disassemble rai_proc_show to see how the patched function looks.
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
arch/x86/kernel/rai.c | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
index 2c6ff06f7a34..819d03a025e3 100644
--- a/arch/x86/kernel/rai.c
+++ b/arch/x86/kernel/rai.c
@@ -37,3 +37,43 @@ update_rai_access(void)
rai_patch(__start_rai_data, __stop_rai_data);
mutex_unlock(&text_mutex);
}
+
+#if 1
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int one, two;
+static long three;
+
+static int
+rai_proc_show(struct seq_file *m, void *v) {
+ seq_printf(m, "one: %d, two: %d, three: %ld\n",
+ rai_load(one), rai_load(two), rai_load(three));
+ one = two = three = -1;
+
+ return 0;
+}
+
+static int
+rai_proc_open(struct inode *inode, struct file *file) {
+ return single_open(file, rai_proc_show, NULL);
+}
+
+static const struct file_operations rai_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = rai_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rai_proc_init(void) {
+ one = 1;
+ two = 2;
+ three = 3;
+
+ proc_create("rai", 0, NULL, &rai_proc_fops);
+ return 0;
+}
+late_initcall(rai_proc_init);
+#endif
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 07/12] x86-64: rai: implement _rai_load
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (4 preceding siblings ...)
2018-10-17 22:33 ` [POC 06/12] ugly ugly hack Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 08/12] fs/dcache.c: access dentry_cache via rai_load Rasmus Villemoes
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
This implements the simplest of the rai_* operations, loading a
value. For load of an 8-byte value, I believe we do need to keep room
for a movabs, since there's no guarantee the final value can be loaded
with as an imm32 or using a %rip-relative leaq.
It wouldn't hurt to add some sanity checking in rai_patch_one, e.g. at
least check that the immediate we are replacing is the dummy 0x12345678
we used in the .rai_templ section.
That the patching works can be seen in a quick virtme session. gdb on
vmlinux and /proc/kcore shows
(gdb) x/16i rai_proc_show
0xffffffff8108c120 <rai_proc_show>: mov $0xffffffff81fd9ad4,%rsi
0xffffffff8108c127 <rai_proc_show+7>: jmpq 0xffffffff819652e9
0xffffffff8108c12c <rai_proc_show+12>: nop
0xffffffff8108c12d <rai_proc_show+13>: nop
0xffffffff8108c12e <rai_proc_show+14>: nop
0xffffffff8108c12f <rai_proc_show+15>: nop
0xffffffff8108c130 <rai_proc_show+16>: nop
0xffffffff8108c131 <rai_proc_show+17>: jmpq 0xffffffff819652f5
0xffffffff8108c136 <rai_proc_show+22>: jmpq 0xffffffff81965300
0xffffffff8108c13b <rai_proc_show+27>: callq 0xffffffff81238bb0 <seq_printf>
0xffffffff8108c140 <rai_proc_show+32>: mov $0xffffffffffffffff,%rax
0xffffffff8108c147 <rai_proc_show+39>: mov %rax,0x17b228a(%rip) # 0xffffffff8283e3d8 <three>
0xffffffff8108c14e <rai_proc_show+46>: mov %eax,0x17b228c(%rip) # 0xffffffff8283e3e0 <two>
0xffffffff8108c154 <rai_proc_show+52>: mov %eax,0x17b228a(%rip) # 0xffffffff8283e3e4 <one>
0xffffffff8108c15a <rai_proc_show+58>: xor %eax,%eax
0xffffffff8108c15c <rai_proc_show+60>: retq
(gdb) x/16i 0xffffffff96e8c120
0xffffffff96e8c120: mov $0xffffffff97dd9ad4,%rsi
0xffffffff96e8c127: movabs $0x3,%r8
0xffffffff96e8c131: mov $0x2,%ecx
0xffffffff96e8c136: mov $0x1,%edx
0xffffffff96e8c13b: callq 0xffffffff97038bb0
0xffffffff96e8c140: mov $0xffffffffffffffff,%rax
0xffffffff96e8c147: mov %rax,0x17b228a(%rip) # 0xffffffff9863e3d8
0xffffffff96e8c14e: mov %eax,0x17b228c(%rip) # 0xffffffff9863e3e0
0xffffffff96e8c154: mov %eax,0x17b228a(%rip) # 0xffffffff9863e3e4
0xffffffff96e8c15a: xor %eax,%eax
0xffffffff96e8c15c: retq
0xffffffff96e8c15d: nopl (%rax)
0xffffffff96e8c160: push %rbx
0xffffffff96e8c161: mov $0xffffffff9804c240,%rdi
0xffffffff96e8c168: mov $0xffffffff97e9fccc,%rbx
0xffffffff96e8c16f: callq 0xffffffff9776b230
where we also see that gcc chooses the destination registers rather
intelligently. As expected, repeated "cat /proc/rai" continues to print
"one: 1, two: 2, three: 3".
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
arch/x86/include/asm/rai.S | 42 +++++++++++++++++++++++++++++++++++++-
arch/x86/include/asm/rai.h | 30 ++++++++++++++++++++++++++-
arch/x86/kernel/rai.c | 18 ++++++++++++++++
3 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/rai.S b/arch/x86/include/asm/rai.S
index 253d27453416..f42cdd8db876 100644
--- a/arch/x86/include/asm/rai.S
+++ b/arch/x86/include/asm/rai.S
@@ -8,11 +8,51 @@
.long \templ_end - \templ
.long \thunk - .
.endm
-
+
.macro rai_entry_pad start end
.ifgt STRUCT_RAI_ENTRY_SIZE-(\end-\start)
.skip STRUCT_RAI_ENTRY_SIZE-(\end-\start), 0x00
.endif
.endm
+.macro rai_load dst, var, type
+ .pushsection .rai_templ, "aw"
+10:
+ .ifeq \type - RAI_LOAD_8
+ movabs $0x1234567812345678, \dst
+ .else
+ mov $0x12345678, \dst
+ .endif
+11:
+ .popsection
+
+ /* Even if the mov \var, \dst is short enough to fit in the
+ * space we reserve in .text, we still need the thunk for when
+ * we do the immediate patching. */
+ .pushsection .text.rai_thunk, "ax"
+20:
+ mov \var(%rip), \dst
+ jmp 32f
+21:
+ .popsection
+
+ /* The part that goes into .text */
+30:
+ /* silence objtool by actually using the thunk for now */
+ jmp 20b
+ /* mov \var(%rip), \dst */
+31:
+ .skip -(((11b - 10b)-(31b - 30b)) > 0)*((11b - 10b)-(31b - 30b)), 0x90
+32:
+
+ .pushsection .rai_data, "a"
+40:
+ rai_entry \type 30b 32b 10b 11b 20b
+ .quad \var /* .load.addr */
+41:
+ rai_entry_pad 40b 41b
+ .popsection
+.endm /* rai_load */
+
+
#endif
diff --git a/arch/x86/include/asm/rai.h b/arch/x86/include/asm/rai.h
index 269d696255b0..b57494c98d0f 100644
--- a/arch/x86/include/asm/rai.h
+++ b/arch/x86/include/asm/rai.h
@@ -1,7 +1,10 @@
#ifndef _ASM_X86_RAI_H
#define _ASM_X86_RAI_H
-#define STRUCT_RAI_ENTRY_SIZE 24
+#define RAI_LOAD_4 0
+#define RAI_LOAD_8 1
+
+#define STRUCT_RAI_ENTRY_SIZE 32
/* Put the asm macros in a separate file for easier editing. */
#include <asm/rai.S>
@@ -16,10 +19,35 @@ struct rai_entry {
s32 templ_len; /* length of template */
s32 thunk_offset; /* member-relative offset to ool thunk */
/* type-specific data follows */
+ union {
+ struct {
+ void *addr;
+ } load;
+ };
};
_Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
"please update STRUCT_RAI_ENTRY_SIZE");
+#define _rai_load(var) ({ \
+ typeof(var) ret__; \
+ switch(sizeof(var)) { \
+ case 4: \
+ asm("rai_load %0, %c1, %c2" \
+ : "=r" (ret__) \
+ : "i" (&(var)), "i" (RAI_LOAD_4)); \
+ break; \
+ case 8: \
+ asm("rai_load %0, %c1, %c2" \
+ : "=r" (ret__) \
+ : "i" (&(var)), "i" (RAI_LOAD_8)); \
+ break; \
+ default: \
+ ret__ = _rai_load_fallback(var); \
+ break; \
+ } \
+ ret__; \
+ })
+
#endif /* !__ASSEMBLY */
#endif /* _ASM_X86_RAI_H */
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
index 819d03a025e3..e55e85f11a2e 100644
--- a/arch/x86/kernel/rai.c
+++ b/arch/x86/kernel/rai.c
@@ -14,6 +14,24 @@ rai_patch_one(const struct rai_entry *r)
u8 *thunk = (u8*)&r->thunk_offset + r->thunk_offset;
switch (r->type) {
+ case RAI_LOAD_4: {
+ const u32 *imm = r->load.addr;
+ /*
+ * The immediate is the last 4 bytes of the template,
+ * regardless of the operand encoding.
+ */
+ memcpy(templ + r->templ_len - sizeof(*imm), imm, sizeof(*imm));
+ break;
+ }
+ case RAI_LOAD_8: {
+ const u64 *imm = r->load.addr;
+ /*
+ * The immediate is the last 8 bytes of the template,
+ * regardless of the operand encoding.
+ */
+ memcpy(templ + r->templ_len - sizeof(*imm), imm, sizeof(*imm));
+ break;
+ }
default:
WARN_ONCE(1, "unhandled RAI type %d\n", r->type);
return;
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 08/12] fs/dcache.c: access dentry_cache via rai_load
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (5 preceding siblings ...)
2018-10-17 22:33 ` [POC 07/12] x86-64: rai: implement _rai_load Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 09/12] fs/inode.c: access inode_cachep " Rasmus Villemoes
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
This avoids a cacheline access to get the value of the dentry_cache
pointer in the places that do a kmem_cache_*(dentry_cache, ...);
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
fs/dcache.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d85e9b4..1d54dfb38c9d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -30,6 +30,7 @@
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
+#include <linux/rai.h>
#include "internal.h"
#include "mount.h"
@@ -76,7 +77,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
-static struct kmem_cache *dentry_cache __read_mostly;
+static struct kmem_cache *__dentry_cache __read_mostly;
+#define dentry_cache rai_load(__dentry_cache)
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
@@ -3088,7 +3090,7 @@ static void __init dcache_init(void)
* but it is probably not worth it because of the cache nature
* of the dcache.
*/
- dentry_cache = KMEM_CACHE_USERCOPY(dentry,
+ __dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
d_iname);
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 09/12] fs/inode.c: access inode_cachep via rai_load
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (6 preceding siblings ...)
2018-10-17 22:33 ` [POC 08/12] fs/dcache.c: access dentry_cache via rai_load Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 10/12] hack: /proc/rai: add rai_bucket_shift use Rasmus Villemoes
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
This avoids a cacheline access to get the value of the inode_cachep
pointer in the places that do a kmem_cache_*(inode_cachep, ...);
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
fs/inode.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 42f6d25f32a5..f1e9f548494e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -19,6 +19,7 @@
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
+#include <linux/rai.h>
#include <trace/events/writeback.h>
#include "internal.h"
@@ -74,7 +75,8 @@ struct inodes_stat_t inodes_stat;
static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);
-static struct kmem_cache *inode_cachep __read_mostly;
+static struct kmem_cache *__inode_cachep __read_mostly;
+#define inode_cachep rai_load(__inode_cachep)
static long get_nr_inodes(void)
{
@@ -1951,7 +1953,7 @@ void __init inode_init_early(void)
void __init inode_init(void)
{
/* inode slab cache */
- inode_cachep = kmem_cache_create("inode_cache",
+ __inode_cachep = kmem_cache_create("inode_cache",
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 10/12] hack: /proc/rai: add rai_bucket_shift use
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (7 preceding siblings ...)
2018-10-17 22:33 ` [POC 09/12] fs/inode.c: access inode_cachep " Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 11/12] x86-64: implement _rai_bucket_shift Rasmus Villemoes
2018-10-17 22:33 ` [POC 12/12] fs/dcache.c: use rai_bucket_shift for dentry hashtable Rasmus Villemoes
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
arch/x86/kernel/rai.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
index e55e85f11a2e..c4284ce7478f 100644
--- a/arch/x86/kernel/rai.c
+++ b/arch/x86/kernel/rai.c
@@ -63,11 +63,21 @@ update_rai_access(void)
static int one, two;
static long three;
+static struct hlist_head *ht1;
+static unsigned shift1;
+
static int
rai_proc_show(struct seq_file *m, void *v) {
+ unsigned hash = 0xdeadbeef;
+
seq_printf(m, "one: %d, two: %d, three: %ld\n",
rai_load(one), rai_load(two), rai_load(three));
+ seq_printf(m, "ht1: %016lx, bucket 0x%08x: %016lx\n",
+ (long)rai_load(ht1), hash, (long)rai_bucket_shift(ht1, shift1, hash));
+
one = two = three = -1;
+ ht1 = NULL;
+ shift1 = 2;
return 0;
}
@@ -89,6 +99,8 @@ static int __init rai_proc_init(void) {
one = 1;
two = 2;
three = 3;
+ ht1 = (void*)0xffffffffabcd0000UL;
+ shift1 = 26;
proc_create("rai", 0, NULL, &rai_proc_fops);
return 0;
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 11/12] x86-64: implement _rai_bucket_shift
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (8 preceding siblings ...)
2018-10-17 22:33 ` [POC 10/12] hack: /proc/rai: add rai_bucket_shift use Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
2018-10-17 22:33 ` [POC 12/12] fs/dcache.c: use rai_bucket_shift for dentry hashtable Rasmus Villemoes
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
The only slightly tricky issue is that for implementing the thunk, we
need some temporary registers (with %ecx being one of them), and we
don't know whether the hash input and/or destination register collide
with whichever we choose. One _could_ attempt text parsing in asm in
order to find a safe set of temps, but they would need to be restored
anyway.
So instead, just pick %edx and %ecx, and start by pushing them on the
stack. Then compute the result we need, push that to the stack, restore
%edx and %ecx, and finally pop the result into the destination
register (which may be %rdx or %rcx or any other) and adjust the stack
pointer.
The patched code does need to do a shr, so I don't think there's a way
around the cc clobber.
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
arch/x86/include/asm/rai.S | 59 ++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/rai.h | 21 +++++++++++++-
arch/x86/kernel/rai.c | 13 +++++++++
3 files changed, 92 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/rai.S b/arch/x86/include/asm/rai.S
index f42cdd8db876..144697e146b6 100644
--- a/arch/x86/include/asm/rai.S
+++ b/arch/x86/include/asm/rai.S
@@ -54,5 +54,64 @@
.popsection
.endm /* rai_load */
+ /*
+ * For convenience, and because it should not cause that much
+ * worse code gen, we tie the hash to an output register, to
+ * avoid it being given in the same register where we must
+ * place the actual output. Since the hash output is unused,
+ * gcc is free to pick that register for anything immediately
+ * afterwards.
+ */
+.macro rai_bucket_shift dst, hash, hashq, base, shift
+ .pushsection .rai_templ, "aw"
+10: movabs $0x1234567812345678, \dst
+ /*
+ * Actually, the hash output contains the shifted hash
+ * value. But I don't think there's a way to inform gcc about
+ * that, and I don't know how useful it would be anyway. So in
+ * the thunk below, we don't do anything to have the same
+ * property, though it would be doable.
+ */
+ shr $6, \hash
+ lea (\dst, \hashq, 8), \dst
+11:
+ .popsection
+
+ .pushsection .text.rai_thunk, "ax"
+20: /* dst and hash are registers, we can clobber hash */
+ push %rdx
+ push %rcx
+ mov \hash, %edx
+ mov \shift(%rip), %ecx
+ shr %cl,%edx
+ /* move the shifted value into \hash, so the below works regardless of whether \dst is %rdx or not */
+ mov %edx, \hash
+ mov \base(%rip), \dst
+ lea (\dst, \hashq, 8), \dst
+ /* We have our final value. */
+ push \dst
+ /* Now restore %rdx and %rcx, then finally restore \dst and adjust the stack pointer */
+ mov 0x8(%rsp), %rcx
+ mov 0x10(%rsp), %rdx
+ pop \dst
+ add $0x10, %rsp
+ jmp 32f
+21:
+ .popsection
+ /* The part that goes into .text */
+30: jmp 20b
+31: .skip -(((11b - 10b)-(31b - 30b)) > 0)*((11b - 10b)-(31b - 30b)), 0x90
+32:
+
+ .pushsection .rai_data, "a"
+40:
+ rai_entry RAI_BUCKET_SHIFT_8_4_4 30b 32b 10b 11b 20b
+ .quad \base /* .bucket_shift.base_addr */
+ .quad \shift /* .bucket_shift.shift_addr */
+41:
+ rai_entry_pad 40b 41b
+ .popsection
+.endm /* rai_bucket_shift */
+
#endif
diff --git a/arch/x86/include/asm/rai.h b/arch/x86/include/asm/rai.h
index b57494c98d0f..c9726d1e40ed 100644
--- a/arch/x86/include/asm/rai.h
+++ b/arch/x86/include/asm/rai.h
@@ -3,8 +3,9 @@
#define RAI_LOAD_4 0
#define RAI_LOAD_8 1
+#define RAI_BUCKET_SHIFT_8_4_4 2
-#define STRUCT_RAI_ENTRY_SIZE 32
+#define STRUCT_RAI_ENTRY_SIZE 40
/* Put the asm macros in a separate file for easier editing. */
#include <asm/rai.S>
@@ -23,6 +24,10 @@ struct rai_entry {
struct {
void *addr;
} load;
+ struct {
+ void *base_addr;
+ void *shift_addr;
+ } bucket_shift;
};
};
_Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
@@ -48,6 +53,20 @@ _Static_assert(sizeof(struct rai_entry) == STRUCT_RAI_ENTRY_SIZE,
ret__; \
})
+#define _rai_bucket_shift(base, shift, hash) ({ \
+ typeof(base) ret__; \
+ typeof(hash) unused__; \
+ if (sizeof(*(base)) == 8 && sizeof(shift) == 4 \
+ && sizeof(hash) == 4) \
+ asm("rai_bucket_shift %0 %1 %q1 %c3 %c4" \
+ : "=r" (ret__), "=r" (unused__) \
+ : "1" (hash), "i" (&(base)), "i" (&(shift)) \
+ : "cc"); \
+ else \
+ ret__ = _rai_bucket_shift_fallback(base, shift, hash); \
+ ret__; \
+ })
+
#endif /* !__ASSEMBLY */
#endif /* _ASM_X86_RAI_H */
diff --git a/arch/x86/kernel/rai.c b/arch/x86/kernel/rai.c
index c4284ce7478f..3aa2e3b2c31b 100644
--- a/arch/x86/kernel/rai.c
+++ b/arch/x86/kernel/rai.c
@@ -32,6 +32,19 @@ rai_patch_one(const struct rai_entry *r)
memcpy(templ + r->templ_len - sizeof(*imm), imm, sizeof(*imm));
break;
}
+ case RAI_BUCKET_SHIFT_8_4_4: {
+ const u32 *shiftp = r->bucket_shift.shift_addr;
+ const u64 *basep = r->bucket_shift.base_addr;
+ /*
+ * This should be made more robust. For now, assume we
+ * have a 10-byte movabs followed by a 3-byte shr. And
+ * while *shiftp is 4 bytes wide, we just need the
+ * LSB.
+ */
+ memcpy(templ + 2, basep, sizeof(*basep));
+ memcpy(templ + 12, shiftp, 1);
+ break;
+ }
default:
WARN_ONCE(1, "unhandled RAI type %d\n", r->type);
return;
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [POC 12/12] fs/dcache.c: use rai_bucket_shift for dentry hashtable
2018-10-17 22:33 [POC 01/12] Accessing __ro_after_init variables as immediates Rasmus Villemoes
` (9 preceding siblings ...)
2018-10-17 22:33 ` [POC 11/12] x86-64: implement _rai_bucket_shift Rasmus Villemoes
@ 2018-10-17 22:33 ` Rasmus Villemoes
10 siblings, 0 replies; 12+ messages in thread
From: Rasmus Villemoes @ 2018-10-17 22:33 UTC (permalink / raw)
To: linux-kernel
Cc: x86, H . Peter Anvin, Ingo Molnar, Kirill A . Shutemov, Rasmus Villemoes
Before this, the disassembly of __d_lookup_rcu begins
0x2d10 <__d_lookup_rcu>: push %r15
0x2d12 <__d_lookup_rcu+2>: push %r14
0x2d14 <__d_lookup_rcu+4>: push %r13
0x2d16 <__d_lookup_rcu+6>: push %r12
0x2d18 <__d_lookup_rcu+8>: push %rbp
0x2d19 <__d_lookup_rcu+9>: push %rbx
0x2d1a <__d_lookup_rcu+10>: sub $0x18,%rsp
0x2d1e <__d_lookup_rcu+14>: mov (%rsi),%r12
0x2d21 <__d_lookup_rcu+17>: mov 0x0(%rip),%ecx # 0x2d27 <__d_lookup_rcu+23>
0x2d27 <__d_lookup_rcu+23>: mov 0x8(%rsi),%r13
0x2d2b <__d_lookup_rcu+27>: mov %r12d,%eax
0x2d2e <__d_lookup_rcu+30>: shr %cl,%eax
0x2d30 <__d_lookup_rcu+32>: mov 0x0(%rip),%rcx # 0x2d37 <__d_lookup_rcu+39>
0x2d37 <__d_lookup_rcu+39>: lea (%rcx,%rax,8),%rax
0x2d3b <__d_lookup_rcu+43>: mov (%rax),%rbx
0x2d3e <__d_lookup_rcu+46>: and $0xfffffffffffffffe,%rbx
0x2d42 <__d_lookup_rcu+50>: je 0x2df3 <__d_lookup_rcu+227>
0x2d48 <__d_lookup_rcu+56>: mov %r12,%rax
0x2d4b <__d_lookup_rcu+59>: mov %r12d,%r10d
0x2d4e <__d_lookup_rcu+62>: mov %rdx,%r15
After this, and after the patching of the run-time values of
dentry_hashtable and d_hash_shift, gdb on /proc/kcore says that we now
have
0xffffffff8902e8d0: push %r15
0xffffffff8902e8d2: push %r14
0xffffffff8902e8d4: push %r13
0xffffffff8902e8d6: push %r12
0xffffffff8902e8d8: push %rbp
0xffffffff8902e8d9: push %rbx
0xffffffff8902e8da: sub $0x18,%rsp
0xffffffff8902e8de: mov (%rsi),%r12
0xffffffff8902e8e1: mov 0x8(%rsi),%r13
0xffffffff8902e8e5: mov %r12d,%eax
0xffffffff8902e8e8: movabs $0xffff959b80007000,%rcx
0xffffffff8902e8f2: shr $0x12,%eax
0xffffffff8902e8f5: lea (%rcx,%rax,8),%rcx
0xffffffff8902e8f9: mov (%rcx),%rbx
0xffffffff8902e8fc: and $0xfffffffffffffffe,%rbx
0xffffffff8902e900: je 0xffffffff8902e9b1
0xffffffff8902e906: mov %r12,%rax
0xffffffff8902e909: mov %r12d,%r10d
0xffffffff8902e90c: mov %rdx,%r15
The shr $0x12, %eax is consistent with
[ 0.300676] Dentry cache hash table entries: 16384 (order: 5, 131072 bytes)
and nothing seems to explode.
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
fs/dcache.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 1d54dfb38c9d..226298c3a599 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -100,7 +100,7 @@ static struct hlist_bl_head *dentry_hashtable __read_mostly;
static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
- return dentry_hashtable + (hash >> d_hash_shift);
+ return rai_bucket_shift(dentry_hashtable, d_hash_shift, hash);
}
#define IN_LOOKUP_SHIFT 10
--
2.19.1.6.gbde171bbf5
^ permalink raw reply related [flat|nested] 12+ messages in thread