From: Anthony Yznaga <anthony.yznaga@oracle.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: willy@infradead.org, corbet@lwn.net, tglx@linutronix.de,
mingo@redhat.com, bp@alien8.de, x86@kernel.org, hpa@zytor.com,
dave.hansen@linux.intel.com, luto@kernel.org,
peterz@infradead.org, rppt@linux.ibm.com,
akpm@linux-foundation.org, hughd@google.com,
ebiederm@xmission.com, masahiroy@kernel.org, ardb@kernel.org,
ndesaulniers@google.com, dima@golovin.in,
daniel.kiper@oracle.com, nivedita@alum.mit.edu,
rafael.j.wysocki@intel.com, dan.j.williams@intel.com,
zhenzhong.duan@oracle.com, jroedel@suse.de, bhe@redhat.com,
guro@fb.com, Thomas.Lendacky@amd.com,
andriy.shevchenko@linux.intel.com, keescook@chromium.org,
hannes@cmpxchg.org, minchan@kernel.org, mhocko@kernel.org,
ying.huang@intel.com, yang.shi@linux.alibaba.com,
gustavo@embeddedor.com, ziqian.lzq@antfin.com,
vdavydov.dev@gmail.com, jason.zeng@intel.com,
kevin.tian@intel.com, zhiyuan.lv@intel.com, lei.l.li@intel.com,
paul.c.lai@intel.com, ashok.raj@intel.com,
linux-fsdevel@vger.kernel.org, linux-doc@vger.kernel.org,
kexec@lists.infradead.org
Subject: [RFC 21/43] x86/KASLR: PKRAM: support physical kaslr
Date: Wed, 6 May 2020 17:41:47 -0700 [thread overview]
Message-ID: <1588812129-8596-22-git-send-email-anthony.yznaga@oracle.com> (raw)
In-Reply-To: <1588812129-8596-1-git-send-email-anthony.yznaga@oracle.com>
Avoid regions of memory that contain preserved pages when computing
slots used to select where to put the decompressed kernel.
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
arch/x86/boot/compressed/Makefile | 3 +
arch/x86/boot/compressed/kaslr.c | 67 ++++++----
arch/x86/boot/compressed/misc.h | 19 +++
arch/x86/boot/compressed/pkram.c | 252 ++++++++++++++++++++++++++++++++++++++
4 files changed, 320 insertions(+), 21 deletions(-)
create mode 100644 arch/x86/boot/compressed/pkram.c
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5f7c262bcc99..ba0d76c53574 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -84,6 +84,9 @@ ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
+ifdef CONFIG_RANDOMIZE_BASE
+ vmlinux-objs-$(CONFIG_PKRAM) += $(obj)/pkram.o
+endif
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index d7408af55738..3f0a6fb15ac2 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -613,31 +613,16 @@ static unsigned long slots_fetch_random(void)
return 0;
}
-static void __process_mem_region(struct mem_vector *entry,
- unsigned long minimum,
- unsigned long image_size)
+void ___process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
{
struct mem_vector region, overlap;
- unsigned long start_orig, end;
+ unsigned long start_orig;
struct mem_vector cur_entry;
- /* On 32-bit, ignore entries entirely above our maximum. */
- if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
- return;
-
- /* Ignore entries entirely below our minimum. */
- if (entry->start + entry->size < minimum)
- return;
-
- /* Ignore entries above memory limit */
- end = min(entry->size + entry->start, mem_limit);
- if (entry->start >= end)
- return;
- cur_entry.start = entry->start;
- cur_entry.size = end - entry->start;
-
- region.start = cur_entry.start;
- region.size = cur_entry.size;
+ region.start = cur_entry.start = entry->start;
+ region.size = cur_entry.size = entry->size;
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
@@ -691,6 +676,39 @@ static void __process_mem_region(struct mem_vector *entry,
}
}
+static void __process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ struct mem_vector region, overlap;
+ unsigned long start_orig, end;
+ struct mem_vector cur_entry;
+
+ /* On 32-bit, ignore entries entirely above our maximum. */
+ if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
+ return;
+
+ /* Ignore entries entirely below our minimum. */
+ if (entry->start + entry->size < minimum)
+ return;
+
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->start, mem_limit);
+ if (entry->start >= end)
+ return;
+ cur_entry.start = entry->start;
+ cur_entry.size = end - entry->start;
+
+ /* Return if region can't contain decompressed kernel */
+ if (cur_entry.size < image_size)
+ return;
+
+ if (pkram_enabled())
+ return pkram_process_mem_region(&cur_entry, minimum, image_size);
+ else
+ return ___process_mem_region(&cur_entry, minimum, image_size);
+}
+
static bool process_mem_region(struct mem_vector *region,
unsigned long long minimum,
unsigned long long image_size)
@@ -902,6 +920,8 @@ void choose_random_location(unsigned long input,
return;
}
+ pkram_init();
+
#ifdef CONFIG_X86_5LEVEL
if (__read_cr4() & X86_CR4_LA57) {
__pgtable_l5_enabled = 1;
@@ -952,3 +972,8 @@ void choose_random_location(unsigned long input,
random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
*virt_addr = random_addr;
}
+
+int slot_areas_full(void)
+{
+ return slot_area_index == MAX_SLOT_AREA;
+}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 726e264410ff..ca1a8ae5ebe9 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -117,6 +117,25 @@ static inline void console_init(void)
{ }
#endif
+void ___process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size);
+
+#ifdef CONFIG_PKRAM
+void pkram_init(void);
+int pkram_enabled(void);
+void pkram_process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size);
+#else
+static inline void pkram_init(void) { }
+static inline int pkram_enabled(void) { return 0; }
+static inline void pkram_process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
+{ ___process_mem_region(entry, minimum, image_size); }
+#endif
+
void set_sev_encryption_mask(void);
/* acpi.c */
diff --git a/arch/x86/boot/compressed/pkram.c b/arch/x86/boot/compressed/pkram.c
new file mode 100644
index 000000000000..5fc1e26909de
--- /dev/null
+++ b/arch/x86/boot/compressed/pkram.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+#include "misc.h"
+#include <asm/pgtable.h>
+
+struct pkram_super_block {
+ __u64 node_pfn;
+ __u64 pgd_pfn;
+};
+
+static unsigned long long pkram_sb_pfn;
+static struct pkram_super_block *pkram_sb;
+static pgd_t *pkram_pgd;
+
+struct pg_state {
+ int (*range_cb)(struct pg_state *state, unsigned long base,
+ unsigned long size);
+ unsigned long curr_addr;
+ unsigned long start_addr;
+ unsigned long min_addr;
+ unsigned long max_addr;
+ unsigned long min_size;
+ unsigned long minimum;
+ bool tracking;
+ bool find_holes;
+};
+
+int pkram_enabled(void)
+{
+ return pkram_pgd ? 1 : 0;
+}
+
+void pkram_init(void)
+{
+ char arg[32];
+
+ if (cmdline_find_option("pkram", arg, sizeof(arg)) > 0) {
+ if (kstrtoull(arg, 16, &pkram_sb_pfn) != 0)
+ return;
+ } else
+ return;
+
+ pkram_sb = (struct pkram_super_block *)(pkram_sb_pfn << PAGE_SHIFT);
+
+ if (pkram_sb)
+ pkram_pgd = (pgd_t *)(pkram_sb->pgd_pfn << PAGE_SHIFT);
+}
+
+static int note_page(struct pg_state *st, int present)
+{
+ unsigned long curr_addr = st->curr_addr;
+ bool track_page = present ^ st->find_holes;
+
+ if (!st->tracking && track_page) {
+ if (curr_addr >= st->max_addr)
+ return 1;
+ /*
+ * curr_addr can be < min_addr if the page straddles the
+ * boundary
+ */
+ st->start_addr = max(curr_addr, st->min_addr);
+ st->tracking = true;
+ } else if (st->tracking) {
+ unsigned long base, size;
+ int ret;
+
+ /* Continue tracking if upper bound has not been reached */
+ if (track_page && curr_addr < st->max_addr)
+ return 0;
+
+ curr_addr = min(curr_addr, st->max_addr);
+
+ base = st->start_addr;
+ size = curr_addr - st->start_addr;
+ st->tracking = false;
+
+ ret = st->range_cb(st, base, size);
+
+ if (curr_addr == st->max_addr)
+ return 1;
+ else
+ return ret;
+ }
+
+ return 0;
+}
+
+static int walk_pte_level(struct pg_state *st, pmd_t addr, unsigned long P)
+{
+ unsigned long *bitmap;
+ int present;
+ int i, ret;
+
+ bitmap = __va(pmd_val(addr));
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ unsigned long curr_addr = P + i * PAGE_SIZE;
+
+ if (curr_addr < st->min_addr)
+ continue;
+ st->curr_addr = curr_addr;
+ present = test_bit(i, bitmap);
+ ret = note_page(st, present);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int walk_pmd_level(struct pg_state *st, pud_t addr, unsigned long P)
+{
+ pmd_t *start;
+ int i, ret;
+
+ start = (pmd_t *)pud_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PMD; i++, start++) {
+ unsigned long curr_addr = P + i * PMD_SIZE;
+
+ if (curr_addr + PMD_SIZE <= st->min_addr)
+ continue;
+ st->curr_addr = curr_addr;
+ if (!pmd_none(*start)) {
+ if (pmd_large(*start))
+ ret = note_page(st, true);
+ else
+ ret = walk_pte_level(st, *start, curr_addr);
+ } else
+ ret = note_page(st, false);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int walk_pud_level(struct pg_state *st, p4d_t addr, unsigned long P)
+{
+ pud_t *start;
+ int i, ret;
+
+ start = (pud_t *)p4d_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PUD; i++, start++) {
+ unsigned long curr_addr = P + i * PUD_SIZE;
+
+ if (curr_addr + PUD_SIZE <= st->min_addr)
+ continue;
+ st->curr_addr = curr_addr;
+ if (!pud_none(*start)) {
+ if (pud_large(*start))
+ ret = note_page(st, true);
+ else
+ ret = walk_pmd_level(st, *start, curr_addr);
+ } else
+ ret = note_page(st, false);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int walk_p4d_level(struct pg_state *st, pgd_t addr, unsigned long P)
+{
+ p4d_t *start;
+ int i, ret;
+
+ if (PTRS_PER_P4D == 1)
+ return walk_pud_level(st, __p4d(pgd_val(addr)), P);
+
+ start = (p4d_t *)pgd_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_P4D; i++, start++) {
+ unsigned long curr_addr = P + i * P4D_SIZE;
+
+ if (curr_addr + P4D_SIZE <= st->min_addr)
+ continue;
+ st->curr_addr = curr_addr;
+ if (!p4d_none(*start)) {
+ if (p4d_large(*start))
+ ret = note_page(st, true);
+ else
+ ret = walk_pud_level(st, *start, curr_addr);
+ } else
+ ret = note_page(st, false);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
+
+static int walk_pgd_level(struct pg_state *st, pgd_t *pgd)
+{
+ pgd_t *start = pgd;
+ int i, ret = 0;
+
+ for (i = 0; i < PTRS_PER_PGD; i++, start++) {
+ unsigned long curr_addr = i * PGDIR_SIZE;
+
+ if (curr_addr + PGDIR_SIZE <= st->min_addr)
+ continue;
+ st->curr_addr = curr_addr;
+ if (!pgd_none(*start))
+ ret = walk_p4d_level(st, *start, curr_addr);
+ else
+ ret = note_page(st, false);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+extern int slot_areas_full(void);
+
+static int pkram_process_mem_region_cb(struct pg_state *st, unsigned long base, unsigned long size)
+{
+ struct mem_vector region = {
+ .start = base,
+ .size = size,
+ };
+
+ if (size < st->min_size)
+ return 0;
+
+ ___process_mem_region(®ion, st->minimum, st->min_size);
+
+ if (slot_areas_full())
+ return 1;
+
+ return 0;
+}
+
+void pkram_process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ struct pg_state st = {
+ .range_cb = pkram_process_mem_region_cb,
+ .min_addr = max((unsigned long)entry->start, minimum),
+ .max_addr = entry->start + entry->size,
+ .min_size = image_size,
+ .minimum = minimum,
+ .find_holes = true,
+ };
+
+ walk_pgd_level(&st, pkram_pgd);
+}
--
2.13.3
next prev parent reply other threads:[~2020-05-07 0:45 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-05-07 0:41 [RFC 00/43] PKRAM: Preserved-over-Kexec RAM Anthony Yznaga
2020-05-07 0:41 ` [RFC 01/43] mm: add PKRAM API stubs and Kconfig Anthony Yznaga
2020-05-07 0:41 ` [RFC 02/43] mm: PKRAM: implement node load and save functions Anthony Yznaga
2020-05-07 0:41 ` [RFC 03/43] mm: PKRAM: implement object " Anthony Yznaga
2020-05-07 0:41 ` [RFC 04/43] mm: PKRAM: implement page stream operations Anthony Yznaga
2020-05-07 0:41 ` [RFC 05/43] mm: PKRAM: support preserving transparent hugepages Anthony Yznaga
2020-05-07 0:41 ` [RFC 06/43] mm: PKRAM: implement byte stream operations Anthony Yznaga
2020-05-07 0:41 ` [RFC 07/43] mm: PKRAM: link nodes by pfn before reboot Anthony Yznaga
2020-05-07 0:41 ` [RFC 08/43] mm: PKRAM: introduce super block Anthony Yznaga
2020-05-07 0:41 ` [RFC 09/43] PKRAM: build a physical mapping pagetable of pages to be preserved Anthony Yznaga
2020-05-07 0:41 ` [RFC 10/43] PKRAM: add code for walking the preserved pages pagetable Anthony Yznaga
2020-05-07 0:41 ` [RFC 11/43] PKRAM: pass the preserved pages pagetable to the next kernel Anthony Yznaga
2020-05-07 0:41 ` [RFC 12/43] mm: PKRAM: reserve preserved memory at boot Anthony Yznaga
2020-05-07 0:41 ` [RFC 13/43] mm: PKRAM: free preserved pages pagetable Anthony Yznaga
2020-05-07 0:41 ` [RFC 14/43] mm: memblock: PKRAM: prevent memblock resize from clobbering preserved pages Anthony Yznaga
2020-05-11 13:57 ` Mike Rapoport
2020-05-11 23:29 ` Anthony Yznaga
2020-05-07 0:41 ` [RFC 15/43] PKRAM: provide a way to ban pages from use by PKRAM Anthony Yznaga
2020-05-07 0:41 ` [RFC 16/43] kexec: PKRAM: prevent kexec clobbering preserved pages in some cases Anthony Yznaga
2020-05-07 0:41 ` [RFC 17/43] PKRAM: provide a way to check if a memory range has preserved pages Anthony Yznaga
2020-05-07 0:41 ` [RFC 18/43] kexec: PKRAM: avoid clobbering already " Anthony Yznaga
2020-05-07 0:41 ` [RFC 19/43] mm: PKRAM: allow preserved memory to be freed from userspace Anthony Yznaga
2020-05-07 0:41 ` [RFC 20/43] PKRAM: disable feature when running the kdump kernel Anthony Yznaga
2020-05-07 0:41 ` Anthony Yznaga [this message]
2020-05-07 17:51 ` [RFC 21/43] x86/KASLR: PKRAM: support physical kaslr Kees Cook
2020-05-07 18:41 ` Anthony Yznaga
2020-05-07 0:41 ` [RFC 22/43] mm: shmem: introduce shmem_insert_page Anthony Yznaga
2020-05-07 0:41 ` [RFC 23/43] mm: shmem: enable saving to PKRAM Anthony Yznaga
2020-05-07 0:41 ` [RFC 24/43] mm: shmem: prevent swapping of PKRAM-enabled tmpfs pages Anthony Yznaga
2020-05-07 0:41 ` [RFC 25/43] mm: shmem: specify the mm to use when inserting pages Anthony Yznaga
2020-05-07 0:41 ` [RFC 26/43] mm: shmem: when inserting, handle pages already charged to a memcg Anthony Yznaga
2020-05-07 0:41 ` [RFC 27/43] x86/mm/numa: add numa_isolate_memblocks() Anthony Yznaga
2020-05-07 0:41 ` [RFC 28/43] PKRAM: ensure memblocks with preserved pages init'd for numa Anthony Yznaga
2020-05-07 0:41 ` [RFC 29/43] memblock: PKRAM: mark memblocks that contain preserved pages Anthony Yznaga
2020-05-07 0:41 ` [RFC 30/43] memblock: add for_each_reserved_mem_range() Anthony Yznaga
2020-05-07 0:41 ` [RFC 31/43] memblock, mm: defer initialization of preserved pages Anthony Yznaga
2020-05-07 0:41 ` [RFC 32/43] shmem: PKRAM: preserve shmem files a chunk at a time Anthony Yznaga
2020-05-07 0:41 ` [RFC 33/43] PKRAM: atomically add and remove link pages Anthony Yznaga
2020-05-07 0:42 ` [RFC 34/43] shmem: PKRAM: multithread preserving and restoring shmem pages Anthony Yznaga
2020-05-07 16:30 ` Randy Dunlap
2020-05-07 17:59 ` Anthony Yznaga
2020-05-07 0:42 ` [RFC 35/43] shmem: introduce shmem_insert_pages() Anthony Yznaga
2020-05-07 0:42 ` [RFC 36/43] PKRAM: add support for loading pages in bulk Anthony Yznaga
2020-05-07 0:42 ` [RFC 37/43] shmem: PKRAM: enable bulk loading of preserved pages into shmem Anthony Yznaga
2020-05-07 0:42 ` [RFC 38/43] mm: implement splicing a list of pages to the LRU Anthony Yznaga
2020-05-07 0:42 ` [RFC 39/43] shmem: optimize adding pages to the LRU in shmem_insert_pages() Anthony Yznaga
2020-05-07 0:42 ` [RFC 40/43] shmem: initial support for adding multiple pages to pagecache Anthony Yznaga
2020-05-07 0:42 ` [RFC 41/43] XArray: add xas_export_node() and xas_import_node() Anthony Yznaga
2020-05-07 0:42 ` [RFC 42/43] shmem: reduce time holding xa_lock when inserting pages Anthony Yznaga
2020-05-07 0:42 ` [RFC 43/43] PKRAM: improve index alignment of pkram_link entries Anthony Yznaga
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1588812129-8596-22-git-send-email-anthony.yznaga@oracle.com \
--to=anthony.yznaga@oracle.com \
--cc=Thomas.Lendacky@amd.com \
--cc=akpm@linux-foundation.org \
--cc=andriy.shevchenko@linux.intel.com \
--cc=ardb@kernel.org \
--cc=ashok.raj@intel.com \
--cc=bhe@redhat.com \
--cc=bp@alien8.de \
--cc=corbet@lwn.net \
--cc=dan.j.williams@intel.com \
--cc=daniel.kiper@oracle.com \
--cc=dave.hansen@linux.intel.com \
--cc=dima@golovin.in \
--cc=ebiederm@xmission.com \
--cc=guro@fb.com \
--cc=gustavo@embeddedor.com \
--cc=hannes@cmpxchg.org \
--cc=hpa@zytor.com \
--cc=hughd@google.com \
--cc=jason.zeng@intel.com \
--cc=jroedel@suse.de \
--cc=keescook@chromium.org \
--cc=kevin.tian@intel.com \
--cc=kexec@lists.infradead.org \
--cc=lei.l.li@intel.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@kernel.org \
--cc=masahiroy@kernel.org \
--cc=mhocko@kernel.org \
--cc=minchan@kernel.org \
--cc=mingo@redhat.com \
--cc=ndesaulniers@google.com \
--cc=nivedita@alum.mit.edu \
--cc=paul.c.lai@intel.com \
--cc=peterz@infradead.org \
--cc=rafael.j.wysocki@intel.com \
--cc=rppt@linux.ibm.com \
--cc=tglx@linutronix.de \
--cc=vdavydov.dev@gmail.com \
--cc=willy@infradead.org \
--cc=x86@kernel.org \
--cc=yang.shi@linux.alibaba.com \
--cc=ying.huang@intel.com \
--cc=zhenzhong.duan@oracle.com \
--cc=zhiyuan.lv@intel.com \
--cc=ziqian.lzq@antfin.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).