From: Jason Andryuk <jason.andryuk@amd.com>
To: <xen-devel@lists.xenproject.org>
Cc: "Jan Beulich" <jbeulich@suse.com>,
"Roger Pau Monné" <roger.pau@citrix.com>,
"Andrew Cooper" <andrew.cooper3@citrix.com>,
"Juergen Gross" <jgross@suse.com>,
"Boris Ostrovsky" <boris.ostrovsky@oracle.com>,
"Jason Andryuk" <jason.andryuk@amd.com>
Subject: [LINUX PATCH] RFC: x86/pvh: Make Xen PVH entrypoint PIC
Date: Wed, 6 Mar 2024 14:31:58 -0500 [thread overview]
Message-ID: <20240306193158.104680-1-jason.andryuk@amd.com> (raw)
In-Reply-To: <20240306185032.103216-1-jason.andryuk@amd.com>
The Xen PVH entrypoint is 32bit non-PIC code running at a default load
address of 0x1000000 (16MB) (CONFIG_PHYSICAL_START). Xen loads the
kernel at that physical address inside the PVH container.
When running a PVH Dom0, the system reserved addresses are mapped 1-1
into the PVH container. There exist system firmwares (Coreboot/EDK2)
with reserved memory at 16MB. This creates a conflict where the PVH
kernel cannot be loaded at that address.
Modify the PVH entrypoint to be position-indepedent to allow flexibility
in load address.
Initial PVH entry runs at the physical addresses and then transitions to
the identity mapped address. While executing xen_prepare_pvh() calls
through pv_ops function pointers transition to the high mapped
addresses. Additionally, __va() is called on some hvm_start_info
physical addresses, we need the directmap address range is used. So we
need to run page tables with all of those ranges mapped.
Modifying init_top_pgt tables ran into issue since
startup_64/__startup_64() will modify those page tables again. Use a
dedicated set of page tables - pvh_init_top_pgt - for the PVH entry to
avoid unwanted interactions.
In xen_pvh_init(), __pa() is called to find the physical address of the
hypercall page. Set phys_base temporarily before calling into
xen_prepare_pvh(), which calls xen_pvh_init(), and clear it afterwards.
__startup_64() assumes phys_base is zero and adds load_delta to it. If
phys_base is already set, the calculation results in an incorrect cr3.
TODO: The 32bit entry path needs additional work to have relocatable
pagetables.
TODO: Sync features.h from xen.git commit xxxxxxxxxx when it is
commited.
Signed-off-by: Jason Andryuk <jason.andryuk@amd.com>
---
Put this out as an example for the Xen modifications. Will split for
actual submission.
Instead of setting and clearing phys_base, add a dedicated variable?
Make __startup_64() exit if phys_base is already set to allow calling
multiple times, and use that and init_top_pgt instead of adding
additional page tables?
---
arch/x86/platform/pvh/head.S | 195 ++++++++++++++++++++++++++++---
arch/x86/xen/xen-head.S | 7 +-
include/xen/interface/features.h | 5 +
3 files changed, 190 insertions(+), 17 deletions(-)
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index f7235ef87bc3..bab857db55ef 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -50,11 +50,32 @@
#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
+#define rva(x) ((x) - pvh_start_xen)
+
SYM_CODE_START_LOCAL(pvh_start_xen)
UNWIND_HINT_END_OF_STACK
cld
- lgdt (_pa(gdt))
+ /*
+ * See the comment for startup_32 for more details. We need to
+ * execute a call to get the execution address to be position
+ * independent, but we don't have a stack. Save and restore the
+ * magic field of start_info in ebx, and use that as the stack.
+ */
+ mov (%ebx), %eax
+ leal 4(%ebx), %esp
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 1f
+1: popl %ebp
+ mov %eax, (%ebx)
+ subl $ rva(1b), %ebp
+ movl $0, %esp
+
+ leal rva(gdt)(%ebp), %eax
+ movl %eax, %ecx
+ leal rva(gdt_start)(%ebp), %ecx
+ movl %ecx, 2(%eax)
+ lgdt (%eax)
mov $PVH_DS_SEL,%eax
mov %eax,%ds
@@ -62,14 +83,14 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
mov %eax,%ss
/* Stash hvm_start_info. */
- mov $_pa(pvh_start_info), %edi
+ leal rva(pvh_start_info)(%ebp), %edi
mov %ebx, %esi
- mov _pa(pvh_start_info_sz), %ecx
+ movl rva(pvh_start_info_sz)(%ebp), %ecx
shr $2,%ecx
rep
movsl
- mov $_pa(early_stack_end), %esp
+ leal rva(early_stack_end)(%ebp), %esp
/* Enable PAE mode. */
mov %cr4, %eax
@@ -83,29 +104,83 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
btsl $_EFER_LME, %eax
wrmsr
+ mov %ebp, %ebx
+ subl $LOAD_PHYSICAL_ADDR, %ebx /* offset */
+ jz .Lpagetable_done
+
+ /* Fixup page-tables for relocation. */
+ leal rva(pvh_init_top_pgt)(%ebp), %edi
+ movl $512, %ecx
+2:
+ movl 0x00(%edi), %eax
+ addl 0x04(%edi), %eax
+ jz 1f
+ addl %ebx, 0x00(%edi)
+1:
+ addl $8, %edi
+ decl %ecx
+ jnz 2b
+
+ /* L3 ident has a single entry. */
+ leal rva(pvh_level3_ident_pgt)(%ebp), %edi
+ addl %ebx, 0x00(%edi)
+
+ leal rva(pvh_level3_kernel_pgt)(%ebp), %edi
+ addl %ebx, (4096 - 16)(%edi)
+ addl %ebx, (4096 - 8)(%edi)
+
+ /* pvh_level2_ident_pgt is fine - large pages */
+
+ /* pvh_level2_kernel_pgt needs adjustment - large pages */
+ leal rva(pvh_level2_kernel_pgt)(%ebp), %edi
+ movl $512, %ecx
+2:
+ movl 0x00(%edi), %eax
+ addl 0x04(%edi), %eax
+ jz 1f
+ addl %ebx, 0x00(%edi)
+1:
+ addl $8, %edi
+ decl %ecx
+ jnz 2b
+
+.Lpagetable_done:
/* Enable pre-constructed page tables. */
- mov $_pa(init_top_pgt), %eax
+ leal rva(pvh_init_top_pgt)(%ebp), %eax
mov %eax, %cr3
mov $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
/* Jump to 64-bit mode. */
- ljmp $PVH_CS_SEL, $_pa(1f)
+ pushl $PVH_CS_SEL
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
/* 64-bit entry point. */
.code64
1:
/* Set base address in stack canary descriptor. */
mov $MSR_GS_BASE,%ecx
- mov $_pa(canary), %eax
+ leal rva(canary)(%ebp), %eax
xor %edx, %edx
wrmsr
+ /* Calculate load offset from LOAD_PHYSICAL_ADDR and store in
+ * phys_base. __pa() needs phys_base set to calculate the the
+ * hypercall page in xen_pvh_init(). */
+ movq %rbp, %rbx
+ subq $LOAD_PHYSICAL_ADDR, %rbx
+ movq %rbx, phys_base(%rip)
call xen_prepare_pvh
+ /* Clear phys_base. startup_64/__startup_64 will *add* to its value,
+ so start from 0. */
+ xor %rbx, %rbx
+ movq %rbx, phys_base(%rip)
/* startup_64 expects boot_params in %rsi. */
- mov $_pa(pvh_bootparams), %rsi
- mov $_pa(startup_64), %rax
+ lea rva(pvh_bootparams)(%ebp), %rsi
+ lea rva(startup_64)(%ebp), %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
@@ -113,20 +188,27 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
call mk_early_pgtbl_32
- mov $_pa(initial_page_table), %eax
+ leal rva(initial_page_table)(%ebp), %eax
mov %eax, %cr3
mov %cr0, %eax
or $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
- ljmp $PVH_CS_SEL, $1f
+ pushl $PVH_CS_SEL
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+
1:
call xen_prepare_pvh
- mov $_pa(pvh_bootparams), %esi
+ leal rva(pvh_bootparams)(%ebp), %esi
/* startup_32 doesn't expect paging and PAE to be on. */
- ljmp $PVH_CS_SEL, $_pa(2f)
+ pushl $PVH_CS_SEL
+ leal rva(2f)(%ebp), %eax
+ pushl %eax
+ lretl
2:
mov %cr0, %eax
and $~X86_CR0_PG, %eax
@@ -135,15 +217,19 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
and $~X86_CR4_PAE, %eax
mov %eax, %cr4
- ljmp $PVH_CS_SEL, $_pa(startup_32)
+ pushl $PVH_CS_SEL
+ leal rva(startup_32)(%ebp), %eax
+ pushl %eax
+ lretl
#endif
+
SYM_CODE_END(pvh_start_xen)
.section ".init.data","aw"
.balign 8
SYM_DATA_START_LOCAL(gdt)
- .word gdt_end - gdt_start
- .long _pa(gdt_start)
+ .word gdt_end - gdt_start - 1
+ .long 0
.word 0
SYM_DATA_END(gdt)
SYM_DATA_START_LOCAL(gdt_start)
@@ -163,5 +249,82 @@ SYM_DATA_START_LOCAL(early_stack)
.fill BOOT_STACK_SIZE, 1, 0
SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end)
+/*
+ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
+ * because we need identity-mapped pages.
+ */
+#define l4_index(x) (((x) >> 39) & 511)
+#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
+#define SYM_DATA_START_PAGE_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
+
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
+ .endr
+
+/*
+ * Xen PVH needs a set of identity mapped and kernel high mapping
+ * page tables. pvh_start_xen starts running on the identity mapped
+ * page tables, but xen_prepare_pvh calls into the high mapping.
+ * These page tables are need to be relocatable and are only used until
+ * startup_64 transitions to init_top_pgt.
+ */
+SYM_DATA_START_PAGE_ALIGNED(pvh_init_top_pgt)
+ .quad pvh_level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org pvh_init_top_pgt + L4_PAGE_OFFSET*8, 0
+ .quad pvh_level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org pvh_init_top_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad pvh_level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(pvh_init_top_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(pvh_level3_ident_pgt)
+ .quad pvh_level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .fill 511, 8, 0
+SYM_DATA_END(pvh_level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(pvh_level2_ident_pgt)
+ /*
+ * Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ *
+ * Note: This sets _PAGE_GLOBAL despite whether
+ * the CPU supports it or it is enabled. But,
+ * the CPU should ignore the bit.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+SYM_DATA_END(pvh_level2_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(pvh_level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad pvh_level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad 0 /* no fixmap */
+SYM_DATA_END(pvh_level3_kernel_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(pvh_level2_kernel_pgt)
+ /*
+ * Kernel high mapping.
+ *
+ * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in
+ * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled,
+ * 512 MiB otherwise.
+ *
+ * (NOTE: after that starts the module area, see MODULES_VADDR.)
+ *
+ * This table is eventually used by the kernel during normal runtime.
+ * Care must be taken to clear out undesired bits later, like _PAGE_RW
+ * or _PAGE_GLOBAL in some cases.
+ */
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
+SYM_DATA_END(pvh_level2_kernel_pgt)
+
ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
_ASM_PTR (pvh_start_xen - __START_KERNEL_map))
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index a0ea285878db..e6994aaa8cc3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -106,7 +106,12 @@ SYM_CODE_END(xen_cpu_bringup_again)
# define FEATURES_PV 0
#endif
#ifdef CONFIG_XEN_PVH
-# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted)
+# ifdef CONFIG_RELOCATABLE
+# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted) | \
+ (1 << XENFEAT_pvh_relocatable)
+# else
+# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted)
+# endif
#else
# define FEATURES_PVH 0
#endif
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
index 53f760378e39..92e7da9194c8 100644
--- a/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@ -97,6 +97,11 @@
#define XENFEAT_not_direct_mapped 16
#define XENFEAT_direct_mapped 17
+/*
+ * PVH: If set, the guest supports being relocated in physical memory on entry.
+ */
+#define XENFEAT_pvh_relocatable 20
+
#define XENFEAT_NR_SUBMAPS 1
#endif /* __XEN_PUBLIC_FEATURES_H__ */
--
2.44.0
g
next prev parent reply other threads:[~2024-03-06 19:32 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-06 18:50 [PATCH 0/3] x86/pvh: Support relocating dom0 kernel Jason Andryuk
2024-03-06 18:50 ` [PATCH 1/3] features.h: Replace hard tabs Jason Andryuk
2024-03-06 20:41 ` Stefano Stabellini
2024-03-06 18:50 ` [PATCH 2/3] xen/x86: bzImage parse kernel_alignment Jason Andryuk
2024-03-07 2:09 ` Stefano Stabellini
2024-03-07 8:26 ` Jan Beulich
2024-03-07 15:06 ` Jason Andryuk
2024-03-06 18:50 ` [PATCH 3/3] x86/PVH: Support relocatable dom0 kernels Jason Andryuk
2024-03-07 2:09 ` Stefano Stabellini
2024-03-07 16:07 ` Jason Andryuk
2024-03-07 9:30 ` Roger Pau Monné
2024-03-07 17:01 ` Jason Andryuk
2024-03-08 6:34 ` Juergen Gross
2024-03-11 16:53 ` Jan Beulich
2024-03-11 19:53 ` Jason Andryuk
2024-03-06 19:31 ` Jason Andryuk [this message]
2024-03-07 10:00 ` [PATCH 0/3] x86/pvh: Support relocating dom0 kernel Roger Pau Monné
2024-03-07 10:08 ` Jan Beulich
2024-03-07 10:20 ` Roger Pau Monné
2024-03-07 17:33 ` Jason Andryuk
2024-03-08 7:03 ` Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240306193158.104680-1-jason.andryuk@amd.com \
--to=jason.andryuk@amd.com \
--cc=andrew.cooper3@citrix.com \
--cc=boris.ostrovsky@oracle.com \
--cc=jbeulich@suse.com \
--cc=jgross@suse.com \
--cc=roger.pau@citrix.com \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.