* [PATCH v4 1/6] x86/mm/KASLR: Improve code comments about struct kaslr_memory_region
2019-03-14 9:46 [PATCH v4 0/6] Several patches to fix code bugs, improve documents and clean up Baoquan He
@ 2019-03-14 9:46 ` Baoquan He
2019-03-14 9:46 ` [PATCH v4 2/6] x86/mm/KASLR: Open code unnecessary function get_padding Baoquan He
` (4 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:46 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie, Baoquan He
The old comment above kaslr_memory_region is not clear enough to explain
the concepts of memory region KASLR.
[Ingo suggested this and helped to prettify the text]
Signed-off-by: Baoquan He <bhe@redhat.com>
---
arch/x86/mm/kaslr.c | 51 +++++++++++++++++++++++++++++++++++++++++----
1 file changed, 47 insertions(+), 4 deletions(-)
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 9a8756517504..5debf82ab06a 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -42,9 +42,46 @@
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
/*
- * Memory regions randomized by KASLR (except modules that use a separate logic
- * earlier during boot). The list is ordered based on virtual addresses. This
- * order is kept after randomization.
+ * struct kaslr_memory_region - represent continuous chunks of kernel
+ * virtual memory regions, to be randomized by KASLR.
+ *
+ * ( The exception is the module space virtual memory window which
+ * uses separate logic earlier during bootup. )
+ *
+ * Currently there are three such regions: the physical memory mapping,
+ * vmalloc and vmemmap regions.
+ *
+ * The array below has the entries ordered based on virtual addresses.
+ * The order is kept after randomization, i.e. the randomized virtual
+ * addresses of these regions are still ascending.
+ *
+ * Here are the fields:
+ *
+ * @base: points to a global variable used by the MM to get the virtual
+ * base address of any of the above regions. This allows the early KASLR
+ * KASLR code to modify these base addresses early during bootup, on a
+ * per bootup basis, without the MM code even being aware of whether it
+ * got changed and to what value.
+ *
+ * When KASLR is active then the MM code makes sure that for each region
+ * there's such a single, dynamic, global base address 'unsigned long'
+ * variable available for the KASLR code to point to and modify directly:
+ *
+ * { &page_offset_base, 0 },
+ * { &vmalloc_base, 0 },
+ * { &vmemmap_base, 1 },
+ *
+ * @size_tb: size in TB of each memory region. E.g, the sizes in 4-level
+ * pageing mode are:
+ *
+ * - Physical memory mapping: (actual RAM size + 10 TB padding)
+ * - Vmalloc: 32 TB
+ * - Vmemmap: 1 TB
+ *
+ * As seen, the size of the physical memory mapping region is variable,
+ * calculated according to the actual size of system RAM in order to
+ * save more space for randomization. The rest are fixed values related
+ * to paging mode.
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
@@ -70,7 +107,13 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
}
-/* Initialize base and padding for each memory region randomized with KASLR */
+/*
+ * kernel_randomize_memory - initialize base and padding for each
+ * memory region randomized with KASLR.
+ *
+ * When randomize the layout, their order are kept, still the physical
+ * memory mapping region is handled firstly, next vmalloc and vmemmap.
+ */
void __init kernel_randomize_memory(void)
{
size_t i;
--
2.17.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 2/6] x86/mm/KASLR: Open code unnecessary function get_padding
2019-03-14 9:46 [PATCH v4 0/6] Several patches to fix code bugs, improve documents and clean up Baoquan He
2019-03-14 9:46 ` [PATCH v4 1/6] x86/mm/KASLR: Improve code comments about struct kaslr_memory_region Baoquan He
@ 2019-03-14 9:46 ` Baoquan He
2019-03-14 9:46 ` [PATCH v4 3/6] mm: Add build time sanity check for struct page size Baoquan He
` (3 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:46 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie, Baoquan He
It's used only twice and we do bit shifts in the parent function
anyway so it's not like it's hiding some uninteresting detail.
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
---
arch/x86/mm/kaslr.c | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 5debf82ab06a..d7ea6b252594 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -92,12 +92,6 @@ static __initdata struct kaslr_memory_region {
{ &vmemmap_base, 1 },
};
-/* Get size in bytes used by the memory region */
-static inline unsigned long get_padding(struct kaslr_memory_region *region)
-{
- return (region->size_tb << TB_SHIFT);
-}
-
/*
* Apply no randomization if KASLR was disabled at boot or if KASAN
* is enabled. KASAN shadow mappings rely on regions being PGD aligned.
@@ -155,7 +149,7 @@ void __init kernel_randomize_memory(void)
/* Calculate entropy available between regions */
remain_entropy = vaddr_end - vaddr_start;
for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++)
- remain_entropy -= get_padding(&kaslr_regions[i]);
+ remain_entropy -= kaslr_regions[i].size_tb << TB_SHIFT;
prandom_seed_state(&rand_state, kaslr_get_random_long("Memory"));
@@ -176,7 +170,7 @@ void __init kernel_randomize_memory(void)
* Jump the region and add a minimum padding based on
* randomization alignment.
*/
- vaddr += get_padding(&kaslr_regions[i]);
+ vaddr += kaslr_regions[i].size_tb << TB_SHIFT;
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
--
2.17.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 3/6] mm: Add build time sanity check for struct page size
2019-03-14 9:46 [PATCH v4 0/6] Several patches to fix code bugs, improve documents and clean up Baoquan He
2019-03-14 9:46 ` [PATCH v4 1/6] x86/mm/KASLR: Improve code comments about struct kaslr_memory_region Baoquan He
2019-03-14 9:46 ` [PATCH v4 2/6] x86/mm/KASLR: Open code unnecessary function get_padding Baoquan He
@ 2019-03-14 9:46 ` Baoquan He
2019-03-14 9:50 ` Baoquan He
2019-03-14 9:46 ` [PATCH v4 4/6] x86/mm/KASLR: Fix the wrong calculation of memory region initial size Baoquan He
` (2 subsequent siblings)
5 siblings, 1 reply; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:46 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie, Baoquan He
Size of struct page might be larger than 64 bytes if debug options
enabled, or fields added for debugging intentionally. Yet an upper
limit need be added at build time to trigger an alert in case the
size is too big to boot up system, warning people to check if it's
be done on purpose in advance.
Here 1/4 of PAGE_SIZE is chosen since system must have been insane
with this value. For those systems with PAGE_SIZE larger than 4KB,
1KB is simply taken.
Signed-off-by: Baoquan He <bhe@redhat.com>
Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
mm/page_alloc.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3eb01dedfb50..eddf96c137f2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -67,6 +67,7 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
+#include <linux/sizes.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -7170,6 +7171,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
unsigned long start_pfn, end_pfn;
int i, nid;
+ BUILD_BUG_ON(sizeof(struct page) > min_t(size_t, SZ_1K, PAGE_SIZE/4));
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
--
2.17.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v4 3/6] mm: Add build time sanity check for struct page size
2019-03-14 9:46 ` [PATCH v4 3/6] mm: Add build time sanity check for struct page size Baoquan He
@ 2019-03-14 9:50 ` Baoquan He
2019-03-14 9:57 ` Baoquan He
0 siblings, 1 reply; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:50 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie
On 03/14/19 at 05:46pm, Baoquan He wrote:
> Size of struct page might be larger than 64 bytes if debug options
> enabled, or fields added for debugging intentionally. Yet an upper
> limit need be added at build time to trigger an alert in case the
> size is too big to boot up system, warning people to check if it's
> be done on purpose in advance.
>
> Here 1/4 of PAGE_SIZE is chosen since system must have been insane
> with this value. For those systems with PAGE_SIZE larger than 4KB,
> 1KB is simply taken.
>
> Signed-off-by: Baoquan He <bhe@redhat.com>
> Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Sorry, I forgot adding 'Acked-by' tag here.
> ---
> mm/page_alloc.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3eb01dedfb50..eddf96c137f2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -67,6 +67,7 @@
> #include <linux/lockdep.h>
> #include <linux/nmi.h>
> #include <linux/psi.h>
> +#include <linux/sizes.h>
>
> #include <asm/sections.h>
> #include <asm/tlbflush.h>
> @@ -7170,6 +7171,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
> unsigned long start_pfn, end_pfn;
> int i, nid;
>
> + BUILD_BUG_ON(sizeof(struct page) > min_t(size_t, SZ_1K, PAGE_SIZE/4));
> /* Record where the zone boundaries are */
> memset(arch_zone_lowest_possible_pfn, 0,
> sizeof(arch_zone_lowest_possible_pfn));
> --
> 2.17.2
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v4 3/6] mm: Add build time sanity check for struct page size
2019-03-14 9:50 ` Baoquan He
@ 2019-03-14 9:57 ` Baoquan He
0 siblings, 0 replies; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:57 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie
On 03/14/19 at 05:50pm, Baoquan He wrote:
> On 03/14/19 at 05:46pm, Baoquan He wrote:
> > Size of struct page might be larger than 64 bytes if debug options
> > enabled, or fields added for debugging intentionally. Yet an upper
> > limit need be added at build time to trigger an alert in case the
> > size is too big to boot up system, warning people to check if it's
> > be done on purpose in advance.
> >
> > Here 1/4 of PAGE_SIZE is chosen since system must have been insane
> > with this value. For those systems with PAGE_SIZE larger than 4KB,
> > 1KB is simply taken.
> >
> > Signed-off-by: Baoquan He <bhe@redhat.com>
> > Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>
> Sorry, I forgot adding 'Acked-by' tag here.
Should be "Suggested-by:" tag.
>
> > ---
> > mm/page_alloc.c | 2 ++
> > 1 file changed, 2 insertions(+)
> >
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 3eb01dedfb50..eddf96c137f2 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -67,6 +67,7 @@
> > #include <linux/lockdep.h>
> > #include <linux/nmi.h>
> > #include <linux/psi.h>
> > +#include <linux/sizes.h>
> >
> > #include <asm/sections.h>
> > #include <asm/tlbflush.h>
> > @@ -7170,6 +7171,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
> > unsigned long start_pfn, end_pfn;
> > int i, nid;
> >
> > + BUILD_BUG_ON(sizeof(struct page) > min_t(size_t, SZ_1K, PAGE_SIZE/4));
> > /* Record where the zone boundaries are */
> > memset(arch_zone_lowest_possible_pfn, 0,
> > sizeof(arch_zone_lowest_possible_pfn));
> > --
> > 2.17.2
> >
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v4 4/6] x86/mm/KASLR: Fix the wrong calculation of memory region initial size
2019-03-14 9:46 [PATCH v4 0/6] Several patches to fix code bugs, improve documents and clean up Baoquan He
` (2 preceding siblings ...)
2019-03-14 9:46 ` [PATCH v4 3/6] mm: Add build time sanity check for struct page size Baoquan He
@ 2019-03-14 9:46 ` Baoquan He
2019-03-24 20:58 ` Thomas Gleixner
2019-03-14 9:46 ` [PATCH v4 5/6] x86/mm/KASLR: Calculate the actual size of vmemmap region Baoquan He
2019-03-14 9:46 ` [PATCH v4 6/6] x86/mm/KASLR: Do not adapt the size of the direct mapping region for SGI UV system Baoquan He
5 siblings, 1 reply; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:46 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie, Baoquan He
In memory region KASLR, __PHYSICAL_MASK_SHIFT is taken to calculate
the initial size of the direct mapping region. This is correct in
the old code where __PHYSICAL_MASK_SHIFT was equal to MAX_PHYSMEM_BITS,
46 bits, and only 4-level mode was supported.
Later, in commit:
b83ce5ee91471d ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52"),
__PHYSICAL_MASK_SHIFT was changed to be always 52 bits, no matter it's
5-level or 4-level. This is wrong for 4-level paging. Then when we
adapt physical memory region size based on available memory, it
will overflow if the amount of system RAM and the padding is bigger
than 64 TB.
In fact, here MAX_PHYSMEM_BITS should be used instead. Fix it by
replacing __PHYSICAL_MASK_SHIFT with MAX_PHYSMEM_BITS.
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
---
arch/x86/mm/kaslr.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index d7ea6b252594..ebf6d1d92385 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -131,7 +131,7 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;
- kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
+ kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
/*
--
2.17.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v4 4/6] x86/mm/KASLR: Fix the wrong calculation of memory region initial size
2019-03-14 9:46 ` [PATCH v4 4/6] x86/mm/KASLR: Fix the wrong calculation of memory region initial size Baoquan He
@ 2019-03-24 20:58 ` Thomas Gleixner
2019-03-25 3:46 ` Baoquan He
0 siblings, 1 reply; 11+ messages in thread
From: Thomas Gleixner @ 2019-03-24 20:58 UTC (permalink / raw)
To: Baoquan He
Cc: linux-kernel, mingo, keescook, kirill, yamada.masahiro, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie
On Thu, 14 Mar 2019, Baoquan He wrote:
> In memory region KASLR, __PHYSICAL_MASK_SHIFT is taken to calculate
> the initial size of the direct mapping region. This is correct in
> the old code where __PHYSICAL_MASK_SHIFT was equal to MAX_PHYSMEM_BITS,
> 46 bits, and only 4-level mode was supported.
>
> Later, in commit:
> b83ce5ee91471d ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52"),
> __PHYSICAL_MASK_SHIFT was changed to be always 52 bits, no matter it's
> 5-level or 4-level. This is wrong for 4-level paging. Then when we
> adapt physical memory region size based on available memory, it
> will overflow if the amount of system RAM and the padding is bigger
> than 64 TB.
I have no idea what that sentence means and what will overflow. Neither I
have the time to stare at the code to figure it out. Changelogs need to be
self explanatory. Providing a simple example with numbers or an
illustration would be helpful.
> In fact, here MAX_PHYSMEM_BITS should be used instead. Fix it by
> replacing __PHYSICAL_MASK_SHIFT with MAX_PHYSMEM_BITS.
>
> Signed-off-by: Baoquan He <bhe@redhat.com>
> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Reviewed-by: Thomas Garnier <thgarnie@google.com>
> Acked-by: Kees Cook <keescook@chromium.org>
So this is an actual bug fix, which is in the middle of this series. Bug
fixes go first and need to be independent of the rest of the series.
They also need a Fixes: tag.
> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
> index d7ea6b252594..ebf6d1d92385 100644
> --- a/arch/x86/mm/kaslr.c
> +++ b/arch/x86/mm/kaslr.c
> @@ -131,7 +131,7 @@ void __init kernel_randomize_memory(void)
> if (!kaslr_memory_enabled())
> return;
>
> - kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
> + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
> kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
That said, I surely can understand why this change needs to be done, but
the changelog needs to explain the issue so someone with less experience or
someone looking at this in a year from now doesn't have to bang his head
against the wall.
Thanks,
tglx
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v4 4/6] x86/mm/KASLR: Fix the wrong calculation of memory region initial size
2019-03-24 20:58 ` Thomas Gleixner
@ 2019-03-25 3:46 ` Baoquan He
0 siblings, 0 replies; 11+ messages in thread
From: Baoquan He @ 2019-03-25 3:46 UTC (permalink / raw)
To: Thomas Gleixner
Cc: linux-kernel, mingo, keescook, kirill, yamada.masahiro, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie
On 03/24/19 at 09:58pm, Thomas Gleixner wrote:
> On Thu, 14 Mar 2019, Baoquan He wrote:
> > In memory region KASLR, __PHYSICAL_MASK_SHIFT is taken to calculate
> > the initial size of the direct mapping region. This is correct in
> > the old code where __PHYSICAL_MASK_SHIFT was equal to MAX_PHYSMEM_BITS,
> > 46 bits, and only 4-level mode was supported.
> >
> > Later, in commit:
> > b83ce5ee91471d ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52"),
> > __PHYSICAL_MASK_SHIFT was changed to be always 52 bits, no matter it's
> > 5-level or 4-level. This is wrong for 4-level paging. Then when we
> > adapt physical memory region size based on available memory, it
> > will overflow if the amount of system RAM and the padding is bigger
> > than 64 TB.
>
> I have no idea what that sentence means and what will overflow. Neither I
> have the time to stare at the code to figure it out. Changelogs need to be
> self explanatory. Providing a simple example with numbers or an
> illustration would be helpful.
>
> > In fact, here MAX_PHYSMEM_BITS should be used instead. Fix it by
> > replacing __PHYSICAL_MASK_SHIFT with MAX_PHYSMEM_BITS.
> >
> > Signed-off-by: Baoquan He <bhe@redhat.com>
> > Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Reviewed-by: Thomas Garnier <thgarnie@google.com>
> > Acked-by: Kees Cook <keescook@chromium.org>
>
> So this is an actual bug fix, which is in the middle of this series. Bug
> fixes go first and need to be independent of the rest of the series.
>
> They also need a Fixes: tag.
>
> > diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
> > index d7ea6b252594..ebf6d1d92385 100644
> > --- a/arch/x86/mm/kaslr.c
> > +++ b/arch/x86/mm/kaslr.c
> > @@ -131,7 +131,7 @@ void __init kernel_randomize_memory(void)
> > if (!kaslr_memory_enabled())
> > return;
> >
> > - kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
> > + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
> > kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
>
> That said, I surely can understand why this change needs to be done, but
> the changelog needs to explain the issue so someone with less experience or
> someone looking at this in a year from now doesn't have to bang his head
> against the wall.
OK, let me add example into log and make log more understandable.
Thanks.
I will take out patch 4, 5, 6 of this series and send them out
separately. Then send patch 1, 2 ,3 as a clean up patch series.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v4 5/6] x86/mm/KASLR: Calculate the actual size of vmemmap region
2019-03-14 9:46 [PATCH v4 0/6] Several patches to fix code bugs, improve documents and clean up Baoquan He
` (3 preceding siblings ...)
2019-03-14 9:46 ` [PATCH v4 4/6] x86/mm/KASLR: Fix the wrong calculation of memory region initial size Baoquan He
@ 2019-03-14 9:46 ` Baoquan He
2019-03-14 9:46 ` [PATCH v4 6/6] x86/mm/KASLR: Do not adapt the size of the direct mapping region for SGI UV system Baoquan He
5 siblings, 0 replies; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:46 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie, Baoquan He
Vmemmap region has different maximum size depending on paging mode.
Now its size is hardcoded as 1TB in memory KASLR, this is not
right for 5-level paging mode. It will cause overflow if vmemmap
region is randomized to be adjacent to cpu_entry_area region and
its actual size is bigger than 1 TB.
So here calculate how many TB by the actual size of vmemmap region
and align up to 1TB boundary. In 4-level the size will be 1 TB always
since the max is 1 TB. In 5-level it's variable so that space can
be saved for randomization.
Signed-off-by: Baoquan He <bhe@redhat.com>
---
arch/x86/mm/kaslr.c | 27 ++++++++++++++++++++-------
1 file changed, 20 insertions(+), 7 deletions(-)
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index ebf6d1d92385..615a79f6b701 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -69,19 +69,22 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*
* { &page_offset_base, 0 },
* { &vmalloc_base, 0 },
- * { &vmemmap_base, 1 },
+ * { &vmemmap_base, 0 },
*
* @size_tb: size in TB of each memory region. E.g, the sizes in 4-level
* pageing mode are:
*
* - Physical memory mapping: (actual RAM size + 10 TB padding)
* - Vmalloc: 32 TB
- * - Vmemmap: 1 TB
+ * - Vmemmap: (needed size aligned to 1TB boundary)
*
- * As seen, the size of the physical memory mapping region is variable,
- * calculated according to the actual size of system RAM in order to
- * save more space for randomization. The rest are fixed values related
- * to paging mode.
+ * As seen, only the vmalloc region is fixed value related to paging
+ * mode. While the sizes of the physical memory mapping region and
+ * vmemmap region are variable. The size of the physical memory mapping
+ * region is calculated according to the actual size of system RAM plus
+ * padding value. And the size of vmemmap is calculated as needed and
+ * aligned to 1 TB boundary. The calculations done here is to save more
+ * space for randomization.
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
@@ -89,7 +92,7 @@ static __initdata struct kaslr_memory_region {
} kaslr_regions[] = {
{ &page_offset_base, 0 },
{ &vmalloc_base, 0 },
- { &vmemmap_base, 1 },
+ { &vmemmap_base, 0 },
};
/*
@@ -115,6 +118,7 @@ void __init kernel_randomize_memory(void)
unsigned long rand, memory_tb;
struct rnd_state rand_state;
unsigned long remain_entropy;
+ unsigned long vmemmap_size;
vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
vaddr = vaddr_start;
@@ -146,6 +150,15 @@ void __init kernel_randomize_memory(void)
if (memory_tb < kaslr_regions[0].size_tb)
kaslr_regions[0].size_tb = memory_tb;
+ /*
+ * Calculate how many TB vmemmap region needs, and align to 1 TB
+ * boundary. It's 1 TB in 4-level since the max is 1 TB, while
+ * variable in 5-level.
+ */
+ vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) *
+ sizeof(struct page);
+ kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT);
+
/* Calculate entropy available between regions */
remain_entropy = vaddr_end - vaddr_start;
for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++)
--
2.17.2
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 6/6] x86/mm/KASLR: Do not adapt the size of the direct mapping region for SGI UV system
2019-03-14 9:46 [PATCH v4 0/6] Several patches to fix code bugs, improve documents and clean up Baoquan He
` (4 preceding siblings ...)
2019-03-14 9:46 ` [PATCH v4 5/6] x86/mm/KASLR: Calculate the actual size of vmemmap region Baoquan He
@ 2019-03-14 9:46 ` Baoquan He
5 siblings, 0 replies; 11+ messages in thread
From: Baoquan He @ 2019-03-14 9:46 UTC (permalink / raw)
To: linux-kernel
Cc: mingo, keescook, kirill, yamada.masahiro, tglx, bp, hpa,
dave.hansen, luto, peterz, x86, thgarnie, Baoquan He
On SGI UV system, kernel often hangs when KASLR is enabled. Disabling
KASLR makes kernel work well.
The back trace is:
kernel BUG at arch/x86/mm/init_64.c:311!
invalid opcode: 0000 [#1] SMP
[...]
RIP: 0010:__init_extra_mapping+0x188/0x196
[...]
Call Trace:
init_extra_mapping_uc+0x13/0x15
map_high+0x67/0x75
map_mmioh_high_uv3+0x20a/0x219
uv_system_init_hub+0x12d9/0x1496
uv_system_init+0x27/0x29
native_smp_prepare_cpus+0x28d/0x2d8
kernel_init_freeable+0xdd/0x253
? rest_init+0x80/0x80
kernel_init+0xe/0x110
ret_from_fork+0x2c/0x40
This is because the SGI UV system need map its MMIOH region to the direct
mapping section, and the mapping happens in rest_init() which is much
later than the calling of kernel_randomize_memory() to do mm KASLR. So
mm KASLR can't count in the size of the MMIOH region when calculate the
needed size of address space for the direct mapping section.
When KASLR is disabled, there are 64TB address space for both system RAM
and the MMIOH regions to share. When KASLR is enabled, the current code
of mm KASLR only reserves the actual size of system RAM plus extra 10TB
for the direct mapping. Thus later the MMIOH mapping could go beyond
the upper bound of the direct mapping to step into VMALLOC or VMEMMAP area.
Then BUG_ON() in __init_extra_mapping() will be triggered.
E.g on the SGI UV3 machine where this bug is reported , there are two MMIOH
regions:
[ 1.519001] UV: Map MMIOH0_HI 0xffc00000000 - 0x100000000000
[ 1.523001] UV: Map MMIOH1_HI 0x100000000000 - 0x200000000000
They are [16TB-16G, 16TB) and [16TB, 32TB). On this machine, 512G RAM are
spread out to 1TB regions. Then above two SGI MMIOH regions also will be
mapped into the direct mapping section.
To fix it, we need check if it's SGI UV system by calling
is_early_uv_system() in kernel_randomize_memory(). If yes, do not adapt
the size of the direct mapping section, just keep it as 64TB.
Signed-off-by: Baoquan He <bhe@redhat.com>
---
arch/x86/mm/kaslr.c | 60 +++++++++++++++++++++++++++++++++------------
1 file changed, 45 insertions(+), 15 deletions(-)
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 615a79f6b701..7584124fca82 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -29,6 +29,7 @@
#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/kaslr.h>
+#include <asm/uv/uv.h>
#include "mm_internal.h"
@@ -104,6 +105,46 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
}
+/*
+ * calc_direct_mapping_size - calculate the needed size of the direct
+ * mapping area.
+ *
+ * Even though a huge virtual address space is reserved for the direct
+ * mapping of physical memory, e.g in 4-level pageing mode, it's 64TB,
+ * rare system can own enough physical memory to use it up, most are
+ * even less than 1TB. So with KASLR enabled, we adapt the size of
+ * direct mapping area to size of actual physical memory plus the
+ * configured padding CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING.
+ * The left part will be taken out to join memory randomization.
+ *
+ * Note that UV system is an exception, its MMIOH region need be mapped
+ * into the direct mapping area too, while the size can't be got until
+ * rest_init() calling. Hence for UV system, do not adapt the size
+ * of direct mapping area.
+ */
+static inline unsigned long calc_direct_mapping_size(void)
+{
+ unsigned long size_tb, memory_tb;
+
+ /*
+ * Update Physical memory mapping to available and
+ * add padding if needed (especially for memory hotplug support).
+ */
+ memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
+ CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+
+ size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
+
+ /*
+ * Adapt phyiscal memory region size based on available memory if
+ * it's not UV system.
+ */
+ if (memory_tb < size_tb && !is_early_uv_system())
+ size_tb = memory_tb;
+
+ return size_tb;
+}
+
/*
* kernel_randomize_memory - initialize base and padding for each
* memory region randomized with KASLR.
@@ -113,12 +154,11 @@ static inline bool kaslr_memory_enabled(void)
*/
void __init kernel_randomize_memory(void)
{
- size_t i;
- unsigned long vaddr_start, vaddr;
- unsigned long rand, memory_tb;
- struct rnd_state rand_state;
+ unsigned long vaddr_start, vaddr, rand;
unsigned long remain_entropy;
unsigned long vmemmap_size;
+ struct rnd_state rand_state;
+ size_t i;
vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
vaddr = vaddr_start;
@@ -135,20 +175,10 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;
- kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
+ kaslr_regions[0].size_tb = calc_direct_mapping_size();
kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
- /*
- * Update Physical memory mapping to available and
- * add padding if needed (especially for memory hotplug support).
- */
BUG_ON(kaslr_regions[0].base != &page_offset_base);
- memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
- CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
-
- /* Adapt phyiscal memory region size based on available memory */
- if (memory_tb < kaslr_regions[0].size_tb)
- kaslr_regions[0].size_tb = memory_tb;
/*
* Calculate how many TB vmemmap region needs, and align to 1 TB
--
2.17.2
^ permalink raw reply related [flat|nested] 11+ messages in thread