linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] rip out x86_32 NUMA remapping code
@ 2013-01-31  0:56 Dave Hansen
  2013-01-31 21:24 ` H. Peter Anvin
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Dave Hansen @ 2013-01-31  0:56 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel, Dave Hansen


This code was an optimization for 32-bit NUMA systems.

It has probably been the cause of a number of subtle bugs over
the years, although the conditions to excite them would have
been hard to trigger.  Essentially, we remap part of the kernel
linear mapping area, and then sometimes part of that area gets
freed back in to the bootmem allocator.  If those pages get
used by kernel data structures (say mem_map[] or a dentry),
there's no big deal.  But, if anyone ever tried to use the
linear mapping for these pages _and_ cared about their physical
address, bad things happen.

For instance, say you passed __GFP_ZERO to the page allocator
and then happened to get handed one of these pages, it zero the
remapped page, but it would make a pte to the _old_ page.
There are probably a hundred other ways that it could screw
with things.

We don't need to hang on to performance optimizations for
these old boxes any more.  All my 32-bit NUMA systems are long
dead and buried, and I probably had access to more than most
people.

This code is causing real things to break today:

	https://lkml.org/lkml/2013/1/9/376

I looked in to actually fixing this, but it requires surgery
to way too much brittle code, as well as stuff like
per_cpu_ptr_to_phys().


---

 linux-2.6.git-dave/arch/x86/Kconfig            |    4 
 linux-2.6.git-dave/arch/x86/mm/numa.c          |    3 
 linux-2.6.git-dave/arch/x86/mm/numa_32.c       |  161 -------------------------
 linux-2.6.git-dave/arch/x86/mm/numa_internal.h |    6 
 4 files changed, 174 deletions(-)

diff -puN arch/x86/Kconfig~rip-out-i386-NUMA-remapping-code arch/x86/Kconfig
--- linux-2.6.git/arch/x86/Kconfig~rip-out-i386-NUMA-remapping-code	2013-01-30 16:25:36.792338332 -0800
+++ linux-2.6.git-dave/arch/x86/Kconfig	2013-01-30 16:25:36.800338399 -0800
@@ -1253,10 +1253,6 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
 
-config HAVE_ARCH_ALLOC_REMAP
-	def_bool y
-	depends on X86_32 && NUMA
-
 config ARCH_HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on X86_32 && DISCONTIGMEM
diff -puN arch/x86/mm/numa_32.c~rip-out-i386-NUMA-remapping-code arch/x86/mm/numa_32.c
--- linux-2.6.git/arch/x86/mm/numa_32.c~rip-out-i386-NUMA-remapping-code	2013-01-30 16:25:36.796338365 -0800
+++ linux-2.6.git-dave/arch/x86/mm/numa_32.c	2013-01-30 16:25:36.800338399 -0800
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int
 
 extern unsigned long highend_pfn, highstart_pfn;
 
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid.  The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function.  For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
-	void *allocation = node_remap_alloc_vaddr[nid];
-
-	size = ALIGN(size, L1_CACHE_BYTES);
-
-	if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
-		return NULL;
-
-	node_remap_alloc_vaddr[nid] += size;
-	memset(allocation, 0, size);
-
-	return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- *                       during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
-	int node;
-
-	for_each_online_node(node) {
-		unsigned long start_va, start_pfn, nr_pages, pfn;
-
-		start_va = (unsigned long)node_remap_start_vaddr[node];
-		start_pfn = node_remap_start_pfn[node];
-		nr_pages = (node_remap_end_vaddr[node] -
-			    node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
-		printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
-		for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
-			unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
-			pgd_t *pgd = pgd_base + pgd_index(vaddr);
-			pud_t *pud = pud_offset(pgd, vaddr);
-			pmd_t *pmd = pmd_offset(pud, vaddr);
-
-			set_pmd(pmd, pfn_pmd(start_pfn + pfn,
-						PAGE_KERNEL_LARGE_EXEC));
-
-			printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
-				__func__, vaddr, start_pfn + pfn);
-		}
-	}
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem.  As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted.  The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal.  alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long end_pfn = end >> PAGE_SHIFT;
-	unsigned long size, pfn;
-	u64 node_pa, remap_pa;
-	void *remap_va;
-
-	/*
-	 * The acpi/srat node info can show hot-add memroy zones where
-	 * memory could be added but not currently present.
-	 */
-	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
-	       nid, start_pfn, end_pfn);
-
-	/* calculate the necessary space aligned to large page size */
-	size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
-	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-	size = ALIGN(size, LARGE_PAGE_BYTES);
-
-	/* allocate node memory and the lowmem remap area */
-	node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
-	if (!node_pa) {
-		pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
-			   size, nid);
-		return;
-	}
-	memblock_reserve(node_pa, size);
-
-	remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
-					  max_low_pfn << PAGE_SHIFT,
-					  size, LARGE_PAGE_BYTES);
-	if (!remap_pa) {
-		pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
-			   size, nid);
-		memblock_free(node_pa, size);
-		return;
-	}
-	memblock_reserve(remap_pa, size);
-	remap_va = phys_to_virt(remap_pa);
-
-	/* perform actual remap */
-	for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
-		set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
-			    (node_pa >> PAGE_SHIFT) + pfn,
-			    PAGE_KERNEL_LARGE);
-
-	/* initialize remap allocator parameters */
-	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
-	node_remap_start_vaddr[nid] = remap_va;
-	node_remap_end_vaddr[nid] = remap_va + size;
-	node_remap_alloc_vaddr[nid] = remap_va;
-
-	printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
-	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
 void __init initmem_init(void)
 {
 	x86_numa_init();
diff -puN arch/x86/mm/numa.c~rip-out-i386-NUMA-remapping-code arch/x86/mm/numa.c
--- linux-2.6.git/arch/x86/mm/numa.c~rip-out-i386-NUMA-remapping-code	2013-01-30 16:25:36.796338365 -0800
+++ linux-2.6.git-dave/arch/x86/mm/numa.c	2013-01-30 16:25:36.800338399 -0800
@@ -205,9 +205,6 @@ static void __init setup_node_data(int n
 	if (end && (end - start) < NODE_MIN_SIZE)
 		return;
 
-	/* initialize remap allocator before aligning to ZONE_ALIGN */
-	init_alloc_remap(nid, start, end);
-
 	start = roundup(start, ZONE_ALIGN);
 
 	printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
diff -puN arch/x86/mm/numa_internal.h~rip-out-i386-NUMA-remapping-code arch/x86/mm/numa_internal.h
--- linux-2.6.git/arch/x86/mm/numa_internal.h~rip-out-i386-NUMA-remapping-code	2013-01-30 16:25:36.796338365 -0800
+++ linux-2.6.git-dave/arch/x86/mm/numa_internal.h	2013-01-30 16:25:36.800338399 -0800
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
 
 void __init x86_numa_init(void);
 
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end)	{ }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
 #ifdef CONFIG_NUMA_EMU
 void __init numa_emulation(struct numa_meminfo *numa_meminfo,
 			   int numa_dist_cnt);
_


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31  0:56 [RFC][PATCH] rip out x86_32 NUMA remapping code Dave Hansen
@ 2013-01-31 21:24 ` H. Peter Anvin
  2013-01-31 21:51 ` H. Peter Anvin
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: H. Peter Anvin @ 2013-01-31 21:24 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-mm, linux-kernel

On 01/30/2013 04:56 PM, Dave Hansen wrote:
> This code was an optimization for 32-bit NUMA systems.
> 
> It has probably been the cause of a number of subtle bugs over
> the years, although the conditions to excite them would have
> been hard to trigger.  Essentially, we remap part of the kernel
> linear mapping area, and then sometimes part of that area gets
> freed back in to the bootmem allocator.  If those pages get
> used by kernel data structures (say mem_map[] or a dentry),
> there's no big deal.  But, if anyone ever tried to use the
> linear mapping for these pages _and_ cared about their physical
> address, bad things happen.
> 
> For instance, say you passed __GFP_ZERO to the page allocator
> and then happened to get handed one of these pages, it zero the
> remapped page, but it would make a pte to the _old_ page.
> There are probably a hundred other ways that it could screw
> with things.
> 
> We don't need to hang on to performance optimizations for
> these old boxes any more.  All my 32-bit NUMA systems are long
> dead and buried, and I probably had access to more than most
> people.
> 
> This code is causing real things to break today:
> 
> 	https://lkml.org/lkml/2013/1/9/376
> 
> I looked in to actually fixing this, but it requires surgery
> to way too much brittle code, as well as stuff like
> per_cpu_ptr_to_phys().
> 

This came up because we made some changes which made us trap on this
bug.  Most likely we have been silently corrupting memory for quite some
time.  Unless someone objects strongly I will apply this patch.

	-hpa



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31  0:56 [RFC][PATCH] rip out x86_32 NUMA remapping code Dave Hansen
  2013-01-31 21:24 ` H. Peter Anvin
@ 2013-01-31 21:51 ` H. Peter Anvin
  2013-01-31 21:55   ` Yinghai Lu
  2013-01-31 22:39 ` [tip:x86/mm] x86-32, mm: Rip out x86_32 NUMA remapping code tip-bot for Dave Hansen
  2013-01-31 22:40 ` [tip:x86/mm] x86-32, mm: Remove reference to resume_map_numa_kva( ) tip-bot for H. Peter Anvin
  3 siblings, 1 reply; 12+ messages in thread
From: H. Peter Anvin @ 2013-01-31 21:51 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-mm, linux-kernel

On 01/30/2013 04:56 PM, Dave Hansen wrote:
> This code was an optimization for 32-bit NUMA systems.
> 
> It has probably been the cause of a number of subtle bugs over
> the years, although the conditions to excite them would have
> been hard to trigger.  Essentially, we remap part of the kernel
> linear mapping area, and then sometimes part of that area gets
> freed back in to the bootmem allocator.  If those pages get
> used by kernel data structures (say mem_map[] or a dentry),
> there's no big deal.  But, if anyone ever tried to use the
> linear mapping for these pages _and_ cared about their physical
> address, bad things happen.
> 
> For instance, say you passed __GFP_ZERO to the page allocator
> and then happened to get handed one of these pages, it zero the
> remapped page, but it would make a pte to the _old_ page.
> There are probably a hundred other ways that it could screw
> with things.
> 
> We don't need to hang on to performance optimizations for
> these old boxes any more.  All my 32-bit NUMA systems are long
> dead and buried, and I probably had access to more than most
> people.
> 
> This code is causing real things to break today:
> 
> 	https://lkml.org/lkml/2013/1/9/376
> 
> I looked in to actually fixing this, but it requires surgery
> to way too much brittle code, as well as stuff like
> per_cpu_ptr_to_phys().
> 

I get a build failure on i386 allyesconfig with this patch:

arch/x86/power/built-in.o: In function `swsusp_arch_resume':
(.text+0x14e4): undefined reference to `resume_map_numa_kva'

It looks trivial to fix up; I assume resume_map_numa_kva() just goes
away like it does in the non-NUMA case, but it would be nice if you
could confirm that.

	-hpa


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31 21:51 ` H. Peter Anvin
@ 2013-01-31 21:55   ` Yinghai Lu
  2013-01-31 21:59     ` H. Peter Anvin
  2013-01-31 22:41     ` [tip:x86/mm] x86-32, mm: Remove reference to alloc_remap() tip-bot for H. Peter Anvin
  0 siblings, 2 replies; 12+ messages in thread
From: Yinghai Lu @ 2013-01-31 21:55 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Dave Hansen, linux-mm, linux-kernel

On Thu, Jan 31, 2013 at 1:51 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> I get a build failure on i386 allyesconfig with this patch:
>
> arch/x86/power/built-in.o: In function `swsusp_arch_resume':
> (.text+0x14e4): undefined reference to `resume_map_numa_kva'
>
> It looks trivial to fix up; I assume resume_map_numa_kva() just goes
> away like it does in the non-NUMA case, but it would be nice if you
> could confirm that.

the patches does not seem to complete.

at least, it does not remove

arch/x86/mm/numa.c:     nd = alloc_remap(nid, nd_size);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31 21:55   ` Yinghai Lu
@ 2013-01-31 21:59     ` H. Peter Anvin
  2013-01-31 22:01       ` Yinghai Lu
  2013-01-31 22:41     ` [tip:x86/mm] x86-32, mm: Remove reference to alloc_remap() tip-bot for H. Peter Anvin
  1 sibling, 1 reply; 12+ messages in thread
From: H. Peter Anvin @ 2013-01-31 21:59 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Dave Hansen, linux-mm, linux-kernel

On 01/31/2013 01:55 PM, Yinghai Lu wrote:
> On Thu, Jan 31, 2013 at 1:51 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> I get a build failure on i386 allyesconfig with this patch:
>>
>> arch/x86/power/built-in.o: In function `swsusp_arch_resume':
>> (.text+0x14e4): undefined reference to `resume_map_numa_kva'
>>
>> It looks trivial to fix up; I assume resume_map_numa_kva() just goes
>> away like it does in the non-NUMA case, but it would be nice if you
>> could confirm that.
> 
> the patches does not seem to complete.
> 
> at least, it does not remove
> 
> arch/x86/mm/numa.c:     nd = alloc_remap(nid, nd_size);
> 

... which will just return NULL because alloc_remap turns into an inline
just returning NULL.  So the compiled code is correct, but the source
code is needlessly messy.

	-hpa


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31 21:59     ` H. Peter Anvin
@ 2013-01-31 22:01       ` Yinghai Lu
  2013-01-31 22:03         ` H. Peter Anvin
  0 siblings, 1 reply; 12+ messages in thread
From: Yinghai Lu @ 2013-01-31 22:01 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Dave Hansen, linux-mm, linux-kernel

On Thu, Jan 31, 2013 at 1:59 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 01/31/2013 01:55 PM, Yinghai Lu wrote:
>> On Thu, Jan 31, 2013 at 1:51 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> I get a build failure on i386 allyesconfig with this patch:
>>>
>>> arch/x86/power/built-in.o: In function `swsusp_arch_resume':
>>> (.text+0x14e4): undefined reference to `resume_map_numa_kva'
>>>
>>> It looks trivial to fix up; I assume resume_map_numa_kva() just goes
>>> away like it does in the non-NUMA case, but it would be nice if you
>>> could confirm that.
>>
>> the patches does not seem to complete.
>>
>> at least, it does not remove
>>
>> arch/x86/mm/numa.c:     nd = alloc_remap(nid, nd_size);
>>
>
> ... which will just return NULL because alloc_remap turns into an inline
> just returning NULL.  So the compiled code is correct, but the source
> code is needlessly messy.

yes...

It still left #ifdefCONFIG_HAVE_ARCH_ALLOC_REMAP there.

#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
extern void *alloc_remap(int nid, unsigned long size);
#else
static inline void *alloc_remap(int nid, unsigned long size)
{
        return NULL;
}
#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */

should throw them all away.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31 22:01       ` Yinghai Lu
@ 2013-01-31 22:03         ` H. Peter Anvin
  2013-01-31 22:09           ` Dave Hansen
  0 siblings, 1 reply; 12+ messages in thread
From: H. Peter Anvin @ 2013-01-31 22:03 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Dave Hansen, linux-mm, linux-kernel

On 01/31/2013 02:01 PM, Yinghai Lu wrote:
> On Thu, Jan 31, 2013 at 1:59 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 01/31/2013 01:55 PM, Yinghai Lu wrote:
>>> On Thu, Jan 31, 2013 at 1:51 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>> I get a build failure on i386 allyesconfig with this patch:
>>>>
>>>> arch/x86/power/built-in.o: In function `swsusp_arch_resume':
>>>> (.text+0x14e4): undefined reference to `resume_map_numa_kva'
>>>>
>>>> It looks trivial to fix up; I assume resume_map_numa_kva() just goes
>>>> away like it does in the non-NUMA case, but it would be nice if you
>>>> could confirm that.
>>>
>>> the patches does not seem to complete.
>>>
>>> at least, it does not remove
>>>
>>> arch/x86/mm/numa.c:     nd = alloc_remap(nid, nd_size);
>>>
>>
>> ... which will just return NULL because alloc_remap turns into an inline
>> just returning NULL.  So the compiled code is correct, but the source
>> code is needlessly messy.
> 
> yes...
> 
> It still left #ifdefCONFIG_HAVE_ARCH_ALLOC_REMAP there.
> 
> #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
> extern void *alloc_remap(int nid, unsigned long size);
> #else
> static inline void *alloc_remap(int nid, unsigned long size)
> {
>         return NULL;
> }
> #endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
> 
> should throw them all away.
> 

That is arch-independent code, and the tile architecture still uses it.

Makes one wonder how much it will get tested going forward, especially
with the x86-32 implementation clearly lacking in that department.

	-hpa


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31 22:03         ` H. Peter Anvin
@ 2013-01-31 22:09           ` Dave Hansen
  2013-01-31 22:12             ` H. Peter Anvin
  0 siblings, 1 reply; 12+ messages in thread
From: Dave Hansen @ 2013-01-31 22:09 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Yinghai Lu, linux-mm, linux-kernel, Chris Metcalf

On 01/31/2013 02:03 PM, H. Peter Anvin wrote:
> That is arch-independent code, and the tile architecture still uses it.
> 
> Makes one wonder how much it will get tested going forward, especially
> with the x86-32 implementation clearly lacking in that department.

Yeah, I left the tile one because it wasn't obvious how it was being
used over there.  It _probably_ has the same bugs that x86 does.

I'll refresh the patch fixing some of the compile issues and resend.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC][PATCH] rip out x86_32 NUMA remapping code
  2013-01-31 22:09           ` Dave Hansen
@ 2013-01-31 22:12             ` H. Peter Anvin
  0 siblings, 0 replies; 12+ messages in thread
From: H. Peter Anvin @ 2013-01-31 22:12 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Yinghai Lu, linux-mm, linux-kernel, Chris Metcalf

On 01/31/2013 02:09 PM, Dave Hansen wrote:
> On 01/31/2013 02:03 PM, H. Peter Anvin wrote:
>> That is arch-independent code, and the tile architecture still uses it.
>>
>> Makes one wonder how much it will get tested going forward, especially
>> with the x86-32 implementation clearly lacking in that department.
> 
> Yeah, I left the tile one because it wasn't obvious how it was being
> used over there.  It _probably_ has the same bugs that x86 does.
> 
> I'll refresh the patch fixing some of the compile issues and resend.
> 

I already have fixup patches, I think.

	-hpa


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:x86/mm] x86-32, mm: Rip out x86_32 NUMA remapping code
  2013-01-31  0:56 [RFC][PATCH] rip out x86_32 NUMA remapping code Dave Hansen
  2013-01-31 21:24 ` H. Peter Anvin
  2013-01-31 21:51 ` H. Peter Anvin
@ 2013-01-31 22:39 ` tip-bot for Dave Hansen
  2013-01-31 22:40 ` [tip:x86/mm] x86-32, mm: Remove reference to resume_map_numa_kva( ) tip-bot for H. Peter Anvin
  3 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Dave Hansen @ 2013-01-31 22:39 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, dave, stable, tglx, hpa

Commit-ID:  f03574f2d5b2d6229dcdf2d322848065f72953c7
Gitweb:     http://git.kernel.org/tip/f03574f2d5b2d6229dcdf2d322848065f72953c7
Author:     Dave Hansen <dave@linux.vnet.ibm.com>
AuthorDate: Wed, 30 Jan 2013 16:56:16 -0800
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Thu, 31 Jan 2013 14:12:30 -0800

x86-32, mm: Rip out x86_32 NUMA remapping code

This code was an optimization for 32-bit NUMA systems.

It has probably been the cause of a number of subtle bugs over
the years, although the conditions to excite them would have
been hard to trigger.  Essentially, we remap part of the kernel
linear mapping area, and then sometimes part of that area gets
freed back in to the bootmem allocator.  If those pages get
used by kernel data structures (say mem_map[] or a dentry),
there's no big deal.  But, if anyone ever tried to use the
linear mapping for these pages _and_ cared about their physical
address, bad things happen.

For instance, say you passed __GFP_ZERO to the page allocator
and then happened to get handed one of these pages, it zero the
remapped page, but it would make a pte to the _old_ page.
There are probably a hundred other ways that it could screw
with things.

We don't need to hang on to performance optimizations for
these old boxes any more.  All my 32-bit NUMA systems are long
dead and buried, and I probably had access to more than most
people.

This code is causing real things to break today:

	https://lkml.org/lkml/2013/1/9/376

I looked in to actually fixing this, but it requires surgery
to way too much brittle code, as well as stuff like
per_cpu_ptr_to_phys().

[ hpa: Cc: this for -stable, since it is a memory corruption issue.
  However, an alternative is to simply mark NUMA as depends BROKEN
  rather than EXPERIMENTAL in the X86_32 subclause... ]

Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@vger.kernel.org>
---
 arch/x86/Kconfig            |   4 --
 arch/x86/mm/numa.c          |   3 -
 arch/x86/mm/numa_32.c       | 161 --------------------------------------------
 arch/x86/mm/numa_internal.h |   6 --
 4 files changed, 174 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 79795af..108efcb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1253,10 +1253,6 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
 
-config HAVE_ARCH_ALLOC_REMAP
-	def_bool y
-	depends on X86_32 && NUMA
-
 config ARCH_HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on X86_32 && DISCONTIGMEM
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b2313c6..61c2b6f 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -205,9 +205,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 	if (end && (end - start) < NODE_MIN_SIZE)
 		return;
 
-	/* initialize remap allocator before aligning to ZONE_ALIGN */
-	init_alloc_remap(nid, start, end);
-
 	start = roundup(start, ZONE_ALIGN);
 
 	printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a..73a6d73 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 
 extern unsigned long highend_pfn, highstart_pfn;
 
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid.  The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function.  For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
-	void *allocation = node_remap_alloc_vaddr[nid];
-
-	size = ALIGN(size, L1_CACHE_BYTES);
-
-	if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
-		return NULL;
-
-	node_remap_alloc_vaddr[nid] += size;
-	memset(allocation, 0, size);
-
-	return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- *                       during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
-	int node;
-
-	for_each_online_node(node) {
-		unsigned long start_va, start_pfn, nr_pages, pfn;
-
-		start_va = (unsigned long)node_remap_start_vaddr[node];
-		start_pfn = node_remap_start_pfn[node];
-		nr_pages = (node_remap_end_vaddr[node] -
-			    node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
-		printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
-		for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
-			unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
-			pgd_t *pgd = pgd_base + pgd_index(vaddr);
-			pud_t *pud = pud_offset(pgd, vaddr);
-			pmd_t *pmd = pmd_offset(pud, vaddr);
-
-			set_pmd(pmd, pfn_pmd(start_pfn + pfn,
-						PAGE_KERNEL_LARGE_EXEC));
-
-			printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
-				__func__, vaddr, start_pfn + pfn);
-		}
-	}
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem.  As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted.  The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal.  alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long end_pfn = end >> PAGE_SHIFT;
-	unsigned long size, pfn;
-	u64 node_pa, remap_pa;
-	void *remap_va;
-
-	/*
-	 * The acpi/srat node info can show hot-add memroy zones where
-	 * memory could be added but not currently present.
-	 */
-	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
-	       nid, start_pfn, end_pfn);
-
-	/* calculate the necessary space aligned to large page size */
-	size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
-	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-	size = ALIGN(size, LARGE_PAGE_BYTES);
-
-	/* allocate node memory and the lowmem remap area */
-	node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
-	if (!node_pa) {
-		pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
-			   size, nid);
-		return;
-	}
-	memblock_reserve(node_pa, size);
-
-	remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
-					  max_low_pfn << PAGE_SHIFT,
-					  size, LARGE_PAGE_BYTES);
-	if (!remap_pa) {
-		pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
-			   size, nid);
-		memblock_free(node_pa, size);
-		return;
-	}
-	memblock_reserve(remap_pa, size);
-	remap_va = phys_to_virt(remap_pa);
-
-	/* perform actual remap */
-	for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
-		set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
-			    (node_pa >> PAGE_SHIFT) + pfn,
-			    PAGE_KERNEL_LARGE);
-
-	/* initialize remap allocator parameters */
-	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
-	node_remap_start_vaddr[nid] = remap_va;
-	node_remap_end_vaddr[nid] = remap_va + size;
-	node_remap_alloc_vaddr[nid] = remap_va;
-
-	printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
-	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
 void __init initmem_init(void)
 {
 	x86_numa_init();
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3a..ad86ec9 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
 
 void __init x86_numa_init(void);
 
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end)	{ }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
 #ifdef CONFIG_NUMA_EMU
 void __init numa_emulation(struct numa_meminfo *numa_meminfo,
 			   int numa_dist_cnt);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:x86/mm] x86-32, mm: Remove reference to resume_map_numa_kva( )
  2013-01-31  0:56 [RFC][PATCH] rip out x86_32 NUMA remapping code Dave Hansen
                   ` (2 preceding siblings ...)
  2013-01-31 22:39 ` [tip:x86/mm] x86-32, mm: Rip out x86_32 NUMA remapping code tip-bot for Dave Hansen
@ 2013-01-31 22:40 ` tip-bot for H. Peter Anvin
  3 siblings, 0 replies; 12+ messages in thread
From: tip-bot for H. Peter Anvin @ 2013-01-31 22:40 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, dave, stable, tglx, hpa

Commit-ID:  bb112aec5ee41427e9b9726e3d57b896709598ed
Gitweb:     http://git.kernel.org/tip/bb112aec5ee41427e9b9726e3d57b896709598ed
Author:     H. Peter Anvin <hpa@linux.intel.com>
AuthorDate: Thu, 31 Jan 2013 13:53:10 -0800
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Thu, 31 Jan 2013 14:12:30 -0800

x86-32, mm: Remove reference to resume_map_numa_kva()

Remove reference to removed function resume_map_numa_kva().

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com
---
 arch/x86/include/asm/mmzone_32.h | 6 ------
 arch/x86/power/hibernate_32.c    | 2 --
 2 files changed, 8 deletions(-)

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index eb05fb3..8a9b3e2 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
 
 #include <asm/numaq.h>
 
-extern void resume_map_numa_kva(pgd_t *pgd);
-
-#else /* !CONFIG_NUMA */
-
-static inline void resume_map_numa_kva(pgd_t *pgd) {}
-
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 74202c1..7d28c88 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
 		}
 	}
 
-	resume_map_numa_kva(pgd_base);
-
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:x86/mm] x86-32, mm: Remove reference to alloc_remap()
  2013-01-31 21:55   ` Yinghai Lu
  2013-01-31 21:59     ` H. Peter Anvin
@ 2013-01-31 22:41     ` tip-bot for H. Peter Anvin
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for H. Peter Anvin @ 2013-01-31 22:41 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, yinghai, dave, stable, tglx, hpa

Commit-ID:  07f4207a305c834f528d08428df4531744e25678
Gitweb:     http://git.kernel.org/tip/07f4207a305c834f528d08428df4531744e25678
Author:     H. Peter Anvin <hpa@linux.intel.com>
AuthorDate: Thu, 31 Jan 2013 14:00:48 -0800
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Thu, 31 Jan 2013 14:12:30 -0800

x86-32, mm: Remove reference to alloc_remap()

We have removed the remap allocator for x86-32, and x86-64 never had
it (and doesn't need it).  Remove residual reference to it.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/CAE9FiQVn6_QZi3fNQ-JHYiR-7jeDJ5hT0SyT_%2BzVvfOj=PzF3w@mail.gmail.com
---
 arch/x86/mm/numa.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 61c2b6f..8504f36 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 static void __init setup_node_data(int nid, u64 start, u64 end)
 {
 	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
-	bool remapped = false;
 	u64 nd_pa;
 	void *nd;
 	int tnid;
@@ -211,28 +210,22 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 	       nid, start, end - 1);
 
 	/*
-	 * Allocate node data.  Try remap allocator first, node-local
-	 * memory and then any node.  Never allocate in DMA zone.
+	 * Allocate node data.  Try node-local memory and then any node.
+	 * Never allocate in DMA zone.
 	 */
-	nd = alloc_remap(nid, nd_size);
-	if (nd) {
-		nd_pa = __pa_nodebug(nd);
-		remapped = true;
-	} else {
-		nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
-		if (!nd_pa) {
-			pr_err("Cannot find %zu bytes in node %d\n",
-			       nd_size, nid);
-			return;
-		}
-		nd = __va(nd_pa);
+	nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+	if (!nd_pa) {
+		pr_err("Cannot find %zu bytes in node %d\n",
+		       nd_size, nid);
+		return;
 	}
+	nd = __va(nd_pa);
 
 	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
-	       nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+	printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]\n",
+	       nd_pa, nd_pa + nd_size - 1);
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (!remapped && tnid != nid)
+	if (tnid != nid)
 		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
 
 	node_data[nid] = nd;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2013-01-31 22:41 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-31  0:56 [RFC][PATCH] rip out x86_32 NUMA remapping code Dave Hansen
2013-01-31 21:24 ` H. Peter Anvin
2013-01-31 21:51 ` H. Peter Anvin
2013-01-31 21:55   ` Yinghai Lu
2013-01-31 21:59     ` H. Peter Anvin
2013-01-31 22:01       ` Yinghai Lu
2013-01-31 22:03         ` H. Peter Anvin
2013-01-31 22:09           ` Dave Hansen
2013-01-31 22:12             ` H. Peter Anvin
2013-01-31 22:41     ` [tip:x86/mm] x86-32, mm: Remove reference to alloc_remap() tip-bot for H. Peter Anvin
2013-01-31 22:39 ` [tip:x86/mm] x86-32, mm: Rip out x86_32 NUMA remapping code tip-bot for Dave Hansen
2013-01-31 22:40 ` [tip:x86/mm] x86-32, mm: Remove reference to resume_map_numa_kva( ) tip-bot for H. Peter Anvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).