All of lore.kernel.org
 help / color / mirror / Atom feed
* boot time node and memory limit options
@ 2004-03-16 17:07 Robert Picco
  2004-03-16 17:34 ` Randy.Dunlap
       [not found] ` <16471.48076.447058.132559@napali.hpl.hp.com>
  0 siblings, 2 replies; 15+ messages in thread
From: Robert Picco @ 2004-03-16 17:07 UTC (permalink / raw)
  To: linux-kernel; +Cc: Robert.Picco, colpatch, mbligh

This patch supports three boot line options.  mem_limit limits the amount of physical memory.
node_mem_limit limits the amount of physical memory per node on a NUMA machine.  nodes_limit
reduces the number of NUMA nodes to the value specified.  On a NUMA machine an eliminated node's 
CPU(s) are removed from the cpu_possible_map.  

The patch has been tested on an IA64 NUMA machine and uniprocessor X86 machine.

thanks,

Bob


--- linux-2.6.4-orig/mm/page_alloc.c	2004-03-10 21:55:22.000000000 -0500
+++ linux-2.6.4/mm/page_alloc.c	2004-03-15 12:11:35.000000000 -0500
@@ -55,6 +55,43 @@
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
+static unsigned long mem_limit  __initdata = ~0UL;
+static unsigned long total_mem __initdata;
+
+static int __init mem_setup(char *str)
+{
+	char *end;
+
+	mem_limit = memparse(str + 1, &end) - 1;
+	return 1;
+}
+
+__setup("mem_limit", mem_setup);
+
+#ifdef	CONFIG_NUMA
+static unsigned long node_mem_limit __initdata = ~0UL;
+static long node_limit __initdata = MAX_NUMNODES;
+
+static int __init node_mem_setup(char *str)
+{
+	char *end;
+
+	node_mem_limit = memparse(str + 1, &end) - 1;
+	return 1;
+}
+
+static int __init nodes_setup(char *str)
+{
+	node_limit  =  simple_strtol(str+1, NULL, 10);
+	if (!node_limit)
+		node_limit = 1;
+	return 1;
+}
+
+__setup("node_mem_limit", node_mem_setup);
+__setup("nodes_limit", nodes_setup);
+#endif
+
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
@@ -1371,6 +1408,106 @@
 	}
 }
 
+#ifdef	CONFIG_NUMA
+static void __init do_trim_cpu(int node)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_to_node(i) == node)
+			cpu_clear(i, cpu_possible_map);
+	return;
+}
+#endif
+
+static unsigned long __init dma_zone_top(struct pglist_data *pgdat, int *dmazones)
+{
+	unsigned long top;
+#define	DMA_SPAN_MIN	((64*1024*1024) >> PAGE_SHIFT)
+	top = 0UL;
+
+	if (pgdat->node_zones[ZONE_DMA].spanned_pages) {
+		if (*dmazones > 1) 
+			--*dmazones;			
+		else {
+			if (pgdat->node_zones[ZONE_DMA].spanned_pages > DMA_SPAN_MIN)
+				top = DMA_SPAN_MIN + pgdat->node_zones[ZONE_DMA].zone_start_pfn;
+			else
+				top = pgdat->node_zones[ZONE_DMA].zone_start_pfn + 
+					pgdat->node_zones[ZONE_DMA].spanned_pages;
+		}
+	}
+
+	return top;
+}
+
+void __init do_mem_limits(void)
+{
+	unsigned long total, alloc, free, top;
+	struct pglist_data *pgdat;
+	int dmazones;
+
+#ifdef	CONFIG_NUMA
+	if (node_limit == MAX_NUMNODES && node_mem_limit == ~0UL && mem_limit == ~0UL)
+#else
+	if (mem_limit == ~0UL)
+#endif
+		return;
+
+	dmazones = 0;
+	for_each_pgdat(pgdat) 
+		if (pgdat->node_zones[ZONE_DMA].spanned_pages) 
+			dmazones++;
+
+	for_each_pgdat(pgdat) {
+#ifdef	CONFIG_NUMA
+		if (node_limit != MAX_NUMNODES && pgdat->node_id >= node_limit) {
+			top = dma_zone_top(pgdat, &dmazones);
+			bootmem_memory_size(pgdat, &alloc, &total);
+			bootmem_memory_trim(pgdat, total - alloc, top);
+			do_trim_cpu(pgdat->node_id);
+			continue;
+		}
+#endif
+		if (mem_limit != ~0UL) {
+			unsigned long mem;
+
+			bootmem_memory_size(pgdat, &alloc, &total);
+			mem = total << PAGE_SHIFT;
+			if ((mem + total_mem) <= mem_limit) 
+				total_mem += mem;
+			else {
+				free = (mem + total_mem) - mem_limit;
+				total_mem = mem_limit;
+				top = dma_zone_top(pgdat, &dmazones);
+#ifdef	CONFIG_NUMA
+				if (free == mem) 
+					do_trim_cpu(pgdat->node_id);
+#endif
+				free >>= PAGE_SHIFT;
+				bootmem_memory_trim(pgdat, free, top);
+			}
+		}
+#ifdef	CONFIG_NUMA
+		else if (node_mem_limit != ~0UL) {
+			unsigned long mem;
+
+			bootmem_memory_size(pgdat, &alloc, &total);
+			mem = total << PAGE_SHIFT;
+
+			if (mem <= node_mem_limit)
+				continue;
+
+			top = dma_zone_top(pgdat, &dmazones);
+			free = (mem - node_mem_limit) >> PAGE_SHIFT;
+			bootmem_memory_trim(pgdat, free, top);
+		}
+#endif
+	}
+
+	return;
+}
+
 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
 		struct page *node_mem_map, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -1397,6 +1534,7 @@
 
 void __init free_area_init(unsigned long *zones_size)
 {
+	pgdat_list = &contig_page_data;
 	free_area_init_node(0, &contig_page_data, NULL, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 	mem_map = contig_page_data.node_mem_map;
--- linux-2.6.4-orig/mm/bootmem.c	2004-03-10 21:55:24.000000000 -0500
+++ linux-2.6.4/mm/bootmem.c	2004-03-15 13:07:50.000000000 -0500
@@ -384,3 +384,51 @@
 	return NULL;
 }
 
+void __init bootmem_memory_size(pg_data_t *pgdat, unsigned long *alloc, unsigned long *total)
+{
+	unsigned long ralloc, i, idx, v, m, *map;
+	bootmem_data_t *bdata;
+
+	bdata = pgdat->bdata;
+	idx =  bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	*total = idx;
+	map = bdata->node_bootmem_map;
+	for (ralloc = 0, i = 0; i < idx; ) {
+		v = map[i / BITS_PER_LONG];
+		if (v) {
+			for (m = 1; m && i < idx; m <<=  1, i++) 
+				if (v & m)
+					ralloc++;
+		} else 
+			i += BITS_PER_LONG;
+	}
+
+	*alloc = ralloc;
+	return;
+}
+
+void __init bootmem_memory_trim(pg_data_t *pgdat, unsigned long trim, unsigned long top)
+{
+	unsigned long i, t, idx, v, m, *map;
+	bootmem_data_t *bdata;
+
+	bdata = pgdat->bdata;
+	idx =  bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	if (top != 0UL)
+		top -= (bdata->node_boot_start >> PAGE_SHIFT);
+	map = bdata->node_bootmem_map;
+	for (i = idx - 1, t = 0; t < trim && i != 0 && i >= top; ) {
+		v = ~map[i / BITS_PER_LONG];
+		if (v) {
+			for (m = 1UL << (i & (BITS_PER_LONG - 1)); 
+				m && i >= top && i != 0 && t < trim; m >>=  1, i--)
+				if (v & m) {
+					t++;
+					map[i / BITS_PER_LONG] |= m;
+				}
+		} else 
+			i -= min((unsigned long) BITS_PER_LONG, i);
+	}
+
+	return;
+} 
--- linux-2.6.4-orig/init/main.c	2004-03-10 21:55:23.000000000 -0500
+++ linux-2.6.4/init/main.c	2004-03-12 14:45:37.000000000 -0500
@@ -450,6 +450,7 @@
 	}
 #endif
 	page_address_init();
+	do_mem_limits();
 	mem_init();
 	kmem_cache_init();
 	if (late_time_init)
--- linux-2.6.4-orig/include/linux/mm.h	2004-03-10 21:55:21.000000000 -0500
+++ linux-2.6.4/include/linux/mm.h	2004-03-12 14:45:38.000000000 -0500
@@ -517,6 +517,7 @@
 	return pmd_offset(pgd, address);
 }
 
+extern void do_mem_limits(void);
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
--- linux-2.6.4-orig/include/linux/bootmem.h	2004-03-10 21:55:44.000000000 -0500
+++ linux-2.6.4/include/linux/bootmem.h	2004-03-12 14:45:38.000000000 -0500
@@ -58,6 +58,9 @@
 extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
 extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
 extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
+extern void __init bootmem_memory_size(pg_data_t *pgdat, unsigned long *alloc, unsigned long *total);
+extern void __init bootmem_memory_trim(pg_data_t *pgdat, unsigned long trim, unsigned long top);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-16 17:07 boot time node and memory limit options Robert Picco
@ 2004-03-16 17:34 ` Randy.Dunlap
       [not found] ` <16471.48076.447058.132559@napali.hpl.hp.com>
  1 sibling, 0 replies; 15+ messages in thread
From: Randy.Dunlap @ 2004-03-16 17:34 UTC (permalink / raw)
  To: Robert Picco; +Cc: linux-kernel, Robert.Picco, colpatch, mbligh

On Tue, 16 Mar 2004 12:07:44 -0500 Robert Picco wrote:

| This patch supports three boot line options.  mem_limit limits the amount of physical memory.
| node_mem_limit limits the amount of physical memory per node on a NUMA machine.  nodes_limit
| reduces the number of NUMA nodes to the value specified.  On a NUMA machine an eliminated node's 
| CPU(s) are removed from the cpu_possible_map.  
| 
| The patch has been tested on an IA64 NUMA machine and uniprocessor X86 machine.

These kernel boot ("command line") parameters need to be documented
in Documentation/kernel-parameters.txt, please:

| +__setup("mem_limit", mem_setup);
| +__setup("node_mem_limit", node_mem_setup);
| +__setup("nodes_limit", nodes_setup);

--
~Randy

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
       [not found] ` <16471.48076.447058.132559@napali.hpl.hp.com>
@ 2004-03-17 18:07   ` Robert Picco
  0 siblings, 0 replies; 15+ messages in thread
From: Robert Picco @ 2004-03-17 18:07 UTC (permalink / raw)
  To: davidm; +Cc: linux-kernel, colpatch, mbligh

Hi David:

Well our IA64 "mem=" is used in efi_memmap_walk.  We could change the 
name to "max_address=".  The X86 "mem=" takes effect before the bootmem 
allocator is initialized.  My patch eliminates memory before
mem_init frees all bootmap memory.  My proposed patch doesn't have the 
same functionality as X86 "mem=".

thanks,

Bob

David Mosberger wrote:

>Hi Bob,
>
>  
>
>>>>>>On Tue, 16 Mar 2004 12:07:44 -0500, Robert Picco <Robert.Picco@hp.com> said:
>>>>>>            
>>>>>>
>
>  Bob> This patch supports three boot line options.  mem_limit limits
>  Bob> the amount of physical memory.  node_mem_limit limits the
>  Bob> amount of physical memory per node on a NUMA machine.
>  Bob> nodes_limit reduces the number of NUMA nodes to the value
>  Bob> specified.  On a NUMA machine an eliminated node's CPU(s) are
>  Bob> removed from the cpu_possible_map.
>
>  Bob> The patch has been tested on an IA64 NUMA machine and
>  Bob> uniprocessor X86 machine.
>
>Would it make sense to improve on the consistency of the "mem" option
>at the same time.  IIRC, "mem=N" on x86 means "limit amount of memory
>to N", whereas on ia64 it means "ignore memory above N".  In my
>opinion, it would make sense to change the ia64 "mem" to option to
>match the behavior on x86 and then to use "mem_limit=N" for the
>"ignore memory above N" case (which is very useful for testing
>addressing issues, such as I/O MMU issues).
>
>Thanks,
>
>	--david
>
>  
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 20:01             ` Robert Picco
@ 2004-03-17 20:58               ` Martin J. Bligh
  0 siblings, 0 replies; 15+ messages in thread
From: Martin J. Bligh @ 2004-03-17 20:58 UTC (permalink / raw)
  To: Robert Picco; +Cc: Jesse Barnes, linux-kernel, colpatch, haveblue

>> Yes ... that's looking very 2.7-ish to reorganise all that stuff. However,
>> for now, I still think we need to restrict memory very early on, before 
>> anything else can allocate bootmem. Are you the absolute first thing that
>> ever runs in the boot allocator?
>> 
>> M.
>>  
>> 
> All the machine dependent initialization code could have allocated and/or reserved bootmem  before the patch would claim additional memory based on boot line parameters.  The patch is  called just before mem_init.  There aren't any pages on freelist yet because mem_init hasn't been called. So I'm not the first thing that ever runs in the boot allocator.  I'm not sure that my answer is addressing your question?

You are, but it's not the answer I want ;-) If you can allocate stuff out
of bootmem that should have been barred by the limiter, I think that's
a bad idea ... you should be restricting earlier, IMHO.

M.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 19:44           ` Martin J. Bligh
  2004-03-17 20:01             ` Robert Picco
@ 2004-03-17 20:52             ` Dave Hansen
  1 sibling, 0 replies; 15+ messages in thread
From: Dave Hansen @ 2004-03-17 20:52 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Robert Picco, Jesse Barnes, Linux Kernel Mailing List, Matthew Dobson

On Wed, 2004-03-17 at 11:44, Martin J. Bligh wrote:
> Yes ... that's looking very 2.7-ish to reorganise all that stuff.
> However, for now, I still think we need to restrict memory very early
> on, before anything else can allocate bootmem. Are you the absolute
> first thing that ever runs in the boot allocator?

I definitely agree with the 2.7 target for what I posted.  We can do it
cleanly in 2.7, but for now, I think the most best solution is to do it
in each architecture.  Partly because it's the way that we already do
mem=, plus I'm not sure the boot allocator code will work with all
architectures, at least ppc64.  

It's probably an oversight in the implementation (of the early ppc64
boot code), but there is some required correlation required with things
like lmb_end_of_DRAM() and how much memory is being used by the mm
structures.  I've played with it a bit, and I _think_ that you would be
required to modify the lmb structures, even with Robert's bootmem patch.

I could be wrong, so can somebody test it on a NUMA ppc64 machine?

Also, it may have been discussed before, but does the bootmem patch have
any applicability to the 32-bit NUMA platforms?  It looks like it just
deals with ZONE_DMA.

-- dave


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 19:44           ` Martin J. Bligh
@ 2004-03-17 20:01             ` Robert Picco
  2004-03-17 20:58               ` Martin J. Bligh
  2004-03-17 20:52             ` Dave Hansen
  1 sibling, 1 reply; 15+ messages in thread
From: Robert Picco @ 2004-03-17 20:01 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Jesse Barnes, linux-kernel, colpatch, haveblue

Martin J. Bligh wrote:

>>I agree with sizing issues at boot of hash tables.  I've seen them all recover when failing to allocate based
>>on num_physpages and then iterating at smaller allocations until successful.  All the primary initialization allocations recover but probably not all drivers.   You could have similiar failure scenarios for any boot line parameter(s) implementation which reduces memory. 
>>    
>>
>>>Don't we have the same arch dependant issue with the current mem= anyway?
>>>Can we come up with something where the arch code calls back into a generic
>>>function to derive limitations, and thereby at least get the parsing done
>>>in a common routine for consistency? There aren't *that* many NUMA arches
>>>to change anyway ...
>>> 
>>>
>>>      
>>>
>>Well this is heading in the direction Dave has proposed and probably 2.7 material.  This would really solve the problem differently than my proposed patch.
>>    
>>
>
>Yes ... that's looking very 2.7-ish to reorganise all that stuff. However,
>for now, I still think we need to restrict memory very early on, before 
>anything else can allocate bootmem. Are you the absolute first thing that
>ever runs in the boot allocator?
>
>M.
>  
>
All the machine dependent initialization code could have allocated 
and/or reserved bootmem  before the patch would claim additional memory 
based on boot line parameters.  The patch is  called just before 
mem_init.  There aren't any pages on freelist yet because mem_init 
hasn't been called. So I'm not the first thing that ever runs in the 
boot allocator.  I'm not sure that my answer is addressing your question?

Bob

>  
>
>>thanks,
>>
>>Bob
>>
>>    
>>
>>>M.
>>>
>>> 
>>>
>>>      
>>>
>>>>Bob
>>>>Martin J. Bligh wrote:
>>>>
>>>>   
>>>>
>>>>        
>>>>
>>>>>--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <jbarnes@sgi.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>>     
>>>>>
>>>>>          
>>>>>
>>>>>>On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>>>>  
>>>>>>
>>>>>>       
>>>>>>
>>>>>>            
>>>>>>
>>>>>>>This patch supports three boot line options.  mem_limit limits the
>>>>>>>amount of physical memory.  node_mem_limit limits the amount of
>>>>>>>physical memory per node on a NUMA machine.  nodes_limit reduces the
>>>>>>>number of NUMA nodes to the value specified.  On a NUMA machine an
>>>>>>>eliminated node's CPU(s) are removed from the cpu_possible_map.  
>>>>>>>
>>>>>>>The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>>>>>machine.
>>>>>>>    
>>>>>>>
>>>>>>>         
>>>>>>>
>>>>>>>              
>>>>>>>
>>>>>>I think this patch will be really useful.  Matt and Martin, does it look
>>>>>>ok to you?  Given that discontiguous support is pretty platform specific
>>>>>>right now, I thought it might be less code if it was done in arch/, but
>>>>>>a platform independent version is awfully nice...
>>>>>>  
>>>>>>
>>>>>>       
>>>>>>
>>>>>>            
>>>>>>
>>>>>I haven't looked at your code yet, but I've had a similar patch in my tree
>>>>>from Dave Hansen for a while you might want to look at:
>>>>>
>>>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>>>>>--- 320-kcg/arch/i386/kernel/numaq.c	2003-10-01 11:47:33.000000000 -0700
>>>>>+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c	2004-03-14 09:54:00.000000000 -0800
>>>>>@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>>>>* function also increments numnodes with the number of nodes (quads)
>>>>>* present.
>>>>>*/
>>>>>+extern unsigned long max_pages_per_node;
>>>>>+extern int limit_mem_per_node;
>>>>>+
>>>>>+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>>>>>static void __init smp_dump_qct(void)
>>>>>{
>>>>>	int node;
>>>>>@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>>>>>				eq->hi_shrd_mem_start - eq->priv_mem_size);
>>>>>			node_end_pfn[node] = MB_TO_PAGES(
>>>>>				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>>>>>+			if (node_size_pages(node) > max_pages_per_node)
>>>>>+				node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>>>>>		}
>>>>>	}
>>>>>}
>>>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>>>>>--- 320-kcg/arch/i386/kernel/setup.c	2004-03-11 14:33:36.000000000 -0800
>>>>>+++ 330-numa_mem_equals/arch/i386/kernel/setup.c	2004-03-14 09:54:00.000000000 -0800
>>>>>@@ -142,7 +142,7 @@ static void __init probe_roms(void)
>>>>>	probe_extension_roms(roms);
>>>>>}
>>>>>
>>>>>-static void __init limit_regions(unsigned long long size)
>>>>>+void __init limit_regions(unsigned long long size)
>>>>>{
>>>>>	unsigned long long current_addr = 0;
>>>>>	int i;
>>>>>@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>>>>>	print_memory_map(who);
>>>>>} /* setup_memory_region */
>>>>>
>>>>>+unsigned long max_pages_per_node = 0xFFFFFFFF; 
>>>>>
>>>>>static void __init parse_cmdline_early (char ** cmdline_p)
>>>>>{
>>>>>@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>>>>>				userdef=1;
>>>>>			}
>>>>>		}
>>>>>+		
>>>>>+		if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>>>>>+			unsigned long long node_size_bytes;
>>>>>+			if (to != command_line)
>>>>>+				to--;
>>>>>+			node_size_bytes = memparse(from+8, &from);
>>>>>+			max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>>>>>+		}
>>>>>
>>>>>		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>>>>>			if (to != command_line)
>>>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>>>>>--- 320-kcg/arch/i386/kernel/srat.c	2003-10-01 11:47:33.000000000 -0700
>>>>>+++ 330-numa_mem_equals/arch/i386/kernel/srat.c	2004-03-14 09:54:01.000000000 -0800
>>>>>@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>>>>>};
>>>>>static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>>>>
>>>>>+#define chunk_start(i)	(node_memory_chunk[i].start_pfn)
>>>>>+#define chunk_end(i)	(node_memory_chunk[i].end_pfn)
>>>>>+#define chunk_size(i) 	(chunk_end(i)-chunk_start(i))
>>>>>+
>>>>>static int num_memory_chunks;		/* total number of memory chunks */
>>>>>static int zholes_size_init;
>>>>>static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>>>>>@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>>>>>	}
>>>>>}
>>>>>
>>>>>+extern unsigned long max_pages_per_node;
>>>>>+extern int limit_mem_per_node; 
>>>>>+
>>>>>/* Parse the ACPI Static Resource Affinity Table */
>>>>>static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>>>>>{
>>>>>@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>>>>>		       node_memory_chunk[j].start_pfn,
>>>>>		       node_memory_chunk[j].end_pfn);
>>>>>	}
>>>>>- 
>>>>>+
>>>>>	/*calculate node_start_pfn/node_end_pfn arrays*/
>>>>>	for (nid = 0; nid < numnodes; nid++) {
>>>>>-		int been_here_before = 0;
>>>>>+		unsigned long node_present_pages = 0;
>>>>>
>>>>>+		node_start_pfn[nid] = -1;
>>>>>		for (j = 0; j < num_memory_chunks; j++){
>>>>>-			if (node_memory_chunk[j].nid == nid) {
>>>>>-				if (been_here_before == 0) {
>>>>>-					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>>>>>-					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>>>-					been_here_before = 1;
>>>>>-				} else { /* We've found another chunk of memory for the node */
>>>>>-					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>>>>>-						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>>>-					}
>>>>>-				}
>>>>>-			}
>>>>>+			unsigned long proposed_size;
>>>>>+
>>>>>+			if (node_memory_chunk[j].nid != nid)
>>>>>+				continue;
>>>>>+
>>>>>+			proposed_size = node_present_pages + chunk_size(j);
>>>>>+			if (proposed_size > max_pages_per_node)
>>>>>+				chunk_end(j) = chunk_start(j) +	
>>>>>+					max_pages_per_node - node_present_pages;
>>>>>+			node_present_pages += chunk_size(j);
>>>>>+
>>>>>+			if (node_start_pfn[nid] == -1)
>>>>>+				node_start_pfn[nid] = chunk_start(j);
>>>>>+			node_end_pfn[nid] = chunk_end(j);
>>>>>		}
>>>>>	}
>>>>>	return 1;
>>>>>
>>>>>-
>>>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>>>the body of a message to majordomo@vger.kernel.org
>>>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>Please read the FAQ at  http://www.tux.org/lkml/
>>>>>
>>>>>
>>>>>
>>>>>     
>>>>>
>>>>>          
>>>>>
>>>>   
>>>>
>>>>        
>>>>
>>>-
>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>the body of a message to majordomo@vger.kernel.org
>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>Please read the FAQ at  http://www.tux.org/lkml/
>>>
>>> 
>>>
>>>      
>>>
>>    
>>
>
>
>  
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 19:30         ` Robert Picco
@ 2004-03-17 19:44           ` Martin J. Bligh
  2004-03-17 20:01             ` Robert Picco
  2004-03-17 20:52             ` Dave Hansen
  0 siblings, 2 replies; 15+ messages in thread
From: Martin J. Bligh @ 2004-03-17 19:44 UTC (permalink / raw)
  To: Robert Picco; +Cc: Jesse Barnes, linux-kernel, colpatch, haveblue

> I agree with sizing issues at boot of hash tables.  I've seen them all recover when failing to allocate based
> on num_physpages and then iterating at smaller allocations until successful.  All the primary initialization allocations recover but probably not all drivers.   You could have similiar failure scenarios for any boot line parameter(s) implementation which reduces memory. 
>> Don't we have the same arch dependant issue with the current mem= anyway?
>> Can we come up with something where the arch code calls back into a generic
>> function to derive limitations, and thereby at least get the parsing done
>> in a common routine for consistency? There aren't *that* many NUMA arches
>> to change anyway ...
>>  
>> 
> Well this is heading in the direction Dave has proposed and probably 2.7 material.  This would really solve the problem differently than my proposed patch.

Yes ... that's looking very 2.7-ish to reorganise all that stuff. However,
for now, I still think we need to restrict memory very early on, before 
anything else can allocate bootmem. Are you the absolute first thing that
ever runs in the boot allocator?

M.

> thanks,
> 
> Bob
> 
>> M.
>> 
>>  
>> 
>>> Bob
>>> Martin J. Bligh wrote:
>>> 
>>>    
>>> 
>>>> --On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <jbarnes@sgi.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>      
>>>> 
>>>>> On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>>>   
>>>>> 
>>>>>        
>>>>> 
>>>>>> This patch supports three boot line options.  mem_limit limits the
>>>>>> amount of physical memory.  node_mem_limit limits the amount of
>>>>>> physical memory per node on a NUMA machine.  nodes_limit reduces the
>>>>>> number of NUMA nodes to the value specified.  On a NUMA machine an
>>>>>> eliminated node's CPU(s) are removed from the cpu_possible_map.  
>>>>>> 
>>>>>> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>>>> machine.
>>>>>>     
>>>>>> 
>>>>>>          
>>>>>> 
>>>>> I think this patch will be really useful.  Matt and Martin, does it look
>>>>> ok to you?  Given that discontiguous support is pretty platform specific
>>>>> right now, I thought it might be less code if it was done in arch/, but
>>>>> a platform independent version is awfully nice...
>>>>>   
>>>>> 
>>>>>        
>>>>> 
>>>> I haven't looked at your code yet, but I've had a similar patch in my tree
>>>> from Dave Hansen for a while you might want to look at:
>>>> 
>>>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>>>> --- 320-kcg/arch/i386/kernel/numaq.c	2003-10-01 11:47:33.000000000 -0700
>>>> +++ 330-numa_mem_equals/arch/i386/kernel/numaq.c	2004-03-14 09:54:00.000000000 -0800
>>>> @@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>>> * function also increments numnodes with the number of nodes (quads)
>>>> * present.
>>>> */
>>>> +extern unsigned long max_pages_per_node;
>>>> +extern int limit_mem_per_node;
>>>> +
>>>> +#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>>>> static void __init smp_dump_qct(void)
>>>> {
>>>>	int node;
>>>> @@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>>>>				eq->hi_shrd_mem_start - eq->priv_mem_size);
>>>>			node_end_pfn[node] = MB_TO_PAGES(
>>>>				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>>>> +			if (node_size_pages(node) > max_pages_per_node)
>>>> +				node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>>>>		}
>>>>	}
>>>> }
>>>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>>>> --- 320-kcg/arch/i386/kernel/setup.c	2004-03-11 14:33:36.000000000 -0800
>>>> +++ 330-numa_mem_equals/arch/i386/kernel/setup.c	2004-03-14 09:54:00.000000000 -0800
>>>> @@ -142,7 +142,7 @@ static void __init probe_roms(void)
>>>>	probe_extension_roms(roms);
>>>> }
>>>> 
>>>> -static void __init limit_regions(unsigned long long size)
>>>> +void __init limit_regions(unsigned long long size)
>>>> {
>>>>	unsigned long long current_addr = 0;
>>>>	int i;
>>>> @@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>>>>	print_memory_map(who);
>>>> } /* setup_memory_region */
>>>> 
>>>> +unsigned long max_pages_per_node = 0xFFFFFFFF; 
>>>> 
>>>> static void __init parse_cmdline_early (char ** cmdline_p)
>>>> {
>>>> @@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>>>>				userdef=1;
>>>>			}
>>>>		}
>>>> +		
>>>> +		if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>>>> +			unsigned long long node_size_bytes;
>>>> +			if (to != command_line)
>>>> +				to--;
>>>> +			node_size_bytes = memparse(from+8, &from);
>>>> +			max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>>>> +		}
>>>> 
>>>>		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>>>>			if (to != command_line)
>>>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>>>> --- 320-kcg/arch/i386/kernel/srat.c	2003-10-01 11:47:33.000000000 -0700
>>>> +++ 330-numa_mem_equals/arch/i386/kernel/srat.c	2004-03-14 09:54:01.000000000 -0800
>>>> @@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>>>> };
>>>> static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>>> 
>>>> +#define chunk_start(i)	(node_memory_chunk[i].start_pfn)
>>>> +#define chunk_end(i)	(node_memory_chunk[i].end_pfn)
>>>> +#define chunk_size(i) 	(chunk_end(i)-chunk_start(i))
>>>> +
>>>> static int num_memory_chunks;		/* total number of memory chunks */
>>>> static int zholes_size_init;
>>>> static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>>>> @@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>>>>	}
>>>> }
>>>> 
>>>> +extern unsigned long max_pages_per_node;
>>>> +extern int limit_mem_per_node; 
>>>> +
>>>> /* Parse the ACPI Static Resource Affinity Table */
>>>> static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>>>> {
>>>> @@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>>>>		       node_memory_chunk[j].start_pfn,
>>>>		       node_memory_chunk[j].end_pfn);
>>>>	}
>>>> - 
>>>> +
>>>>	/*calculate node_start_pfn/node_end_pfn arrays*/
>>>>	for (nid = 0; nid < numnodes; nid++) {
>>>> -		int been_here_before = 0;
>>>> +		unsigned long node_present_pages = 0;
>>>> 
>>>> +		node_start_pfn[nid] = -1;
>>>>		for (j = 0; j < num_memory_chunks; j++){
>>>> -			if (node_memory_chunk[j].nid == nid) {
>>>> -				if (been_here_before == 0) {
>>>> -					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>>>> -					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>> -					been_here_before = 1;
>>>> -				} else { /* We've found another chunk of memory for the node */
>>>> -					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>>>> -						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>> -					}
>>>> -				}
>>>> -			}
>>>> +			unsigned long proposed_size;
>>>> +
>>>> +			if (node_memory_chunk[j].nid != nid)
>>>> +				continue;
>>>> +
>>>> +			proposed_size = node_present_pages + chunk_size(j);
>>>> +			if (proposed_size > max_pages_per_node)
>>>> +				chunk_end(j) = chunk_start(j) +	
>>>> +					max_pages_per_node - node_present_pages;
>>>> +			node_present_pages += chunk_size(j);
>>>> +
>>>> +			if (node_start_pfn[nid] == -1)
>>>> +				node_start_pfn[nid] = chunk_start(j);
>>>> +			node_end_pfn[nid] = chunk_end(j);
>>>>		}
>>>>	}
>>>>	return 1;
>>>> 
>>>> -
>>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at  http://www.tux.org/lkml/
>>>> 
>>>> 
>>>> 
>>>>      
>>>> 
>>>    
>>> 
>> 
>> 
>> -
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>> 
>>  
>> 
> 
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 16:36       ` Martin J. Bligh
  2004-03-17 17:09         ` Dave Hansen
@ 2004-03-17 19:30         ` Robert Picco
  2004-03-17 19:44           ` Martin J. Bligh
  1 sibling, 1 reply; 15+ messages in thread
From: Robert Picco @ 2004-03-17 19:30 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Jesse Barnes, linux-kernel, colpatch, haveblue

Martin J. Bligh wrote:

>>I did something like this before my posted patch in the IA64 
>>ACPI NUMA memory initialization code.  It wasn't posted or even 
>>reviewed by peers.  Your patch below basically trims the NUMA node 
>>memory information before the X86 discontig code calls the bootmem 
>>initialization routines.  The problem with coming up with a solution 
>>at this level is each (at least ones I've looked at) architecture 
>>handles low level memory initialization differently and there needs 
>>to be a common way to parse early boot arguments.
>>
>>The patch I posted was arrived at after some people suggested an 
>>architecture independent patch.  My patch basically allocates memory 
>>from the bootmem allocator before mem_init calls free_all_bootmem_core.  
>>It's architecture independent.  If the real goal is to limit physical 
>>memory before the bootmem allocator is initialized, then my current 
>>patch doesn't accomplish this. 
>>    
>>
>
>Mmmm. That does worry me somewhat, as its possible to allocate large
>amounts of bootmem for hash tables, etc, IIRC. I think that's too late
>to restrict things accurately. The fact that we only have bootmem on
>node 0 on ia32 isn't going to help matters either ;-)
>  
>
I agree with sizing issues at boot of hash tables.  I've seen them all 
recover when failing to allocate based
on num_physpages and then iterating at smaller allocations until 
successful.  All the primary initialization allocations recover but 
probably not all drivers.   You could have similiar failure scenarios 
for any boot line parameter(s) implementation which reduces memory. 

>Don't we have the same arch dependant issue with the current mem= anyway?
>Can we come up with something where the arch code calls back into a generic
>function to derive limitations, and thereby at least get the parsing done
>in a common routine for consistency? There aren't *that* many NUMA arches
>to change anyway ...
>  
>
Well this is heading in the direction Dave has proposed and probably 2.7 
material.  This would really solve the problem differently than my 
proposed patch.

thanks,

Bob

>M.
>
>  
>
>>Bob
>>Martin J. Bligh wrote:
>>
>>    
>>
>>>--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <jbarnes@sgi.com> wrote:
>>>
>>> 
>>>
>>>      
>>>
>>>>On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>>   
>>>>
>>>>        
>>>>
>>>>>This patch supports three boot line options.  mem_limit limits the
>>>>>amount of physical memory.  node_mem_limit limits the amount of
>>>>>physical memory per node on a NUMA machine.  nodes_limit reduces the
>>>>>number of NUMA nodes to the value specified.  On a NUMA machine an
>>>>>eliminated node's CPU(s) are removed from the cpu_possible_map.  
>>>>>
>>>>>The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>>>machine.
>>>>>     
>>>>>
>>>>>          
>>>>>
>>>>I think this patch will be really useful.  Matt and Martin, does it look
>>>>ok to you?  Given that discontiguous support is pretty platform specific
>>>>right now, I thought it might be less code if it was done in arch/, but
>>>>a platform independent version is awfully nice...
>>>>   
>>>>
>>>>        
>>>>
>>>I haven't looked at your code yet, but I've had a similar patch in my tree
>>>from Dave Hansen for a while you might want to look at:
>>>
>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>>>--- 320-kcg/arch/i386/kernel/numaq.c	2003-10-01 11:47:33.000000000 -0700
>>>+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c	2004-03-14 09:54:00.000000000 -0800
>>>@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>> * function also increments numnodes with the number of nodes (quads)
>>> * present.
>>> */
>>>+extern unsigned long max_pages_per_node;
>>>+extern int limit_mem_per_node;
>>>+
>>>+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>>>static void __init smp_dump_qct(void)
>>>{
>>>	int node;
>>>@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>>>				eq->hi_shrd_mem_start - eq->priv_mem_size);
>>>			node_end_pfn[node] = MB_TO_PAGES(
>>>				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>>>+			if (node_size_pages(node) > max_pages_per_node)
>>>+				node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>>>		}
>>>	}
>>>}
>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>>>--- 320-kcg/arch/i386/kernel/setup.c	2004-03-11 14:33:36.000000000 -0800
>>>+++ 330-numa_mem_equals/arch/i386/kernel/setup.c	2004-03-14 09:54:00.000000000 -0800
>>>@@ -142,7 +142,7 @@ static void __init probe_roms(void)
>>>	probe_extension_roms(roms);
>>>}
>>>
>>>-static void __init limit_regions(unsigned long long size)
>>>+void __init limit_regions(unsigned long long size)
>>>{
>>>	unsigned long long current_addr = 0;
>>>	int i;
>>>@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>>>	print_memory_map(who);
>>>} /* setup_memory_region */
>>>
>>>+unsigned long max_pages_per_node = 0xFFFFFFFF; 
>>>
>>>static void __init parse_cmdline_early (char ** cmdline_p)
>>>{
>>>@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>>>				userdef=1;
>>>			}
>>>		}
>>>+		
>>>+		if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>>>+			unsigned long long node_size_bytes;
>>>+			if (to != command_line)
>>>+				to--;
>>>+			node_size_bytes = memparse(from+8, &from);
>>>+			max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>>>+		}
>>>
>>>		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>>>			if (to != command_line)
>>>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>>>--- 320-kcg/arch/i386/kernel/srat.c	2003-10-01 11:47:33.000000000 -0700
>>>+++ 330-numa_mem_equals/arch/i386/kernel/srat.c	2004-03-14 09:54:01.000000000 -0800
>>>@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>>>};
>>>static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>>>
>>>+#define chunk_start(i)	(node_memory_chunk[i].start_pfn)
>>>+#define chunk_end(i)	(node_memory_chunk[i].end_pfn)
>>>+#define chunk_size(i) 	(chunk_end(i)-chunk_start(i))
>>>+
>>>static int num_memory_chunks;		/* total number of memory chunks */
>>>static int zholes_size_init;
>>>static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>>>@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>>>	}
>>>}
>>>
>>>+extern unsigned long max_pages_per_node;
>>>+extern int limit_mem_per_node; 
>>>+
>>>/* Parse the ACPI Static Resource Affinity Table */
>>>static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>>>{
>>>@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>>>		       node_memory_chunk[j].start_pfn,
>>>		       node_memory_chunk[j].end_pfn);
>>>	}
>>>- 
>>>+
>>>	/*calculate node_start_pfn/node_end_pfn arrays*/
>>>	for (nid = 0; nid < numnodes; nid++) {
>>>-		int been_here_before = 0;
>>>+		unsigned long node_present_pages = 0;
>>>
>>>+		node_start_pfn[nid] = -1;
>>>		for (j = 0; j < num_memory_chunks; j++){
>>>-			if (node_memory_chunk[j].nid == nid) {
>>>-				if (been_here_before == 0) {
>>>-					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>>>-					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>-					been_here_before = 1;
>>>-				} else { /* We've found another chunk of memory for the node */
>>>-					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>>>-						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>>>-					}
>>>-				}
>>>-			}
>>>+			unsigned long proposed_size;
>>>+
>>>+			if (node_memory_chunk[j].nid != nid)
>>>+				continue;
>>>+
>>>+			proposed_size = node_present_pages + chunk_size(j);
>>>+			if (proposed_size > max_pages_per_node)
>>>+				chunk_end(j) = chunk_start(j) +	
>>>+					max_pages_per_node - node_present_pages;
>>>+			node_present_pages += chunk_size(j);
>>>+
>>>+			if (node_start_pfn[nid] == -1)
>>>+				node_start_pfn[nid] = chunk_start(j);
>>>+			node_end_pfn[nid] = chunk_end(j);
>>>		}
>>>	}
>>>	return 1;
>>>
>>>-
>>>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>>the body of a message to majordomo@vger.kernel.org
>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>Please read the FAQ at  http://www.tux.org/lkml/
>>>
>>> 
>>>
>>>      
>>>
>>    
>>
>
>
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at  http://www.tux.org/lkml/
>
>  
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 17:51           ` Jesse Barnes
@ 2004-03-17 18:12             ` Dave Hansen
  0 siblings, 0 replies; 15+ messages in thread
From: Dave Hansen @ 2004-03-17 18:12 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Martin J. Bligh, Robert Picco, Linux Kernel Mailing List, Matthew Dobson

On Wed, 2004-03-17 at 09:51, Jesse Barnes wrote:
> In some cases (ia64 for example) there are additional restrictions on
> each memory chunk.  For example, the EFI memory map may describe a
> contiguous chunk of memory 28MB in size, but if your kernel page size
> was set to 64MB, you'd have to throw it away as unusable.  Should that
> be dealt with in the arch independent code (i.e. is similar stuff done
> on other platforms?) or is it best to only add sections that are usable?

I was really hoping that this mechanism can be as stupid about what it
contains as possible.  It's _just_ there to store the memory layout, and
wouldn't decide or implement policy for the architecture.

The "runt" section of memory should be added to the structures and
tracked.  If, for some random reason, another 36MB of contiguous memory
got added to it later, you could start to think about coalescing it with
the runt from before.

The place to ignore the runt is in your architecture code that sets up
the page tables.  Your arch code would, of course, be reading from this
layout code.

> > What I'd like to do is present a standard way for all of these
> > architectures to store the information that they need to record at boot
> > time, plus make something flexible enough that we can use it for stuff
> > at runtime when hotplug memory is involved.
> 
> That would be great, what you have below seems sensible.

Mostly sensible.  I definitely need to make sure that it can cover all
the cases.  The "section" terminology should probably be removed so that
we can use it for CONFIG_NONLINEAR, and we need to think about what
happens when conflicting sections are added.  For instance, it might be
valid to add RAM from 0-4GB, then reserve 3.75-4GB later on for PCI
space.  Also, the code currently leaves "undefined" sections instead of
creating holes.  That can be dealt with later. 

Anyway, I'm not too attached to that code, it just realizes an idea that
I have.

> > The code I'd like to see go away from boot-time is anything that deals
> > with arch-specific structures like the e820, functions like
> > lmb_end_of_DRAM(), or any code that deals with zholes.  I'd like to get
> > it to a point where we can do a mostly arch-independent mem=.  
> 
> So what you have here would be only for boot time setup, while
> CONFIG_NONLINEAR would be used in lieu of multiple pgdats per node or a
> virtual memmap in the case of intranode discontiguous memory?

Well, I was hoping that whatever we use at boot-time could stick around
for runtime.  I'd like to get to the point where the interface for
bringing up boot-time memory is the same for hotplugging memory.  (for
2.7, of course)

Just as with the CPU hotplug code, having separate code paths for
hotplug memory is asking for trouble, because the coverage will never be
as high as the generic boot case.  

-- dave


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 17:09         ` Dave Hansen
@ 2004-03-17 17:51           ` Jesse Barnes
  2004-03-17 18:12             ` Dave Hansen
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2004-03-17 17:51 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Martin J. Bligh, Robert Picco, Linux Kernel Mailing List, Matthew Dobson

On Wed, Mar 17, 2004 at 09:09:45AM -0800, Dave Hansen wrote:
> Every arch has its own way of describing its layout.  Some use "chunks"
> and others like ppc64 use LMB (logical memory blocks).  If each arch was
> willing to store their memory layout information in a generic way, then
> we might have a shot at doing a generic mem= or a NUMA version.  

> 
> I coded this up a few days ago to see if I could replace the x440 SRAT
> chunks with it.  I never got around to actually doing that part, but
> something like this is what we need to do *layout* manipulation in an
> architecture-agnostic way.
> 
> I started coding this before I thought *too* much about it.  What I want
> is a way to get rid of all of the crap that each architecture (and
> subarch) have to store their physical memory layout.  On normal x86 we
> have the e820 and the EFI tables and on Summit/x440, we have yet another
> way to do it.  

In some cases (ia64 for example) there are additional restrictions on
each memory chunk.  For example, the EFI memory map may describe a
contiguous chunk of memory 28MB in size, but if your kernel page size
was set to 64MB, you'd have to throw it away as unusable.  Should that
be dealt with in the arch independent code (i.e. is similar stuff done
on other platforms?) or is it best to only add sections that are usable?

> What I'd like to do is present a standard way for all of these
> architectures to store the information that they need to record at boot
> time, plus make something flexible enough that we can use it for stuff
> at runtime when hotplug memory is involved.

That would be great, what you have below seems sensible.

> The code I'd like to see go away from boot-time is anything that deals
> with arch-specific structures like the e820, functions like
> lmb_end_of_DRAM(), or any code that deals with zholes.  I'd like to get
> it to a point where we can do a mostly arch-independent mem=.  

So what you have here would be only for boot time setup, while
CONFIG_NONLINEAR would be used in lieu of multiple pgdats per node or a
virtual memmap in the case of intranode discontiguous memory?

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 16:36       ` Martin J. Bligh
@ 2004-03-17 17:09         ` Dave Hansen
  2004-03-17 17:51           ` Jesse Barnes
  2004-03-17 19:30         ` Robert Picco
  1 sibling, 1 reply; 15+ messages in thread
From: Dave Hansen @ 2004-03-17 17:09 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Robert Picco, Jesse Barnes, Linux Kernel Mailing List, Matthew Dobson

[-- Attachment #1: Type: text/plain, Size: 3177 bytes --]

On Wed, 2004-03-17 at 08:36, Martin J. Bligh wrote:
> > The patch I posted was arrived at after some people suggested an 
> > architecture independent patch.  My patch basically allocates memory 
> > from the bootmem allocator before mem_init calls free_all_bootmem_core.  
> > It's architecture independent.  If the real goal is to limit physical 
> > memory before the bootmem allocator is initialized, then my current 
> > patch doesn't accomplish this. 
>
> Don't we have the same arch dependant issue with the current mem= anyway?
> Can we come up with something where the arch code calls back into a generic
> function to derive limitations, and thereby at least get the parsing done
> in a common routine for consistency? There aren't *that* many NUMA arches
> to change anyway ...

The problem with doing it in generic code is that it has to happen
_after_ the memory layout is discovered.  It's a mess to reconstruct all
of the necessary information about where holes stop and start, at least
from the current information that we store.  Then, you have to go track
down any information that might have "leaked" into the arch code before
you parsed the mem=, which includes all of the {min,max)_{high,low}_pfn
variable.  I prefer to just take care of it at its source where NUMA
information is read out of the hardware.

Every arch has its own way of describing its layout.  Some use "chunks"
and others like ppc64 use LMB (logical memory blocks).  If each arch was
willing to store their memory layout information in a generic way, then
we might have a shot at doing a generic mem= or a NUMA version.  

I coded this up a few days ago to see if I could replace the x440 SRAT
chunks with it.  I never got around to actually doing that part, but
something like this is what we need to do *layout* manipulation in an
architecture-agnostic way.

I started coding this before I thought *too* much about it.  What I want
is a way to get rid of all of the crap that each architecture (and
subarch) have to store their physical memory layout.  On normal x86 we
have the e820 and the EFI tables and on Summit/x440, we have yet another
way to do it.  

What I'd like to do is present a standard way for all of these
architectures to store the information that they need to record at boot
time, plus make something flexible enough that we can use it for stuff
at runtime when hotplug memory is involved.

The code I'd like to see go away from boot-time is anything that deals
with arch-specific structures like the e820, functions like
lmb_end_of_DRAM(), or any code that deals with zholes.  I'd like to get
it to a point where we can do a mostly arch-independent mem=.  

So, here's a little bit of (now userspace) code that implements a very
simple way to track physical memory areas.

stuff that sucks:
- long type names/indiscriminate use of u64
- "section" is on my brain from CONFIG_NONLINEAR, probably don't want
  to use that name again
- Doesn't coalesce adjacent sections with identical attributes, only
  extends existing ones.
- could sort arrays instead of using lists for speed/space
- can leave "UNDEF" holes
- can't add new sections spanning 2 old ones


-- dave

[-- Attachment #2: layout.c --]
[-- Type: text/x-c, Size: 5504 bytes --]

#include <stdlib.h>
#include "list.h"
typedef unsigned long long u64;


#define	PHYS_SECTION_UNDEF	0
#define	PHYS_SECTION_DEF		~0ul
#define	PHYS_SECTION_RAM		1<<0
#define	PHYS_SECTION_RAM_DISABLED	1<<1
#define	PHYS_SECTION_HOLE		1<<2

#define PHYS_SECTION_ATTR_END	3

/* these would be in per-arch headers */
#define PHYS_SECTION_PCI_SPACE	1<<(PHYS_SECTION_ATTR_END+0)
#define PHYS_SECTION_RESERVED	1<<(PHYS_SECTION_ATTR_END+1)
#define PHYS_SECTION_BAR	1<<(PHYS_SECTION_ATTR_END+2)
/* this would copy any arch-private fields */
#define section_attr_copy_arch(...)	do {} while(0);
struct phys_layout_section_arch {
	u64 numa_proximity;
};
/* end of per-arch headers */

struct phys_layout_section {
	u64 start_phys_addr;
	u64 end_phys_addr;

	unsigned long attributes;
	struct phys_layout_section_arch arch;
	
	struct list_head list;
};

#define MAX_SECTIONS 128
static struct phys_layout_section sections[MAX_SECTIONS];
int sections_used = 0;

void section_attr_copy(struct phys_layout_section *dst, struct phys_layout_section *src)
{
	dst->arch = src->arch;
	section_attr_copy_arch(dst, src);
}

void add_next_section(struct phys_layout_section *new, struct phys_layout_section *head)
{
	list_add(&new->list, &head->list);
}
void add_prev_section(struct phys_layout_section *new, struct phys_layout_section *head)
{
	list_add_tail(&new->list, &head->list);
}

struct phys_layout_section *alloc_phys_section()
{
	return &sections[sections_used++];
}

int section_contains(struct phys_layout_section * section, u64 addr)
{
	return (addr >= section->start_phys_addr &&
		addr < section->end_phys_addr);
}

LIST_HEAD(section_list);

void print_section(int i, struct phys_layout_section *section)
{
	printf("section %d: %016Lx-%016Lx %08lx\n",
		i++, section->start_phys_addr,
		section->end_phys_addr,
		section->attributes);
}
void print_sections(void)
{
	struct list_head *entry = NULL;
	int i=0;
	
	list_for_each(entry, &section_list) {
		struct phys_layout_section *section;
		section = list_entry(entry, struct phys_layout_section, list);
		print_section(i++, section);
	}
	printf("--------------------------------------------\n");
}


struct phys_layout_section *add_phys_section(u64 new_start_addr, u64 new_end_addr, unsigned long attributes)
{
	struct list_head *entry = NULL;
	struct phys_layout_section *section = NULL;
	struct phys_layout_section *new_section;
	struct phys_layout_section *split_section;
	int i=0;
	u64 old_end_addr;
	
	list_for_each(entry, &section_list) {
		section = list_entry(entry, struct phys_layout_section, list);
		if (!section_contains(section, new_start_addr))
			continue;

		/* same attributes, just extend it */
		if (section->attributes == attributes) {
			if (section->end_phys_addr < new_end_addr)
				section->end_phys_addr = new_end_addr;
			break;
		}

		/* new section needed */
		new_section = alloc_phys_section();
		new_section->start_phys_addr = new_start_addr;
		new_section->end_phys_addr = new_end_addr;
		new_section->attributes = attributes;
		
		/* This started in the same place as the old one */
		if (section->start_phys_addr == new_start_addr) {
			/* shift start of the old one up */
			section->start_phys_addr = new_end_addr;
			add_prev_section(new_section, section);
			break;
		}

		/* 
		 * New section started in the middle of the old one.
		 * Truncate the old one, so that it ends at the start
		 * of the new one.
		 */
		old_end_addr = section->end_phys_addr;
		section->end_phys_addr = new_start_addr;
		add_next_section(new_section, section);

		/* New section covered the rest of the old section */
		if (old_end_addr == new_end_addr)
			break;

		/* 
		 * The new section was spanned by the old one, and the old
		 * one had to be split. Another section is needed for the 
		 * remainder of the old area.  Extend the "new_section", so
		 * that the split section can truncate it in the recursion.
		 *
		 * This will only recurse once, and terminate at the break
		 * immediately above.
		 */
		new_section->end_phys_addr = old_end_addr;
		split_section = add_phys_section(new_end_addr, old_end_addr, 
						 section->attributes);
		section_attr_copy(split_section, section);
		break;
	}
	return new_section;
}

u64 get_total_attr_size(unsigned long attr)
{
        struct list_head *entry = NULL;
	u64 result = 0;
                                                                              
	list_for_each(entry, &section_list) {
		struct phys_layout_section *section;
		section = list_entry(entry, struct phys_layout_section, list);

		if (section->attributes & attr)
			result += section->end_phys_addr - section->start_phys_addr;
	}
		
	return result;
}

int main()
{
	int i;
	sections_used = 1;
	sections[0].end_phys_addr = -1;
	sections[0].attributes = PHYS_SECTION_UNDEF;
	INIT_LIST_HEAD(&sections[0].list);
	list_add(&sections[0].list, &section_list);
	print_sections();

	add_phys_section(0x0000, 0x1000, PHYS_SECTION_HOLE);
	add_phys_section(0x1000, 0x2000, PHYS_SECTION_RAM);
	add_phys_section(0x2000, 0x9000, PHYS_SECTION_HOLE);
	add_phys_section(0x3000, 0x6000, PHYS_SECTION_RAM);
	/* disable some ram for mem= */
	add_phys_section(0x5000, 0x6000, PHYS_SECTION_RAM_DISABLED);
	print_sections();
	printf("disabled RAM size: %Ld\n", get_total_attr_size(PHYS_SECTION_RAM_DISABLED));
	printf("         RAM size: %Ld\n", get_total_attr_size(PHYS_SECTION_RAM));
	printf("        hole size: %Ld\n", get_total_attr_size(PHYS_SECTION_HOLE));
	printf("         any size: %Ld\n", get_total_attr_size(PHYS_SECTION_DEF));
}


[-- Attachment #3: list.h --]
[-- Type: text/x-c-header, Size: 18060 bytes --]

#define prefetch(X)	(X)

/*
 * These are non-NULL pointers that will result in page faults
 * under normal circumstances, used to verify that nobody uses
 * non-initialized list entries.
 */
#define LIST_POISON1  ((void *) 0x00100100)
#define LIST_POISON2  ((void *) 0x00200200)

/*
 * Simple doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

struct list_head {
	struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
	struct list_head name = LIST_HEAD_INIT(name)

#define INIT_LIST_HEAD(ptr) do { \
	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)

/*
 * Insert a new entry between two known consecutive entries. 
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
			      struct list_head *prev,
			      struct list_head *next)
{
	next->prev = new;
	new->next = next;
	new->prev = prev;
	prev->next = new;
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
	__list_add(new, head, head->next);
}

/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
	__list_add(new, head->prev, head);
}

/*
 * Insert a new entry between two known consecutive entries. 
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static __inline__ void __list_add_rcu(struct list_head * new,
	struct list_head * prev,
	struct list_head * next)
{
	new->next = next;
	new->prev = prev;
	smp_wmb();
	next->prev = new;
	prev->next = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head)
{
	__list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head)
{
	__list_add_rcu(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
	next->prev = prev;
	prev->next = next;
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
	__list_del(entry->prev, entry->next);
	entry->next = LIST_POISON1;
	entry->prev = LIST_POISON2;
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty on entry does not return true after this, 
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward 
 * pointers that may still be used for walking the list.
 */
static inline void list_del_rcu(struct list_head *entry)
{
	__list_del(entry->prev, entry->next);
	entry->prev = LIST_POISON2;
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
	__list_del(entry->prev, entry->next);
	INIT_LIST_HEAD(entry); 
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del(list->prev, list->next);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
				  struct list_head *head)
{
        __list_del(list->prev, list->next);
        list_add_tail(list, head);
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
	return head->next == head;
}

/**
 * list_empty_careful - tests whether a list is
 * empty _and_ checks that no other CPU might be
 * in the process of still modifying either member
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 *
 * @head: the list to test.
 */
static inline int list_empty_careful(const struct list_head *head)
{
	struct list_head *next = head->next;
	return (next == head) && (next == head->prev);
}

static inline void __list_splice(struct list_head *list,
				 struct list_head *head)
{
	struct list_head *first = list->next;
	struct list_head *last = list->prev;
	struct list_head *at = head->next;

	first->prev = head;
	head->next = first;

	last->next = at;
	at->prev = last;
}

/**
 * list_splice - join two lists
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(struct list_head *list, struct list_head *head)
{
	if (!list_empty(list))
		__list_splice(list, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
				    struct list_head *head)
{
	if (!list_empty(list)) {
		__list_splice(list, head);
		INIT_LIST_HEAD(list);
	}
}

#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#define container_of(ptr, type, member) ({                      \
	const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
	(type *)( (char *)__mptr - offsetof(type,member) );})


/**
 * list_entry - get the struct for this entry
 * @ptr:	the &struct list_head pointer.
 * @type:	the type of the struct this is embedded in.
 * @member:	the name of the list_struct within the struct.
 */
#define list_entry(ptr, type, member) \
	container_of(ptr, type, member)

/**
 * list_for_each	-	iterate over a list
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 */
#define list_for_each(pos, head) \
	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
        	pos = pos->next, prefetch(pos->next))

/**
 * __list_for_each	-	iterate over a list
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 *
 * This variant differs from list_for_each() in that it's the
 * simplest possible list iteration code, no prefetching is done.
 * Use this for code that knows the list to be very short (empty
 * or 1 entry) most of the time.
 */
#define __list_for_each(pos, head) \
	for (pos = (head)->next; pos != (head); pos = pos->next)

/**
 * list_for_each_prev	-	iterate over a list backwards
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 */
#define list_for_each_prev(pos, head) \
	for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
        	pos = pos->prev, prefetch(pos->prev))
        	
/**
 * list_for_each_safe	-	iterate over a list safe against removal of list entry
 * @pos:	the &struct list_head to use as a loop counter.
 * @n:		another &struct list_head to use as temporary storage
 * @head:	the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
	for (pos = (head)->next, n = pos->next; pos != (head); \
		pos = n, n = pos->next)

/**
 * list_for_each_entry	-	iterate over list of given type
 * @pos:	the type * to use as a loop counter.
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 */
#define list_for_each_entry(pos, head, member)				\
	for (pos = list_entry((head)->next, typeof(*pos), member),	\
		     prefetch(pos->member.next);			\
	     &pos->member != (head); 					\
	     pos = list_entry(pos->member.next, typeof(*pos), member),	\
		     prefetch(pos->member.next))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:	the type * to use as a loop counter.
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)			\
	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
		     prefetch(pos->member.prev);			\
	     &pos->member != (head); 					\
	     pos = list_entry(pos->member.prev, typeof(*pos), member),	\
		     prefetch(pos->member.prev))

/**
 * list_prepare_entry - prepare a pos entry for use as a start point in
 *			list_for_each_entry_continue
 * @pos:	the type * to use as a start point
 * @head:	the head of the list
 * @member:	the name of the list_struct within the struct.
 */
#define list_prepare_entry(pos, head, member) \
	((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue -	iterate over list of given type
 *			continuing after existing point
 * @pos:	the type * to use as a loop counter.
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 */
#define list_for_each_entry_continue(pos, head, member) 		\
	for (pos = list_entry(pos->member.next, typeof(*pos), member),	\
		     prefetch(pos->member.next);			\
	     &pos->member != (head);					\
	     pos = list_entry(pos->member.next, typeof(*pos), member),	\
		     prefetch(pos->member.next))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:	the type * to use as a loop counter.
 * @n:		another type * to use as temporary storage
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)			\
	for (pos = list_entry((head)->next, typeof(*pos), member),	\
		n = list_entry(pos->member.next, typeof(*pos), member);	\
	     &pos->member != (head); 					\
	     pos = n, n = list_entry(n->member.next, typeof(*n), member))

/**
 * list_for_each_rcu	-	iterate over an rcu-protected list
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 */
#define list_for_each_rcu(pos, head) \
	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
        	pos = pos->next, ({ smp_read_barrier_depends(); 0;}), prefetch(pos->next))
        	
#define __list_for_each_rcu(pos, head) \
	for (pos = (head)->next; pos != (head); \
        	pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
        	
/**
 * list_for_each_safe_rcu	-	iterate over an rcu-protected list safe
 *					against removal of list entry
 * @pos:	the &struct list_head to use as a loop counter.
 * @n:		another &struct list_head to use as temporary storage
 * @head:	the head for your list.
 */
#define list_for_each_safe_rcu(pos, n, head) \
	for (pos = (head)->next, n = pos->next; pos != (head); \
		pos = n, ({ smp_read_barrier_depends(); 0;}), n = pos->next)

/**
 * list_for_each_entry_rcu	-	iterate over rcu list of given type
 * @pos:	the type * to use as a loop counter.
 * @head:	the head for your list.
 * @member:	the name of the list_struct within the struct.
 */
#define list_for_each_entry_rcu(pos, head, member)			\
	for (pos = list_entry((head)->next, typeof(*pos), member),	\
		     prefetch(pos->member.next);			\
	     &pos->member != (head); 					\
	     pos = list_entry(pos->member.next, typeof(*pos), member),	\
		     ({ smp_read_barrier_depends(); 0;}),		\
		     prefetch(pos->member.next))


/**
 * list_for_each_continue_rcu	-	iterate over an rcu-protected list 
 *			continuing after existing point.
 * @pos:	the &struct list_head to use as a loop counter.
 * @head:	the head for your list.
 */
#define list_for_each_continue_rcu(pos, head) \
	for ((pos) = (pos)->next, prefetch((pos)->next); (pos) != (head); \
        	(pos) = (pos)->next, ({ smp_read_barrier_depends(); 0;}), prefetch((pos)->next))

/* 
 * Double linked lists with a single pointer list head. 
 * Mostly useful for hash tables where the two pointer list head is 
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */ 

struct hlist_head { 
	struct hlist_node *first; 
}; 

struct hlist_node { 
	struct hlist_node *next, **pprev; 
}; 

#define HLIST_HEAD_INIT { .first = NULL } 
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) 
#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)

static __inline__ int hlist_unhashed(const struct hlist_node *h) 
{ 
	return !h->pprev;
} 

static __inline__ int hlist_empty(const struct hlist_head *h) 
{ 
	return !h->first;
} 

static __inline__ void __hlist_del(struct hlist_node *n) 
{
	struct hlist_node *next = n->next;
	struct hlist_node **pprev = n->pprev;
	*pprev = next;  
	if (next) 
		next->pprev = pprev;
}  

static __inline__ void hlist_del(struct hlist_node *n)
{
	__hlist_del(n);
	n->next = LIST_POISON1;
	n->pprev = LIST_POISON2;
}

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this, 
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
	__hlist_del(n);
	n->pprev = LIST_POISON2;
}

static __inline__ void hlist_del_init(struct hlist_node *n) 
{
	if (n->pprev)  {
		__hlist_del(n);
		INIT_HLIST_NODE(n);
	}
}  

#define hlist_del_rcu_init hlist_del_init

static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h) 
{ 
	struct hlist_node *first = h->first;
	n->next = first; 
	if (first) 
		first->pprev = &n->next;
	h->first = n; 
	n->pprev = &h->first; 
} 

static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) 
{ 
	struct hlist_node *first = h->first;
	n->next = first;
	n->pprev = &h->first; 
	smp_wmb();
	if (first) 
		first->pprev = &n->next;
	h->first = n; 
} 

/* next must be != NULL */
static __inline__ void hlist_add_before(struct hlist_node *n, struct hlist_node *next)
{
	n->pprev = next->pprev;
	n->next = next; 
	next->pprev = &n->next; 
	*(n->pprev) = n;
}

static __inline__ void hlist_add_after(struct hlist_node *n,
				       struct hlist_node *next)
{
	next->next	= n->next;
	*(next->pprev)	= n;
	n->next		= next;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

/* Cannot easily do prefetch unfortunately */
#define hlist_for_each(pos, head) \
	for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
	     pos = pos->next) 

#define hlist_for_each_safe(pos, n, head) \
	for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
	     pos = n)

/**
 * hlist_for_each_entry	- iterate over list of given type
 * @tpos:	the type * to use as a loop counter.
 * @pos:	the &struct hlist_node to use as a loop counter.
 * @head:	the head for your list.
 * @member:	the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(tpos, pos, head, member)			 \
	for (pos = (head)->first;					 \
	     pos && ({ prefetch(pos->next); 1;}) &&			 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = pos->next)

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point
 * @tpos:	the type * to use as a loop counter.
 * @pos:	the &struct hlist_node to use as a loop counter.
 * @member:	the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(tpos, pos, member)		 \
	for (pos = (pos)->next;						 \
	     pos && ({ prefetch(pos->next); 1;}) &&			 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = pos->next)

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from existing point
 * @tpos:	the type * to use as a loop counter.
 * @pos:	the &struct hlist_node to use as a loop counter.
 * @member:	the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(tpos, pos, member)			 \
	for (; pos && ({ prefetch(pos->next); 1;}) &&			 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = pos->next)

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:	the type * to use as a loop counter.
 * @pos:	the &struct hlist_node to use as a loop counter.
 * @n:		another &struct hlist_node to use as temporary storage
 * @head:	the head for your list.
 * @member:	the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
	for (pos = (head)->first;					 \
	     pos && ({ n = pos->next; 1; }) && 				 \
		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
	     pos = n)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-17 16:15     ` Robert Picco
@ 2004-03-17 16:36       ` Martin J. Bligh
  2004-03-17 17:09         ` Dave Hansen
  2004-03-17 19:30         ` Robert Picco
  0 siblings, 2 replies; 15+ messages in thread
From: Martin J. Bligh @ 2004-03-17 16:36 UTC (permalink / raw)
  To: Robert Picco; +Cc: Jesse Barnes, linux-kernel, colpatch, haveblue

> I did something like this before my posted patch in the IA64 
> ACPI NUMA memory initialization code.  It wasn't posted or even 
> reviewed by peers.  Your patch below basically trims the NUMA node 
> memory information before the X86 discontig code calls the bootmem 
> initialization routines.  The problem with coming up with a solution 
> at this level is each (at least ones I've looked at) architecture 
> handles low level memory initialization differently and there needs 
> to be a common way to parse early boot arguments.
> 
> The patch I posted was arrived at after some people suggested an 
> architecture independent patch.  My patch basically allocates memory 
> from the bootmem allocator before mem_init calls free_all_bootmem_core.  
> It's architecture independent.  If the real goal is to limit physical 
> memory before the bootmem allocator is initialized, then my current 
> patch doesn't accomplish this. 

Mmmm. That does worry me somewhat, as its possible to allocate large
amounts of bootmem for hash tables, etc, IIRC. I think that's too late
to restrict things accurately. The fact that we only have bootmem on
node 0 on ia32 isn't going to help matters either ;-)

Don't we have the same arch dependant issue with the current mem= anyway?
Can we come up with something where the arch code calls back into a generic
function to derive limitations, and thereby at least get the parsing done
in a common routine for consistency? There aren't *that* many NUMA arches
to change anyway ...

M.

> Bob
> Martin J. Bligh wrote:
> 
>> --On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <jbarnes@sgi.com> wrote:
>> 
>>  
>> 
>>> On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>>    
>>> 
>>>> This patch supports three boot line options.  mem_limit limits the
>>>> amount of physical memory.  node_mem_limit limits the amount of
>>>> physical memory per node on a NUMA machine.  nodes_limit reduces the
>>>> number of NUMA nodes to the value specified.  On a NUMA machine an
>>>> eliminated node's CPU(s) are removed from the cpu_possible_map.  
>>>> 
>>>> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>> machine.
>>>>      
>>>> 
>>> I think this patch will be really useful.  Matt and Martin, does it look
>>> ok to you?  Given that discontiguous support is pretty platform specific
>>> right now, I thought it might be less code if it was done in arch/, but
>>> a platform independent version is awfully nice...
>>>    
>>> 
>> 
>> I haven't looked at your code yet, but I've had a similar patch in my tree
>> from Dave Hansen for a while you might want to look at:
>> 
>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>> --- 320-kcg/arch/i386/kernel/numaq.c	2003-10-01 11:47:33.000000000 -0700
>> +++ 330-numa_mem_equals/arch/i386/kernel/numaq.c	2004-03-14 09:54:00.000000000 -0800
>> @@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>>  * function also increments numnodes with the number of nodes (quads)
>>  * present.
>>  */
>> +extern unsigned long max_pages_per_node;
>> +extern int limit_mem_per_node;
>> +
>> +#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
>> static void __init smp_dump_qct(void)
>> {
>> 	int node;
>> @@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
>> 				eq->hi_shrd_mem_start - eq->priv_mem_size);
>> 			node_end_pfn[node] = MB_TO_PAGES(
>> 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>> +			if (node_size_pages(node) > max_pages_per_node)
>> +				node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
>> 		}
>> 	}
>> }
>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>> --- 320-kcg/arch/i386/kernel/setup.c	2004-03-11 14:33:36.000000000 -0800
>> +++ 330-numa_mem_equals/arch/i386/kernel/setup.c	2004-03-14 09:54:00.000000000 -0800
>> @@ -142,7 +142,7 @@ static void __init probe_roms(void)
>> 	probe_extension_roms(roms);
>> }
>> 
>> -static void __init limit_regions(unsigned long long size)
>> +void __init limit_regions(unsigned long long size)
>> {
>> 	unsigned long long current_addr = 0;
>> 	int i;
>> @@ -478,6 +478,7 @@ static void __init setup_memory_region(v
>> 	print_memory_map(who);
>> } /* setup_memory_region */
>> 
>> +unsigned long max_pages_per_node = 0xFFFFFFFF; 
>> 
>> static void __init parse_cmdline_early (char ** cmdline_p)
>> {
>> @@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
>> 				userdef=1;
>> 			}
>> 		}
>> +		
>> +		if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>> +			unsigned long long node_size_bytes;
>> +			if (to != command_line)
>> +				to--;
>> +			node_size_bytes = memparse(from+8, &from);
>> +			max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>> +		}
>> 
>> 		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
>> 			if (to != command_line)
>> diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>> --- 320-kcg/arch/i386/kernel/srat.c	2003-10-01 11:47:33.000000000 -0700
>> +++ 330-numa_mem_equals/arch/i386/kernel/srat.c	2004-03-14 09:54:01.000000000 -0800
>> @@ -53,6 +53,10 @@ struct node_memory_chunk_s {
>> };
>> static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
>> 
>> +#define chunk_start(i)	(node_memory_chunk[i].start_pfn)
>> +#define chunk_end(i)	(node_memory_chunk[i].end_pfn)
>> +#define chunk_size(i) 	(chunk_end(i)-chunk_start(i))
>> +
>> static int num_memory_chunks;		/* total number of memory chunks */
>> static int zholes_size_init;
>> static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>> @@ -198,6 +202,9 @@ static void __init initialize_physnode_m
>> 	}
>> }
>> 
>> +extern unsigned long max_pages_per_node;
>> +extern int limit_mem_per_node; 
>> +
>> /* Parse the ACPI Static Resource Affinity Table */
>> static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
>> {
>> @@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
>> 		       node_memory_chunk[j].start_pfn,
>> 		       node_memory_chunk[j].end_pfn);
>> 	}
>> - 
>> +
>> 	/*calculate node_start_pfn/node_end_pfn arrays*/
>> 	for (nid = 0; nid < numnodes; nid++) {
>> -		int been_here_before = 0;
>> +		unsigned long node_present_pages = 0;
>> 
>> +		node_start_pfn[nid] = -1;
>> 		for (j = 0; j < num_memory_chunks; j++){
>> -			if (node_memory_chunk[j].nid == nid) {
>> -				if (been_here_before == 0) {
>> -					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>> -					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>> -					been_here_before = 1;
>> -				} else { /* We've found another chunk of memory for the node */
>> -					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>> -						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>> -					}
>> -				}
>> -			}
>> +			unsigned long proposed_size;
>> +
>> +			if (node_memory_chunk[j].nid != nid)
>> +				continue;
>> +
>> +			proposed_size = node_present_pages + chunk_size(j);
>> +			if (proposed_size > max_pages_per_node)
>> +				chunk_end(j) = chunk_start(j) +	
>> +					max_pages_per_node - node_present_pages;
>> +			node_present_pages += chunk_size(j);
>> +
>> +			if (node_start_pfn[nid] == -1)
>> +				node_start_pfn[nid] = chunk_start(j);
>> +			node_end_pfn[nid] = chunk_end(j);
>> 		}
>> 	}
>> 	return 1;
>> 
>> -
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>> 
>>  
>> 
> 
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-16 19:39   ` Martin J. Bligh
@ 2004-03-17 16:15     ` Robert Picco
  2004-03-17 16:36       ` Martin J. Bligh
  0 siblings, 1 reply; 15+ messages in thread
From: Robert Picco @ 2004-03-17 16:15 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Jesse Barnes, linux-kernel, colpatch, haveblue

Hi Martin:

I did something like this before my posted patch in the IA64 ACPI NUMA 
memory initialization code.  It wasn't posted or even reviewed by 
peers.  Your patch below basically
trims the NUMA node memory information before the X86 discontig code 
calls the bootmem initialization routines.  The problem with coming up 
with a solution at this level is each (at least ones I've looked at) 
architecture handles low level memory initialization differently and 
there needs to be a common way to parse early boot arguments.

The patch I posted was arrived at after some people suggested an 
architecture independent patch.  My patch
basically allocates memory from the bootmem allocator before mem_init 
calls free_all_bootmem_core.  It's architecture independent.  If the 
real goal is to limit physical memory before the bootmem allocator is 
initialized, then my current patch doesn't accomplish this. 

thanks,

Bob
Martin J. Bligh wrote:

>--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <jbarnes@sgi.com> wrote:
>
>  
>
>>On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>>    
>>
>>>This patch supports three boot line options.  mem_limit limits the
>>>amount of physical memory.  node_mem_limit limits the amount of
>>>physical memory per node on a NUMA machine.  nodes_limit reduces the
>>>number of NUMA nodes to the value specified.  On a NUMA machine an
>>>eliminated node's CPU(s) are removed from the cpu_possible_map.  
>>>
>>>The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>>>machine.
>>>      
>>>
>>I think this patch will be really useful.  Matt and Martin, does it look
>>ok to you?  Given that discontiguous support is pretty platform specific
>>right now, I thought it might be less code if it was done in arch/, but
>>a platform independent version is awfully nice...
>>    
>>
>
>I haven't looked at your code yet, but I've had a similar patch in my tree
>from Dave Hansen for a while you might want to look at:
>
>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
>--- 320-kcg/arch/i386/kernel/numaq.c	2003-10-01 11:47:33.000000000 -0700
>+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c	2004-03-14 09:54:00.000000000 -0800
>@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
>  * function also increments numnodes with the number of nodes (quads)
>  * present.
>  */
>+extern unsigned long max_pages_per_node;
>+extern int limit_mem_per_node;
>+
>+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
> static void __init smp_dump_qct(void)
> {
> 	int node;
>@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
> 				eq->hi_shrd_mem_start - eq->priv_mem_size);
> 			node_end_pfn[node] = MB_TO_PAGES(
> 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
>+			if (node_size_pages(node) > max_pages_per_node)
>+				node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
> 		}
> 	}
> }
>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
>--- 320-kcg/arch/i386/kernel/setup.c	2004-03-11 14:33:36.000000000 -0800
>+++ 330-numa_mem_equals/arch/i386/kernel/setup.c	2004-03-14 09:54:00.000000000 -0800
>@@ -142,7 +142,7 @@ static void __init probe_roms(void)
> 	probe_extension_roms(roms);
> }
> 
>-static void __init limit_regions(unsigned long long size)
>+void __init limit_regions(unsigned long long size)
> {
> 	unsigned long long current_addr = 0;
> 	int i;
>@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
> 	print_memory_map(who);
> } /* setup_memory_region */
> 
>+unsigned long max_pages_per_node = 0xFFFFFFFF; 
> 
> static void __init parse_cmdline_early (char ** cmdline_p)
> {
>@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
> 				userdef=1;
> 			}
> 		}
>+		
>+		if (c == ' ' && !memcmp(from, "memnode=", 8)) {
>+			unsigned long long node_size_bytes;
>+			if (to != command_line)
>+				to--;
>+			node_size_bytes = memparse(from+8, &from);
>+			max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
>+		}
> 
> 		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
> 			if (to != command_line)
>diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
>--- 320-kcg/arch/i386/kernel/srat.c	2003-10-01 11:47:33.000000000 -0700
>+++ 330-numa_mem_equals/arch/i386/kernel/srat.c	2004-03-14 09:54:01.000000000 -0800
>@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
> };
> static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
> 
>+#define chunk_start(i)	(node_memory_chunk[i].start_pfn)
>+#define chunk_end(i)	(node_memory_chunk[i].end_pfn)
>+#define chunk_size(i) 	(chunk_end(i)-chunk_start(i))
>+
> static int num_memory_chunks;		/* total number of memory chunks */
> static int zholes_size_init;
> static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
>@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
> 	}
> }
> 
>+extern unsigned long max_pages_per_node;
>+extern int limit_mem_per_node; 
>+
> /* Parse the ACPI Static Resource Affinity Table */
> static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
> {
>@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
> 		       node_memory_chunk[j].start_pfn,
> 		       node_memory_chunk[j].end_pfn);
> 	}
>- 
>+
> 	/*calculate node_start_pfn/node_end_pfn arrays*/
> 	for (nid = 0; nid < numnodes; nid++) {
>-		int been_here_before = 0;
>+		unsigned long node_present_pages = 0;
> 
>+		node_start_pfn[nid] = -1;
> 		for (j = 0; j < num_memory_chunks; j++){
>-			if (node_memory_chunk[j].nid == nid) {
>-				if (been_here_before == 0) {
>-					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
>-					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>-					been_here_before = 1;
>-				} else { /* We've found another chunk of memory for the node */
>-					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
>-						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
>-					}
>-				}
>-			}
>+			unsigned long proposed_size;
>+
>+			if (node_memory_chunk[j].nid != nid)
>+				continue;
>+
>+			proposed_size = node_present_pages + chunk_size(j);
>+			if (proposed_size > max_pages_per_node)
>+				chunk_end(j) = chunk_start(j) +	
>+					max_pages_per_node - node_present_pages;
>+			node_present_pages += chunk_size(j);
>+
>+			if (node_start_pfn[nid] == -1)
>+				node_start_pfn[nid] = chunk_start(j);
>+			node_end_pfn[nid] = chunk_end(j);
> 		}
> 	}
> 	return 1;
>
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at  http://www.tux.org/lkml/
>
>  
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
  2004-03-16 17:43 ` Jesse Barnes
@ 2004-03-16 19:39   ` Martin J. Bligh
  2004-03-17 16:15     ` Robert Picco
  0 siblings, 1 reply; 15+ messages in thread
From: Martin J. Bligh @ 2004-03-16 19:39 UTC (permalink / raw)
  To: Jesse Barnes, Robert Picco, linux-kernel; +Cc: colpatch, haveblue

--On Tuesday, March 16, 2004 09:43:29 -0800 Jesse Barnes <jbarnes@sgi.com> wrote:

> On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
>> This patch supports three boot line options.  mem_limit limits the
>> amount of physical memory.  node_mem_limit limits the amount of
>> physical memory per node on a NUMA machine.  nodes_limit reduces the
>> number of NUMA nodes to the value specified.  On a NUMA machine an
>> eliminated node's CPU(s) are removed from the cpu_possible_map.  
>> 
>> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
>> machine.
> 
> I think this patch will be really useful.  Matt and Martin, does it look
> ok to you?  Given that discontiguous support is pretty platform specific
> right now, I thought it might be less code if it was done in arch/, but
> a platform independent version is awfully nice...

I haven't looked at your code yet, but I've had a similar patch in my tree
from Dave Hansen for a while you might want to look at:

diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/numaq.c 330-numa_mem_equals/arch/i386/kernel/numaq.c
--- 320-kcg/arch/i386/kernel/numaq.c	2003-10-01 11:47:33.000000000 -0700
+++ 330-numa_mem_equals/arch/i386/kernel/numaq.c	2004-03-14 09:54:00.000000000 -0800
@@ -42,6 +42,10 @@ extern long node_start_pfn[], node_end_p
  * function also increments numnodes with the number of nodes (quads)
  * present.
  */
+extern unsigned long max_pages_per_node;
+extern int limit_mem_per_node;
+
+#define node_size_pages(n) (node_end_pfn[n] - node_start_pfn[n])
 static void __init smp_dump_qct(void)
 {
 	int node;
@@ -60,6 +64,8 @@ static void __init smp_dump_qct(void)
 				eq->hi_shrd_mem_start - eq->priv_mem_size);
 			node_end_pfn[node] = MB_TO_PAGES(
 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+			if (node_size_pages(node) > max_pages_per_node)
+				node_end_pfn[node] = node_start_pfn[node] + max_pages_per_node;
 		}
 	}
 }
diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/setup.c 330-numa_mem_equals/arch/i386/kernel/setup.c
--- 320-kcg/arch/i386/kernel/setup.c	2004-03-11 14:33:36.000000000 -0800
+++ 330-numa_mem_equals/arch/i386/kernel/setup.c	2004-03-14 09:54:00.000000000 -0800
@@ -142,7 +142,7 @@ static void __init probe_roms(void)
 	probe_extension_roms(roms);
 }
 
-static void __init limit_regions(unsigned long long size)
+void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr = 0;
 	int i;
@@ -478,6 +478,7 @@ static void __init setup_memory_region(v
 	print_memory_map(who);
 } /* setup_memory_region */
 
+unsigned long max_pages_per_node = 0xFFFFFFFF; 
 
 static void __init parse_cmdline_early (char ** cmdline_p)
 {
@@ -521,6 +522,14 @@ static void __init parse_cmdline_early (
 				userdef=1;
 			}
 		}
+		
+		if (c == ' ' && !memcmp(from, "memnode=", 8)) {
+			unsigned long long node_size_bytes;
+			if (to != command_line)
+				to--;
+			node_size_bytes = memparse(from+8, &from);
+			max_pages_per_node = node_size_bytes >> PAGE_SHIFT;
+		}
 
 		if (c == ' ' && !memcmp(from, "memmap=", 7)) {
 			if (to != command_line)
diff -purN -X /home/mbligh/.diff.exclude 320-kcg/arch/i386/kernel/srat.c 330-numa_mem_equals/arch/i386/kernel/srat.c
--- 320-kcg/arch/i386/kernel/srat.c	2003-10-01 11:47:33.000000000 -0700
+++ 330-numa_mem_equals/arch/i386/kernel/srat.c	2004-03-14 09:54:01.000000000 -0800
@@ -53,6 +53,10 @@ struct node_memory_chunk_s {
 };
 static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
 
+#define chunk_start(i)	(node_memory_chunk[i].start_pfn)
+#define chunk_end(i)	(node_memory_chunk[i].end_pfn)
+#define chunk_size(i) 	(chunk_end(i)-chunk_start(i))
+
 static int num_memory_chunks;		/* total number of memory chunks */
 static int zholes_size_init;
 static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
@@ -198,6 +202,9 @@ static void __init initialize_physnode_m
 	}
 }
 
+extern unsigned long max_pages_per_node;
+extern int limit_mem_per_node; 
+
 /* Parse the ACPI Static Resource Affinity Table */
 static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 {
@@ -281,23 +288,27 @@ static int __init acpi20_parse_srat(stru
 		       node_memory_chunk[j].start_pfn,
 		       node_memory_chunk[j].end_pfn);
 	}
- 
+
 	/*calculate node_start_pfn/node_end_pfn arrays*/
 	for (nid = 0; nid < numnodes; nid++) {
-		int been_here_before = 0;
+		unsigned long node_present_pages = 0;
 
+		node_start_pfn[nid] = -1;
 		for (j = 0; j < num_memory_chunks; j++){
-			if (node_memory_chunk[j].nid == nid) {
-				if (been_here_before == 0) {
-					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
-					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
-					been_here_before = 1;
-				} else { /* We've found another chunk of memory for the node */
-					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
-						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
-					}
-				}
-			}
+			unsigned long proposed_size;
+
+			if (node_memory_chunk[j].nid != nid)
+				continue;
+
+			proposed_size = node_present_pages + chunk_size(j);
+			if (proposed_size > max_pages_per_node)
+				chunk_end(j) = chunk_start(j) +	
+					max_pages_per_node - node_present_pages;
+			node_present_pages += chunk_size(j);
+
+			if (node_start_pfn[nid] == -1)
+				node_start_pfn[nid] = chunk_start(j);
+			node_end_pfn[nid] = chunk_end(j);
 		}
 	}
 	return 1;


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: boot time node and memory limit options
       [not found] <4057392A.8000602@hp.com>
@ 2004-03-16 17:43 ` Jesse Barnes
  2004-03-16 19:39   ` Martin J. Bligh
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2004-03-16 17:43 UTC (permalink / raw)
  To: Robert Picco, linux-kernel; +Cc: colpatch, mbligh

On Tue, Mar 16, 2004 at 12:28:10PM -0500, Robert Picco wrote:
> This patch supports three boot line options.  mem_limit limits the
> amount of physical memory.  node_mem_limit limits the amount of
> physical memory per node on a NUMA machine.  nodes_limit reduces the
> number of NUMA nodes to the value specified.  On a NUMA machine an
> eliminated node's CPU(s) are removed from the cpu_possible_map.  
> 
> The patch has been tested on an IA64 NUMA machine and uniprocessor X86
> machine.

I think this patch will be really useful.  Matt and Martin, does it look
ok to you?  Given that discontiguous support is pretty platform specific
right now, I thought it might be less code if it was done in arch/, but
a platform independent version is awfully nice...

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2004-03-17 20:58 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-03-16 17:07 boot time node and memory limit options Robert Picco
2004-03-16 17:34 ` Randy.Dunlap
     [not found] ` <16471.48076.447058.132559@napali.hpl.hp.com>
2004-03-17 18:07   ` Robert Picco
     [not found] <4057392A.8000602@hp.com>
2004-03-16 17:43 ` Jesse Barnes
2004-03-16 19:39   ` Martin J. Bligh
2004-03-17 16:15     ` Robert Picco
2004-03-17 16:36       ` Martin J. Bligh
2004-03-17 17:09         ` Dave Hansen
2004-03-17 17:51           ` Jesse Barnes
2004-03-17 18:12             ` Dave Hansen
2004-03-17 19:30         ` Robert Picco
2004-03-17 19:44           ` Martin J. Bligh
2004-03-17 20:01             ` Robert Picco
2004-03-17 20:58               ` Martin J. Bligh
2004-03-17 20:52             ` Dave Hansen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.