linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* RE: hash table sizes
@ 2003-11-26  5:53 Zhang, Yanmin
  0 siblings, 0 replies; 32+ messages in thread
From: Zhang, Yanmin @ 2003-11-26  5:53 UTC (permalink / raw)
  To: linux-kernel

Tcp/ip have the same problem. See function tcp_init/ip_rt_init.

My colleague's testing on IA64 shows the same result like Jef's while
tcp_establish hash table applies for about 1G memory! Actually, it seems
it wants to apply for 4G memory at the beginning!

It's not a good idea to find a smart formula to fit into all scenarios.
We can add new command line parameters to setup the hash table size,
such as dentry_ht_size/inode_ht_size/tcp_estab_ht_size/ip_rt_ht_szie. If
users don't input such parameters, then kernel can use Jes's formula to
work out the size. 


^ permalink raw reply	[flat|nested] 32+ messages in thread
* Re: hash table sizes
@ 2003-11-29 10:39 Manfred Spraul
  0 siblings, 0 replies; 32+ messages in thread
From: Manfred Spraul @ 2003-11-29 10:39 UTC (permalink / raw)
  To: Jack Steiner
  Cc: Jes Sorensen, linux-kernel, anton, Martin J. Bligh,
	William Lee Irwin III, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 627 bytes --]

What about the attached patch?
- add command line overrides for the hash tables sizes. It's impossible 
to guess if the system is used as a HPC or as a file server. Doc update 
missing.
- limit the dcache hash to 8 mio entries, and the inode hash to 1 mio 
entries, regardless of the amount of memory in the system. The system 
admin can override the defaults at boot time if needed.
- distribute the memory allocations that happen during boot to all nodes 
- the memory will be touched by all cpus, binding all allocs to the boot 
node is wrong.

The patch compiles, but is untested due to lack of hardware.

--
    Manfred



[-- Attachment #2: patch-numa --]
[-- Type: text/plain, Size: 3591 bytes --]

// $Header$
// Kernel Version:
//  VERSION = 2
//  PATCHLEVEL = 6
//  SUBLEVEL = 0
//  EXTRAVERSION = -test11
--- 2.6/mm/page_alloc.c	2003-11-29 09:46:35.000000000 +0100
+++ build-2.6/mm/page_alloc.c	2003-11-29 11:34:04.000000000 +0100
@@ -681,6 +681,42 @@
 
 EXPORT_SYMBOL(__alloc_pages);
 
+#ifdef CONFIG_NUMA
+/* Early boot: Everything is done by one cpu, but the data structures will be
+ * used by all cpus - spread them on all nodes.
+ */
+static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
+{
+static int nodenr;
+	int i = nodenr;
+	struct page *page;
+
+	for (;;) {
+		if (i > nodenr + numnodes)
+			return 0;
+		if (node_present_pages(i%numnodes)) {
+			struct zone **z;
+			/* The node contains memory. Check that there is 
+			 * memory in the intended zonelist.
+			 */
+			z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
+			while (*z) {
+				if ( (*z)->free_pages > (1UL<<order))
+					goto found_node;
+				z++;
+			}
+		}
+		i++;
+	}
+found_node:
+	nodenr = i+1;
+	page = alloc_pages_node(i%numnodes, gfp_mask, order);
+	if (!page)
+		return 0;
+	return (unsigned long) page_address(page);
+}
+#endif
+
 /*
  * Common helper functions.
  */
@@ -688,6 +724,10 @@
 {
 	struct page * page;
 
+#ifdef CONFIG_NUMA
+	if (unlikely(!system_running))
+		return get_boot_pages(gfp_mask, order);
+#endif
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
--- 2.6/fs/inode.c	2003-11-29 09:46:34.000000000 +0100
+++ build-2.6/fs/inode.c	2003-11-29 10:19:21.000000000 +0100
@@ -1327,6 +1327,20 @@
 		wake_up_all(wq);
 }
 
+static __initdata int ihash_entries;
+
+static int __init set_ihash_entries(char *str)
+{
+	get_option(&str, &ihash_entries);
+	if (ihash_entries <= 0) {
+		ihash_entries = 0;
+		return 0;
+	}
+	return 1;
+}
+
+__setup("ihash_entries=", set_ihash_entries);
+
 /*
  * Initialize the waitqueues and inode hash table.
  */
@@ -1340,8 +1354,16 @@
 	for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
 		init_waitqueue_head(&i_wait_queue_heads[i].wqh);
 
-	mempages >>= (14 - PAGE_SHIFT);
-	mempages *= sizeof(struct hlist_head);
+	if (!ihash_entries) {
+		ihash_entries = mempages >> (14 - PAGE_SHIFT);
+		/* Limit inode hash size. Override for nfs servers
+		 * that handle lots of files.
+		 */
+		if (ihash_entries > 1024*1024)
+			ihash_entries = 1024*1024;
+	}
+
+	mempages = ihash_entries*sizeof(struct hlist_head);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
 		;
 
--- 2.6/fs/dcache.c	2003-11-29 09:46:34.000000000 +0100
+++ build-2.6/fs/dcache.c	2003-11-29 10:53:15.000000000 +0100
@@ -1546,6 +1546,20 @@
 	return ino;
 }
 
+static __initdata int dhash_entries;
+
+static int __init set_dhash_entries(char *str)
+{
+	get_option(&str, &dhash_entries);
+	if (dhash_entries <= 0) {
+		dhash_entries = 0;
+		return 0;
+	}
+	return 1;
+}
+
+__setup("dhash_entries=", set_dhash_entries);
+
 static void __init dcache_init(unsigned long mempages)
 {
 	struct hlist_head *d;
@@ -1571,10 +1585,18 @@
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
 
+	if (!dhash_entries) {
 #if PAGE_SHIFT < 13
-	mempages >>= (13 - PAGE_SHIFT);
+		mempages >>= (13 - PAGE_SHIFT);
 #endif
-	mempages *= sizeof(struct hlist_head);
+		dhash_entries = mempages;
+		/* 8 mio is enough for general purpose systems.
+		 * For file servers, override with "dhash_entries="
+		 */
+		if (dhash_entries > 8*1024*1024)
+			dhash_entries = 8*1024*1024;
+	}
+	mempages = dhash_entries*sizeof(struct hlist_head);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
 		;
 

^ permalink raw reply	[flat|nested] 32+ messages in thread
* hash table sizes
@ 2003-11-25 13:35 Jes Sorensen
  2003-11-25 13:42 ` William Lee Irwin III
  2003-11-25 20:48 ` Jack Steiner
  0 siblings, 2 replies; 32+ messages in thread
From: Jes Sorensen @ 2003-11-25 13:35 UTC (permalink / raw)
  To: Alexander Viro
  Cc: Andrew Morton, William Lee Irwin, III, linux-kernel, jbarnes, steiner

Hi,

On NUMA systems with way too much memory, the current algorithms for
determining the size of the inode and dentry hash tables ends up trying
to allocate tables that are so big they may not fit within the physical
memory of a single node. Ie. on a 256 node system with 512GB of RAM with
16KB pages it basically ends up eating up all the memory on node before
completing a boot because of this. The inode and dentry hashes are 256MB
each and the IP routing table hash is 128MB.

I have tried changing the algorithm as below and it seems to produce
reasonable results and almost identical numbers for the smaller /
mid-sized configs I looked at.

This is not meant to be a final patch, any input/oppinion is welcome.

Thanks,
Jes

--- orig/linux-2.6.0-test10/fs/dcache.c	Sat Oct 25 11:42:58 2003
+++ linux-2.6.0-test10/fs/dcache.c	Tue Nov 25 05:33:04 2003
@@ -1549,9 +1549,8 @@
 static void __init dcache_init(unsigned long mempages)
 {
 	struct hlist_head *d;
-	unsigned long order;
 	unsigned int nr_hash;
-	int i;
+	int i, order;
 
 	/* 
 	 * A constructor could be added for stable state like the lists,
@@ -1571,12 +1570,17 @@
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
 
+#if 0
 #if PAGE_SHIFT < 13
 	mempages >>= (13 - PAGE_SHIFT);
 #endif
 	mempages *= sizeof(struct hlist_head);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
 		;
+#endif
+	mempages >>= (23 - (PAGE_SHIFT - 1));
+	order = max(2, fls(mempages));
+	order = min(12, order);
 
 	do {
 		unsigned long tmp;
@@ -1594,7 +1598,7 @@
 			__get_free_pages(GFP_ATOMIC, order);
 	} while (dentry_hashtable == NULL && --order >= 0);
 
-	printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n",
+	printk(KERN_INFO "Dentry cache hash table entries: %d (order: %d, %ld bytes)\n",
 			nr_hash, order, (PAGE_SIZE << order));
 
 	if (!dentry_hashtable)
--- orig/linux-2.6.0-test10/fs/inode.c	Sat Oct 25 11:44:53 2003
+++ linux-2.6.0-test10/fs/inode.c	Tue Nov 25 05:33:27 2003
@@ -1333,17 +1333,21 @@
 void __init inode_init(unsigned long mempages)
 {
 	struct hlist_head *head;
-	unsigned long order;
 	unsigned int nr_hash;
-	int i;
+	int i, order;
 
 	for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
 		init_waitqueue_head(&i_wait_queue_heads[i].wqh);
 
+#if 0
 	mempages >>= (14 - PAGE_SHIFT);
 	mempages *= sizeof(struct hlist_head);
 	for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
 		;
+#endif
+	mempages >>= (23 - (PAGE_SHIFT - 1));
+	order = max(2, fls(mempages));
+	order = min(12, order);
 
 	do {
 		unsigned long tmp;

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2003-12-01 22:59 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-11-26  5:53 hash table sizes Zhang, Yanmin
  -- strict thread matches above, loose matches on Subject: below --
2003-11-29 10:39 Manfred Spraul
2003-11-25 13:35 Jes Sorensen
2003-11-25 13:42 ` William Lee Irwin III
2003-11-25 13:54   ` Jes Sorensen
2003-11-25 16:25     ` Thomas Schlichter
2003-11-25 17:52       ` Antonio Vargas
2003-11-25 17:54         ` William Lee Irwin III
2003-11-25 20:48 ` Jack Steiner
2003-11-25 21:07   ` Andrew Morton
2003-11-25 21:14     ` Jesse Barnes
2003-11-25 21:24       ` Andrew Morton
2003-11-26  2:14         ` David S. Miller
2003-11-26  5:27         ` Matt Mackall
2003-11-28 14:15         ` Jes Sorensen
2003-11-28 14:52           ` Jack Steiner
2003-11-28 16:22             ` Jes Sorensen
2003-11-28 19:35               ` Jack Steiner
2003-11-28 21:18                 ` Jörn Engel
2003-12-01  9:46                   ` Jes Sorensen
2003-12-01 21:06     ` Anton Blanchard
2003-12-01 22:57       ` Martin J. Bligh
2003-11-25 21:16   ` Anton Blanchard
2003-11-25 23:11     ` Jack Steiner
2003-11-26  3:39       ` Rik van Riel
2003-11-26  3:59         ` William Lee Irwin III
2003-11-26  4:25           ` Andrew Morton
2003-11-26  4:23             ` William Lee Irwin III
2003-11-26  5:14           ` Martin J. Bligh
2003-11-26  9:51             ` William Lee Irwin III
2003-11-26 16:17               ` Martin J. Bligh
2003-11-26  7:25       ` Anton Blanchard

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).