All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
@ 2011-01-25  3:32 ` Anton Blanchard
  0 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-01-25  3:32 UTC (permalink / raw)
  To: dwg, mel, akpm, hughd; +Cc: linux-mm, linux-kernel


In preparation for creating a hash of spinlocks to replace the global
hugetlb_instantiation_mutex, protect the region tracking code with
its own spinlock.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

The old code locked it with either:

	down_write(&mm->mmap_sem);
or
	down_read(&mm->mmap_sem);
	mutex_lock(&hugetlb_instantiation_mutex);

I chose to keep things simple and wrap everything with a single lock.
Do we need the parallelism the old code had in the down_write case?


Index: powerpc.git/mm/hugetlb.c
===================================================================
--- powerpc.git.orig/mm/hugetlb.c	2011-01-07 12:50:52.090440484 +1100
+++ powerpc.git/mm/hugetlb.c	2011-01-07 12:52:03.922704453 +1100
@@ -56,16 +56,6 @@ static DEFINE_SPINLOCK(hugetlb_lock);
 /*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
- *
- * The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex.  To access or modify a region the caller
- * must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
- *
- * 	down_write(&mm->mmap_sem);
- * or
- * 	down_read(&mm->mmap_sem);
- * 	mutex_lock(&hugetlb_instantiation_mutex);
  */
 struct file_region {
 	struct list_head link;
@@ -73,10 +63,14 @@ struct file_region {
 	long to;
 };
 
+static DEFINE_SPINLOCK(region_lock);
+
 static long region_add(struct list_head *head, long f, long t)
 {
 	struct file_region *rg, *nrg, *trg;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -106,6 +100,7 @@ static long region_add(struct list_head
 	}
 	nrg->from = f;
 	nrg->to = t;
+	spin_unlock(&region_lock);
 	return 0;
 }
 
@@ -114,6 +109,8 @@ static long region_chg(struct list_head
 	struct file_region *rg, *nrg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are before or in. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -124,14 +121,17 @@ static long region_chg(struct list_head
 	 * size such that we can guarantee to record the reservation. */
 	if (&rg->link == head || t < rg->from) {
 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-		if (!nrg)
-			return -ENOMEM;
+		if (!nrg) {
+			chg = -ENOMEM;
+			goto out;
+		}
 		nrg->from = f;
 		nrg->to   = f;
 		INIT_LIST_HEAD(&nrg->link);
 		list_add(&nrg->link, rg->link.prev);
 
-		return t - f;
+		chg = t - f;
+		goto out;
 	}
 
 	/* Round our left edge to the current segment if it encloses us. */
@@ -144,7 +144,7 @@ static long region_chg(struct list_head
 		if (&rg->link == head)
 			break;
 		if (rg->from > t)
-			return chg;
+			goto out;
 
 		/* We overlap with this area, if it extends futher than
 		 * us then we must extend ourselves.  Account for its
@@ -155,6 +155,9 @@ static long region_chg(struct list_head
 		}
 		chg -= rg->to - rg->from;
 	}
+out:
+
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -163,12 +166,16 @@ static long region_truncate(struct list_
 	struct file_region *rg, *trg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (end <= rg->to)
 			break;
-	if (&rg->link == head)
-		return 0;
+	if (&rg->link == head) {
+		chg = 0;
+		goto out;
+	}
 
 	/* If we are in the middle of a region then adjust it. */
 	if (end > rg->from) {
@@ -185,6 +192,9 @@ static long region_truncate(struct list_
 		list_del(&rg->link);
 		kfree(rg);
 	}
+
+out:
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -193,6 +203,8 @@ static long region_count(struct list_hea
 	struct file_region *rg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate each segment we overlap with, and count that overlap. */
 	list_for_each_entry(rg, head, link) {
 		int seg_from;
@@ -209,6 +221,7 @@ static long region_count(struct list_hea
 		chg += seg_to - seg_from;
 	}
 
+	spin_unlock(&region_lock);
 	return chg;
 }
 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
@ 2011-01-25  3:32 ` Anton Blanchard
  0 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-01-25  3:32 UTC (permalink / raw)
  To: dwg, mel, akpm, hughd; +Cc: linux-mm, linux-kernel


In preparation for creating a hash of spinlocks to replace the global
hugetlb_instantiation_mutex, protect the region tracking code with
its own spinlock.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

The old code locked it with either:

	down_write(&mm->mmap_sem);
or
	down_read(&mm->mmap_sem);
	mutex_lock(&hugetlb_instantiation_mutex);

I chose to keep things simple and wrap everything with a single lock.
Do we need the parallelism the old code had in the down_write case?


Index: powerpc.git/mm/hugetlb.c
===================================================================
--- powerpc.git.orig/mm/hugetlb.c	2011-01-07 12:50:52.090440484 +1100
+++ powerpc.git/mm/hugetlb.c	2011-01-07 12:52:03.922704453 +1100
@@ -56,16 +56,6 @@ static DEFINE_SPINLOCK(hugetlb_lock);
 /*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
- *
- * The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex.  To access or modify a region the caller
- * must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
- *
- * 	down_write(&mm->mmap_sem);
- * or
- * 	down_read(&mm->mmap_sem);
- * 	mutex_lock(&hugetlb_instantiation_mutex);
  */
 struct file_region {
 	struct list_head link;
@@ -73,10 +63,14 @@ struct file_region {
 	long to;
 };
 
+static DEFINE_SPINLOCK(region_lock);
+
 static long region_add(struct list_head *head, long f, long t)
 {
 	struct file_region *rg, *nrg, *trg;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -106,6 +100,7 @@ static long region_add(struct list_head
 	}
 	nrg->from = f;
 	nrg->to = t;
+	spin_unlock(&region_lock);
 	return 0;
 }
 
@@ -114,6 +109,8 @@ static long region_chg(struct list_head
 	struct file_region *rg, *nrg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are before or in. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -124,14 +121,17 @@ static long region_chg(struct list_head
 	 * size such that we can guarantee to record the reservation. */
 	if (&rg->link == head || t < rg->from) {
 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-		if (!nrg)
-			return -ENOMEM;
+		if (!nrg) {
+			chg = -ENOMEM;
+			goto out;
+		}
 		nrg->from = f;
 		nrg->to   = f;
 		INIT_LIST_HEAD(&nrg->link);
 		list_add(&nrg->link, rg->link.prev);
 
-		return t - f;
+		chg = t - f;
+		goto out;
 	}
 
 	/* Round our left edge to the current segment if it encloses us. */
@@ -144,7 +144,7 @@ static long region_chg(struct list_head
 		if (&rg->link == head)
 			break;
 		if (rg->from > t)
-			return chg;
+			goto out;
 
 		/* We overlap with this area, if it extends futher than
 		 * us then we must extend ourselves.  Account for its
@@ -155,6 +155,9 @@ static long region_chg(struct list_head
 		}
 		chg -= rg->to - rg->from;
 	}
+out:
+
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -163,12 +166,16 @@ static long region_truncate(struct list_
 	struct file_region *rg, *trg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (end <= rg->to)
 			break;
-	if (&rg->link == head)
-		return 0;
+	if (&rg->link == head) {
+		chg = 0;
+		goto out;
+	}
 
 	/* If we are in the middle of a region then adjust it. */
 	if (end > rg->from) {
@@ -185,6 +192,9 @@ static long region_truncate(struct list_
 		list_del(&rg->link);
 		kfree(rg);
 	}
+
+out:
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -193,6 +203,8 @@ static long region_count(struct list_hea
 	struct file_region *rg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate each segment we overlap with, and count that overlap. */
 	list_for_each_entry(rg, head, link) {
 		int seg_from;
@@ -209,6 +221,7 @@ static long region_count(struct list_hea
 		chg += seg_to - seg_from;
 	}
 
+	spin_unlock(&region_lock);
 	return chg;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-01-25  3:32 ` Anton Blanchard
@ 2011-01-25  3:34   ` Anton Blanchard
  -1 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-01-25  3:34 UTC (permalink / raw)
  To: dwg, mel, akpm, hughd; +Cc: linux-mm, linux-kernel

From: David Gibson <dwg@au1.ibm.com>

At present, the page fault path for hugepages is serialized by a
single mutex.  This is used to avoid spurious out-of-memory conditions
when the hugepage pool is fully utilized (two processes or threads can
race to instantiate the same mapping with the last hugepage from the
pool, the race loser returning VM_FAULT_OOM).  This problem is
specific to hugepages, because it is normal to want to use every
single hugepage in the system - with normal pages we simply assume
there will always be a few spare pages which can be used temporarily
until the race is resolved.

Unfortunately this serialization also means that clearing of hugepages
cannot be parallelized across multiple CPUs, which can lead to very
long process startup times when using large numbers of hugepages.

This patch improves the situation by replacing the single mutex with a
table of mutexes, selected based on a hash of the address_space and
file offset being faulted (or mm and virtual address for MAP_PRIVATE
mappings).


From: Anton Blanchard <anton@samba.org>

Forward ported and made a few changes:

- Use the Jenkins hash to scatter the hash, better than using just the
  low bits.

- Always round num_fault_mutexes to a power of two to avoid an expensive
  modulus in the hash calculation.

I also tested this patch on a 64 thread POWER6 box using a simple parallel
fault testcase:

http://ozlabs.org/~anton/junkcode/parallel_fault.c

Command line options:

parallel_fault <nr_threads> <size in kB> <skip in kB>

First the time taken to fault 48GB of 16MB hugepages:
# time hugectl --heap ./parallel_fault 1 50331648 16384
11.1 seconds

Now the same test with 64 concurrent threads:
# time hugectl --heap ./parallel_fault 64 50331648 16384
8.8 seconds

Hardly any speedup. Finally the 64 concurrent threads test with this patch
applied:
# time hugectl --heap ./parallel_fault 64 50331648 16384
0.7 seconds

We go from 8.8 seconds to 0.7 seconds, an improvement of 12.6x.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: powerpc.git/mm/hugetlb.c
===================================================================
--- powerpc.git.orig/mm/hugetlb.c	2011-01-25 13:20:49.311405902 +1100
+++ powerpc.git/mm/hugetlb.c	2011-01-25 13:45:54.437235053 +1100
@@ -21,6 +21,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/jhash.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -54,6 +55,13 @@ static unsigned long __initdata default_
 static DEFINE_SPINLOCK(hugetlb_lock);
 
 /*
+ * Serializes faults on the same logical page.  This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static unsigned int num_fault_mutexes;
+static struct mutex *htlb_fault_mutex_table;
+
+/*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
  */
@@ -1764,6 +1772,8 @@ module_exit(hugetlb_exit);
 
 static int __init hugetlb_init(void)
 {
+	int i;
+
 	/* Some platform decide whether they support huge pages at boot
 	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
 	 * there is no such support
@@ -1790,6 +1800,12 @@ static int __init hugetlb_init(void)
 
 	hugetlb_register_all_nodes();
 
+	num_fault_mutexes = roundup_pow_of_two(2 * num_possible_cpus());
+	htlb_fault_mutex_table =
+		kmalloc(num_fault_mutexes * sizeof(struct mutex), GFP_KERNEL);
+	for (i = 0; i < num_fault_mutexes; i++)
+		mutex_init(&htlb_fault_mutex_table[i]);
+
 	return 0;
 }
 module_init(hugetlb_init);
@@ -2616,6 +2632,27 @@ backout_unlocked:
 	goto out;
 }
 
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+			    struct vm_area_struct *vma,
+			    struct address_space *mapping,
+			    unsigned long pagenum, unsigned long address)
+{
+	unsigned long key[2];
+	u32 hash;
+
+	if ((vma->vm_flags & VM_SHARED)) {
+		key[0] = (unsigned long)mapping;
+		key[1] = pagenum;
+	} else {
+		key[0] = (unsigned long)mm;
+		key[1] = address >> huge_page_shift(h);
+	}
+
+	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+	return hash & (num_fault_mutexes - 1);
+}
+
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags)
 {
@@ -2624,8 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm,
 	int ret;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
-	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
+	struct address_space *mapping;
+	unsigned long idx;
+	u32 hash;
 
 	ptep = huge_pte_offset(mm, address);
 	if (ptep) {
@@ -2642,12 +2681,16 @@ int hugetlb_fault(struct mm_struct *mm,
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, address);
+
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
-	mutex_lock(&hugetlb_instantiation_mutex);
+	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+	mutex_lock(&htlb_fault_mutex_table[hash]);
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
 		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
@@ -2716,7 +2759,7 @@ out_page_table_lock:
 		unlock_page(page);
 
 out_mutex:
-	mutex_unlock(&hugetlb_instantiation_mutex);
+	mutex_unlock(&htlb_fault_mutex_table[hash]);
 
 	return ret;
 }

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-01-25  3:34   ` Anton Blanchard
  0 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-01-25  3:34 UTC (permalink / raw)
  To: dwg, mel, akpm, hughd; +Cc: linux-mm, linux-kernel

From: David Gibson <dwg@au1.ibm.com>

At present, the page fault path for hugepages is serialized by a
single mutex.  This is used to avoid spurious out-of-memory conditions
when the hugepage pool is fully utilized (two processes or threads can
race to instantiate the same mapping with the last hugepage from the
pool, the race loser returning VM_FAULT_OOM).  This problem is
specific to hugepages, because it is normal to want to use every
single hugepage in the system - with normal pages we simply assume
there will always be a few spare pages which can be used temporarily
until the race is resolved.

Unfortunately this serialization also means that clearing of hugepages
cannot be parallelized across multiple CPUs, which can lead to very
long process startup times when using large numbers of hugepages.

This patch improves the situation by replacing the single mutex with a
table of mutexes, selected based on a hash of the address_space and
file offset being faulted (or mm and virtual address for MAP_PRIVATE
mappings).


From: Anton Blanchard <anton@samba.org>

Forward ported and made a few changes:

- Use the Jenkins hash to scatter the hash, better than using just the
  low bits.

- Always round num_fault_mutexes to a power of two to avoid an expensive
  modulus in the hash calculation.

I also tested this patch on a 64 thread POWER6 box using a simple parallel
fault testcase:

http://ozlabs.org/~anton/junkcode/parallel_fault.c

Command line options:

parallel_fault <nr_threads> <size in kB> <skip in kB>

First the time taken to fault 48GB of 16MB hugepages:
# time hugectl --heap ./parallel_fault 1 50331648 16384
11.1 seconds

Now the same test with 64 concurrent threads:
# time hugectl --heap ./parallel_fault 64 50331648 16384
8.8 seconds

Hardly any speedup. Finally the 64 concurrent threads test with this patch
applied:
# time hugectl --heap ./parallel_fault 64 50331648 16384
0.7 seconds

We go from 8.8 seconds to 0.7 seconds, an improvement of 12.6x.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: powerpc.git/mm/hugetlb.c
===================================================================
--- powerpc.git.orig/mm/hugetlb.c	2011-01-25 13:20:49.311405902 +1100
+++ powerpc.git/mm/hugetlb.c	2011-01-25 13:45:54.437235053 +1100
@@ -21,6 +21,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/jhash.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -54,6 +55,13 @@ static unsigned long __initdata default_
 static DEFINE_SPINLOCK(hugetlb_lock);
 
 /*
+ * Serializes faults on the same logical page.  This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static unsigned int num_fault_mutexes;
+static struct mutex *htlb_fault_mutex_table;
+
+/*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
  */
@@ -1764,6 +1772,8 @@ module_exit(hugetlb_exit);
 
 static int __init hugetlb_init(void)
 {
+	int i;
+
 	/* Some platform decide whether they support huge pages at boot
 	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
 	 * there is no such support
@@ -1790,6 +1800,12 @@ static int __init hugetlb_init(void)
 
 	hugetlb_register_all_nodes();
 
+	num_fault_mutexes = roundup_pow_of_two(2 * num_possible_cpus());
+	htlb_fault_mutex_table =
+		kmalloc(num_fault_mutexes * sizeof(struct mutex), GFP_KERNEL);
+	for (i = 0; i < num_fault_mutexes; i++)
+		mutex_init(&htlb_fault_mutex_table[i]);
+
 	return 0;
 }
 module_init(hugetlb_init);
@@ -2616,6 +2632,27 @@ backout_unlocked:
 	goto out;
 }
 
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+			    struct vm_area_struct *vma,
+			    struct address_space *mapping,
+			    unsigned long pagenum, unsigned long address)
+{
+	unsigned long key[2];
+	u32 hash;
+
+	if ((vma->vm_flags & VM_SHARED)) {
+		key[0] = (unsigned long)mapping;
+		key[1] = pagenum;
+	} else {
+		key[0] = (unsigned long)mm;
+		key[1] = address >> huge_page_shift(h);
+	}
+
+	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+	return hash & (num_fault_mutexes - 1);
+}
+
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags)
 {
@@ -2624,8 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm,
 	int ret;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
-	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
+	struct address_space *mapping;
+	unsigned long idx;
+	u32 hash;
 
 	ptep = huge_pte_offset(mm, address);
 	if (ptep) {
@@ -2642,12 +2681,16 @@ int hugetlb_fault(struct mm_struct *mm,
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, address);
+
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
-	mutex_lock(&hugetlb_instantiation_mutex);
+	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+	mutex_lock(&htlb_fault_mutex_table[hash]);
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
 		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
@@ -2716,7 +2759,7 @@ out_page_table_lock:
 		unlock_page(page);
 
 out_mutex:
-	mutex_unlock(&hugetlb_instantiation_mutex);
+	mutex_unlock(&htlb_fault_mutex_table[hash]);
 
 	return ret;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
  2011-01-25  3:32 ` Anton Blanchard
  (?)
  (?)
@ 2011-01-25 19:43 ` Eric B Munson
  -1 siblings, 0 replies; 24+ messages in thread
From: Eric B Munson @ 2011-01-25 19:43 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, mel, akpm, hughd, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 322 bytes --]

On Tue, 25 Jan 2011, Anton Blanchard wrote:

> 
> In preparation for creating a hash of spinlocks to replace the global
> hugetlb_instantiation_mutex, protect the region tracking code with
> its own spinlock.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org> 

Reviewed-by: Eric B Munson <emunson@mgebm.net>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-01-25  3:34   ` Anton Blanchard
  (?)
@ 2011-01-25 19:44   ` Eric B Munson
  -1 siblings, 0 replies; 24+ messages in thread
From: Eric B Munson @ 2011-01-25 19:44 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, mel, akpm, hughd, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2355 bytes --]

On Tue, 25 Jan 2011, Anton Blanchard wrote:

> From: David Gibson <dwg@au1.ibm.com>
> 
> At present, the page fault path for hugepages is serialized by a
> single mutex.  This is used to avoid spurious out-of-memory conditions
> when the hugepage pool is fully utilized (two processes or threads can
> race to instantiate the same mapping with the last hugepage from the
> pool, the race loser returning VM_FAULT_OOM).  This problem is
> specific to hugepages, because it is normal to want to use every
> single hugepage in the system - with normal pages we simply assume
> there will always be a few spare pages which can be used temporarily
> until the race is resolved.
> 
> Unfortunately this serialization also means that clearing of hugepages
> cannot be parallelized across multiple CPUs, which can lead to very
> long process startup times when using large numbers of hugepages.
> 
> This patch improves the situation by replacing the single mutex with a
> table of mutexes, selected based on a hash of the address_space and
> file offset being faulted (or mm and virtual address for MAP_PRIVATE
> mappings).
> 
> 
> From: Anton Blanchard <anton@samba.org>
> 
> Forward ported and made a few changes:
> 
> - Use the Jenkins hash to scatter the hash, better than using just the
>   low bits.
> 
> - Always round num_fault_mutexes to a power of two to avoid an expensive
>   modulus in the hash calculation.
> 
> I also tested this patch on a 64 thread POWER6 box using a simple parallel
> fault testcase:
> 
> http://ozlabs.org/~anton/junkcode/parallel_fault.c
> 
> Command line options:
> 
> parallel_fault <nr_threads> <size in kB> <skip in kB>
> 
> First the time taken to fault 48GB of 16MB hugepages:
> # time hugectl --heap ./parallel_fault 1 50331648 16384
> 11.1 seconds
> 
> Now the same test with 64 concurrent threads:
> # time hugectl --heap ./parallel_fault 64 50331648 16384
> 8.8 seconds
> 
> Hardly any speedup. Finally the 64 concurrent threads test with this patch
> applied:
> # time hugectl --heap ./parallel_fault 64 50331648 16384
> 0.7 seconds
> 
> We go from 8.8 seconds to 0.7 seconds, an improvement of 12.6x.
> 
> Signed-off-by: David Gibson <dwg@au1.ibm.com>
> Signed-off-by: Anton Blanchard <anton@samba.org>

Reviewed-by: Eric B Munson <emunson@mgebm.net>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
  2011-01-25  3:32 ` Anton Blanchard
@ 2011-01-26  9:07   ` Mel Gorman
  -1 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2011-01-26  9:07 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel

On Tue, Jan 25, 2011 at 02:32:26PM +1100, Anton Blanchard wrote:
> 
> In preparation for creating a hash of spinlocks to replace the global
> hugetlb_instantiation_mutex, protect the region tracking code with
> its own spinlock.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org> 
> ---
> 
> The old code locked it with either:
> 
> 	down_write(&mm->mmap_sem);
> or
> 	down_read(&mm->mmap_sem);
> 	mutex_lock(&hugetlb_instantiation_mutex);
> 
> I chose to keep things simple and wrap everything with a single lock.
> Do we need the parallelism the old code had in the down_write case?
> 
> 
> Index: powerpc.git/mm/hugetlb.c
> ===================================================================
> --- powerpc.git.orig/mm/hugetlb.c	2011-01-07 12:50:52.090440484 +1100
> +++ powerpc.git/mm/hugetlb.c	2011-01-07 12:52:03.922704453 +1100
> @@ -56,16 +56,6 @@ static DEFINE_SPINLOCK(hugetlb_lock);
>  /*
>   * Region tracking -- allows tracking of reservations and instantiated pages
>   *                    across the pages in a mapping.
> - *
> - * The region data structures are protected by a combination of the mmap_sem
> - * and the hugetlb_instantion_mutex.  To access or modify a region the caller
> - * must either hold the mmap_sem for write, or the mmap_sem for read and
> - * the hugetlb_instantiation mutex:
> - *
> - * 	down_write(&mm->mmap_sem);
> - * or
> - * 	down_read(&mm->mmap_sem);
> - * 	mutex_lock(&hugetlb_instantiation_mutex);
>   */
>  struct file_region {
>  	struct list_head link;

A new comment is needed to explain how region_lock is protecting the
region lists.

Otherwise, nothing jumps out as being bad. It does not appear functionality
or locking has really changed.  The mutex + mmap_sem is still in place
so the spinlock is redundant for protecting the region lists presumably
until the next patch.

> @@ -73,10 +63,14 @@ struct file_region {
>  	long to;
>  };
>  
> +static DEFINE_SPINLOCK(region_lock);
> +
>  static long region_add(struct list_head *head, long f, long t)
>  {
>  	struct file_region *rg, *nrg, *trg;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate the region we are either in or before. */
>  	list_for_each_entry(rg, head, link)
>  		if (f <= rg->to)
> @@ -106,6 +100,7 @@ static long region_add(struct list_head
>  	}
>  	nrg->from = f;
>  	nrg->to = t;
> +	spin_unlock(&region_lock);
>  	return 0;
>  }
>  
> @@ -114,6 +109,8 @@ static long region_chg(struct list_head
>  	struct file_region *rg, *nrg;
>  	long chg = 0;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate the region we are before or in. */
>  	list_for_each_entry(rg, head, link)
>  		if (f <= rg->to)
> @@ -124,14 +121,17 @@ static long region_chg(struct list_head
>  	 * size such that we can guarantee to record the reservation. */
>  	if (&rg->link == head || t < rg->from) {
>  		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
> -		if (!nrg)
> -			return -ENOMEM;
> +		if (!nrg) {
> +			chg = -ENOMEM;
> +			goto out;
> +		}
>  		nrg->from = f;
>  		nrg->to   = f;
>  		INIT_LIST_HEAD(&nrg->link);
>  		list_add(&nrg->link, rg->link.prev);
>  
> -		return t - f;
> +		chg = t - f;
> +		goto out;
>  	}
>  
>  	/* Round our left edge to the current segment if it encloses us. */
> @@ -144,7 +144,7 @@ static long region_chg(struct list_head
>  		if (&rg->link == head)
>  			break;
>  		if (rg->from > t)
> -			return chg;
> +			goto out;
>  
>  		/* We overlap with this area, if it extends futher than
>  		 * us then we must extend ourselves.  Account for its
> @@ -155,6 +155,9 @@ static long region_chg(struct list_head
>  		}
>  		chg -= rg->to - rg->from;
>  	}
> +out:
> +
> +	spin_unlock(&region_lock);
>  	return chg;
>  }
>  
> @@ -163,12 +166,16 @@ static long region_truncate(struct list_
>  	struct file_region *rg, *trg;
>  	long chg = 0;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate the region we are either in or before. */
>  	list_for_each_entry(rg, head, link)
>  		if (end <= rg->to)
>  			break;
> -	if (&rg->link == head)
> -		return 0;
> +	if (&rg->link == head) {
> +		chg = 0;
> +		goto out;
> +	}
>  
>  	/* If we are in the middle of a region then adjust it. */
>  	if (end > rg->from) {
> @@ -185,6 +192,9 @@ static long region_truncate(struct list_
>  		list_del(&rg->link);
>  		kfree(rg);
>  	}
> +
> +out:
> +	spin_unlock(&region_lock);
>  	return chg;
>  }
>  
> @@ -193,6 +203,8 @@ static long region_count(struct list_hea
>  	struct file_region *rg;
>  	long chg = 0;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate each segment we overlap with, and count that overlap. */
>  	list_for_each_entry(rg, head, link) {
>  		int seg_from;
> @@ -209,6 +221,7 @@ static long region_count(struct list_hea
>  		chg += seg_to - seg_from;
>  	}
>  
> +	spin_unlock(&region_lock);
>  	return chg;
>  }
>  
> 

-- 
Mel Gorman
Linux Technology Center
IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
@ 2011-01-26  9:07   ` Mel Gorman
  0 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2011-01-26  9:07 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel

On Tue, Jan 25, 2011 at 02:32:26PM +1100, Anton Blanchard wrote:
> 
> In preparation for creating a hash of spinlocks to replace the global
> hugetlb_instantiation_mutex, protect the region tracking code with
> its own spinlock.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org> 
> ---
> 
> The old code locked it with either:
> 
> 	down_write(&mm->mmap_sem);
> or
> 	down_read(&mm->mmap_sem);
> 	mutex_lock(&hugetlb_instantiation_mutex);
> 
> I chose to keep things simple and wrap everything with a single lock.
> Do we need the parallelism the old code had in the down_write case?
> 
> 
> Index: powerpc.git/mm/hugetlb.c
> ===================================================================
> --- powerpc.git.orig/mm/hugetlb.c	2011-01-07 12:50:52.090440484 +1100
> +++ powerpc.git/mm/hugetlb.c	2011-01-07 12:52:03.922704453 +1100
> @@ -56,16 +56,6 @@ static DEFINE_SPINLOCK(hugetlb_lock);
>  /*
>   * Region tracking -- allows tracking of reservations and instantiated pages
>   *                    across the pages in a mapping.
> - *
> - * The region data structures are protected by a combination of the mmap_sem
> - * and the hugetlb_instantion_mutex.  To access or modify a region the caller
> - * must either hold the mmap_sem for write, or the mmap_sem for read and
> - * the hugetlb_instantiation mutex:
> - *
> - * 	down_write(&mm->mmap_sem);
> - * or
> - * 	down_read(&mm->mmap_sem);
> - * 	mutex_lock(&hugetlb_instantiation_mutex);
>   */
>  struct file_region {
>  	struct list_head link;

A new comment is needed to explain how region_lock is protecting the
region lists.

Otherwise, nothing jumps out as being bad. It does not appear functionality
or locking has really changed.  The mutex + mmap_sem is still in place
so the spinlock is redundant for protecting the region lists presumably
until the next patch.

> @@ -73,10 +63,14 @@ struct file_region {
>  	long to;
>  };
>  
> +static DEFINE_SPINLOCK(region_lock);
> +
>  static long region_add(struct list_head *head, long f, long t)
>  {
>  	struct file_region *rg, *nrg, *trg;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate the region we are either in or before. */
>  	list_for_each_entry(rg, head, link)
>  		if (f <= rg->to)
> @@ -106,6 +100,7 @@ static long region_add(struct list_head
>  	}
>  	nrg->from = f;
>  	nrg->to = t;
> +	spin_unlock(&region_lock);
>  	return 0;
>  }
>  
> @@ -114,6 +109,8 @@ static long region_chg(struct list_head
>  	struct file_region *rg, *nrg;
>  	long chg = 0;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate the region we are before or in. */
>  	list_for_each_entry(rg, head, link)
>  		if (f <= rg->to)
> @@ -124,14 +121,17 @@ static long region_chg(struct list_head
>  	 * size such that we can guarantee to record the reservation. */
>  	if (&rg->link == head || t < rg->from) {
>  		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
> -		if (!nrg)
> -			return -ENOMEM;
> +		if (!nrg) {
> +			chg = -ENOMEM;
> +			goto out;
> +		}
>  		nrg->from = f;
>  		nrg->to   = f;
>  		INIT_LIST_HEAD(&nrg->link);
>  		list_add(&nrg->link, rg->link.prev);
>  
> -		return t - f;
> +		chg = t - f;
> +		goto out;
>  	}
>  
>  	/* Round our left edge to the current segment if it encloses us. */
> @@ -144,7 +144,7 @@ static long region_chg(struct list_head
>  		if (&rg->link == head)
>  			break;
>  		if (rg->from > t)
> -			return chg;
> +			goto out;
>  
>  		/* We overlap with this area, if it extends futher than
>  		 * us then we must extend ourselves.  Account for its
> @@ -155,6 +155,9 @@ static long region_chg(struct list_head
>  		}
>  		chg -= rg->to - rg->from;
>  	}
> +out:
> +
> +	spin_unlock(&region_lock);
>  	return chg;
>  }
>  
> @@ -163,12 +166,16 @@ static long region_truncate(struct list_
>  	struct file_region *rg, *trg;
>  	long chg = 0;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate the region we are either in or before. */
>  	list_for_each_entry(rg, head, link)
>  		if (end <= rg->to)
>  			break;
> -	if (&rg->link == head)
> -		return 0;
> +	if (&rg->link == head) {
> +		chg = 0;
> +		goto out;
> +	}
>  
>  	/* If we are in the middle of a region then adjust it. */
>  	if (end > rg->from) {
> @@ -185,6 +192,9 @@ static long region_truncate(struct list_
>  		list_del(&rg->link);
>  		kfree(rg);
>  	}
> +
> +out:
> +	spin_unlock(&region_lock);
>  	return chg;
>  }
>  
> @@ -193,6 +203,8 @@ static long region_count(struct list_hea
>  	struct file_region *rg;
>  	long chg = 0;
>  
> +	spin_lock(&region_lock);
> +
>  	/* Locate each segment we overlap with, and count that overlap. */
>  	list_for_each_entry(rg, head, link) {
>  		int seg_from;
> @@ -209,6 +221,7 @@ static long region_count(struct list_hea
>  		chg += seg_to - seg_from;
>  	}
>  
> +	spin_unlock(&region_lock);
>  	return chg;
>  }
>  
> 

-- 
Mel Gorman
Linux Technology Center
IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-01-25  3:34   ` Anton Blanchard
@ 2011-01-26  9:24     ` Mel Gorman
  -1 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2011-01-26  9:24 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel

On Tue, Jan 25, 2011 at 02:34:14PM +1100, Anton Blanchard wrote:
> From: David Gibson <dwg@au1.ibm.com>
> 
> At present, the page fault path for hugepages is serialized by a
> single mutex.  This is used to avoid spurious out-of-memory conditions
> when the hugepage pool is fully utilized (two processes or threads can
> race to instantiate the same mapping with the last hugepage from the
> pool, the race loser returning VM_FAULT_OOM). This problem is
> specific to hugepages, because it is normal to want to use every
> single hugepage in the system - with normal pages we simply assume
> there will always be a few spare pages which can be used temporarily
> until the race is resolved.
> 
> Unfortunately this serialization also means that clearing of hugepages
> cannot be parallelized across multiple CPUs, which can lead to very
> long process startup times when using large numbers of hugepages.
> 
> This patch improves the situation by replacing the single mutex with a
> table of mutexes, selected based on a hash of the address_space and
> file offset being faulted (or mm and virtual address for MAP_PRIVATE
> mappings).
> 
> From: Anton Blanchard <anton@samba.org>
> 
> Forward ported and made a few changes:
> 
> - Use the Jenkins hash to scatter the hash, better than using just the
>   low bits.
> 
> - Always round num_fault_mutexes to a power of two to avoid an expensive
>   modulus in the hash calculation.
> 
> I also tested this patch on a 64 thread POWER6 box using a simple parallel
> fault testcase:
> 
> http://ozlabs.org/~anton/junkcode/parallel_fault.c
> 
> Command line options:
> 
> parallel_fault <nr_threads> <size in kB> <skip in kB>
> 
> First the time taken to fault 48GB of 16MB hugepages:
> # time hugectl --heap ./parallel_fault 1 50331648 16384
> 11.1 seconds
> 
> Now the same test with 64 concurrent threads:
> # time hugectl --heap ./parallel_fault 64 50331648 16384
> 8.8 seconds
> 
> Hardly any speedup. Finally the 64 concurrent threads test with this patch
> applied:
> # time hugectl --heap ./parallel_fault 64 50331648 16384
> 0.7 seconds
> 
> We go from 8.8 seconds to 0.7 seconds, an improvement of 12.6x.
> 
> Signed-off-by: David Gibson <dwg@au1.ibm.com>
> Signed-off-by: Anton Blanchard <anton@samba.org>

I haven't tested this patch yet but typically how I would test it is multiple
parallel instances of make func from libhugetlbfs. In particular I would
be looking out for counter corruption. Has something like this been done?
I know hugetlb_lock protects the counters but the locking in there has turned
into a bit of a mess so it's easy to miss something.

> ---
> 
> Index: powerpc.git/mm/hugetlb.c
> ===================================================================
> --- powerpc.git.orig/mm/hugetlb.c	2011-01-25 13:20:49.311405902 +1100
> +++ powerpc.git/mm/hugetlb.c	2011-01-25 13:45:54.437235053 +1100
> @@ -21,6 +21,7 @@
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
>  #include <linux/swapops.h>
> +#include <linux/jhash.h>
>  
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
> @@ -54,6 +55,13 @@ static unsigned long __initdata default_
>  static DEFINE_SPINLOCK(hugetlb_lock);
>  
>  /*
> + * Serializes faults on the same logical page.  This is used to
> + * prevent spurious OOMs when the hugepage pool is fully utilized.
> + */
> +static unsigned int num_fault_mutexes;
> +static struct mutex *htlb_fault_mutex_table;
> +
> +/*
>   * Region tracking -- allows tracking of reservations and instantiated pages
>   *                    across the pages in a mapping.
>   */
> @@ -1764,6 +1772,8 @@ module_exit(hugetlb_exit);
>  
>  static int __init hugetlb_init(void)
>  {
> +	int i;
> +
>  	/* Some platform decide whether they support huge pages at boot
>  	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
>  	 * there is no such support
> @@ -1790,6 +1800,12 @@ static int __init hugetlb_init(void)
>  
>  	hugetlb_register_all_nodes();
>  
> +	num_fault_mutexes = roundup_pow_of_two(2 * num_possible_cpus());
> +	htlb_fault_mutex_table =
> +		kmalloc(num_fault_mutexes * sizeof(struct mutex), GFP_KERNEL);

and if this fails? It'd be unusual I know but num_possible_cpus() could
conceivably be large enough to prevent kmalloc() granting the request.
Do you need to do something similar to profile_init() here instead?

> +	for (i = 0; i < num_fault_mutexes; i++)
> +		mutex_init(&htlb_fault_mutex_table[i]);
> +
>  	return 0;
>  }
>  module_init(hugetlb_init);
> @@ -2616,6 +2632,27 @@ backout_unlocked:
>  	goto out;
>  }
>  
> +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
> +			    struct vm_area_struct *vma,
> +			    struct address_space *mapping,
> +			    unsigned long pagenum, unsigned long address)

pagenum could be anything. Leave it as idx or index because it's easier
to guess it's the result of vma_hugecache_offset().

> +{
> +	unsigned long key[2];
> +	u32 hash;
> +
> +	if ((vma->vm_flags & VM_SHARED)) {
> +		key[0] = (unsigned long)mapping;
> +		key[1] = pagenum;
> +	} else {
> +		key[0] = (unsigned long)mm;
> +		key[1] = address >> huge_page_shift(h);
> +	}
> +
> +	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
> +
> +	return hash & (num_fault_mutexes - 1);
> +}
> +
>  int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, unsigned int flags)
>  {
> @@ -2624,8 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm,
>  	int ret;
>  	struct page *page = NULL;
>  	struct page *pagecache_page = NULL;
> -	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
>  	struct hstate *h = hstate_vma(vma);
> +	struct address_space *mapping;
> +	unsigned long idx;
> +	u32 hash;
>  
>  	ptep = huge_pte_offset(mm, address);
>  	if (ptep) {
> @@ -2642,12 +2681,16 @@ int hugetlb_fault(struct mm_struct *mm,
>  	if (!ptep)
>  		return VM_FAULT_OOM;
>  
> +	mapping = vma->vm_file->f_mapping;
> +	idx = vma_hugecache_offset(h, vma, address);
> +
>  	/*
>  	 * Serialize hugepage allocation and instantiation, so that we don't
>  	 * get spurious allocation failures if two CPUs race to instantiate
>  	 * the same page in the page cache.
>  	 */
> -	mutex_lock(&hugetlb_instantiation_mutex);
> +	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
> +	mutex_lock(&htlb_fault_mutex_table[hash]);
>  	entry = huge_ptep_get(ptep);
>  	if (huge_pte_none(entry)) {
>  		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
> @@ -2716,7 +2759,7 @@ out_page_table_lock:
>  		unlock_page(page);
>  
>  out_mutex:
> -	mutex_unlock(&hugetlb_instantiation_mutex);
> +	mutex_unlock(&htlb_fault_mutex_table[hash]);
>  
>  	return ret;
>  }
> 

I didn't spot anything wrong but I'd be happier if I knew multiple parallel
"make func" from libhugetlbfs tests were run as well.

-- 
Mel Gorman
Linux Technology Center
IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-01-26  9:24     ` Mel Gorman
  0 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2011-01-26  9:24 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel

On Tue, Jan 25, 2011 at 02:34:14PM +1100, Anton Blanchard wrote:
> From: David Gibson <dwg@au1.ibm.com>
> 
> At present, the page fault path for hugepages is serialized by a
> single mutex.  This is used to avoid spurious out-of-memory conditions
> when the hugepage pool is fully utilized (two processes or threads can
> race to instantiate the same mapping with the last hugepage from the
> pool, the race loser returning VM_FAULT_OOM). This problem is
> specific to hugepages, because it is normal to want to use every
> single hugepage in the system - with normal pages we simply assume
> there will always be a few spare pages which can be used temporarily
> until the race is resolved.
> 
> Unfortunately this serialization also means that clearing of hugepages
> cannot be parallelized across multiple CPUs, which can lead to very
> long process startup times when using large numbers of hugepages.
> 
> This patch improves the situation by replacing the single mutex with a
> table of mutexes, selected based on a hash of the address_space and
> file offset being faulted (or mm and virtual address for MAP_PRIVATE
> mappings).
> 
> From: Anton Blanchard <anton@samba.org>
> 
> Forward ported and made a few changes:
> 
> - Use the Jenkins hash to scatter the hash, better than using just the
>   low bits.
> 
> - Always round num_fault_mutexes to a power of two to avoid an expensive
>   modulus in the hash calculation.
> 
> I also tested this patch on a 64 thread POWER6 box using a simple parallel
> fault testcase:
> 
> http://ozlabs.org/~anton/junkcode/parallel_fault.c
> 
> Command line options:
> 
> parallel_fault <nr_threads> <size in kB> <skip in kB>
> 
> First the time taken to fault 48GB of 16MB hugepages:
> # time hugectl --heap ./parallel_fault 1 50331648 16384
> 11.1 seconds
> 
> Now the same test with 64 concurrent threads:
> # time hugectl --heap ./parallel_fault 64 50331648 16384
> 8.8 seconds
> 
> Hardly any speedup. Finally the 64 concurrent threads test with this patch
> applied:
> # time hugectl --heap ./parallel_fault 64 50331648 16384
> 0.7 seconds
> 
> We go from 8.8 seconds to 0.7 seconds, an improvement of 12.6x.
> 
> Signed-off-by: David Gibson <dwg@au1.ibm.com>
> Signed-off-by: Anton Blanchard <anton@samba.org>

I haven't tested this patch yet but typically how I would test it is multiple
parallel instances of make func from libhugetlbfs. In particular I would
be looking out for counter corruption. Has something like this been done?
I know hugetlb_lock protects the counters but the locking in there has turned
into a bit of a mess so it's easy to miss something.

> ---
> 
> Index: powerpc.git/mm/hugetlb.c
> ===================================================================
> --- powerpc.git.orig/mm/hugetlb.c	2011-01-25 13:20:49.311405902 +1100
> +++ powerpc.git/mm/hugetlb.c	2011-01-25 13:45:54.437235053 +1100
> @@ -21,6 +21,7 @@
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
>  #include <linux/swapops.h>
> +#include <linux/jhash.h>
>  
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
> @@ -54,6 +55,13 @@ static unsigned long __initdata default_
>  static DEFINE_SPINLOCK(hugetlb_lock);
>  
>  /*
> + * Serializes faults on the same logical page.  This is used to
> + * prevent spurious OOMs when the hugepage pool is fully utilized.
> + */
> +static unsigned int num_fault_mutexes;
> +static struct mutex *htlb_fault_mutex_table;
> +
> +/*
>   * Region tracking -- allows tracking of reservations and instantiated pages
>   *                    across the pages in a mapping.
>   */
> @@ -1764,6 +1772,8 @@ module_exit(hugetlb_exit);
>  
>  static int __init hugetlb_init(void)
>  {
> +	int i;
> +
>  	/* Some platform decide whether they support huge pages at boot
>  	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
>  	 * there is no such support
> @@ -1790,6 +1800,12 @@ static int __init hugetlb_init(void)
>  
>  	hugetlb_register_all_nodes();
>  
> +	num_fault_mutexes = roundup_pow_of_two(2 * num_possible_cpus());
> +	htlb_fault_mutex_table =
> +		kmalloc(num_fault_mutexes * sizeof(struct mutex), GFP_KERNEL);

and if this fails? It'd be unusual I know but num_possible_cpus() could
conceivably be large enough to prevent kmalloc() granting the request.
Do you need to do something similar to profile_init() here instead?

> +	for (i = 0; i < num_fault_mutexes; i++)
> +		mutex_init(&htlb_fault_mutex_table[i]);
> +
>  	return 0;
>  }
>  module_init(hugetlb_init);
> @@ -2616,6 +2632,27 @@ backout_unlocked:
>  	goto out;
>  }
>  
> +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
> +			    struct vm_area_struct *vma,
> +			    struct address_space *mapping,
> +			    unsigned long pagenum, unsigned long address)

pagenum could be anything. Leave it as idx or index because it's easier
to guess it's the result of vma_hugecache_offset().

> +{
> +	unsigned long key[2];
> +	u32 hash;
> +
> +	if ((vma->vm_flags & VM_SHARED)) {
> +		key[0] = (unsigned long)mapping;
> +		key[1] = pagenum;
> +	} else {
> +		key[0] = (unsigned long)mm;
> +		key[1] = address >> huge_page_shift(h);
> +	}
> +
> +	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
> +
> +	return hash & (num_fault_mutexes - 1);
> +}
> +
>  int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			unsigned long address, unsigned int flags)
>  {
> @@ -2624,8 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm,
>  	int ret;
>  	struct page *page = NULL;
>  	struct page *pagecache_page = NULL;
> -	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
>  	struct hstate *h = hstate_vma(vma);
> +	struct address_space *mapping;
> +	unsigned long idx;
> +	u32 hash;
>  
>  	ptep = huge_pte_offset(mm, address);
>  	if (ptep) {
> @@ -2642,12 +2681,16 @@ int hugetlb_fault(struct mm_struct *mm,
>  	if (!ptep)
>  		return VM_FAULT_OOM;
>  
> +	mapping = vma->vm_file->f_mapping;
> +	idx = vma_hugecache_offset(h, vma, address);
> +
>  	/*
>  	 * Serialize hugepage allocation and instantiation, so that we don't
>  	 * get spurious allocation failures if two CPUs race to instantiate
>  	 * the same page in the page cache.
>  	 */
> -	mutex_lock(&hugetlb_instantiation_mutex);
> +	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
> +	mutex_lock(&htlb_fault_mutex_table[hash]);
>  	entry = huge_ptep_get(ptep);
>  	if (huge_pte_none(entry)) {
>  		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
> @@ -2716,7 +2759,7 @@ out_page_table_lock:
>  		unlock_page(page);
>  
>  out_mutex:
> -	mutex_unlock(&hugetlb_instantiation_mutex);
> +	mutex_unlock(&htlb_fault_mutex_table[hash]);
>  
>  	return ret;
>  }
> 

I didn't spot anything wrong but I'd be happier if I knew multiple parallel
"make func" from libhugetlbfs tests were run as well.

-- 
Mel Gorman
Linux Technology Center
IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-01-26  9:24     ` Mel Gorman
@ 2011-07-15  6:06       ` Anton Blanchard
  -1 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-07-15  6:06 UTC (permalink / raw)
  To: Mel Gorman; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel


Hi Mel,

> I haven't tested this patch yet but typically how I would test it is
> multiple parallel instances of make func from libhugetlbfs. In
> particular I would be looking out for counter corruption. Has
> something like this been done? I know hugetlb_lock protects the
> counters but the locking in there has turned into a bit of a mess so
> it's easy to miss something.

Thanks for the suggestion and sorry for taking so long. Make check has
the same PASS/FAIL count before and after the patches.

I also ran 16 copies of make func on a large box with 896 HW threads.
Some of the tests that use shared memory were a bit upset, but that
seems to be because we use a static key. It seems the tests were also
fighting over the number of huge pages they wanted the system set to.

It got up to a load average of 13207, and heap-overflow consumed all my
memory, a pretty good effort considering I have over 1TB of it.

After things settled down things were OK, apart from the fact that we
have 20 huge pages unaccounted for:

HugePages_Total:   10000
HugePages_Free:     9980
HugePages_Rsvd:        0
HugePages_Surp:        0

I verified there were no shared memory segments, and no files in the
hugetlbfs filesystem (I double checked by unmounting it).

I can't see how this patch set would cause this. It seems like we can
leak huge pages, perhaps in an error path. Anyway, I'll repost the
patch set for comments.

Anton

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-07-15  6:06       ` Anton Blanchard
  0 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-07-15  6:06 UTC (permalink / raw)
  To: Mel Gorman; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel


Hi Mel,

> I haven't tested this patch yet but typically how I would test it is
> multiple parallel instances of make func from libhugetlbfs. In
> particular I would be looking out for counter corruption. Has
> something like this been done? I know hugetlb_lock protects the
> counters but the locking in there has turned into a bit of a mess so
> it's easy to miss something.

Thanks for the suggestion and sorry for taking so long. Make check has
the same PASS/FAIL count before and after the patches.

I also ran 16 copies of make func on a large box with 896 HW threads.
Some of the tests that use shared memory were a bit upset, but that
seems to be because we use a static key. It seems the tests were also
fighting over the number of huge pages they wanted the system set to.

It got up to a load average of 13207, and heap-overflow consumed all my
memory, a pretty good effort considering I have over 1TB of it.

After things settled down things were OK, apart from the fact that we
have 20 huge pages unaccounted for:

HugePages_Total:   10000
HugePages_Free:     9980
HugePages_Rsvd:        0
HugePages_Surp:        0

I verified there were no shared memory segments, and no files in the
hugetlbfs filesystem (I double checked by unmounting it).

I can't see how this patch set would cause this. It seems like we can
leak huge pages, perhaps in an error path. Anyway, I'll repost the
patch set for comments.

Anton

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
  2011-07-15  6:06       ` Anton Blanchard
@ 2011-07-15  6:08         ` Anton Blanchard
  -1 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-07-15  6:08 UTC (permalink / raw)
  To: Mel Gorman, dwg, akpm, hughd; +Cc: linux-mm, linux-kernel


In preparation for creating a hash of spinlocks to replace the global
hugetlb_instantiation_mutex, protect the region tracking code with
its own spinlock.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

The old code locked it with either:

	down_write(&mm->mmap_sem);
or
	down_read(&mm->mmap_sem);
	mutex_lock(&hugetlb_instantiation_mutex);

I chose to keep things simple and wrap everything with a single lock.
Do we need the parallelism the old code had in the down_write case?


Index: linux-2.6-work/mm/hugetlb.c
===================================================================
--- linux-2.6-work.orig/mm/hugetlb.c	2011-06-06 08:10:13.471407173 +1000
+++ linux-2.6-work/mm/hugetlb.c	2011-06-06 08:10:15.041433948 +1000
@@ -56,16 +56,6 @@ static DEFINE_SPINLOCK(hugetlb_lock);
 /*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
- *
- * The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex.  To access or modify a region the caller
- * must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
- *
- * 	down_write(&mm->mmap_sem);
- * or
- * 	down_read(&mm->mmap_sem);
- * 	mutex_lock(&hugetlb_instantiation_mutex);
  */
 struct file_region {
 	struct list_head link;
@@ -73,10 +63,14 @@ struct file_region {
 	long to;
 };
 
+static DEFINE_SPINLOCK(region_lock);
+
 static long region_add(struct list_head *head, long f, long t)
 {
 	struct file_region *rg, *nrg, *trg;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -106,6 +100,7 @@ static long region_add(struct list_head
 	}
 	nrg->from = f;
 	nrg->to = t;
+	spin_unlock(&region_lock);
 	return 0;
 }
 
@@ -114,6 +109,8 @@ static long region_chg(struct list_head
 	struct file_region *rg, *nrg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are before or in. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -124,14 +121,17 @@ static long region_chg(struct list_head
 	 * size such that we can guarantee to record the reservation. */
 	if (&rg->link == head || t < rg->from) {
 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-		if (!nrg)
-			return -ENOMEM;
+		if (!nrg) {
+			chg = -ENOMEM;
+			goto out;
+		}
 		nrg->from = f;
 		nrg->to   = f;
 		INIT_LIST_HEAD(&nrg->link);
 		list_add(&nrg->link, rg->link.prev);
 
-		return t - f;
+		chg = t - f;
+		goto out;
 	}
 
 	/* Round our left edge to the current segment if it encloses us. */
@@ -144,7 +144,7 @@ static long region_chg(struct list_head
 		if (&rg->link == head)
 			break;
 		if (rg->from > t)
-			return chg;
+			goto out;
 
 		/* We overlap with this area, if it extends further than
 		 * us then we must extend ourselves.  Account for its
@@ -155,6 +155,9 @@ static long region_chg(struct list_head
 		}
 		chg -= rg->to - rg->from;
 	}
+out:
+
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -163,12 +166,16 @@ static long region_truncate(struct list_
 	struct file_region *rg, *trg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (end <= rg->to)
 			break;
-	if (&rg->link == head)
-		return 0;
+	if (&rg->link == head) {
+		chg = 0;
+		goto out;
+	}
 
 	/* If we are in the middle of a region then adjust it. */
 	if (end > rg->from) {
@@ -185,6 +192,9 @@ static long region_truncate(struct list_
 		list_del(&rg->link);
 		kfree(rg);
 	}
+
+out:
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -193,6 +203,8 @@ static long region_count(struct list_hea
 	struct file_region *rg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate each segment we overlap with, and count that overlap. */
 	list_for_each_entry(rg, head, link) {
 		int seg_from;
@@ -209,6 +221,7 @@ static long region_count(struct list_hea
 		chg += seg_to - seg_from;
 	}
 
+	spin_unlock(&region_lock);
 	return chg;
 }
 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
@ 2011-07-15  6:08         ` Anton Blanchard
  0 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-07-15  6:08 UTC (permalink / raw)
  To: Mel Gorman, dwg, akpm, hughd; +Cc: linux-mm, linux-kernel


In preparation for creating a hash of spinlocks to replace the global
hugetlb_instantiation_mutex, protect the region tracking code with
its own spinlock.

Signed-off-by: Anton Blanchard <anton@samba.org> 
---

The old code locked it with either:

	down_write(&mm->mmap_sem);
or
	down_read(&mm->mmap_sem);
	mutex_lock(&hugetlb_instantiation_mutex);

I chose to keep things simple and wrap everything with a single lock.
Do we need the parallelism the old code had in the down_write case?


Index: linux-2.6-work/mm/hugetlb.c
===================================================================
--- linux-2.6-work.orig/mm/hugetlb.c	2011-06-06 08:10:13.471407173 +1000
+++ linux-2.6-work/mm/hugetlb.c	2011-06-06 08:10:15.041433948 +1000
@@ -56,16 +56,6 @@ static DEFINE_SPINLOCK(hugetlb_lock);
 /*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
- *
- * The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex.  To access or modify a region the caller
- * must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
- *
- * 	down_write(&mm->mmap_sem);
- * or
- * 	down_read(&mm->mmap_sem);
- * 	mutex_lock(&hugetlb_instantiation_mutex);
  */
 struct file_region {
 	struct list_head link;
@@ -73,10 +63,14 @@ struct file_region {
 	long to;
 };
 
+static DEFINE_SPINLOCK(region_lock);
+
 static long region_add(struct list_head *head, long f, long t)
 {
 	struct file_region *rg, *nrg, *trg;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -106,6 +100,7 @@ static long region_add(struct list_head
 	}
 	nrg->from = f;
 	nrg->to = t;
+	spin_unlock(&region_lock);
 	return 0;
 }
 
@@ -114,6 +109,8 @@ static long region_chg(struct list_head
 	struct file_region *rg, *nrg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are before or in. */
 	list_for_each_entry(rg, head, link)
 		if (f <= rg->to)
@@ -124,14 +121,17 @@ static long region_chg(struct list_head
 	 * size such that we can guarantee to record the reservation. */
 	if (&rg->link == head || t < rg->from) {
 		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-		if (!nrg)
-			return -ENOMEM;
+		if (!nrg) {
+			chg = -ENOMEM;
+			goto out;
+		}
 		nrg->from = f;
 		nrg->to   = f;
 		INIT_LIST_HEAD(&nrg->link);
 		list_add(&nrg->link, rg->link.prev);
 
-		return t - f;
+		chg = t - f;
+		goto out;
 	}
 
 	/* Round our left edge to the current segment if it encloses us. */
@@ -144,7 +144,7 @@ static long region_chg(struct list_head
 		if (&rg->link == head)
 			break;
 		if (rg->from > t)
-			return chg;
+			goto out;
 
 		/* We overlap with this area, if it extends further than
 		 * us then we must extend ourselves.  Account for its
@@ -155,6 +155,9 @@ static long region_chg(struct list_head
 		}
 		chg -= rg->to - rg->from;
 	}
+out:
+
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -163,12 +166,16 @@ static long region_truncate(struct list_
 	struct file_region *rg, *trg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate the region we are either in or before. */
 	list_for_each_entry(rg, head, link)
 		if (end <= rg->to)
 			break;
-	if (&rg->link == head)
-		return 0;
+	if (&rg->link == head) {
+		chg = 0;
+		goto out;
+	}
 
 	/* If we are in the middle of a region then adjust it. */
 	if (end > rg->from) {
@@ -185,6 +192,9 @@ static long region_truncate(struct list_
 		list_del(&rg->link);
 		kfree(rg);
 	}
+
+out:
+	spin_unlock(&region_lock);
 	return chg;
 }
 
@@ -193,6 +203,8 @@ static long region_count(struct list_hea
 	struct file_region *rg;
 	long chg = 0;
 
+	spin_lock(&region_lock);
+
 	/* Locate each segment we overlap with, and count that overlap. */
 	list_for_each_entry(rg, head, link) {
 		int seg_from;
@@ -209,6 +221,7 @@ static long region_count(struct list_hea
 		chg += seg_to - seg_from;
 	}
 
+	spin_unlock(&region_lock);
 	return chg;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-07-15  6:06       ` Anton Blanchard
@ 2011-07-15  6:10         ` Anton Blanchard
  -1 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-07-15  6:10 UTC (permalink / raw)
  To: Mel Gorman, dwg, akpm, hughd; +Cc: linux-mm, linux-kernel

From: David Gibson <dwg@au1.ibm.com>

At present, the page fault path for hugepages is serialized by a
single mutex.  This is used to avoid spurious out-of-memory conditions
when the hugepage pool is fully utilized (two processes or threads can
race to instantiate the same mapping with the last hugepage from the
pool, the race loser returning VM_FAULT_OOM).  This problem is
specific to hugepages, because it is normal to want to use every
single hugepage in the system - with normal pages we simply assume
there will always be a few spare pages which can be used temporarily
until the race is resolved.

Unfortunately this serialization also means that clearing of hugepages
cannot be parallelized across multiple CPUs, which can lead to very
long process startup times when using large numbers of hugepages.

This patch improves the situation by replacing the single mutex with a
table of mutexes, selected based on a hash of the address_space and
file offset being faulted (or mm and virtual address for MAP_PRIVATE
mappings).

From: Anton Blanchard <anton@samba.org>

Forward ported and made a few changes:

- Use the Jenkins hash to scatter the hash, better than using just the
  low bits.

- Always round num_fault_mutexes to a power of two to avoid an
  expensive modulus in the hash calculation.

I also tested this patch on a large POWER7 box using a simple parallel
fault testcase:

http://ozlabs.org/~anton/junkcode/parallel_fault.c

Command line options:

parallel_fault <nr_threads> <size in kB> <skip in kB>


First the time taken to fault 128GB of 16MB hugepages:

# time hugectl --heap ./parallel_fault 1 134217728 16384
40.68 seconds

Now the same test with 64 concurrent threads:
# time hugectl --heap ./parallel_fault 64 134217728 16384
39.34 seconds

Hardly any speedup. Finally the 64 concurrent threads test with
this patch applied:
# time hugectl --heap ./parallel_fault 64 134217728 16384
0.85 seconds

We go from 40.68 seconds to 0.85 seconds, an improvement of 47.9x

This was tested with the libhugetlbfs test suite, and the PASS/FAIL
count was the same before and after this patch.


Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-2.6-work/mm/hugetlb.c
===================================================================
--- linux-2.6-work.orig/mm/hugetlb.c	2011-07-15 09:17:23.724410080 +1000
+++ linux-2.6-work/mm/hugetlb.c	2011-07-15 09:17:24.584425505 +1000
@@ -21,6 +21,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/jhash.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -54,6 +55,13 @@ static unsigned long __initdata default_
 static DEFINE_SPINLOCK(hugetlb_lock);
 
 /*
+ * Serializes faults on the same logical page.  This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static unsigned int num_fault_mutexes;
+static struct mutex *htlb_fault_mutex_table;
+
+/*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
  */
@@ -1772,6 +1780,8 @@ module_exit(hugetlb_exit);
 
 static int __init hugetlb_init(void)
 {
+	int i;
+
 	/* Some platform decide whether they support huge pages at boot
 	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
 	 * there is no such support
@@ -1798,6 +1808,12 @@ static int __init hugetlb_init(void)
 
 	hugetlb_register_all_nodes();
 
+	num_fault_mutexes = roundup_pow_of_two(2 * num_possible_cpus());
+	htlb_fault_mutex_table =
+		kmalloc(num_fault_mutexes * sizeof(struct mutex), GFP_KERNEL);
+	for (i = 0; i < num_fault_mutexes; i++)
+		mutex_init(&htlb_fault_mutex_table[i]);
+
 	return 0;
 }
 module_init(hugetlb_init);
@@ -2622,6 +2638,27 @@ backout_unlocked:
 	goto out;
 }
 
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+			    struct vm_area_struct *vma,
+			    struct address_space *mapping,
+			    unsigned long pagenum, unsigned long address)
+{
+	unsigned long key[2];
+	u32 hash;
+
+	if ((vma->vm_flags & VM_SHARED)) {
+		key[0] = (unsigned long)mapping;
+		key[1] = pagenum;
+	} else {
+		key[0] = (unsigned long)mm;
+		key[1] = address >> huge_page_shift(h);
+	}
+
+	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+	return hash & (num_fault_mutexes - 1);
+}
+
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags)
 {
@@ -2630,8 +2667,10 @@ int hugetlb_fault(struct mm_struct *mm,
 	int ret;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
-	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
+	struct address_space *mapping;
+	unsigned long idx;
+	u32 hash;
 
 	ptep = huge_pte_offset(mm, address);
 	if (ptep) {
@@ -2648,12 +2687,16 @@ int hugetlb_fault(struct mm_struct *mm,
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, address);
+
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
-	mutex_lock(&hugetlb_instantiation_mutex);
+	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+	mutex_lock(&htlb_fault_mutex_table[hash]);
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
 		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
@@ -2722,7 +2765,7 @@ out_page_table_lock:
 		unlock_page(page);
 
 out_mutex:
-	mutex_unlock(&hugetlb_instantiation_mutex);
+	mutex_unlock(&htlb_fault_mutex_table[hash]);
 
 	return ret;
 }

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-07-15  6:10         ` Anton Blanchard
  0 siblings, 0 replies; 24+ messages in thread
From: Anton Blanchard @ 2011-07-15  6:10 UTC (permalink / raw)
  To: Mel Gorman, dwg, akpm, hughd; +Cc: linux-mm, linux-kernel

From: David Gibson <dwg@au1.ibm.com>

At present, the page fault path for hugepages is serialized by a
single mutex.  This is used to avoid spurious out-of-memory conditions
when the hugepage pool is fully utilized (two processes or threads can
race to instantiate the same mapping with the last hugepage from the
pool, the race loser returning VM_FAULT_OOM).  This problem is
specific to hugepages, because it is normal to want to use every
single hugepage in the system - with normal pages we simply assume
there will always be a few spare pages which can be used temporarily
until the race is resolved.

Unfortunately this serialization also means that clearing of hugepages
cannot be parallelized across multiple CPUs, which can lead to very
long process startup times when using large numbers of hugepages.

This patch improves the situation by replacing the single mutex with a
table of mutexes, selected based on a hash of the address_space and
file offset being faulted (or mm and virtual address for MAP_PRIVATE
mappings).

From: Anton Blanchard <anton@samba.org>

Forward ported and made a few changes:

- Use the Jenkins hash to scatter the hash, better than using just the
  low bits.

- Always round num_fault_mutexes to a power of two to avoid an
  expensive modulus in the hash calculation.

I also tested this patch on a large POWER7 box using a simple parallel
fault testcase:

http://ozlabs.org/~anton/junkcode/parallel_fault.c

Command line options:

parallel_fault <nr_threads> <size in kB> <skip in kB>


First the time taken to fault 128GB of 16MB hugepages:

# time hugectl --heap ./parallel_fault 1 134217728 16384
40.68 seconds

Now the same test with 64 concurrent threads:
# time hugectl --heap ./parallel_fault 64 134217728 16384
39.34 seconds

Hardly any speedup. Finally the 64 concurrent threads test with
this patch applied:
# time hugectl --heap ./parallel_fault 64 134217728 16384
0.85 seconds

We go from 40.68 seconds to 0.85 seconds, an improvement of 47.9x

This was tested with the libhugetlbfs test suite, and the PASS/FAIL
count was the same before and after this patch.


Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-2.6-work/mm/hugetlb.c
===================================================================
--- linux-2.6-work.orig/mm/hugetlb.c	2011-07-15 09:17:23.724410080 +1000
+++ linux-2.6-work/mm/hugetlb.c	2011-07-15 09:17:24.584425505 +1000
@@ -21,6 +21,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/jhash.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -54,6 +55,13 @@ static unsigned long __initdata default_
 static DEFINE_SPINLOCK(hugetlb_lock);
 
 /*
+ * Serializes faults on the same logical page.  This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static unsigned int num_fault_mutexes;
+static struct mutex *htlb_fault_mutex_table;
+
+/*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
  */
@@ -1772,6 +1780,8 @@ module_exit(hugetlb_exit);
 
 static int __init hugetlb_init(void)
 {
+	int i;
+
 	/* Some platform decide whether they support huge pages at boot
 	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
 	 * there is no such support
@@ -1798,6 +1808,12 @@ static int __init hugetlb_init(void)
 
 	hugetlb_register_all_nodes();
 
+	num_fault_mutexes = roundup_pow_of_two(2 * num_possible_cpus());
+	htlb_fault_mutex_table =
+		kmalloc(num_fault_mutexes * sizeof(struct mutex), GFP_KERNEL);
+	for (i = 0; i < num_fault_mutexes; i++)
+		mutex_init(&htlb_fault_mutex_table[i]);
+
 	return 0;
 }
 module_init(hugetlb_init);
@@ -2622,6 +2638,27 @@ backout_unlocked:
 	goto out;
 }
 
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+			    struct vm_area_struct *vma,
+			    struct address_space *mapping,
+			    unsigned long pagenum, unsigned long address)
+{
+	unsigned long key[2];
+	u32 hash;
+
+	if ((vma->vm_flags & VM_SHARED)) {
+		key[0] = (unsigned long)mapping;
+		key[1] = pagenum;
+	} else {
+		key[0] = (unsigned long)mm;
+		key[1] = address >> huge_page_shift(h);
+	}
+
+	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+	return hash & (num_fault_mutexes - 1);
+}
+
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags)
 {
@@ -2630,8 +2667,10 @@ int hugetlb_fault(struct mm_struct *mm,
 	int ret;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
-	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
+	struct address_space *mapping;
+	unsigned long idx;
+	u32 hash;
 
 	ptep = huge_pte_offset(mm, address);
 	if (ptep) {
@@ -2648,12 +2687,16 @@ int hugetlb_fault(struct mm_struct *mm,
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, address);
+
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
-	mutex_lock(&hugetlb_instantiation_mutex);
+	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+	mutex_lock(&htlb_fault_mutex_table[hash]);
 	entry = huge_ptep_get(ptep);
 	if (huge_pte_none(entry)) {
 		ret = hugetlb_no_page(mm, vma, address, ptep, flags);
@@ -2722,7 +2765,7 @@ out_page_table_lock:
 		unlock_page(page);
 
 out_mutex:
-	mutex_unlock(&hugetlb_instantiation_mutex);
+	mutex_unlock(&htlb_fault_mutex_table[hash]);
 
 	return ret;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-01-25  3:34   ` Anton Blanchard
@ 2011-07-15  7:52     ` Andi Kleen
  -1 siblings, 0 replies; 24+ messages in thread
From: Andi Kleen @ 2011-07-15  7:52 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, mel, akpm, hughd, linux-mm, linux-kernel

Anton Blanchard <anton@samba.org> writes:


> This patch improves the situation by replacing the single mutex with a
> table of mutexes, selected based on a hash of the address_space and
> file offset being faulted (or mm and virtual address for MAP_PRIVATE
> mappings).

It's unclear to me how this solves the original OOM problem.
But then you can still have early oom over all the hugepages if they
happen to hash to different pages, can't you? 

I think it would be better to move out the clearing out of the lock,
and possibly take the lock only when the hugepages are about to 
go OOM.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-07-15  7:52     ` Andi Kleen
  0 siblings, 0 replies; 24+ messages in thread
From: Andi Kleen @ 2011-07-15  7:52 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, mel, akpm, hughd, linux-mm, linux-kernel

Anton Blanchard <anton@samba.org> writes:


> This patch improves the situation by replacing the single mutex with a
> table of mutexes, selected based on a hash of the address_space and
> file offset being faulted (or mm and virtual address for MAP_PRIVATE
> mappings).

It's unclear to me how this solves the original OOM problem.
But then you can still have early oom over all the hugepages if they
happen to hash to different pages, can't you? 

I think it would be better to move out the clearing out of the lock,
and possibly take the lock only when the hugepages are about to 
go OOM.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-07-15  7:52     ` Andi Kleen
@ 2011-07-15 13:10       ` David Gibson
  -1 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2011-07-15 13:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Anton Blanchard, mel, akpm, hughd, linux-mm, linux-kernel

On Fri, Jul 15, 2011 at 12:52:38AM -0700, Andi Kleen wrote:
> Anton Blanchard <anton@samba.org> writes:
> 
> 
> > This patch improves the situation by replacing the single mutex with a
> > table of mutexes, selected based on a hash of the address_space and
> > file offset being faulted (or mm and virtual address for MAP_PRIVATE
> > mappings).
> 
> It's unclear to me how this solves the original OOM problem.
> But then you can still have early oom over all the hugepages if they
> happen to hash to different pages, can't you? 

The spurious OOM case only occurs when the two processes or threads
are racing to instantiate the same page (that is the same page within
an address_space for SHARED or the same virtual address for PRIVATE).
In other cases the OOM is correct behaviour (because we really don't
have enough hugepages to satisfy the requests).

Because of the hash's construction, we're guaranteed than in the
spurious OOM case, both processes or threads will use the same mutex.

> I think it would be better to move out the clearing out of the lock,

We really can't.  The lock has to be taken before we grab a page from
the pool, and can't be released until after the page is "committed"
either by updating the address space's radix tree (SHARED) or the page
tables (PRIVATE).  I can't see anyway the clearing can be moved out of
that.

> and possibly take the lock only when the hugepages are about to 
> go OOM.

This is much easier said than done.  

At one stage I did attempt a more theoretically elegant approach which
is to keep a count of the number of "in-flight" hugepages - OOMs
should be retried if it is non-zero.  I believe that approach can
work, but it turns out to be pretty darn hairy to implement.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-07-15 13:10       ` David Gibson
  0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2011-07-15 13:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Anton Blanchard, mel, akpm, hughd, linux-mm, linux-kernel

On Fri, Jul 15, 2011 at 12:52:38AM -0700, Andi Kleen wrote:
> Anton Blanchard <anton@samba.org> writes:
> 
> 
> > This patch improves the situation by replacing the single mutex with a
> > table of mutexes, selected based on a hash of the address_space and
> > file offset being faulted (or mm and virtual address for MAP_PRIVATE
> > mappings).
> 
> It's unclear to me how this solves the original OOM problem.
> But then you can still have early oom over all the hugepages if they
> happen to hash to different pages, can't you? 

The spurious OOM case only occurs when the two processes or threads
are racing to instantiate the same page (that is the same page within
an address_space for SHARED or the same virtual address for PRIVATE).
In other cases the OOM is correct behaviour (because we really don't
have enough hugepages to satisfy the requests).

Because of the hash's construction, we're guaranteed than in the
spurious OOM case, both processes or threads will use the same mutex.

> I think it would be better to move out the clearing out of the lock,

We really can't.  The lock has to be taken before we grab a page from
the pool, and can't be released until after the page is "committed"
either by updating the address space's radix tree (SHARED) or the page
tables (PRIVATE).  I can't see anyway the clearing can be moved out of
that.

> and possibly take the lock only when the hugepages are about to 
> go OOM.

This is much easier said than done.  

At one stage I did attempt a more theoretically elegant approach which
is to keep a count of the number of "in-flight" hugepages - OOMs
should be retried if it is non-zero.  I believe that approach can
work, but it turns out to be pretty darn hairy to implement.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock
  2011-07-15  6:08         ` Anton Blanchard
  (?)
@ 2011-07-18 15:24         ` Eric B Munson
  -1 siblings, 0 replies; 24+ messages in thread
From: Eric B Munson @ 2011-07-18 15:24 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: Mel Gorman, dwg, akpm, hughd, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 401 bytes --]

On Fri, 15 Jul 2011, Anton Blanchard wrote:

> 
> In preparation for creating a hash of spinlocks to replace the global
> hugetlb_instantiation_mutex, protect the region tracking code with
> its own spinlock.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org> 

These work on x86_64 as well with the same test method as described by Anton.

Tested-by: Eric B Munson <emunson@mgebm.net>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-07-15  6:10         ` Anton Blanchard
  (?)
@ 2011-07-18 15:24         ` Eric B Munson
  -1 siblings, 0 replies; 24+ messages in thread
From: Eric B Munson @ 2011-07-18 15:24 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: Mel Gorman, dwg, akpm, hughd, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2493 bytes --]

On Fri, 15 Jul 2011, Anton Blanchard wrote:

> From: David Gibson <dwg@au1.ibm.com>
> 
> At present, the page fault path for hugepages is serialized by a
> single mutex.  This is used to avoid spurious out-of-memory conditions
> when the hugepage pool is fully utilized (two processes or threads can
> race to instantiate the same mapping with the last hugepage from the
> pool, the race loser returning VM_FAULT_OOM).  This problem is
> specific to hugepages, because it is normal to want to use every
> single hugepage in the system - with normal pages we simply assume
> there will always be a few spare pages which can be used temporarily
> until the race is resolved.
> 
> Unfortunately this serialization also means that clearing of hugepages
> cannot be parallelized across multiple CPUs, which can lead to very
> long process startup times when using large numbers of hugepages.
> 
> This patch improves the situation by replacing the single mutex with a
> table of mutexes, selected based on a hash of the address_space and
> file offset being faulted (or mm and virtual address for MAP_PRIVATE
> mappings).
> 
> From: Anton Blanchard <anton@samba.org>
> 
> Forward ported and made a few changes:
> 
> - Use the Jenkins hash to scatter the hash, better than using just the
>   low bits.
> 
> - Always round num_fault_mutexes to a power of two to avoid an
>   expensive modulus in the hash calculation.
> 
> I also tested this patch on a large POWER7 box using a simple parallel
> fault testcase:
> 
> http://ozlabs.org/~anton/junkcode/parallel_fault.c
> 
> Command line options:
> 
> parallel_fault <nr_threads> <size in kB> <skip in kB>
> 
> 
> First the time taken to fault 128GB of 16MB hugepages:
> 
> # time hugectl --heap ./parallel_fault 1 134217728 16384
> 40.68 seconds
> 
> Now the same test with 64 concurrent threads:
> # time hugectl --heap ./parallel_fault 64 134217728 16384
> 39.34 seconds
> 
> Hardly any speedup. Finally the 64 concurrent threads test with
> this patch applied:
> # time hugectl --heap ./parallel_fault 64 134217728 16384
> 0.85 seconds
> 
> We go from 40.68 seconds to 0.85 seconds, an improvement of 47.9x
> 
> This was tested with the libhugetlbfs test suite, and the PASS/FAIL
> count was the same before and after this patch.
> 
> 
> Signed-off-by: David Gibson <dwg@au1.ibm.com>
> Signed-off-by: Anton Blanchard <anton@samba.org>

Tested-by: Eric B Munson <emunson@mgebm.net>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
  2011-07-15  6:06       ` Anton Blanchard
@ 2011-07-21 10:17         ` Mel Gorman
  -1 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2011-07-21 10:17 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel

On Fri, Jul 15, 2011 at 04:06:50PM +1000, Anton Blanchard wrote:
> 
> Hi Mel,
> 
> > I haven't tested this patch yet but typically how I would test it is
> > multiple parallel instances of make func from libhugetlbfs. In
> > particular I would be looking out for counter corruption. Has
> > something like this been done? I know hugetlb_lock protects the
> > counters but the locking in there has turned into a bit of a mess so
> > it's easy to miss something.
> 
> Thanks for the suggestion and sorry for taking so long. Make check has
> the same PASS/FAIL count before and after the patches.
> 
> I also ran 16 copies of make func on a large box with 896 HW threads.
> Some of the tests that use shared memory were a bit upset, but that
> seems to be because we use a static key. It seems the tests were also
> fighting over the number of huge pages they wanted the system set to.
> 
> It got up to a load average of 13207, and heap-overflow consumed all my
> memory, a pretty good effort considering I have over 1TB of it.
> 
> After things settled down things were OK, apart from the fact that we
> have 20 huge pages unaccounted for:
> 
> HugePages_Total:   10000
> HugePages_Free:     9980
> HugePages_Rsvd:        0
> HugePages_Surp:        0
> 
> I verified there were no shared memory segments, and no files in the
> hugetlbfs filesystem (I double checked by unmounting it).
> 
> I can't see how this patch set would cause this. It seems like we can
> leak huge pages, perhaps in an error path. Anyway, I'll repost the
> patch set for comments.
> 

I didn't see any problem with the patch either. The locking should
be functionally equivalent for both private and shared mappings.

Out of curiousity, can you trigger the bug without the patches? It
could be a race on faulting the shared regions that is causing the
leakage.  Any chance this can be debugged minimally by finding out if
this is an accounting bug or if if a hugepage is leaked? Considering
the stress of the machine and its size, I'm guessing it's not trivially
reproducible anywhere else.

I think one possibility of where the bug is is when updating
the hugepage pool size with "make func". This is a scenario that does
not normally occur as hugepage pool resizing is an administrative task
that happens rarely. See this chunk for instance

        spin_lock(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                spin_unlock(&hugetlb_lock);
                return NULL;
        } else {
                h->nr_huge_pages++;
                h->surplus_huge_pages++;
        }
        spin_unlock(&hugetlb_lock);

        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
                                        __GFP_REPEAT|__GFP_NOWARN,
                                        huge_page_order(h));

        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
                return NULL;
        }

That thing is not updating the counters if arch_prepare_hugepage fails.
That function is a no-op on powerpc normally. That wasn't changed for
any reason was it?

Another possibility is a regression of
[4f16fc10: mm: hugetlb: fix hugepage memory leak in mincore()] so maybe
try a similar reproduction case of mincore?

Maybe also try putting assert_spin_locked at every point nr_free_pages
or nr_huge_pages is updated and see if one of them triggers?


-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path
@ 2011-07-21 10:17         ` Mel Gorman
  0 siblings, 0 replies; 24+ messages in thread
From: Mel Gorman @ 2011-07-21 10:17 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: dwg, akpm, hughd, linux-mm, linux-kernel

On Fri, Jul 15, 2011 at 04:06:50PM +1000, Anton Blanchard wrote:
> 
> Hi Mel,
> 
> > I haven't tested this patch yet but typically how I would test it is
> > multiple parallel instances of make func from libhugetlbfs. In
> > particular I would be looking out for counter corruption. Has
> > something like this been done? I know hugetlb_lock protects the
> > counters but the locking in there has turned into a bit of a mess so
> > it's easy to miss something.
> 
> Thanks for the suggestion and sorry for taking so long. Make check has
> the same PASS/FAIL count before and after the patches.
> 
> I also ran 16 copies of make func on a large box with 896 HW threads.
> Some of the tests that use shared memory were a bit upset, but that
> seems to be because we use a static key. It seems the tests were also
> fighting over the number of huge pages they wanted the system set to.
> 
> It got up to a load average of 13207, and heap-overflow consumed all my
> memory, a pretty good effort considering I have over 1TB of it.
> 
> After things settled down things were OK, apart from the fact that we
> have 20 huge pages unaccounted for:
> 
> HugePages_Total:   10000
> HugePages_Free:     9980
> HugePages_Rsvd:        0
> HugePages_Surp:        0
> 
> I verified there were no shared memory segments, and no files in the
> hugetlbfs filesystem (I double checked by unmounting it).
> 
> I can't see how this patch set would cause this. It seems like we can
> leak huge pages, perhaps in an error path. Anyway, I'll repost the
> patch set for comments.
> 

I didn't see any problem with the patch either. The locking should
be functionally equivalent for both private and shared mappings.

Out of curiousity, can you trigger the bug without the patches? It
could be a race on faulting the shared regions that is causing the
leakage.  Any chance this can be debugged minimally by finding out if
this is an accounting bug or if if a hugepage is leaked? Considering
the stress of the machine and its size, I'm guessing it's not trivially
reproducible anywhere else.

I think one possibility of where the bug is is when updating
the hugepage pool size with "make func". This is a scenario that does
not normally occur as hugepage pool resizing is an administrative task
that happens rarely. See this chunk for instance

        spin_lock(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                spin_unlock(&hugetlb_lock);
                return NULL;
        } else {
                h->nr_huge_pages++;
                h->surplus_huge_pages++;
        }
        spin_unlock(&hugetlb_lock);

        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
                                        __GFP_REPEAT|__GFP_NOWARN,
                                        huge_page_order(h));

        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
                return NULL;
        }

That thing is not updating the counters if arch_prepare_hugepage fails.
That function is a no-op on powerpc normally. That wasn't changed for
any reason was it?

Another possibility is a regression of
[4f16fc10: mm: hugetlb: fix hugepage memory leak in mincore()] so maybe
try a similar reproduction case of mincore?

Maybe also try putting assert_spin_locked at every point nr_free_pages
or nr_huge_pages is updated and see if one of them triggers?


-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2011-07-21 10:17 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-25  3:32 [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock Anton Blanchard
2011-01-25  3:32 ` Anton Blanchard
2011-01-25  3:34 ` [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path Anton Blanchard
2011-01-25  3:34   ` Anton Blanchard
2011-01-25 19:44   ` Eric B Munson
2011-01-26  9:24   ` Mel Gorman
2011-01-26  9:24     ` Mel Gorman
2011-07-15  6:06     ` Anton Blanchard
2011-07-15  6:06       ` Anton Blanchard
2011-07-15  6:08       ` [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock Anton Blanchard
2011-07-15  6:08         ` Anton Blanchard
2011-07-18 15:24         ` Eric B Munson
2011-07-15  6:10       ` [PATCH 2/2] hugepage: Allow parallelization of the hugepage fault path Anton Blanchard
2011-07-15  6:10         ` Anton Blanchard
2011-07-18 15:24         ` Eric B Munson
2011-07-21 10:17       ` Mel Gorman
2011-07-21 10:17         ` Mel Gorman
2011-07-15  7:52   ` Andi Kleen
2011-07-15  7:52     ` Andi Kleen
2011-07-15 13:10     ` David Gibson
2011-07-15 13:10       ` David Gibson
2011-01-25 19:43 ` [PATCH 1/2] hugepage: Protect region tracking lists with its own spinlock Eric B Munson
2011-01-26  9:07 ` Mel Gorman
2011-01-26  9:07   ` Mel Gorman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.