[RFC PATCH] asynchronous page fault.

* [RFC PATCH] asynchronous page fault.
@ 2009-12-25  1:51 KAMEZAWA Hiroyuki
  2009-12-27  9:47 ` Minchan Kim
                   ` (3 more replies)
  0 siblings, 4 replies; 32+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-12-25  1:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-mm, minchan.kim, cl

[-- Attachment #1: Type: text/plain, Size: 13333 bytes --]

Speculative page fault v3.

This version is much simpler than old versions and doesn't use mm_accessor
but use RCU. This is based on linux-2.6.33-rc2.

This patch is just my toy but shows...
 - Once RB-tree is RCU-aware and no-lock in readside, we can avoid mmap_sem
   in page fault. 
So, what we need is not mm_accessor, but RCU-aware RB-tree, I think.

But yes, I may miss something critical ;)

After patch, statistics perf show is following. Test progam is attached.
  
# Samples: 1331231315119
#
# Overhead          Command             Shared Object  Symbol
# ........  ...............  ........................  ......
#
    28.41%  multi-fault-all  [kernel]                  [k] clear_page_c
            |
            --- clear_page_c
                __alloc_pages_nodemask
                handle_mm_fault
                do_page_fault
                page_fault
                0x400950
               |
                --100.00%-- (nil)

    21.69%  multi-fault-all  [kernel]                  [k] _raw_spin_lock
            |
            --- _raw_spin_lock
               |
               |--81.85%-- free_pcppages_bulk
               |          free_hot_cold_page
               |          __pagevec_free
               |          release_pages
               |          free_pages_and_swap_cache


I'll be almost offline in the next week. 

Minchan, in this version, I didn't add CONFIG and some others which was
recommended just for my laziness. Sorry.

=
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Asynchronous page fault.

This patch is for avoidng mmap_sem in usual page fault. At running highly
multi-threaded programs, mm->mmap_sem can use much CPU because of false
sharing when it causes page fault in parallel. (Run after fork() is a typical
case, I think.)
This patch uses a speculative vma lookup to reduce that cost.

Considering vma lookup, rb-tree lookup, the only operation we do is checking
node->rb_left,rb_right. And there are no complicated operation.
At page fault, there are no demands for accessing sorted-vma-list or access
prev or next in many case. Except for stack-expansion, we always need a vma
which contains page-fault address. Then, we can access vma's RB-tree in
speculative way.
Even if RB-tree rotation occurs while we walk tree for look-up, we just
miss vma without oops. In other words, we can _try_ to find vma in lockless
manner. If failed, retry is ok.... we take lock and access vma.

For lockess walking, this uses RCU and adds find_vma_speculative(). And
per-vma wait-queue and reference count. This refcnt+wait_queue guarantees that
there are no thread which access the vma when we call subsystem's unmap
functions.

Test result on my tiny test program on 8core/2socket machine is here.
This measures how many page fault can occur in 60sec in parallel.

[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all-split 8

 Performance counter stats for './multi-fault-all-split 8' (5 runs):

       17481387  page-faults                ( +-   0.409% )
      509914595  cache-misses               ( +-   0.239% )

   60.002277793  seconds time elapsed   ( +-   0.000% )


[root@bluextal memory]# /root/bin/perf stat -e page-faults,cache-misses --repeat 5 ./multi-fault-all-split 8


 Performance counter stats for './multi-fault-all-split 8' (5 runs):

       35949073  page-faults                ( +-   0.364% )
      473091100  cache-misses               ( +-   0.304% )

   60.005444117  seconds time elapsed   ( +-   0.004% )



Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 arch/x86/mm/fault.c      |   14 +++++++-
 include/linux/mm.h       |   14 ++++++++
 include/linux/mm_types.h |    5 +++
 lib/rbtree.c             |   34 ++++++++++----------
 mm/mmap.c                |   77 +++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 123 insertions(+), 21 deletions(-)

Index: linux-2.6.33-rc2/lib/rbtree.c
===================================================================

--- linux-2.6.33-rc2.orig/lib/rbtree.c
+++ linux-2.6.33-rc2/lib/rbtree.c
@@ -30,19 +30,19 @@ static void __rb_rotate_left(struct rb_n
 
 	if ((node->rb_right = right->rb_left))
 		rb_set_parent(right->rb_left, node);
-	right->rb_left = node;
+	rcu_assign_pointer(right->rb_left, node);
 
 	rb_set_parent(right, parent);
 
 	if (parent)
 	{
 		if (node == parent->rb_left)
-			parent->rb_left = right;
+			rcu_assign_pointer(parent->rb_left, right);
 		else
-			parent->rb_right = right;
+			rcu_assign_pointer(parent->rb_right, right);
 	}
 	else
-		root->rb_node = right;
+		rcu_assign_pointer(root->rb_node, right);
 	rb_set_parent(node, right);
 }
 
@@ -53,19 +53,19 @@ static void __rb_rotate_right(struct rb_
 
 	if ((node->rb_left = left->rb_right))
 		rb_set_parent(left->rb_right, node);
-	left->rb_right = node;
+	rcu_assign_pointer(left->rb_right, node);
 
 	rb_set_parent(left, parent);
 
 	if (parent)
 	{
 		if (node == parent->rb_right)
-			parent->rb_right = left;
+			rcu_assign_pointer(parent->rb_right, left);
 		else
-			parent->rb_left = left;
+			rcu_assign_pointer(parent->rb_left, left);
 	}
 	else
-		root->rb_node = left;
+		rcu_assign_pointer(root->rb_node, left);
 	rb_set_parent(node, left);
 }
 
@@ -234,11 +234,11 @@ void rb_erase(struct rb_node *node, stru
 
 		if (rb_parent(old)) {
 			if (rb_parent(old)->rb_left == old)
-				rb_parent(old)->rb_left = node;
+				rcu_assign_pointer(rb_parent(old)->rb_left, node);
 			else
-				rb_parent(old)->rb_right = node;
+				rcu_assign_pointer(rb_parent(old)->rb_right, node);
 		} else
-			root->rb_node = node;
+			rcu_assign_pointer(root->rb_node, node);
 
 		child = node->rb_right;
 		parent = rb_parent(node);
@@ -249,14 +249,14 @@ void rb_erase(struct rb_node *node, stru
 		} else {
 			if (child)
 				rb_set_parent(child, parent);
-			parent->rb_left = child;
+			rcu_assign_pointer(parent->rb_left, child);
 
-			node->rb_right = old->rb_right;
+			rcu_assign_pointer(node->rb_right, old->rb_right);
 			rb_set_parent(old->rb_right, node);
 		}
 
 		node->rb_parent_color = old->rb_parent_color;
-		node->rb_left = old->rb_left;
+		rcu_assign_pointer(node->rb_left, old->rb_left);
 		rb_set_parent(old->rb_left, node);
 
 		goto color;
@@ -270,12 +270,12 @@ void rb_erase(struct rb_node *node, stru
 	if (parent)
 	{
 		if (parent->rb_left == node)
-			parent->rb_left = child;
+			rcu_assign_pointer(parent->rb_left, child);
 		else
-			parent->rb_right = child;
+			rcu_assign_pointer(parent->rb_right, child);
 	}
 	else
-		root->rb_node = child;
+		rcu_assign_pointer(root->rb_node, child);
 
  color:
 	if (color == RB_BLACK)
Index: linux-2.6.33-rc2/include/linux/mm_types.h
===================================================================
--- linux-2.6.33-rc2.orig/include/linux/mm_types.h
+++ linux-2.6.33-rc2/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/cpumask.h>
+#include <linux/rcupdate.h>
 #include <linux/page-debug-flags.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -180,6 +181,10 @@ struct vm_area_struct {
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
+	atomic_t refcnt;
+	wait_queue_head_t wait_queue;
+	struct rcu_head	rcuhead;
+
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
Index: linux-2.6.33-rc2/mm/mmap.c
===================================================================
--- linux-2.6.33-rc2.orig/mm/mmap.c
+++ linux-2.6.33-rc2/mm/mmap.c
@@ -187,6 +187,24 @@ error:
 	return -ENOMEM;
 }
 
+static void __free_vma_rcu_cb(struct rcu_head *head)
+{
+	struct vm_area_struct *vma;
+	vma = container_of(head, struct vm_area_struct, rcuhead);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+/* Call this if vma was linked to rb-tree */
+static void free_vma_rcu(struct vm_area_struct *vma)
+{
+	call_rcu(&vma->rcuhead, __free_vma_rcu_cb);
+}
+/* called when vma is unlinked and wait for all racy access.*/
+static void invalidate_vma_before_free(struct vm_area_struct *vma)
+{
+	atomic_dec(&vma->refcnt);
+	wait_event(vma->wait_queue, !atomic_read(&vma->refcnt));
+}
+
 /*
  * Requires inode->i_mapping->i_mmap_lock
  */
@@ -238,7 +256,7 @@ static struct vm_area_struct *remove_vma
 			removed_exe_file_vma(vma->vm_mm);
 	}
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma_rcu(vma);
 	return next;
 }
 
@@ -404,6 +422,8 @@ __vma_link_list(struct mm_struct *mm, st
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct rb_node **rb_link, struct rb_node *rb_parent)
 {
+	atomic_set(&vma->refcnt, 1);
+	init_waitqueue_head(&vma->wait_queue);
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 }
@@ -614,6 +634,7 @@ again:			remove_next = 1 + (end > next->
 		 * us to remove next before dropping the locks.
 		 */
 		__vma_unlink(mm, next, vma);
+		invalidate_vma_before_free(next);
 		if (file)
 			__remove_shared_vm_struct(next, file, mapping);
 		if (next->anon_vma)
@@ -640,7 +661,7 @@ again:			remove_next = 1 + (end > next->
 		}
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma_rcu(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1544,6 +1565,55 @@ out:
 }
 
 /*
+ * Returns vma which contains given address. This scans rb-tree in speculative
+ * way and increment a reference count if found. Even if vma exists in rb-tree,
+ * this function may return NULL in racy case. So, this function cannot be used
+ * for checking whether given address is valid or not.
+ */
+struct vm_area_struct *
+find_vma_speculative(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL;
+	struct vm_area_struct *vma_tmp;
+	struct rb_node *rb_node;
+
+	if (unlikely(!mm))
+		return NULL;;
+
+	rcu_read_lock();
+	rb_node = rcu_dereference(mm->mm_rb.rb_node);
+	vma = NULL;
+	while (rb_node) {
+		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+		if (vma_tmp->vm_end > addr) {
+			vma = vma_tmp;
+			if (vma_tmp->vm_start <= addr)
+				break;
+			rb_node = rcu_dereference(rb_node->rb_left);
+		} else
+			rb_node = rcu_dereference(rb_node->rb_right);
+	}
+	if (vma) {
+		if ((vma->vm_start <= addr) && (addr < vma->vm_end)) {
+			if (!atomic_inc_not_zero(&vma->refcnt))
+				vma = NULL;
+		} else
+			vma = NULL;
+	}
+	rcu_read_unlock();
+	return vma;
+}
+
+void vma_put(struct vm_area_struct *vma)
+{
+	if ((atomic_dec_return(&vma->refcnt) == 1) &&
+	    waitqueue_active(&vma->wait_queue))
+		wake_up(&vma->wait_queue);
+	return;
+}
+
+/*
  * Verify that the stack growth is acceptable and
  * update accounting. This is shared with both the
  * grow-up and grow-down cases.
@@ -1808,6 +1878,7 @@ detach_vmas_to_be_unmapped(struct mm_str
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
+		invalidate_vma_before_free(vma);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
@@ -1819,7 +1890,7 @@ detach_vmas_to_be_unmapped(struct mm_str
 	else
 		addr = vma ?  vma->vm_start : mm->mmap_base;
 	mm->unmap_area(mm, addr);
-	mm->mmap_cache = NULL;		/* Kill the cache. */
+	mm->mmap_cache = NULL;	/* Kill the cache. */
 }
 
 /*
Index: linux-2.6.33-rc2/include/linux/mm.h
===================================================================
--- linux-2.6.33-rc2.orig/include/linux/mm.h
+++ linux-2.6.33-rc2/include/linux/mm.h
@@ -1235,6 +1235,20 @@ static inline struct vm_area_struct * fi
 	return vma;
 }
 
+/*
+ * Look up vma for given address in speculative way. This allows lockless lookup
+ * of vmas but in racy case, vma may no be found. You have to call find_vma()
+ * under read lock of mm->mmap_sem even if this function returns NULL.
+ * And, returned vma's reference count is incremented to show vma is accessed
+ * asynchronously, the caller has to call vma_put().
+ *
+ * Unlike find_vma(), this returns a vma which contains specified address.
+ * This doesn't return the nearest vma.
+ */
+extern struct vm_area_struct *find_vma_speculative(struct mm_struct *mm,
+	unsigned long addr);
+void vma_put(struct vm_area_struct *vma);
+
 static inline unsigned long vma_pages(struct vm_area_struct *vma)
 {
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
Index: linux-2.6.33-rc2/arch/x86/mm/fault.c
===================================================================
--- linux-2.6.33-rc2.orig/arch/x86/mm/fault.c
+++ linux-2.6.33-rc2/arch/x86/mm/fault.c
@@ -952,6 +952,7 @@ do_page_fault(struct pt_regs *regs, unsi
 	struct mm_struct *mm;
 	int write;
 	int fault;
+	int speculative = 0;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1040,6 +1041,14 @@ do_page_fault(struct pt_regs *regs, unsi
 		return;
 	}
 
+	if ((error_code & PF_USER)) {
+		vma = find_vma_speculative(mm, address);
+		if (vma) {
+			speculative = 1;
+			goto good_area;
+		}
+	}
+
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in
@@ -1136,5 +1145,8 @@ good_area:
 
 	check_v8086_mode(regs, address, tsk);
 
-	up_read(&mm->mmap_sem);
+	if (speculative)
+		vma_put(vma);
+	else
+		up_read(&mm->mmap_sem);
 }

[-- Attachment #2: multi-fault-all-split.c --]
[-- Type: text/x-csrc, Size: 1849 bytes --]

/*
 * multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
 * % gcc -O2 -o multi-fault multi-fault.c -lpthread
 * % multi-fault # of cpus.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>

#define NR_THREADS	8
pthread_t threads[NR_THREADS];
/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define MMAP_LENGTH	(8 * 1024 * 1024)
#define FAULT_LENGTH	(2 * 1024 * 1024)
void *mmap_area[NR_THREADS];
#define PAGE_SIZE	4096

pthread_barrier_t barrier;
int name[NR_THREADS];

void segv_handler(int sig)
{
	sleep(100);
}
void *worker(void *data)
{
	cpu_set_t set;
	int cpu;

	cpu = *(int *)data;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);

	while (1) {
		char *c;
		char *start = mmap_area[cpu];
		char *end = mmap_area[cpu] + FAULT_LENGTH;
		pthread_barrier_wait(&barrier);
		//printf("fault into %p-%p\n",start, end);

		for (c = start; c < end; c += PAGE_SIZE)
			*c = 0;
		pthread_barrier_wait(&barrier);

		madvise(start, FAULT_LENGTH, MADV_DONTNEED);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int i, ret;
	unsigned int num;

	if (argc < 2)
		return 0;

	num = atoi(argv[1]);	
	pthread_barrier_init(&barrier, NULL, num);

	for (i = 0; i < num; i++) {
		mmap_area[i] = mmap(NULL, MMAP_LENGTH * num,
				PROT_WRITE|PROT_READ,
				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
		/* memory hole */
		mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
	}

	for (i = 0; i < num; ++i) {
		name[i] = i;
		ret = pthread_create(&threads[i], NULL, worker, &name[i]);
		if (ret < 0) {
			perror("pthread create");
			return 0;
		}
	}
	sleep(60);
	return 0;
}

^ permalink raw reply	[flat|nested] 32+ messages in thread