Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Peter Zijlstra <peterz@infradead.org>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"minchan.kim@gmail.com" <minchan.kim@gmail.com>,
	cl@linux-foundation.org,
	"hugh.dickins" <hugh.dickins@tiscali.co.uk>,
	Nick Piggin <nickpiggin@yahoo.com.au>,
	Ingo Molnar <mingo@elte.hu>,
	Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 4/8] mm: RCU free vmas
Date: Mon, 04 Jan 2010 19:24:33 +0100
Message-ID: <20100104182813.479668508@chello.nl> (raw)
In-Reply-To: <20100104182429.833180340@chello.nl>


[-- Attachment #0: mm-foo-3.patch --]
[-- Type: text/plain, Size: 6711 bytes --]

TODO:
 - should be SRCU, lack of call_srcu()

In order to allow speculative vma lookups, RCU free the struct
vm_area_struct.

We use two means of detecting a vma is still valid:
 - firstly, we set RB_CLEAR_NODE once we remove a vma from the tree.
 - secondly, we check the vma sequence number.

These two things combined will guarantee that 1) the vma is still
present and two, it still covers the same range from when we looked it
up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm.h       |   12 ++++++++++++
 include/linux/mm_types.h |    2 ++
 init/Kconfig             |   34 +++++++++++++++++-----------------
 kernel/sched.c           |    9 ++++++++-
 mm/mmap.c                |   33 +++++++++++++++++++++++++++++++--
 5 files changed, 70 insertions(+), 20 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -765,6 +765,18 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
 
+static inline int vma_is_dead(struct vm_area_struct *vma, unsigned int sequence)
+{
+	int ret = RB_EMPTY_NODE(&vma->vm_rb);
+	unsigned seq = vma->vm_sequence.sequence;
+	/*
+	 * Matches both the wmb in write_seqlock_begin/end() and
+	 * the wmb in detach_vmas_to_be_unmapped()/__unlink_vma().
+	 */
+	smp_rmb();
+	return ret || seq != sequence;
+}
+
 /**
  * mm_walk - callbacks for walk_page_range
  * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h
+++ linux-2.6/include/linux/mm_types.h
@@ -13,6 +13,7 @@
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
 #include <linux/seqlock.h>
+#include <linux/rcupdate.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -188,6 +189,7 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	seqcount_t vm_sequence;
+	struct rcu_head vm_rcu_head;
 };
 
 struct core_thread {
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -222,6 +222,19 @@ void unlink_file_vma(struct vm_area_stru
 	}
 }
 
+static void free_vma_rcu(struct rcu_head *head)
+{
+	struct vm_area_struct *vma =
+		container_of(head, struct vm_area_struct, vm_rcu_head);
+
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+static void free_vma(struct vm_area_struct *vma)
+{
+	call_rcu(&vma->vm_rcu_head, free_vma_rcu);
+}
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -238,7 +251,7 @@ static struct vm_area_struct *remove_vma
 			removed_exe_file_vma(vma->vm_mm);
 	}
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma);
 	return next;
 }
 
@@ -488,6 +501,14 @@ __vma_unlink(struct mm_struct *mm, struc
 {
 	prev->vm_next = vma->vm_next;
 	rb_erase(&vma->vm_rb, &mm->mm_rb);
+	/*
+	 * Ensure the removal is completely comitted to memory
+	 * before clearing the node.
+	 *
+	 * Matched by vma_is_dead()/handle_speculative_fault().
+	 */
+	smp_wmb();
+	RB_CLEAR_NODE(&vma->vm_rb);
 	if (mm->mmap_cache == vma)
 		mm->mmap_cache = prev;
 }
@@ -644,7 +665,7 @@ again:			remove_next = 1 + (end > next->
 		}
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1858,6 +1879,14 @@ detach_vmas_to_be_unmapped(struct mm_str
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
+		/*
+		 * Ensure the removal is completely comitted to memory
+		 * before clearing the node.
+		 *
+		 * Matched by vma_is_dead()/handle_speculative_fault().
+		 */
+		smp_wmb();
+		RB_CLEAR_NODE(&vma->vm_rb);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -314,19 +314,19 @@ menu "RCU Subsystem"
 
 choice
 	prompt "RCU Implementation"
-	default TREE_RCU
+	default TREE_PREEMPT_RCU
 
-config TREE_RCU
-	bool "Tree-based hierarchical RCU"
-	help
-	  This option selects the RCU implementation that is
-	  designed for very large SMP system with hundreds or
-	  thousands of CPUs.  It also scales down nicely to
-	  smaller systems.
+#config TREE_RCU
+#	bool "Tree-based hierarchical RCU"
+#	help
+#	  This option selects the RCU implementation that is
+#	  designed for very large SMP system with hundreds or
+#	  thousands of CPUs.  It also scales down nicely to
+#	  smaller systems.
 
 config TREE_PREEMPT_RCU
 	bool "Preemptable tree-based hierarchical RCU"
-	depends on PREEMPT
+#	depends on PREEMPT
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -334,14 +334,14 @@ config TREE_PREEMPT_RCU
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 
-config TINY_RCU
-	bool "UP-only small-memory-footprint RCU"
-	depends on !SMP
-	help
-	  This option selects the RCU implementation that is
-	  designed for UP systems from which real-time response
-	  is not required.  This option greatly reduces the
-	  memory footprint of RCU.
+#config TINY_RCU
+#	bool "UP-only small-memory-footprint RCU"
+#	depends on !SMP
+#	help
+#	  This option selects the RCU implementation that is
+#	  designed for UP systems from which real-time response
+#	  is not required.  This option greatly reduces the
+#	  memory footprint of RCU.
 
 endchoice
 
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -9689,7 +9689,14 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = (preempt_count() & ~PREEMPT_ACTIVE)
+		/*
+		 * remove this for we need preemptible RCU
+		 * exactly because it needs to sleep..
+		 *
+		 + rcu_preempt_depth()
+		 */
+		;
 
 	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply index

Thread overview: 121+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-01-04 18:24 [RFC][PATCH 0/8] Speculative pagefault -v3 Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 1/8] mm: Remove pte reference from fault path Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 2/8] mm: Speculative pagefault infrastructure Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 3/8] mm: Add vma sequence count Peter Zijlstra
2010-01-04 18:24 ` Peter Zijlstra [this message]
2010-01-05  2:43   ` [RFC][PATCH 4/8] mm: RCU free vmas Paul E. McKenney
2010-01-05  8:28     ` Peter Zijlstra
2010-01-05 16:05       ` Paul E. McKenney
2010-01-04 18:24 ` [RFC][PATCH 5/8] mm: Speculative pte_map_lock() Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 6/8] mm: handle_speculative_fault() Peter Zijlstra
2010-01-05  0:25   ` KAMEZAWA Hiroyuki
2010-01-05  3:13     ` Linus Torvalds
2010-01-05  8:17       ` Peter Zijlstra
2010-01-05  8:57       ` Peter Zijlstra
2010-01-05 15:34         ` Linus Torvalds
2010-01-05 15:40           ` Al Viro
2010-01-05 16:10             ` Linus Torvalds
2010-01-06 15:41               ` Peter Zijlstra
2010-01-05  9:37       ` Peter Zijlstra
2010-01-05 23:35         ` Linus Torvalds
2010-01-05  4:29     ` Minchan Kim
2010-01-05  4:43       ` KAMEZAWA Hiroyuki
2010-01-05  5:10         ` Linus Torvalds
2010-01-05  5:30           ` KAMEZAWA Hiroyuki
2010-01-05  7:39             ` KAMEZAWA Hiroyuki
2010-01-05 15:26               ` Linus Torvalds
2010-01-05 16:14                 ` Linus Torvalds
2010-01-05 17:25                   ` Andi Kleen
2010-01-05 17:47                     ` Christoph Lameter
2010-01-05 18:00                       ` Andi Kleen
2010-01-05 17:55                     ` Linus Torvalds
2010-01-05 18:13                       ` Christoph Lameter
2010-01-05 18:25                         ` Linus Torvalds
2010-01-05 18:46                           ` Christoph Lameter
2010-01-05 18:56                             ` Linus Torvalds
2010-01-05 19:15                               ` Christoph Lameter
2010-01-05 19:28                                 ` Linus Torvalds
2010-01-05 18:55                           ` Paul E. McKenney
2010-01-05 19:08                             ` Linus Torvalds
2010-01-05 19:23                               ` Paul E. McKenney
2010-01-05 20:29                           ` Peter Zijlstra
2010-01-05 20:46                             ` Linus Torvalds
2010-01-05 21:00                               ` Linus Torvalds
2010-01-05 23:29                             ` Paul E. McKenney
2010-01-06  0:22                 ` KAMEZAWA Hiroyuki
2010-01-06  1:37                   ` Linus Torvalds
2010-01-06  2:52                     ` KAMEZAWA Hiroyuki
2010-01-06  3:27                       ` Linus Torvalds
2010-01-06  3:56                         ` KAMEZAWA Hiroyuki
2010-01-06  4:20                           ` Linus Torvalds
2010-01-06  7:06                             ` KAMEZAWA Hiroyuki
2010-01-06  7:49                               ` Minchan Kim
2010-01-06  9:39                               ` Linus Torvalds
2010-01-07  1:00                                 ` KAMEZAWA Hiroyuki
2010-01-08 16:53                             ` Peter Zijlstra
2010-01-08 17:22                               ` Linus Torvalds
2010-01-08 17:43                                 ` Christoph Lameter
2010-01-08 17:52                                   ` Linus Torvalds
2010-01-08 18:33                                     ` Christoph Lameter
2010-01-08 18:46                                   ` Andi Kleen
2010-01-08 18:56                                     ` Christoph Lameter
2010-01-08 19:10                                       ` Andi Kleen
2010-01-08 19:11                                       ` Linus Torvalds
2010-01-08 19:28                                         ` Andi Kleen
2010-01-08 19:39                                           ` Linus Torvalds
2010-01-08 19:42                                             ` Linus Torvalds
2010-01-08 21:36                                   ` Linus Torvalds
2010-01-08 21:46                                     ` Christoph Lameter
2010-01-08 22:43                                       ` Linus Torvalds
2010-01-08 22:43                                       ` Linus Torvalds
2010-01-09 14:47                               ` Ed Tomlinson
2010-01-10  5:27                                 ` Nitin Gupta
2010-01-05 15:14             ` Christoph Lameter
2010-01-05  8:18           ` Peter Zijlstra
2010-01-05  6:00         ` Minchan Kim
2010-01-05  4:48       ` Linus Torvalds
2010-01-05  6:09         ` Minchan Kim
2010-01-05  6:09           ` KAMEZAWA Hiroyuki
2010-01-05  6:24             ` Minchan Kim
2010-01-05  8:35           ` Peter Zijlstra
2010-01-05 13:45   ` Arjan van de Ven
2010-01-05 14:15     ` Andi Kleen
2010-01-05 15:17     ` Christoph Lameter
2010-01-06  3:22       ` Arjan van de Ven
2010-01-07 16:11         ` Christoph Lameter
2010-01-07 16:19           ` Linus Torvalds
2010-01-07 16:31             ` Linus Torvalds
2010-01-07 16:34             ` Paul E. McKenney
2010-01-07 16:36             ` Christoph Lameter
2010-01-08  4:49               ` Arjan van de Ven
2010-01-08  5:00                 ` Linus Torvalds
2010-01-08 15:51                 ` Christoph Lameter
2010-01-09 15:55                   ` Arjan van de Ven
2010-01-07 17:22             ` Peter Zijlstra
2010-01-07 17:36               ` Linus Torvalds
2010-01-07 17:49                 ` Linus Torvalds
2010-01-07 18:00                   ` Peter Zijlstra
2010-01-07 18:15                     ` Linus Torvalds
2010-01-07 21:49                       ` Peter Zijlstra
2010-01-07 18:44                   ` Linus Torvalds
2010-01-07 19:20                     ` Paul E. McKenney
2010-01-07 20:06                       ` Linus Torvalds
2010-01-07 20:25                         ` Paul E. McKenney
2010-01-07 19:24                     ` Christoph Lameter
2010-01-07 20:08                       ` Linus Torvalds
2010-01-07 20:13                         ` Linus Torvalds
2010-01-07 21:44                     ` Peter Zijlstra
2010-01-07 22:33                       ` Linus Torvalds
2010-01-08  0:23                         ` KAMEZAWA Hiroyuki
2010-01-08  0:25                           ` KAMEZAWA Hiroyuki
2010-01-08  0:39                           ` Linus Torvalds
2010-01-08  0:41                             ` Linus Torvalds
2010-01-07 23:51                 ` Rik van Riel
2010-01-04 18:24 ` [RFC][PATCH 7/8] mm,x86: speculative pagefault support Peter Zijlstra
2010-01-04 18:24 ` [RFC][PATCH 8/8] mm: Optimize pte_map_lock() Peter Zijlstra
2010-01-04 21:41 ` [RFC][PATCH 0/8] Speculative pagefault -v3 Rik van Riel
2010-01-04 21:46   ` Peter Zijlstra
2010-01-04 23:20     ` Rik van Riel
2010-01-04 21:59   ` Christoph Lameter
2010-01-05  0:28     ` KAMEZAWA Hiroyuki
2010-01-05  2:26 ` Minchan Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100104182813.479668508@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=cl@linux-foundation.org \
    --cc=hugh.dickins@tiscali.co.uk \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan.kim@gmail.com \
    --cc=mingo@elte.hu \
    --cc=nickpiggin@yahoo.com.au \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git