All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFP 0/3] Make mmu_notifier_invalidate_range_start able to sleep.
@ 2010-01-28 19:56 Robin Holt
  2010-01-28 19:56 ` [RFP 1/3] srcu Robin Holt
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Robin Holt @ 2010-01-28 19:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm

This proposed set of patches is three parts.  The first changes
mmu_notifiers over to using srcu instead of rcu, the second move the
tlb_gather_mmu after the mmu_notifier_invalidate_range_start, and the
last allows the truncate call to zap_page_range work in an atomic context.

The atomic context is accomplished by unlocking the i_mmap_lock and then
making a second call into mmu_notifier_invalidate_range_start().

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: linux-mm@kvack.org

---

 include/linux/mm.h           |    2 -
 include/linux/mmu_notifier.h |   20 +++++++-----
 include/linux/srcu.h         |    2 +
 mm/fremap.c                  |    2 -
 mm/hugetlb.c                 |    2 -
 mm/memory.c                  |   36 ++++++++++++++++------
 mm/mmap.c                    |    6 +--
 mm/mmu_notifier.c            |   69 ++++++++++++++++++++++++++-----------------
 mm/mprotect.c                |    2 -
 mm/mremap.c                  |    2 -
 10 files changed, 90 insertions(+), 53 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFP 1/3] srcu
  2010-01-28 19:56 [RFP 0/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
@ 2010-01-28 19:56 ` Robin Holt
  2010-01-29 20:56   ` Andrew Morton
  2010-01-28 19:56 ` [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
  2010-01-28 19:56 ` [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
  2 siblings, 1 reply; 12+ messages in thread
From: Robin Holt @ 2010-01-28 19:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Jack Steiner

[-- Attachment #1: mmu_notifier_srcu_v1 --]
[-- Type: text/plain, Size: 9207 bytes --]

From: Andrea Arcangeli <andrea@qumranet.com>

This converts rcu into a per-mm srcu to allow all mmu notifier methods to
schedule.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jack Steiner <steiner@sgi.com>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: linux-mm@kvack.org
---

 include/linux/mmu_notifier.h |    3 ++
 include/linux/srcu.h         |    2 +
 mm/mmu_notifier.c            |   59 ++++++++++++++++++++++++++-----------------
 3 files changed, 42 insertions(+), 22 deletions(-)
Index: mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h
===================================================================
--- mmu_notifiers_sleepable_v1.orig/include/linux/mmu_notifier.h	2010-01-28 10:36:39.000000000 -0600
+++ mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h	2010-01-28 10:39:10.000000000 -0600
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
+#include <linux/srcu.h>
 
 struct mmu_notifier;
 struct mmu_notifier_ops;
@@ -19,6 +20,8 @@ struct mmu_notifier_ops;
 struct mmu_notifier_mm {
 	/* all mmu notifiers registerd in this mm are queued in this list */
 	struct hlist_head list;
+	/* srcu structure for this mm */
+	struct srcu_struct srcu;
 	/* to serialize the list modifications and hlist_unhashed */
 	spinlock_t lock;
 };
Index: mmu_notifiers_sleepable_v1/include/linux/srcu.h
===================================================================
--- mmu_notifiers_sleepable_v1.orig/include/linux/srcu.h	2010-01-28 10:36:39.000000000 -0600
+++ mmu_notifiers_sleepable_v1/include/linux/srcu.h	2010-01-28 10:39:10.000000000 -0600
@@ -27,6 +27,8 @@
 #ifndef _LINUX_SRCU_H
 #define _LINUX_SRCU_H
 
+#include <linux/mutex.h>
+
 struct srcu_struct_array {
 	int c[2];
 };
Index: mmu_notifiers_sleepable_v1/mm/mmu_notifier.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mmu_notifier.c	2010-01-28 10:36:40.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mmu_notifier.c	2010-01-28 10:39:10.000000000 -0600
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 
@@ -24,14 +25,15 @@
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
  * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
  * can't go away from under us as exit_mmap holds an mm_count pin
  * itself.
  */
 void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
+	int srcu;
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,10 +48,10 @@ void __mmu_notifier_release(struct mm_st
 		 */
 		hlist_del_init_rcu(&mn->hlist);
 		/*
-		 * RCU here will block mmu_notifier_unregister until
+		 * SRCU here will block mmu_notifier_unregister until
 		 * ->release returns.
 		 */
-		rcu_read_lock();
+		srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 		/*
 		 * if ->release runs before mmu_notifier_unregister it
@@ -60,13 +62,13 @@ void __mmu_notifier_release(struct mm_st
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
+		srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 		spin_lock(&mm->mmu_notifier_mm->lock);
 	}
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
 	/*
-	 * synchronize_rcu here prevents mmu_notifier_release to
+	 * synchronize_srcu here prevents mmu_notifier_release to
 	 * return to exit_mmap (which would proceed freeing all pages
 	 * in the mm) until the ->release method returns, if it was
 	 * invoked by mmu_notifier_unregister.
@@ -74,7 +76,7 @@ void __mmu_notifier_release(struct mm_st
 	 * The mmu_notifier_mm can't go away from under us because one
 	 * mm_count is hold by exit_mmap.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&mm->mmu_notifier_mm->srcu);
 }
 
 /*
@@ -87,14 +89,14 @@ int __mmu_notifier_clear_flush_young(str
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 
 	return young;
 }
@@ -124,13 +126,14 @@ void __mmu_notifier_invalidate_page(stru
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -138,13 +141,14 @@ void __mmu_notifier_invalidate_range_sta
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
 			mn->ops->invalidate_range_start(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -152,13 +156,14 @@ void __mmu_notifier_invalidate_range_end
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int srcu;
 
-	rcu_read_lock();
+	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -175,6 +180,10 @@ static int do_mmu_notifier_register(stru
 	if (unlikely(!mmu_notifier_mm))
 		goto out;
 
+	ret = init_srcu_struct(&mmu_notifier_mm->srcu);
+	if (unlikely(ret))
+		goto out_kfree;
+
 	if (take_mmap_sem)
 		down_write(&mm->mmap_sem);
 	ret = mm_take_all_locks(mm);
@@ -205,6 +214,9 @@ static int do_mmu_notifier_register(stru
 out_cleanup:
 	if (take_mmap_sem)
 		up_write(&mm->mmap_sem);
+	if (mmu_notifier_mm)
+		cleanup_srcu_struct(&mmu_notifier_mm->srcu);
+out_kfree:
 	/* kfree() does nothing if mmu_notifier_mm is NULL */
 	kfree(mmu_notifier_mm);
 out:
@@ -245,6 +257,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_registe
 void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 {
 	BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+	cleanup_srcu_struct(&mm->mmu_notifier_mm->srcu);
 	kfree(mm->mmu_notifier_mm);
 	mm->mmu_notifier_mm = LIST_POISON1; /* debug */
 }
@@ -252,8 +265,8 @@ void __mmu_notifier_mm_destroy(struct mm
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -265,13 +278,15 @@ void mmu_notifier_unregister(struct mmu_
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	if (!hlist_unhashed(&mn->hlist)) {
+		int srcu;
+
 		hlist_del_rcu(&mn->hlist);
 
 		/*
-		 * RCU here will force exit_mmap to wait ->release to finish
+		 * SRCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
-		rcu_read_lock();
+		srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 		/*
 		 * exit_mmap will block in mmu_notifier_release to
@@ -280,7 +295,7 @@ void mmu_notifier_unregister(struct mmu_
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
+		srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 	} else
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 
@@ -288,7 +303,7 @@ void mmu_notifier_unregister(struct mmu_
 	 * Wait any running method to finish, of course including
 	 * ->release if it was run by mmu_notifier_relase instead of us.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&mm->mmu_notifier_mm->srcu);
 
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers
  2010-01-28 19:56 [RFP 0/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
  2010-01-28 19:56 ` [RFP 1/3] srcu Robin Holt
@ 2010-01-28 19:56 ` Robin Holt
  2010-01-29 20:54   ` Andrew Morton
  2010-01-28 19:56 ` [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
  2 siblings, 1 reply; 12+ messages in thread
From: Robin Holt @ 2010-01-28 19:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Jack Steiner

[-- Attachment #1: mmu_notifier_tlb_v1 --]
[-- Type: text/plain, Size: 4327 bytes --]


unmap_vmas() can fail to correctly flush the TLB if a
callout to mmu_notifier_invalidate_range_start() sleeps.
The mmu_gather list is initialized prior to the callout. If it is reused
while the thread is sleeping, the mm field may be invalid.

If the task migrates to a different cpu, the task may use the wrong
mmu_gather.

The patch changes unmap_vmas() to initialize the mmu_gather
AFTER the mmu_notifier completes.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
Cc: Andrea Arcangeli <aarcange@redhat.com>

---

 include/linux/mm.h |    2 +-
 mm/memory.c        |   11 +++++++----
 mm/mmap.c          |    6 ++----
 3 files changed, 10 insertions(+), 9 deletions(-)
Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h	2010-01-25 01:45:37.000000000 -0600
+++ linux/include/linux/mm.h	2010-01-25 11:32:21.000000000 -0600
@@ -761,7 +761,7 @@ unsigned long zap_page_range(struct vm_a
 unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *);
+		struct zap_details *, int fullmm);
 
 /**
  * mm_walk - callbacks for walk_page_range
Index: linux/mm/memory.c
===================================================================
--- linux.orig/mm/memory.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/memory.c	2010-01-25 11:32:21.000000000 -0600
@@ -1010,17 +1010,21 @@ static unsigned long unmap_page_range(st
 unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *details)
+		struct zap_details *details, int fullmm)
 {
 	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = (*tlbp)->fullmm;
 	struct mm_struct *mm = vma->vm_mm;
 
+	/*
+	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
+	 * mmu_gather until it completes
+	 */
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
 
@@ -1108,9 +1112,8 @@ unsigned long zap_page_range(struct vm_a
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details, 0);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
Index: linux/mm/mmap.c
===================================================================
--- linux.orig/mm/mmap.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/mmap.c	2010-01-25 11:35:55.000000000 -0600
@@ -1824,9 +1824,8 @@ static void unmap_region(struct mm_struc
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL, 0);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
@@ -2168,10 +2167,9 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
-	tlb = tlb_gather_mmu(mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL, 1);
 	vm_unacct_memory(nr_accounted);
 
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-01-28 19:56 [RFP 0/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
  2010-01-28 19:56 ` [RFP 1/3] srcu Robin Holt
  2010-01-28 19:56 ` [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
@ 2010-01-28 19:56 ` Robin Holt
  2010-01-28 21:31   ` Andrea Arcangeli
  2010-01-29 21:08   ` Andrew Morton
  2 siblings, 2 replies; 12+ messages in thread
From: Robin Holt @ 2010-01-28 19:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Jack Steiner

[-- Attachment #1: mmu_notifier_truncate_sleepable_v1 --]
[-- Type: text/plain, Size: 10044 bytes --]


Make the truncate case handle the need to sleep.  We accomplish this
by failing the mmu_notifier_invalidate_range_start(... atomic==1)
case which inturn falls back to unmap_mapping_range_vma() with the
restart_address == start_address.  In that case, we make an additional
callout to mmu_notifier_invalidate_range_start(... atomic==0) after the
i_mmap_lock has been released.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: linux-mm@kvack.org

---

 include/linux/mmu_notifier.h |   17 +++++++++--------
 mm/fremap.c                  |    2 +-
 mm/hugetlb.c                 |    2 +-
 mm/memory.c                  |   25 +++++++++++++++++++------
 mm/mmu_notifier.c            |   10 ++++++----
 mm/mprotect.c                |    2 +-
 mm/mremap.c                  |    2 +-
 7 files changed, 38 insertions(+), 22 deletions(-)
Index: mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h
===================================================================
--- mmu_notifiers_sleepable_v1.orig/include/linux/mmu_notifier.h	2010-01-28 13:43:26.000000000 -0600
+++ mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h	2010-01-28 13:43:26.000000000 -0600
@@ -170,8 +170,8 @@ extern void __mmu_notifier_change_pte(st
 				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address);
-extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end);
+extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			    unsigned long start, unsigned long end, int atomic);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 
@@ -203,11 +203,12 @@ static inline void mmu_notifier_invalida
 		__mmu_notifier_invalidate_page(mm, address);
 }
 
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, int atomic)
 {
 	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_start(mm, start, end);
+		return __mmu_notifier_invalidate_range_start(mm, start, end, atomic);
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -288,10 +289,10 @@ static inline void mmu_notifier_invalida
 					  unsigned long address)
 {
 }
-
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, int atomic)
 {
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
Index: mmu_notifiers_sleepable_v1/mm/fremap.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/fremap.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/fremap.c	2010-01-28 13:43:26.000000000 -0600
@@ -226,7 +226,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsign
 		vma->vm_flags = saved_flags;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
+	mmu_notifier_invalidate_range_start(mm, start, start + size, 0);
 	err = populate_range(mm, vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
Index: mmu_notifiers_sleepable_v1/mm/hugetlb.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/hugetlb.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/hugetlb.c	2010-01-28 13:43:26.000000000 -0600
@@ -2159,7 +2159,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
Index: mmu_notifiers_sleepable_v1/mm/memory.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/memory.c	2010-01-28 13:43:26.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/memory.c	2010-01-28 13:43:26.000000000 -0600
@@ -786,7 +786,7 @@ int copy_page_range(struct mm_struct *ds
 	 * is_cow_mapping() returns true.
 	 */
 	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_start(src_mm, addr, end);
+		mmu_notifier_invalidate_range_start(src_mm, addr, end, 0);
 
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
@@ -990,7 +990,8 @@ static unsigned long unmap_page_range(st
  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
  * @details: details of nonlinear truncation or shared cache invalidation
  *
- * Returns the end address of the unmapping (restart addr if interrupted).
+ * Returns the end address of the unmapping (restart addr if interrupted, start
+ * if the i_mmap_lock is held and mmu_notifier_range_start() needs to sleep).
  *
  * Unmap all pages in the vma list.
  *
@@ -1018,12 +1019,17 @@ unsigned long unmap_vmas(struct mmu_gath
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	struct mm_struct *mm = vma->vm_mm;
+	int ret;
 
 	/*
 	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
 	 * mmu_gather until it completes
 	 */
-	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	ret = mmu_notifier_invalidate_range_start(mm, start_addr,
+					end_addr, (i_mmap_lock == NULL));
+	if (ret)
+		goto out;
+
 	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
@@ -1107,7 +1113,7 @@ unsigned long zap_page_range(struct vm_a
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct mmu_gather *tlb;
+	struct mmu_gather *tlb == NULL;
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
@@ -1908,7 +1914,7 @@ int apply_to_page_range(struct mm_struct
 	int err;
 
 	BUG_ON(addr >= end);
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -2329,6 +2335,7 @@ static int unmap_mapping_range_vma(struc
 {
 	unsigned long restart_addr;
 	int need_break;
+	int need_unlocked_invalidate;
 
 	/*
 	 * files that support invalidating or truncating portions of the
@@ -2350,7 +2357,9 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+	need_unlocked_invalidate = (restart_addr == start_addr);
+	need_break = need_resched() || spin_needbreak(details->i_mmap_lock) ||
+					need_unlocked_invalidate;
 
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
@@ -2365,6 +2374,10 @@ again:
 	}
 
 	spin_unlock(details->i_mmap_lock);
+	if (need_unlocked_invalidate) {
+		mmu_notifier_invalidate_range_start(vma->mm, start, end, 0);
+		mmu_notifier_invalidate_range_end(vma->mm, start, end);
+	}
 	cond_resched();
 	spin_lock(details->i_mmap_lock);
 	return -EINTR;
Index: mmu_notifiers_sleepable_v1/mm/mmu_notifier.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mmu_notifier.c	2010-01-28 13:43:26.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mmu_notifier.c	2010-01-28 13:43:26.000000000 -0600
@@ -135,20 +135,22 @@ void __mmu_notifier_invalidate_page(stru
 	}
 	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
-
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, int atomic)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 	int srcu;
+	int ret = 0;
 
 	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
-			mn->ops->invalidate_range_start(mn, mm, start, end);
+			ret |= mn->ops->invalidate_range_start(mn, mm, start,
+								end, atomic);
 	}
 	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
+	return ret;
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
Index: mmu_notifiers_sleepable_v1/mm/mprotect.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mprotect.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mprotect.c	2010-01-28 13:43:26.000000000 -0600
@@ -204,7 +204,7 @@ success:
 		dirty_accountable = 1;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
Index: mmu_notifiers_sleepable_v1/mm/mremap.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mremap.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mremap.c	2010-01-28 13:43:26.000000000 -0600
@@ -82,7 +82,7 @@ static void move_ptes(struct vm_area_str
 
 	old_start = old_addr;
 	mmu_notifier_invalidate_range_start(vma->vm_mm,
-					    old_start, old_end);
+					    old_start, old_end, 0);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-01-28 19:56 ` [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
@ 2010-01-28 21:31   ` Andrea Arcangeli
  2010-01-29 12:37     ` Robin Holt
  2010-01-29 21:08   ` Andrew Morton
  1 sibling, 1 reply; 12+ messages in thread
From: Andrea Arcangeli @ 2010-01-28 21:31 UTC (permalink / raw)
  To: Robin Holt; +Cc: linux-mm, Jack Steiner

On Thu, Jan 28, 2010 at 01:56:30PM -0600, Robin Holt wrote:
> 
> Make the truncate case handle the need to sleep.  We accomplish this
> by failing the mmu_notifier_invalidate_range_start(... atomic==1)
> case which inturn falls back to unmap_mapping_range_vma() with the
> restart_address == start_address.  In that case, we make an additional
> callout to mmu_notifier_invalidate_range_start(... atomic==0) after the
> i_mmap_lock has been released.

I think this is as dirty as it can be and why Christoph's first
patchset was turned down by Andrew (rightfully). What is wrong with
MMU_NOTIFIER_SLEEPABLE=y that is automatically enabled by XPMEM=y?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-01-28 21:31   ` Andrea Arcangeli
@ 2010-01-29 12:37     ` Robin Holt
  0 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2010-01-29 12:37 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robin Holt, linux-mm, Jack Steiner, Andrea Arcangeli

On Thu, Jan 28, 2010 at 10:31:00PM +0100, Andrea Arcangeli wrote:
> On Thu, Jan 28, 2010 at 01:56:30PM -0600, Robin Holt wrote:
> > 
> > Make the truncate case handle the need to sleep.  We accomplish this
> > by failing the mmu_notifier_invalidate_range_start(... atomic==1)
> > case which inturn falls back to unmap_mapping_range_vma() with the
> > restart_address == start_address.  In that case, we make an additional
> > callout to mmu_notifier_invalidate_range_start(... atomic==0) after the
> > i_mmap_lock has been released.
> 
> I think this is as dirty as it can be and why Christoph's first
> patchset was turned down by Andrew (rightfully). What is wrong with
> MMU_NOTIFIER_SLEEPABLE=y that is automatically enabled by XPMEM=y?

Andrew, do you have objections to this method?

Thanks,
Robin

Patch 3 repeated below:
------------------------------------------------------------------------

Make the truncate case handle the need to sleep.  We accomplish this
by failing the mmu_notifier_invalidate_range_start(... atomic==1)
case which inturn falls back to unmap_mapping_range_vma() with the
restart_address == start_address.  In that case, we make an additional
callout to mmu_notifier_invalidate_range_start(... atomic==0) after the
i_mmap_lock has been released.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: linux-mm@kvack.org

---

 include/linux/mmu_notifier.h |   17 +++++++++--------
 mm/fremap.c                  |    2 +-
 mm/hugetlb.c                 |    2 +-
 mm/memory.c                  |   25 +++++++++++++++++++------
 mm/mmu_notifier.c            |   10 ++++++----
 mm/mprotect.c                |    2 +-
 mm/mremap.c                  |    2 +-
 7 files changed, 38 insertions(+), 22 deletions(-)


Index: mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h
===================================================================
--- mmu_notifiers_sleepable_v1.orig/include/linux/mmu_notifier.h	2010-01-28 13:43:26.000000000 -0600
+++ mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h	2010-01-28 13:43:26.000000000 -0600
@@ -170,8 +170,8 @@ extern void __mmu_notifier_change_pte(st
 				      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 					  unsigned long address);
-extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end);
+extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			    unsigned long start, unsigned long end, int atomic);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 
@@ -203,11 +203,12 @@ static inline void mmu_notifier_invalida
 		__mmu_notifier_invalidate_page(mm, address);
 }
 
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, int atomic)
 {
 	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_start(mm, start, end);
+		return __mmu_notifier_invalidate_range_start(mm, start, end, atomic);
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -288,10 +289,10 @@ static inline void mmu_notifier_invalida
 					  unsigned long address)
 {
 }
-
-static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+static inline int mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, int atomic)
 {
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
Index: mmu_notifiers_sleepable_v1/mm/fremap.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/fremap.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/fremap.c	2010-01-28 13:43:26.000000000 -0600
@@ -226,7 +226,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsign
 		vma->vm_flags = saved_flags;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
+	mmu_notifier_invalidate_range_start(mm, start, start + size, 0);
 	err = populate_range(mm, vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
Index: mmu_notifiers_sleepable_v1/mm/hugetlb.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/hugetlb.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/hugetlb.c	2010-01-28 13:43:26.000000000 -0600
@@ -2159,7 +2159,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
Index: mmu_notifiers_sleepable_v1/mm/memory.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/memory.c	2010-01-28 13:43:26.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/memory.c	2010-01-28 13:43:26.000000000 -0600
@@ -786,7 +786,7 @@ int copy_page_range(struct mm_struct *ds
 	 * is_cow_mapping() returns true.
 	 */
 	if (is_cow_mapping(vma->vm_flags))
-		mmu_notifier_invalidate_range_start(src_mm, addr, end);
+		mmu_notifier_invalidate_range_start(src_mm, addr, end, 0);
 
 	ret = 0;
 	dst_pgd = pgd_offset(dst_mm, addr);
@@ -990,7 +990,8 @@ static unsigned long unmap_page_range(st
  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
  * @details: details of nonlinear truncation or shared cache invalidation
  *
- * Returns the end address of the unmapping (restart addr if interrupted).
+ * Returns the end address of the unmapping (restart addr if interrupted, start
+ * if the i_mmap_lock is held and mmu_notifier_range_start() needs to sleep).
  *
  * Unmap all pages in the vma list.
  *
@@ -1018,12 +1019,17 @@ unsigned long unmap_vmas(struct mmu_gath
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	struct mm_struct *mm = vma->vm_mm;
+	int ret;
 
 	/*
 	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
 	 * mmu_gather until it completes
 	 */
-	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	ret = mmu_notifier_invalidate_range_start(mm, start_addr,
+					end_addr, (i_mmap_lock == NULL));
+	if (ret)
+		goto out;
+
 	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
@@ -1107,7 +1113,7 @@ unsigned long zap_page_range(struct vm_a
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct mmu_gather *tlb;
+	struct mmu_gather *tlb == NULL;
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
@@ -1908,7 +1914,7 @@ int apply_to_page_range(struct mm_struct
 	int err;
 
 	BUG_ON(addr >= end);
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -2329,6 +2335,7 @@ static int unmap_mapping_range_vma(struc
 {
 	unsigned long restart_addr;
 	int need_break;
+	int need_unlocked_invalidate;
 
 	/*
 	 * files that support invalidating or truncating portions of the
@@ -2350,7 +2357,9 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
+	need_unlocked_invalidate = (restart_addr == start_addr);
+	need_break = need_resched() || spin_needbreak(details->i_mmap_lock) ||
+					need_unlocked_invalidate;
 
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
@@ -2365,6 +2374,10 @@ again:
 	}
 
 	spin_unlock(details->i_mmap_lock);
+	if (need_unlocked_invalidate) {
+		mmu_notifier_invalidate_range_start(vma->mm, start, end, 0);
+		mmu_notifier_invalidate_range_end(vma->mm, start, end);
+	}
 	cond_resched();
 	spin_lock(details->i_mmap_lock);
 	return -EINTR;
Index: mmu_notifiers_sleepable_v1/mm/mmu_notifier.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mmu_notifier.c	2010-01-28 13:43:26.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mmu_notifier.c	2010-01-28 13:43:26.000000000 -0600
@@ -135,20 +135,22 @@ void __mmu_notifier_invalidate_page(stru
 	}
 	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
 }
-
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+			     unsigned long start, unsigned long end, int atomic)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
 	int srcu;
+	int ret = 0;
 
 	srcu = srcu_read_lock(&mm->mmu_notifier_mm->srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
-			mn->ops->invalidate_range_start(mn, mm, start, end);
+			ret |= mn->ops->invalidate_range_start(mn, mm, start,
+								end, atomic);
 	}
 	srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu);
+	return ret;
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
Index: mmu_notifiers_sleepable_v1/mm/mprotect.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mprotect.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mprotect.c	2010-01-28 13:43:26.000000000 -0600
@@ -204,7 +204,7 @@ success:
 		dirty_accountable = 1;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
+	mmu_notifier_invalidate_range_start(mm, start, end, 0);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
Index: mmu_notifiers_sleepable_v1/mm/mremap.c
===================================================================
--- mmu_notifiers_sleepable_v1.orig/mm/mremap.c	2010-01-28 13:42:15.000000000 -0600
+++ mmu_notifiers_sleepable_v1/mm/mremap.c	2010-01-28 13:43:26.000000000 -0600
@@ -82,7 +82,7 @@ static void move_ptes(struct vm_area_str
 
 	old_start = old_addr;
 	mmu_notifier_invalidate_range_start(vma->vm_mm,
-					    old_start, old_end);
+					    old_start, old_end, 0);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers
  2010-01-28 19:56 ` [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
@ 2010-01-29 20:54   ` Andrew Morton
  2010-02-01 19:07     ` Robin Holt
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2010-01-29 20:54 UTC (permalink / raw)
  To: Robin Holt; +Cc: Andrea Arcangeli, linux-mm, Jack Steiner

On Thu, 28 Jan 2010 13:56:29 -0600
Robin Holt <holt@sgi.com> wrote:

> 
> unmap_vmas() can fail to correctly flush the TLB if a
> callout to mmu_notifier_invalidate_range_start() sleeps.
> The mmu_gather list is initialized prior to the callout. If it is reused
> while the thread is sleeping, the mm field may be invalid.
> 
> If the task migrates to a different cpu, the task may use the wrong
> mmu_gather.

I don't think that description is complete.

There might be ways in which we can prevent this task from being
migrated to another CPU, but that doesn't fix the problem because the
mmu_gather is a per-CPU resource and might get trashed if another task
is scheduled on THIS cpu, and uses its mmu_gather.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 1/3] srcu
  2010-01-28 19:56 ` [RFP 1/3] srcu Robin Holt
@ 2010-01-29 20:56   ` Andrew Morton
  2010-02-01 19:06     ` Robin Holt
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2010-01-29 20:56 UTC (permalink / raw)
  To: Robin Holt; +Cc: Andrea Arcangeli, linux-mm, Jack Steiner


> Subject: [RFP 1/3] srcu

Well that was terse.

On Thu, 28 Jan 2010 13:56:28 -0600
Robin Holt <holt@sgi.com> wrote:

> From: Andrea Arcangeli <andrea@qumranet.com>
> 
> This converts rcu into a per-mm srcu to allow all mmu notifier methods to
> schedule.

Changelog doesn't make much sense.

> --- mmu_notifiers_sleepable_v1.orig/include/linux/srcu.h	2010-01-28 10:36:39.000000000 -0600
> +++ mmu_notifiers_sleepable_v1/include/linux/srcu.h	2010-01-28 10:39:10.000000000 -0600
> @@ -27,6 +27,8 @@
>  #ifndef _LINUX_SRCU_H
>  #define _LINUX_SRCU_H
>  
> +#include <linux/mutex.h>
> +
>  struct srcu_struct_array {
>  	int c[2];
>  };

An unchangelogged, unrelated bugfix.  I guess it's OK slipping this
into this patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-01-28 19:56 ` [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
  2010-01-28 21:31   ` Andrea Arcangeli
@ 2010-01-29 21:08   ` Andrew Morton
  2010-02-01 19:22     ` Robin Holt
  1 sibling, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2010-01-29 21:08 UTC (permalink / raw)
  To: Robin Holt; +Cc: Andrea Arcangeli, linux-mm, Jack Steiner

On Thu, 28 Jan 2010 13:56:30 -0600
Robin Holt <holt@sgi.com> wrote:

> 
> Make the truncate case handle the need to sleep.  We accomplish this
> by failing the mmu_notifier_invalidate_range_start(... atomic==1)
> case which inturn falls back to unmap_mapping_range_vma() with the
> restart_address == start_address.  In that case, we make an additional
> callout to mmu_notifier_invalidate_range_start(... atomic==0) after the
> i_mmap_lock has been released.

This is a mushroom patch.  This patch (and the rest of the patchset)
fails to provide any reason for making any change to anything.

I understand that it has something to do with xpmem?  That needs to be
spelled out in some detail please, so we understand the requirements
and perhaps can suggest alternatives.  If we have enough information we
can perhaps even suggest alternatives _within xpmem_.  But right now, we
have nothing.

> ===================================================================
> --- mmu_notifiers_sleepable_v1.orig/include/linux/mmu_notifier.h	2010-01-28 13:43:26.000000000 -0600
> +++ mmu_notifiers_sleepable_v1/include/linux/mmu_notifier.h	2010-01-28 13:43:26.000000000 -0600
> @@ -170,8 +170,8 @@ extern void __mmu_notifier_change_pte(st
>  				      unsigned long address, pte_t pte);
>  extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
>  					  unsigned long address);
> -extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> -				  unsigned long start, unsigned long end);
> +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> +			    unsigned long start, unsigned long end, int atomic);

Perhaps `atomic' could be made bool.

> ...
>
> @@ -1018,12 +1019,17 @@ unsigned long unmap_vmas(struct mmu_gath
>  	unsigned long start = start_addr;
>  	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
>  	struct mm_struct *mm = vma->vm_mm;
> +	int ret;
>  
>  	/*
>  	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
>  	 * mmu_gather until it completes
>  	 */
> -	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
> +	ret = mmu_notifier_invalidate_range_start(mm, start_addr,
> +					end_addr, (i_mmap_lock == NULL));
> +	if (ret)
> +		goto out;
> +

afaict, `ret' doesn't get used for anything.

>  	*tlbp = tlb_gather_mmu(mm, fullmm);
>  	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
>  		unsigned long end;
> @@ -1107,7 +1113,7 @@ unsigned long zap_page_range(struct vm_a
>  		unsigned long size, struct zap_details *details)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
> -	struct mmu_gather *tlb;
> +	struct mmu_gather *tlb == NULL;

This statement doesn't do what you thought it did.  Didn't the compiler warn?

>  	unsigned long end = address + size;
>  	unsigned long nr_accounted = 0;
>  
> @@ -1908,7 +1914,7 @@ int apply_to_page_range(struct mm_struct
>  	int err;
>  
>  	BUG_ON(addr >= end);
> -	mmu_notifier_invalidate_range_start(mm, start, end);
> +	mmu_notifier_invalidate_range_start(mm, start, end, 0);
>  	pgd = pgd_offset(mm, addr);
>  	do {
>  		next = pgd_addr_end(addr, end);
> @@ -2329,6 +2335,7 @@ static int unmap_mapping_range_vma(struc
>  {
>  	unsigned long restart_addr;
>  	int need_break;
> +	int need_unlocked_invalidate;
>  
>  	/*
>  	 * files that support invalidating or truncating portions of the
> @@ -2350,7 +2357,9 @@ again:
>  
>  	restart_addr = zap_page_range(vma, start_addr,
>  					end_addr - start_addr, details);
> -	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
> +	need_unlocked_invalidate = (restart_addr == start_addr);
> +	need_break = need_resched() || spin_needbreak(details->i_mmap_lock) ||
> +					need_unlocked_invalidate;
>  
>  	if (restart_addr >= end_addr) {
>  		/* We have now completed this vma: mark it so */
> @@ -2365,6 +2374,10 @@ again:
>  	}
>  
>  	spin_unlock(details->i_mmap_lock);
> +	if (need_unlocked_invalidate) {
> +		mmu_notifier_invalidate_range_start(vma->mm, start, end, 0);
> +		mmu_notifier_invalidate_range_end(vma->mm, start, end);
> +	}

This is the appropriate place at which to add a comment explaining to
the reader what the code is doing.

> -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> -				  unsigned long start, unsigned long end)
> +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> +			     unsigned long start, unsigned long end, int atomic)

The implementation would be considerably less ugly if we could do away
with the `atomic' thing altogether and just assume `atomic=false'
throughout?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 1/3] srcu
  2010-01-29 20:56   ` Andrew Morton
@ 2010-02-01 19:06     ` Robin Holt
  0 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2010-02-01 19:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robin Holt, Andrea Arcangeli, linux-mm, Jack Steiner

On Fri, Jan 29, 2010 at 12:56:50PM -0800, Andrew Morton wrote:
> 
> > Subject: [RFP 1/3] srcu
> 
> Well that was terse.
> 
> On Thu, 28 Jan 2010 13:56:28 -0600
> Robin Holt <holt@sgi.com> wrote:
> 
> > From: Andrea Arcangeli <andrea@qumranet.com>
> > 
> > This converts rcu into a per-mm srcu to allow all mmu notifier methods to
> > schedule.
> 
> Changelog doesn't make much sense.

I made the changelog a little more verbose and hopefully a little
more clear.

> 
> > --- mmu_notifiers_sleepable_v1.orig/include/linux/srcu.h	2010-01-28 10:36:39.000000000 -0600
> > +++ mmu_notifiers_sleepable_v1/include/linux/srcu.h	2010-01-28 10:39:10.000000000 -0600
> > @@ -27,6 +27,8 @@
> >  #ifndef _LINUX_SRCU_H
> >  #define _LINUX_SRCU_H
> >  
> > +#include <linux/mutex.h>
> > +
> >  struct srcu_struct_array {
> >  	int c[2];
> >  };
> 
> An unchangelogged, unrelated bugfix.  I guess it's OK slipping this
> into this patch.

Removed.  This does not appear to be needed as it mmu_notifier.c compiles
without warning.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers
  2010-01-29 20:54   ` Andrew Morton
@ 2010-02-01 19:07     ` Robin Holt
  0 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2010-02-01 19:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robin Holt, Andrea Arcangeli, linux-mm, Jack Steiner

On Fri, Jan 29, 2010 at 12:54:26PM -0800, Andrew Morton wrote:
> On Thu, 28 Jan 2010 13:56:29 -0600
> Robin Holt <holt@sgi.com> wrote:
> 
> > 
> > unmap_vmas() can fail to correctly flush the TLB if a
> > callout to mmu_notifier_invalidate_range_start() sleeps.
> > The mmu_gather list is initialized prior to the callout. If it is reused
> > while the thread is sleeping, the mm field may be invalid.
> > 
> > If the task migrates to a different cpu, the task may use the wrong
> > mmu_gather.
> 
> I don't think that description is complete.
> 
> There might be ways in which we can prevent this task from being
> migrated to another CPU, but that doesn't fix the problem because the
> mmu_gather is a per-CPU resource and might get trashed if another task
> is scheduled on THIS cpu, and uses its mmu_gather.

I couldn't reword it to make it any more clear.  The third paragraph of
Jack's original changelog says basically what you said.

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep.
  2010-01-29 21:08   ` Andrew Morton
@ 2010-02-01 19:22     ` Robin Holt
  0 siblings, 0 replies; 12+ messages in thread
From: Robin Holt @ 2010-02-01 19:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Robin Holt, Andrea Arcangeli, linux-mm, Jack Steiner

On Fri, Jan 29, 2010 at 01:08:20PM -0800, Andrew Morton wrote:
> On Thu, 28 Jan 2010 13:56:30 -0600
> Robin Holt <holt@sgi.com> wrote:
...
> This is a mushroom patch.  This patch (and the rest of the patchset)
> fails to provide any reason for making any change to anything.
> 
> I understand that it has something to do with xpmem?  That needs to be
> spelled out in some detail please, so we understand the requirements
> and perhaps can suggest alternatives.  If we have enough information we
> can perhaps even suggest alternatives _within xpmem_.  But right now, we
> have nothing.

I have a much better description of what XPMEM needs in the next version.

> > +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> > +			    unsigned long start, unsigned long end, int atomic);
> 
> Perhaps `atomic' could be made bool.

Done.

> > @@ -1018,12 +1019,17 @@ unsigned long unmap_vmas(struct mmu_gath
...
> > -	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
> > +	ret = mmu_notifier_invalidate_range_start(mm, start_addr,
> > +					end_addr, (i_mmap_lock == NULL));
> > +	if (ret)
> > +		goto out;
> > +
> 
> afaict, `ret' doesn't get used for anything.

Removed 'ret'

> > +	struct mmu_gather *tlb == NULL;
> 
> This statement doesn't do what you thought it did.  Didn't the compiler warn?

That was wrong.  I had not compiled the patchset.

> >  	spin_unlock(details->i_mmap_lock);
> > +	if (need_unlocked_invalidate) {
> > +		mmu_notifier_invalidate_range_start(vma->mm, start, end, 0);
> > +		mmu_notifier_invalidate_range_end(vma->mm, start, end);
> > +	}
> 
> This is the appropriate place at which to add a comment explaining to
> the reader what the code is doing.

Added a comment.

> > -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> > -				  unsigned long start, unsigned long end)
> > +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
> > +			     unsigned long start, unsigned long end, int atomic)
> 
> The implementation would be considerably less ugly if we could do away
> with the `atomic' thing altogether and just assume `atomic=false'
> throughout?

In order to do the atomic=false, we would either need to convert the
_range_start/_range_end to multiple _page callouts or take Andreas
series of patches which convert the i_mmap_lock to an i_mmap_sem and then
implement a method to ensure the vma remains consistent while sleeping
in the invalidate_range_start callout.

Over the weekend, I think I got fairly close to the point in XPMEM where
I can comfortably say the callouts with the i_mmap_lock unlocked will be
unneeded.  I will still need to have the flag (currently called atomic but
should be renamed in that case) to indicate that asynchronous clearing
of the page tables is acceptable.  In that circumstance, XPMEM would
queue up the clearing much as we currently do for the _invalidate_page()
callouts and process them later with a seperate xpmem kernel thread.
I need to do some more thinking and checking with the MPI library folks
to see if we can caveat that behavior.

Another version will be posted shortly.  I will test the synchronous
clearing and follow up with those results.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2010-02-01 19:22 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-28 19:56 [RFP 0/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
2010-01-28 19:56 ` [RFP 1/3] srcu Robin Holt
2010-01-29 20:56   ` Andrew Morton
2010-02-01 19:06     ` Robin Holt
2010-01-28 19:56 ` [RFP 2/3] Fix unmap_vma() bug related to mmu_notifiers Robin Holt
2010-01-29 20:54   ` Andrew Morton
2010-02-01 19:07     ` Robin Holt
2010-01-28 19:56 ` [RFP 3/3] Make mmu_notifier_invalidate_range_start able to sleep Robin Holt
2010-01-28 21:31   ` Andrea Arcangeli
2010-01-29 12:37     ` Robin Holt
2010-01-29 21:08   ` Andrew Morton
2010-02-01 19:22     ` Robin Holt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.