All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-25 17:45 ` Jack Steiner
  0 siblings, 0 replies; 22+ messages in thread
From: Jack Steiner @ 2010-01-25 17:45 UTC (permalink / raw)
  To: mingo, tglx; +Cc: linux-mm, linux-kernel

unmap_vmas() can fail to correctly flush the TLB if a
callout to mmu_notifier_invalidate_range_start() sleeps.
The mmu_gather list is initialized prior to the callout. If it is reused
while the thread is sleeping, the mm field may be invalid.

If the task migrates to a different cpu, the task may use the wrong
mmu_gather.

The patch changes unmap_vmas() to initialize the mmu_gather
AFTER the mmu_notifier completes.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: <stable@kernel.org>

---
 include/linux/mm.h |    2 +-
 mm/memory.c        |   11 +++++++----
 mm/mmap.c          |    6 ++----
 3 files changed, 10 insertions(+), 9 deletions(-)

Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h	2010-01-25 01:45:37.000000000 -0600
+++ linux/include/linux/mm.h	2010-01-25 11:32:21.000000000 -0600
@@ -761,7 +761,7 @@ unsigned long zap_page_range(struct vm_a
 unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *);
+		struct zap_details *, int fullmm);
 
 /**
  * mm_walk - callbacks for walk_page_range
Index: linux/mm/memory.c
===================================================================
--- linux.orig/mm/memory.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/memory.c	2010-01-25 11:32:21.000000000 -0600
@@ -1010,17 +1010,21 @@ static unsigned long unmap_page_range(st
 unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *details)
+		struct zap_details *details, int fullmm)
 {
 	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = (*tlbp)->fullmm;
 	struct mm_struct *mm = vma->vm_mm;
 
+	/*
+	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
+	 * mmu_gather until it completes
+	 */
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
 
@@ -1108,9 +1112,8 @@ unsigned long zap_page_range(struct vm_a
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details, 0);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
Index: linux/mm/mmap.c
===================================================================
--- linux.orig/mm/mmap.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/mmap.c	2010-01-25 11:35:55.000000000 -0600
@@ -1824,9 +1824,8 @@ static void unmap_region(struct mm_struc
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL, 0);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
@@ -2168,10 +2167,9 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
-	tlb = tlb_gather_mmu(mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL, 1);
 	vm_unacct_memory(nr_accounted);
 
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-25 17:45 ` Jack Steiner
  0 siblings, 0 replies; 22+ messages in thread
From: Jack Steiner @ 2010-01-25 17:45 UTC (permalink / raw)
  To: mingo, tglx; +Cc: linux-mm, linux-kernel

unmap_vmas() can fail to correctly flush the TLB if a
callout to mmu_notifier_invalidate_range_start() sleeps.
The mmu_gather list is initialized prior to the callout. If it is reused
while the thread is sleeping, the mm field may be invalid.

If the task migrates to a different cpu, the task may use the wrong
mmu_gather.

The patch changes unmap_vmas() to initialize the mmu_gather
AFTER the mmu_notifier completes.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: <stable@kernel.org>

---
 include/linux/mm.h |    2 +-
 mm/memory.c        |   11 +++++++----
 mm/mmap.c          |    6 ++----
 3 files changed, 10 insertions(+), 9 deletions(-)

Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h	2010-01-25 01:45:37.000000000 -0600
+++ linux/include/linux/mm.h	2010-01-25 11:32:21.000000000 -0600
@@ -761,7 +761,7 @@ unsigned long zap_page_range(struct vm_a
 unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *);
+		struct zap_details *, int fullmm);
 
 /**
  * mm_walk - callbacks for walk_page_range
Index: linux/mm/memory.c
===================================================================
--- linux.orig/mm/memory.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/memory.c	2010-01-25 11:32:21.000000000 -0600
@@ -1010,17 +1010,21 @@ static unsigned long unmap_page_range(st
 unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
-		struct zap_details *details)
+		struct zap_details *details, int fullmm)
 {
 	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-	int fullmm = (*tlbp)->fullmm;
 	struct mm_struct *mm = vma->vm_mm;
 
+	/*
+	 * mmu_notifier_invalidate_range_start can sleep. Don't initialize
+	 * mmu_gather until it completes
+	 */
 	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+	*tlbp = tlb_gather_mmu(mm, fullmm);
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
 
@@ -1108,9 +1112,8 @@ unsigned long zap_page_range(struct vm_a
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details, 0);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
Index: linux/mm/mmap.c
===================================================================
--- linux.orig/mm/mmap.c	2010-01-25 01:45:37.000000000 -0600
+++ linux/mm/mmap.c	2010-01-25 11:35:55.000000000 -0600
@@ -1824,9 +1824,8 @@ static void unmap_region(struct mm_struc
 	unsigned long nr_accounted = 0;
 
 	lru_add_drain();
-	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL, 0);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
@@ -2168,10 +2167,9 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
-	tlb = tlb_gather_mmu(mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL, 1);
 	vm_unacct_memory(nr_accounted);
 
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-25 17:45 ` Jack Steiner
@ 2010-01-25 19:00   ` Andrea Arcangeli
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-25 19:00 UTC (permalink / raw)
  To: Jack Steiner; +Cc: mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 11:45:56AM -0600, Jack Steiner wrote:
> unmap_vmas() can fail to correctly flush the TLB if a
> callout to mmu_notifier_invalidate_range_start() sleeps.

Not sure I understand: the callbacks invoked by
mmu_notifier_invalidate_range_start can't sleep, or rcu locking inside
mmu notifier will break too (first thing that should be replaced with
srcu if they were allowed to sleep).

In short there's no schedule that could be added because of those
callbacks so if this code isn't ok and schedules and screw on the
mmu_gather tlb it's probably not mmu notifier related.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-25 19:00   ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-25 19:00 UTC (permalink / raw)
  To: Jack Steiner; +Cc: mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 11:45:56AM -0600, Jack Steiner wrote:
> unmap_vmas() can fail to correctly flush the TLB if a
> callout to mmu_notifier_invalidate_range_start() sleeps.

Not sure I understand: the callbacks invoked by
mmu_notifier_invalidate_range_start can't sleep, or rcu locking inside
mmu notifier will break too (first thing that should be replaced with
srcu if they were allowed to sleep).

In short there's no schedule that could be added because of those
callbacks so if this code isn't ok and schedules and screw on the
mmu_gather tlb it's probably not mmu notifier related.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-25 19:00   ` Andrea Arcangeli
@ 2010-01-25 21:10     ` Jack Steiner
  -1 siblings, 0 replies; 22+ messages in thread
From: Jack Steiner @ 2010-01-25 21:10 UTC (permalink / raw)
  To: Andrea Arcangeli, Robin Holt, cl; +Cc: mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 08:00:52PM +0100, Andrea Arcangeli wrote:
> On Mon, Jan 25, 2010 at 11:45:56AM -0600, Jack Steiner wrote:
> > unmap_vmas() can fail to correctly flush the TLB if a
> > callout to mmu_notifier_invalidate_range_start() sleeps.
> 
> Not sure I understand: the callbacks invoked by
> mmu_notifier_invalidate_range_start can't sleep, or rcu locking inside
> mmu notifier will break too (first thing that should be replaced with
> srcu if they were allowed to sleep).
> 
> In short there's no schedule that could be added because of those
> callbacks so if this code isn't ok and schedules and screw on the
> mmu_gather tlb it's probably not mmu notifier related.

Hmmmm. I was under the impression that the range callbacks _were_ allowed to
sleep. That was certainly in the original patches posted by christoph. I was
not aware that it had changed.

I'll let Robin provide details but the way mmu_notifiers are used by
xpmem, I believe sleeping is essential. The 'start" callout sends messages to
another OS instances to teardown cross-SSI mappings. That is difficult
to do w/o a sleep.

We'll dig deeper tomorrow & see what we can do....

--- jack

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-25 21:10     ` Jack Steiner
  0 siblings, 0 replies; 22+ messages in thread
From: Jack Steiner @ 2010-01-25 21:10 UTC (permalink / raw)
  To: Andrea Arcangeli, Robin Holt, cl; +Cc: mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 08:00:52PM +0100, Andrea Arcangeli wrote:
> On Mon, Jan 25, 2010 at 11:45:56AM -0600, Jack Steiner wrote:
> > unmap_vmas() can fail to correctly flush the TLB if a
> > callout to mmu_notifier_invalidate_range_start() sleeps.
> 
> Not sure I understand: the callbacks invoked by
> mmu_notifier_invalidate_range_start can't sleep, or rcu locking inside
> mmu notifier will break too (first thing that should be replaced with
> srcu if they were allowed to sleep).
> 
> In short there's no schedule that could be added because of those
> callbacks so if this code isn't ok and schedules and screw on the
> mmu_gather tlb it's probably not mmu notifier related.

Hmmmm. I was under the impression that the range callbacks _were_ allowed to
sleep. That was certainly in the original patches posted by christoph. I was
not aware that it had changed.

I'll let Robin provide details but the way mmu_notifiers are used by
xpmem, I believe sleeping is essential. The 'start" callout sends messages to
another OS instances to teardown cross-SSI mappings. That is difficult
to do w/o a sleep.

We'll dig deeper tomorrow & see what we can do....

--- jack

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-25 21:10     ` Jack Steiner
@ 2010-01-25 21:16       ` Andrea Arcangeli
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-25 21:16 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Robin Holt, cl, mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 03:10:33PM -0600, Jack Steiner wrote:
> On Mon, Jan 25, 2010 at 08:00:52PM +0100, Andrea Arcangeli wrote:
> > On Mon, Jan 25, 2010 at 11:45:56AM -0600, Jack Steiner wrote:
> > > unmap_vmas() can fail to correctly flush the TLB if a
> > > callout to mmu_notifier_invalidate_range_start() sleeps.
> > 
> > Not sure I understand: the callbacks invoked by
> > mmu_notifier_invalidate_range_start can't sleep, or rcu locking inside
> > mmu notifier will break too (first thing that should be replaced with
> > srcu if they were allowed to sleep).
> > 
> > In short there's no schedule that could be added because of those
> > callbacks so if this code isn't ok and schedules and screw on the
> > mmu_gather tlb it's probably not mmu notifier related.
> 
> Hmmmm. I was under the impression that the range callbacks _were_ allowed to
> sleep. That was certainly in the original patches posted by christoph. I was
> not aware that it had changed.

There's rcu, srcu not... srcu is the first thing to add, and then
anon_vma refcounting, then we can go sleepable...

> I'll let Robin provide details but the way mmu_notifiers are used by
> xpmem, I believe sleeping is essential. The 'start" callout sends messages to
> another OS instances to teardown cross-SSI mappings. That is difficult
> to do w/o a sleep.
> 
> We'll dig deeper tomorrow & see what we can do....

The old patches are in my ftp area, they should still apply, you
should concentrate testing with those additional ones applied, then it
will work for xpmem too ;)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-25 21:16       ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-25 21:16 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Robin Holt, cl, mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 03:10:33PM -0600, Jack Steiner wrote:
> On Mon, Jan 25, 2010 at 08:00:52PM +0100, Andrea Arcangeli wrote:
> > On Mon, Jan 25, 2010 at 11:45:56AM -0600, Jack Steiner wrote:
> > > unmap_vmas() can fail to correctly flush the TLB if a
> > > callout to mmu_notifier_invalidate_range_start() sleeps.
> > 
> > Not sure I understand: the callbacks invoked by
> > mmu_notifier_invalidate_range_start can't sleep, or rcu locking inside
> > mmu notifier will break too (first thing that should be replaced with
> > srcu if they were allowed to sleep).
> > 
> > In short there's no schedule that could be added because of those
> > callbacks so if this code isn't ok and schedules and screw on the
> > mmu_gather tlb it's probably not mmu notifier related.
> 
> Hmmmm. I was under the impression that the range callbacks _were_ allowed to
> sleep. That was certainly in the original patches posted by christoph. I was
> not aware that it had changed.

There's rcu, srcu not... srcu is the first thing to add, and then
anon_vma refcounting, then we can go sleepable...

> I'll let Robin provide details but the way mmu_notifiers are used by
> xpmem, I believe sleeping is essential. The 'start" callout sends messages to
> another OS instances to teardown cross-SSI mappings. That is difficult
> to do w/o a sleep.
> 
> We'll dig deeper tomorrow & see what we can do....

The old patches are in my ftp area, they should still apply, you
should concentrate testing with those additional ones applied, then it
will work for xpmem too ;)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-25 21:16       ` Andrea Arcangeli
@ 2010-01-26 21:29         ` Robin Holt
  -1 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-26 21:29 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Robin Holt, cl, mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> The old patches are in my ftp area, they should still apply, you
> should concentrate testing with those additional ones applied, then it
> will work for xpmem too ;)

Andrea, could you point me at your ftp area?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-26 21:29         ` Robin Holt
  0 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-26 21:29 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jack Steiner, Robin Holt, cl, mingo, tglx, linux-mm, linux-kernel

On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> The old patches are in my ftp area, they should still apply, you
> should concentrate testing with those additional ones applied, then it
> will work for xpmem too ;)

Andrea, could you point me at your ftp area?

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-26 21:29         ` Robin Holt
@ 2010-01-26 21:38           ` Andrea Arcangeli
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-26 21:38 UTC (permalink / raw)
  To: Robin Holt; +Cc: Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Tue, Jan 26, 2010 at 03:29:04PM -0600, Robin Holt wrote:
> On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> > The old patches are in my ftp area, they should still apply, you
> > should concentrate testing with those additional ones applied, then it
> > will work for xpmem too ;)
> 
> Andrea, could you point me at your ftp area?

Sure, this is the very latest version I maintained:

http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.26-rc7/mmu-notifier-v18/

Note, it may be an option to make mmu notifier sleepable through
.config, unless people uses xpmem there is no reason to add
refcounting to vmas. That is something we'd pay even if no KVM is used
and no mmu notifer is used. I think the ideal is that anon-vma lock
should be a rwspinlock and only rcu (no refcounting) with
MMU_NOTIFIER_SLEEPABLE=n, and a read-write sem + refcounting if
MMU_NOTIFIER_SLEEPABLE=y. MMU_NOTIFIER_SLEEPABLE doesn't need to be
user visible, simply XPMEM=y will automatically set
MMU_NOTIFIER_SLEEPABLE=y.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-26 21:38           ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-26 21:38 UTC (permalink / raw)
  To: Robin Holt; +Cc: Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Tue, Jan 26, 2010 at 03:29:04PM -0600, Robin Holt wrote:
> On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> > The old patches are in my ftp area, they should still apply, you
> > should concentrate testing with those additional ones applied, then it
> > will work for xpmem too ;)
> 
> Andrea, could you point me at your ftp area?

Sure, this is the very latest version I maintained:

http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.26-rc7/mmu-notifier-v18/

Note, it may be an option to make mmu notifier sleepable through
.config, unless people uses xpmem there is no reason to add
refcounting to vmas. That is something we'd pay even if no KVM is used
and no mmu notifer is used. I think the ideal is that anon-vma lock
should be a rwspinlock and only rcu (no refcounting) with
MMU_NOTIFIER_SLEEPABLE=n, and a read-write sem + refcounting if
MMU_NOTIFIER_SLEEPABLE=y. MMU_NOTIFIER_SLEEPABLE doesn't need to be
user visible, simply XPMEM=y will automatically set
MMU_NOTIFIER_SLEEPABLE=y.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-26 21:38           ` Andrea Arcangeli
@ 2010-01-28  3:18             ` Robin Holt
  -1 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-28  3:18 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Tue, Jan 26, 2010 at 10:38:53PM +0100, Andrea Arcangeli wrote:
> On Tue, Jan 26, 2010 at 03:29:04PM -0600, Robin Holt wrote:
> > On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> > > The old patches are in my ftp area, they should still apply, you
> > > should concentrate testing with those additional ones applied, then it
> > > will work for xpmem too ;)
> > 
> > Andrea, could you point me at your ftp area?
> 
> Sure, this is the very latest version I maintained:
> 
> http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.26-rc7/mmu-notifier-v18/

Let me start with what XPMEM currently has.

We adjusted xpmem so that the mmu_notifier_invalidate_page() callout
does not need to sleep.  It takes the arguments passed in and adds them
to a queue for clearing the pages.  We added a seperate kernel thread
which manages this clearing.

The mmu_notifier_invalidate_range_end() likewise does not really need
to sleep either.

That leaves the mmu_notifier_invalidate_range_start() callout.  This does
not need to drop the mm_sem.  It does need to be able to sleep waiting
for the invalidations to complete on the other process.  That other
process may be on a different SSI connected to the same Numalink fabric.

I think that with the SRCU patch, we have enough.  Is that true or have
I missed something?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-28  3:18             ` Robin Holt
  0 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-28  3:18 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Tue, Jan 26, 2010 at 10:38:53PM +0100, Andrea Arcangeli wrote:
> On Tue, Jan 26, 2010 at 03:29:04PM -0600, Robin Holt wrote:
> > On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> > > The old patches are in my ftp area, they should still apply, you
> > > should concentrate testing with those additional ones applied, then it
> > > will work for xpmem too ;)
> > 
> > Andrea, could you point me at your ftp area?
> 
> Sure, this is the very latest version I maintained:
> 
> http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.26-rc7/mmu-notifier-v18/

Let me start with what XPMEM currently has.

We adjusted xpmem so that the mmu_notifier_invalidate_page() callout
does not need to sleep.  It takes the arguments passed in and adds them
to a queue for clearing the pages.  We added a seperate kernel thread
which manages this clearing.

The mmu_notifier_invalidate_range_end() likewise does not really need
to sleep either.

That leaves the mmu_notifier_invalidate_range_start() callout.  This does
not need to drop the mm_sem.  It does need to be able to sleep waiting
for the invalidations to complete on the other process.  That other
process may be on a different SSI connected to the same Numalink fabric.

I think that with the SRCU patch, we have enough.  Is that true or have
I missed something?

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-28  3:18             ` Robin Holt
@ 2010-01-28  3:49               ` Robin Holt
  -1 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-28  3:49 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Wed, Jan 27, 2010 at 09:18:42PM -0600, Robin Holt wrote:
> On Tue, Jan 26, 2010 at 10:38:53PM +0100, Andrea Arcangeli wrote:
> > On Tue, Jan 26, 2010 at 03:29:04PM -0600, Robin Holt wrote:
> > > On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> > > > The old patches are in my ftp area, they should still apply, you
> > > > should concentrate testing with those additional ones applied, then it
> > > > will work for xpmem too ;)
> > > 
> > > Andrea, could you point me at your ftp area?
> > 
> > Sure, this is the very latest version I maintained:
> > 
> > http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.26-rc7/mmu-notifier-v18/
> 
> Let me start with what XPMEM currently has.
> 
> We adjusted xpmem so that the mmu_notifier_invalidate_page() callout
> does not need to sleep.  It takes the arguments passed in and adds them
> to a queue for clearing the pages.  We added a seperate kernel thread
> which manages this clearing.
> 
> The mmu_notifier_invalidate_range_end() likewise does not really need
> to sleep either.
> 
> That leaves the mmu_notifier_invalidate_range_start() callout.  This does
> not need to drop the mm_sem.  It does need to be able to sleep waiting
> for the invalidations to complete on the other process.  That other
> process may be on a different SSI connected to the same Numalink fabric.
> 
> I think that with the SRCU patch, we have enough.  Is that true or have
> I missed something?

I wasn't quite complete in my previous email.  Your srcu patch
plus Jack's patch to move the tlb_gather_mmu to after the
mmu_notifier_invalidate_range_start().

Robin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-28  3:49               ` Robin Holt
  0 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-28  3:49 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Wed, Jan 27, 2010 at 09:18:42PM -0600, Robin Holt wrote:
> On Tue, Jan 26, 2010 at 10:38:53PM +0100, Andrea Arcangeli wrote:
> > On Tue, Jan 26, 2010 at 03:29:04PM -0600, Robin Holt wrote:
> > > On Mon, Jan 25, 2010 at 10:16:15PM +0100, Andrea Arcangeli wrote:
> > > > The old patches are in my ftp area, they should still apply, you
> > > > should concentrate testing with those additional ones applied, then it
> > > > will work for xpmem too ;)
> > > 
> > > Andrea, could you point me at your ftp area?
> > 
> > Sure, this is the very latest version I maintained:
> > 
> > http://www.kernel.org/pub/linux/kernel/people/andrea/patches/v2.6/2.6.26-rc7/mmu-notifier-v18/
> 
> Let me start with what XPMEM currently has.
> 
> We adjusted xpmem so that the mmu_notifier_invalidate_page() callout
> does not need to sleep.  It takes the arguments passed in and adds them
> to a queue for clearing the pages.  We added a seperate kernel thread
> which manages this clearing.
> 
> The mmu_notifier_invalidate_range_end() likewise does not really need
> to sleep either.
> 
> That leaves the mmu_notifier_invalidate_range_start() callout.  This does
> not need to drop the mm_sem.  It does need to be able to sleep waiting
> for the invalidations to complete on the other process.  That other
> process may be on a different SSI connected to the same Numalink fabric.
> 
> I think that with the SRCU patch, we have enough.  Is that true or have
> I missed something?

I wasn't quite complete in my previous email.  Your srcu patch
plus Jack's patch to move the tlb_gather_mmu to after the
mmu_notifier_invalidate_range_start().

Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-28  3:49               ` Robin Holt
@ 2010-01-28 10:03                 ` Andrea Arcangeli
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-28 10:03 UTC (permalink / raw)
  To: Robin Holt; +Cc: Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Wed, Jan 27, 2010 at 09:49:44PM -0600, Robin Holt wrote:
> > I think that with the SRCU patch, we have enough.  Is that true or have
> > I missed something?
> 
> I wasn't quite complete in my previous email.  Your srcu patch
> plus Jack's patch to move the tlb_gather_mmu to after the
> mmu_notifier_invalidate_range_start().

My pmdp_clear_flush_notify with transparent hugepage will give some
trouble because it's using mmu_notifier_invalidate_range_start to
provide backwards compatible API to mmu notifier users like GRU that
may be mapping physical hugepages with 4k secondary tlb mappings
(which have to be all invalidated not only the first one). So that
would still require the full series as it's like if the rmap code
would be using mmu_notifier_invalidate_range_start. But we can
probably get away by forcing all mmu notifier methods to provide a
mmu_notifier_invalidate_hugepage.

But in addition to srcu surely you also need i_mmap_lock_to_sem for
unmap_mapping_range_vma taking i_mmap_lock, basically you missed
truncate. Which then in cascade requires free_pgtables,
rwsem-contended, unmap_vmas (the latter are for the tlb gather
required to be in atomic context to avoid scheduling to other cpus
while holding the tlb gather).

So you only avoid the need of anon-vma switching to rwsem (because
there's no range-vmtruncate but only rmap uses it on a page-by-page
basis with mmu_notifier_invalidate_page). So at that point IMHO you
can as well add a CONFIG_MMU_NOTIFIER_SLEEPABLE and allow scheduling
everywhere in mmu notifier IMHO, but if you prefer to avoid changing
anon_vma lock to rwsem and add refcounting that is ok with me too. I
just think it'd be cleaner to switch them all to sleepable code if we
have to provide for it and most of the work on the i_mmap_lock side is
mandatory anyway.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-28 10:03                 ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-28 10:03 UTC (permalink / raw)
  To: Robin Holt; +Cc: Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Wed, Jan 27, 2010 at 09:49:44PM -0600, Robin Holt wrote:
> > I think that with the SRCU patch, we have enough.  Is that true or have
> > I missed something?
> 
> I wasn't quite complete in my previous email.  Your srcu patch
> plus Jack's patch to move the tlb_gather_mmu to after the
> mmu_notifier_invalidate_range_start().

My pmdp_clear_flush_notify with transparent hugepage will give some
trouble because it's using mmu_notifier_invalidate_range_start to
provide backwards compatible API to mmu notifier users like GRU that
may be mapping physical hugepages with 4k secondary tlb mappings
(which have to be all invalidated not only the first one). So that
would still require the full series as it's like if the rmap code
would be using mmu_notifier_invalidate_range_start. But we can
probably get away by forcing all mmu notifier methods to provide a
mmu_notifier_invalidate_hugepage.

But in addition to srcu surely you also need i_mmap_lock_to_sem for
unmap_mapping_range_vma taking i_mmap_lock, basically you missed
truncate. Which then in cascade requires free_pgtables,
rwsem-contended, unmap_vmas (the latter are for the tlb gather
required to be in atomic context to avoid scheduling to other cpus
while holding the tlb gather).

So you only avoid the need of anon-vma switching to rwsem (because
there's no range-vmtruncate but only rmap uses it on a page-by-page
basis with mmu_notifier_invalidate_page). So at that point IMHO you
can as well add a CONFIG_MMU_NOTIFIER_SLEEPABLE and allow scheduling
everywhere in mmu notifier IMHO, but if you prefer to avoid changing
anon_vma lock to rwsem and add refcounting that is ok with me too. I
just think it'd be cleaner to switch them all to sleepable code if we
have to provide for it and most of the work on the i_mmap_lock side is
mandatory anyway.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-28 10:03                 ` Andrea Arcangeli
@ 2010-01-28 13:25                   ` Robin Holt
  -1 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-28 13:25 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Thu, Jan 28, 2010 at 11:03:27AM +0100, Andrea Arcangeli wrote:
> On Wed, Jan 27, 2010 at 09:49:44PM -0600, Robin Holt wrote:
> > > I think that with the SRCU patch, we have enough.  Is that true or have
> > > I missed something?
> > 
> > I wasn't quite complete in my previous email.  Your srcu patch
> > plus Jack's patch to move the tlb_gather_mmu to after the
> > mmu_notifier_invalidate_range_start().
> 
> My pmdp_clear_flush_notify with transparent hugepage will give some
> trouble because it's using mmu_notifier_invalidate_range_start to
> provide backwards compatible API to mmu notifier users like GRU that
> may be mapping physical hugepages with 4k secondary tlb mappings
> (which have to be all invalidated not only the first one). So that
> would still require the full series as it's like if the rmap code
> would be using mmu_notifier_invalidate_range_start. But we can
> probably get away by forcing all mmu notifier methods to provide a
> mmu_notifier_invalidate_hugepage.

The GRU is using a hardware TLB of 2MB page size when the
is_vm_hugetlb_page() indicates it is a 2MB vma.  From my reading of it,
your callout to mmu_notifier_invalidate_page() will work for gru and I
think it will work for XPMEM as well, but I am not certain of that yet.
I am certain that it can be made to work for XPMEM.  I think using the
range callouts are actually worse because now we are mixing the conceptual
uses of page and range.

> But in addition to srcu surely you also need i_mmap_lock_to_sem for
> unmap_mapping_range_vma taking i_mmap_lock, basically you missed
> truncate. Which then in cascade requires free_pgtables,

I must be missing something key here.  I thought unmap_mapping_range_vma
would percolate down to calling mmu_notifier_invalidate_page() which
xpmem can sleep in.  Based upon that assumption, I don't see the
need for the other patches.

> rwsem-contended, unmap_vmas (the latter are for the tlb gather
> required to be in atomic context to avoid scheduling to other cpus
> while holding the tlb gather).
> 
> So you only avoid the need of anon-vma switching to rwsem (because
> there's no range-vmtruncate but only rmap uses it on a page-by-page
> basis with mmu_notifier_invalidate_page). So at that point IMHO you
> can as well add a CONFIG_MMU_NOTIFIER_SLEEPABLE and allow scheduling
> everywhere in mmu notifier IMHO, but if you prefer to avoid changing
> anon_vma lock to rwsem and add refcounting that is ok with me too. I
> just think it'd be cleaner to switch them all to sleepable code if we
> have to provide for it and most of the work on the i_mmap_lock side is
> mandatory anyway.

I don't see the mandatory part here.  Maybe it is your broken english
combined with my ignorance, but I do not see what the statement
"i_mmap_lock side is mandatory" is based upon.  It looks to me like
everywhere that is calling an mmu_notifier_invalidate_* while holding
the i_mmap_lock is calling mmu_notifier_invalidate_page().  That is
currently safe for sleeping as far as XPMEM is concerned.  Is there a
place that is calling mmu_notifier_invalidate_range_*() while holding
the i_mmap_lock which I have missed?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-28 13:25                   ` Robin Holt
  0 siblings, 0 replies; 22+ messages in thread
From: Robin Holt @ 2010-01-28 13:25 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Robin Holt, Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Thu, Jan 28, 2010 at 11:03:27AM +0100, Andrea Arcangeli wrote:
> On Wed, Jan 27, 2010 at 09:49:44PM -0600, Robin Holt wrote:
> > > I think that with the SRCU patch, we have enough.  Is that true or have
> > > I missed something?
> > 
> > I wasn't quite complete in my previous email.  Your srcu patch
> > plus Jack's patch to move the tlb_gather_mmu to after the
> > mmu_notifier_invalidate_range_start().
> 
> My pmdp_clear_flush_notify with transparent hugepage will give some
> trouble because it's using mmu_notifier_invalidate_range_start to
> provide backwards compatible API to mmu notifier users like GRU that
> may be mapping physical hugepages with 4k secondary tlb mappings
> (which have to be all invalidated not only the first one). So that
> would still require the full series as it's like if the rmap code
> would be using mmu_notifier_invalidate_range_start. But we can
> probably get away by forcing all mmu notifier methods to provide a
> mmu_notifier_invalidate_hugepage.

The GRU is using a hardware TLB of 2MB page size when the
is_vm_hugetlb_page() indicates it is a 2MB vma.  From my reading of it,
your callout to mmu_notifier_invalidate_page() will work for gru and I
think it will work for XPMEM as well, but I am not certain of that yet.
I am certain that it can be made to work for XPMEM.  I think using the
range callouts are actually worse because now we are mixing the conceptual
uses of page and range.

> But in addition to srcu surely you also need i_mmap_lock_to_sem for
> unmap_mapping_range_vma taking i_mmap_lock, basically you missed
> truncate. Which then in cascade requires free_pgtables,

I must be missing something key here.  I thought unmap_mapping_range_vma
would percolate down to calling mmu_notifier_invalidate_page() which
xpmem can sleep in.  Based upon that assumption, I don't see the
need for the other patches.

> rwsem-contended, unmap_vmas (the latter are for the tlb gather
> required to be in atomic context to avoid scheduling to other cpus
> while holding the tlb gather).
> 
> So you only avoid the need of anon-vma switching to rwsem (because
> there's no range-vmtruncate but only rmap uses it on a page-by-page
> basis with mmu_notifier_invalidate_page). So at that point IMHO you
> can as well add a CONFIG_MMU_NOTIFIER_SLEEPABLE and allow scheduling
> everywhere in mmu notifier IMHO, but if you prefer to avoid changing
> anon_vma lock to rwsem and add refcounting that is ok with me too. I
> just think it'd be cleaner to switch them all to sleepable code if we
> have to provide for it and most of the work on the i_mmap_lock side is
> mandatory anyway.

I don't see the mandatory part here.  Maybe it is your broken english
combined with my ignorance, but I do not see what the statement
"i_mmap_lock side is mandatory" is based upon.  It looks to me like
everywhere that is calling an mmu_notifier_invalidate_* while holding
the i_mmap_lock is calling mmu_notifier_invalidate_page().  That is
currently safe for sleeping as far as XPMEM is concerned.  Is there a
place that is calling mmu_notifier_invalidate_range_*() while holding
the i_mmap_lock which I have missed?

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
  2010-01-28 13:25                   ` Robin Holt
@ 2010-01-28 15:20                     ` Andrea Arcangeli
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-28 15:20 UTC (permalink / raw)
  To: Robin Holt; +Cc: Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Thu, Jan 28, 2010 at 07:25:03AM -0600, Robin Holt wrote:
> The GRU is using a hardware TLB of 2MB page size when the
> is_vm_hugetlb_page() indicates it is a 2MB vma.  From my reading of it,
> your callout to mmu_notifier_invalidate_page() will work for gru and I
> think it will work for XPMEM as well, but I am not certain of that yet.
> I am certain that it can be made to work for XPMEM.  I think using the
> range callouts are actually worse because now we are mixing the conceptual
> uses of page and range.

KVM also will obviously be fine, the whole point of transparent
hugepage support is to allow mapping 2M tlb and 2M shadow pages in the
spmd... In fact I'm already calling the 4k callout for the 2M
pmdp_flush_clear_young_notify... because worst case that won't cash
but only swap less smart. However at the moment start/stop is just
safer... and gives more peace of mind and they can't schedule
anyway... but I surely would prefer a single call too, if nothing else
for performance reasons. Said that it's definitely not a fast path
worth worrying about for KVM.

Even the tlb_flush of pmdp_*flush on x86 is a range flush in
transparent huegepage support because I found errata that invlpg isn't
ok to flush PSE tlb on some cpus but then I didn't check the details,
I just wanted less risk right now, later it can be optimized (worst
case dependent on cpuid).

Note also that pmdp_splitting_flush_notify probably can drop the
_notify part. As long as there is symmetry with the "pages" returned
by gup and their respective put_page and you only use the "page" to do
put_page and get its physical address, there is no need to be notified
about a split_huge_page. At the moment it seems just a little more
paranoid but again not a requirement by design because hugepages are
stable, and only thing that can require an invalidate is a physical
relocation that only happens during swap (or similar). split_huge_page
doesn't affect _physical_ at all, and in turn there is in theory no
need to modify the secondary mmu mappings, when the primary mmu
mappings are altered. One of the reasons of altering the primary mmu
mappings is to avoid confusing the code that can't handle huge pmd
natively and would crash on them, so we virtually split the page to
show to that code an environment it won't find itself lost.

> I must be missing something key here.  I thought unmap_mapping_range_vma
> would percolate down to calling mmu_notifier_invalidate_page() which
> xpmem can sleep in.  Based upon that assumption, I don't see the
> need for the other patches.

unmap_mapping_range takes i_mmap_lock (spinlock) and then calls
zap_page_range that calls unmap_vmas under spinlock, that leads to
mmu_notifier_invalidate_range_start under i_mmap_lock. That only
happens for truncate... That was also the reason that Christioph's
first patchset had a sleep parameter in its version of
mmu_notifier_invalidate_something(sleep) (and sleep=0 was passed when
it was called inside truncate IIRC).

> I don't see the mandatory part here.  Maybe it is your broken english

Eheh, my english so bad ah... :(. And my writing is probably better
than my pronunciation ;)

> combined with my ignorance, but I do not see what the statement
> "i_mmap_lock side is mandatory" is based upon.  It looks to me like

Tried to explain again above, hope it is clearer this time.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] - Fix unmap_vma() bug related to mmu_notifiers
@ 2010-01-28 15:20                     ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2010-01-28 15:20 UTC (permalink / raw)
  To: Robin Holt; +Cc: Jack Steiner, cl, mingo, tglx, linux-mm, linux-kernel

On Thu, Jan 28, 2010 at 07:25:03AM -0600, Robin Holt wrote:
> The GRU is using a hardware TLB of 2MB page size when the
> is_vm_hugetlb_page() indicates it is a 2MB vma.  From my reading of it,
> your callout to mmu_notifier_invalidate_page() will work for gru and I
> think it will work for XPMEM as well, but I am not certain of that yet.
> I am certain that it can be made to work for XPMEM.  I think using the
> range callouts are actually worse because now we are mixing the conceptual
> uses of page and range.

KVM also will obviously be fine, the whole point of transparent
hugepage support is to allow mapping 2M tlb and 2M shadow pages in the
spmd... In fact I'm already calling the 4k callout for the 2M
pmdp_flush_clear_young_notify... because worst case that won't cash
but only swap less smart. However at the moment start/stop is just
safer... and gives more peace of mind and they can't schedule
anyway... but I surely would prefer a single call too, if nothing else
for performance reasons. Said that it's definitely not a fast path
worth worrying about for KVM.

Even the tlb_flush of pmdp_*flush on x86 is a range flush in
transparent huegepage support because I found errata that invlpg isn't
ok to flush PSE tlb on some cpus but then I didn't check the details,
I just wanted less risk right now, later it can be optimized (worst
case dependent on cpuid).

Note also that pmdp_splitting_flush_notify probably can drop the
_notify part. As long as there is symmetry with the "pages" returned
by gup and their respective put_page and you only use the "page" to do
put_page and get its physical address, there is no need to be notified
about a split_huge_page. At the moment it seems just a little more
paranoid but again not a requirement by design because hugepages are
stable, and only thing that can require an invalidate is a physical
relocation that only happens during swap (or similar). split_huge_page
doesn't affect _physical_ at all, and in turn there is in theory no
need to modify the secondary mmu mappings, when the primary mmu
mappings are altered. One of the reasons of altering the primary mmu
mappings is to avoid confusing the code that can't handle huge pmd
natively and would crash on them, so we virtually split the page to
show to that code an environment it won't find itself lost.

> I must be missing something key here.  I thought unmap_mapping_range_vma
> would percolate down to calling mmu_notifier_invalidate_page() which
> xpmem can sleep in.  Based upon that assumption, I don't see the
> need for the other patches.

unmap_mapping_range takes i_mmap_lock (spinlock) and then calls
zap_page_range that calls unmap_vmas under spinlock, that leads to
mmu_notifier_invalidate_range_start under i_mmap_lock. That only
happens for truncate... That was also the reason that Christioph's
first patchset had a sleep parameter in its version of
mmu_notifier_invalidate_something(sleep) (and sleep=0 was passed when
it was called inside truncate IIRC).

> I don't see the mandatory part here.  Maybe it is your broken english

Eheh, my english so bad ah... :(. And my writing is probably better
than my pronunciation ;)

> combined with my ignorance, but I do not see what the statement
> "i_mmap_lock side is mandatory" is based upon.  It looks to me like

Tried to explain again above, hope it is clearer this time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2010-01-28 15:21 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-25 17:45 [PATCH] - Fix unmap_vma() bug related to mmu_notifiers Jack Steiner
2010-01-25 17:45 ` Jack Steiner
2010-01-25 19:00 ` Andrea Arcangeli
2010-01-25 19:00   ` Andrea Arcangeli
2010-01-25 21:10   ` Jack Steiner
2010-01-25 21:10     ` Jack Steiner
2010-01-25 21:16     ` Andrea Arcangeli
2010-01-25 21:16       ` Andrea Arcangeli
2010-01-26 21:29       ` Robin Holt
2010-01-26 21:29         ` Robin Holt
2010-01-26 21:38         ` Andrea Arcangeli
2010-01-26 21:38           ` Andrea Arcangeli
2010-01-28  3:18           ` Robin Holt
2010-01-28  3:18             ` Robin Holt
2010-01-28  3:49             ` Robin Holt
2010-01-28  3:49               ` Robin Holt
2010-01-28 10:03               ` Andrea Arcangeli
2010-01-28 10:03                 ` Andrea Arcangeli
2010-01-28 13:25                 ` Robin Holt
2010-01-28 13:25                   ` Robin Holt
2010-01-28 15:20                   ` Andrea Arcangeli
2010-01-28 15:20                     ` Andrea Arcangeli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.