All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-15 20:11 ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-15 20:11 UTC (permalink / raw)
  To: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm
  Cc: Andres Lagar-Cavilla

When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been
swapped out or is behind a filemap, this will trigger async readahead and
return immediately. The rationale is that KVM will kick back the guest with an
"async page fault" and allow for some other guest process to take over.

If async PFs are enabled the fault is retried asap from a workqueue, or
immediately if no async PFs. The retry will not relinquish the mmap semaphore
and will block on the IO. This is a bad thing, as other mmap semaphore users
now stall. The fault could take a long time, depending on swap or filemap
latency.

This patch ensures both the regular and async PF path re-enter the fault
allowing for the mmap semaphore to be relinquished in the case of IO wait.

Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
---
 include/linux/kvm_host.h |  9 +++++++++
 include/linux/mm.h       |  1 +
 mm/gup.c                 |  4 ++++
 virt/kvm/async_pf.c      |  4 +---
 virt/kvm/kvm_main.c      | 45 ++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3addcbc..704908d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -198,6 +198,15 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+/*
+ * Retry a fault after a gup with FOLL_NOWAIT. This properly relinquishes mmap
+ * semaphore if the filemap/swap has to wait on page lock (and retries the gup
+ * to completion after that).
+ */
+int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long addr, bool write_fault,
+			    struct page **pagep);
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ebc5f90..13e585f7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
+#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b..332d1c3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 	if (*flags & FOLL_NOWAIT)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+	if (*flags & FOLL_TRIED) {
+		WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+		fault_flags |= FAULT_FLAG_TRIED;
+	}
 
 	ret = handle_mm_fault(mm, vma, address, fault_flags);
 	if (ret & VM_FAULT_ERROR) {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d6a3d09..17b78b1 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	down_read(&mm->mmap_sem);
-	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
-	up_read(&mm->mmap_sem);
+	kvm_get_user_page_retry(NULL, mm, addr, 1, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7ef6b48..43a9ab9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,6 +1115,39 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
+int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
+				unsigned long addr, bool write_fault,
+				struct page **pagep)
+{
+	int npages;
+	int locked = 1;
+	int flags = FOLL_TOUCH | FOLL_HWPOISON |
+		    (pagep ? FOLL_GET : 0) |
+		    (write_fault ? FOLL_WRITE : 0);
+
+	/*
+	 * Retrying fault, we get here *not* having allowed the filemap to wait
+	 * on the page lock. We should now allow waiting on the IO with the
+	 * mmap semaphore released.
+	 */
+	down_read(&mm->mmap_sem);
+	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+				  &locked);
+	if (!locked) {
+		BUG_ON(npages != -EBUSY);
+		/*
+		 * The previous call has now waited on the IO. Now we can
+		 * retry and complete. Pass TRIED to ensure we do not re
+		 * schedule async IO (see e.g. filemap_fault).
+		 */
+		down_read(&mm->mmap_sem);
+		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+					  pagep, NULL, NULL);
+	}
+	up_read(&mm->mmap_sem);
+	return npages;
+}
+
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else
-		npages = get_user_pages_fast(addr, 1, write_fault,
-					     page);
+	} else {
+		/*
+		 * By now we have tried gup_fast, and possible async_pf, and we
+		 * are certainly not atomic. Time to retry the gup, allowing
+		 * mmap semaphore to be relinquished in the case of IO.
+		 */
+		npages = kvm_get_user_page_retry(current, current->mm, addr,
+						 write_fault, page);
+	}
 	if (npages != 1)
 		return npages;
 
-- 
2.1.0.rc2.206.gedb03e5


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-15 20:11 ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-15 20:11 UTC (permalink / raw)
  To: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm
  Cc: Andres Lagar-Cavilla

When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been
swapped out or is behind a filemap, this will trigger async readahead and
return immediately. The rationale is that KVM will kick back the guest with an
"async page fault" and allow for some other guest process to take over.

If async PFs are enabled the fault is retried asap from a workqueue, or
immediately if no async PFs. The retry will not relinquish the mmap semaphore
and will block on the IO. This is a bad thing, as other mmap semaphore users
now stall. The fault could take a long time, depending on swap or filemap
latency.

This patch ensures both the regular and async PF path re-enter the fault
allowing for the mmap semaphore to be relinquished in the case of IO wait.

Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
---
 include/linux/kvm_host.h |  9 +++++++++
 include/linux/mm.h       |  1 +
 mm/gup.c                 |  4 ++++
 virt/kvm/async_pf.c      |  4 +---
 virt/kvm/kvm_main.c      | 45 ++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3addcbc..704908d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -198,6 +198,15 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+/*
+ * Retry a fault after a gup with FOLL_NOWAIT. This properly relinquishes mmap
+ * semaphore if the filemap/swap has to wait on page lock (and retries the gup
+ * to completion after that).
+ */
+int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
+			    unsigned long addr, bool write_fault,
+			    struct page **pagep);
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ebc5f90..13e585f7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
+#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b..332d1c3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 	if (*flags & FOLL_NOWAIT)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+	if (*flags & FOLL_TRIED) {
+		WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+		fault_flags |= FAULT_FLAG_TRIED;
+	}
 
 	ret = handle_mm_fault(mm, vma, address, fault_flags);
 	if (ret & VM_FAULT_ERROR) {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d6a3d09..17b78b1 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	down_read(&mm->mmap_sem);
-	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
-	up_read(&mm->mmap_sem);
+	kvm_get_user_page_retry(NULL, mm, addr, 1, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7ef6b48..43a9ab9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,6 +1115,39 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
+int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
+				unsigned long addr, bool write_fault,
+				struct page **pagep)
+{
+	int npages;
+	int locked = 1;
+	int flags = FOLL_TOUCH | FOLL_HWPOISON |
+		    (pagep ? FOLL_GET : 0) |
+		    (write_fault ? FOLL_WRITE : 0);
+
+	/*
+	 * Retrying fault, we get here *not* having allowed the filemap to wait
+	 * on the page lock. We should now allow waiting on the IO with the
+	 * mmap semaphore released.
+	 */
+	down_read(&mm->mmap_sem);
+	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+				  &locked);
+	if (!locked) {
+		BUG_ON(npages != -EBUSY);
+		/*
+		 * The previous call has now waited on the IO. Now we can
+		 * retry and complete. Pass TRIED to ensure we do not re
+		 * schedule async IO (see e.g. filemap_fault).
+		 */
+		down_read(&mm->mmap_sem);
+		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+					  pagep, NULL, NULL);
+	}
+	up_read(&mm->mmap_sem);
+	return npages;
+}
+
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else
-		npages = get_user_pages_fast(addr, 1, write_fault,
-					     page);
+	} else {
+		/*
+		 * By now we have tried gup_fast, and possible async_pf, and we
+		 * are certainly not atomic. Time to retry the gup, allowing
+		 * mmap semaphore to be relinquished in the case of IO.
+		 */
+		npages = kvm_get_user_page_retry(current, current->mm, addr,
+						 write_fault, page);
+	}
 	if (npages != 1)
 		return npages;
 
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-15 20:11 ` Andres Lagar-Cavilla
@ 2014-09-16 13:51   ` Paolo Bonzini
  -1 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-16 13:51 UTC (permalink / raw)
  To: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
> +	if (!locked) {
> +		BUG_ON(npages != -EBUSY);

VM_BUG_ON perhaps?

> @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  		npages = get_user_page_nowait(current, current->mm,
>  					      addr, write_fault, page);
>  		up_read(&current->mm->mmap_sem);
> -	} else
> -		npages = get_user_pages_fast(addr, 1, write_fault,
> -					     page);
> +	} else {
> +		/*
> +		 * By now we have tried gup_fast, and possible async_pf, and we
> +		 * are certainly not atomic. Time to retry the gup, allowing
> +		 * mmap semaphore to be relinquished in the case of IO.
> +		 */
> +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> +						 write_fault, page);

This is a separate logical change.  Was this:

	down_read(&mm->mmap_sem);
	npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
	up_read(&mm->mmap_sem);

the intention rather than get_user_pages_fast?

I think a first patch should introduce kvm_get_user_page_retry ("Retry a
fault after a gup with FOLL_NOWAIT.") and the second would add
FOLL_TRIED ("This properly relinquishes mmap semaphore if the
filemap/swap has to wait on page lock (and retries the gup to completion
after that").

Apart from this, the patch looks good.  The mm/ parts are minimal, so I
think it's best to merge it through the KVM tree with someone's Acked-by.

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 13:51   ` Paolo Bonzini
  0 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-16 13:51 UTC (permalink / raw)
  To: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
> +	if (!locked) {
> +		BUG_ON(npages != -EBUSY);

VM_BUG_ON perhaps?

> @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  		npages = get_user_page_nowait(current, current->mm,
>  					      addr, write_fault, page);
>  		up_read(&current->mm->mmap_sem);
> -	} else
> -		npages = get_user_pages_fast(addr, 1, write_fault,
> -					     page);
> +	} else {
> +		/*
> +		 * By now we have tried gup_fast, and possible async_pf, and we
> +		 * are certainly not atomic. Time to retry the gup, allowing
> +		 * mmap semaphore to be relinquished in the case of IO.
> +		 */
> +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> +						 write_fault, page);

This is a separate logical change.  Was this:

	down_read(&mm->mmap_sem);
	npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
	up_read(&mm->mmap_sem);

the intention rather than get_user_pages_fast?

I think a first patch should introduce kvm_get_user_page_retry ("Retry a
fault after a gup with FOLL_NOWAIT.") and the second would add
FOLL_TRIED ("This properly relinquishes mmap semaphore if the
filemap/swap has to wait on page lock (and retries the gup to completion
after that").

Apart from this, the patch looks good.  The mm/ parts are minimal, so I
think it's best to merge it through the KVM tree with someone's Acked-by.

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 13:51   ` Paolo Bonzini
  (?)
@ 2014-09-16 16:52   ` Andres Lagar-Cavilla
  2014-09-16 16:55       ` Andres Lagar-Cavilla
  2014-09-16 18:29       ` Paolo Bonzini
  -1 siblings, 2 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 16:52 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 2651 bytes --]

On Tue, Sep 16, 2014 at 6:51 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
> > +     if (!locked) {
> > +             BUG_ON(npages != -EBUSY);
>
> VM_BUG_ON perhaps?
>
Sure.


> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr,
> bool *async, bool write_fault,
> >               npages = get_user_page_nowait(current, current->mm,
> >                                             addr, write_fault, page);
> >               up_read(&current->mm->mmap_sem);
> > -     } else
> > -             npages = get_user_pages_fast(addr, 1, write_fault,
> > -                                          page);
> > +     } else {
> > +             /*
> > +              * By now we have tried gup_fast, and possible async_pf,
> and we
> > +              * are certainly not atomic. Time to retry the gup,
> allowing
> > +              * mmap semaphore to be relinquished in the case of IO.
> > +              */
> > +             npages = kvm_get_user_page_retry(current, current->mm,
> addr,
> > +                                              write_fault, page);
>
> This is a separate logical change.  Was this:
>
>         down_read(&mm->mmap_sem);
>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>         up_read(&mm->mmap_sem);
>
> the intention rather than get_user_pages_fast?
>

Nope. The intention was to pass FAULT_FLAG_RETRY to the vma fault handler
(without _NOWAIT). And once you do that, if you come back without holding
the mmap sem, you need to call yet again.

By that point in the call chain I felt comfortable dropping the _fast. All
paths that get there have already tried _fast (and some have tried _NOWAIT).


> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
> fault after a gup with FOLL_NOWAIT.") and the second would add
> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
> filemap/swap has to wait on page lock (and retries the gup to completion
> after that").
>

That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the fault
handler (e.g. filemap) know that we've been there and waited on the IO
already, so in the common case we won't need to redo the IO.

Have a look at how FAULT_FLAG_TRIED is used in e.g. arch/x86/mm/fault.c.


>
> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
> think it's best to merge it through the KVM tree with someone's Acked-by.
>

Thanks!
Andres


>
> Paolo
>



-- 
Andres Lagar-Cavilla | Google Cloud Platform | andreslc@google.com |
 647-778-4380

[-- Attachment #2: Type: text/html, Size: 5021 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 16:52   ` Andres Lagar-Cavilla
@ 2014-09-16 16:55       ` Andres Lagar-Cavilla
  2014-09-16 18:29       ` Paolo Bonzini
  1 sibling, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 16:55 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 9:52 AM, Andres Lagar-Cavilla
<andreslc@google.com> wrote:

Apologies to all. Resend as lists rejected my gmail-formatted version.
Now on plain text. Won't happen again.

> On Tue, Sep 16, 2014 at 6:51 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
>> > +     if (!locked) {
>> > +             BUG_ON(npages != -EBUSY);
>>
>> VM_BUG_ON perhaps?
>
> Sure.
>
>>
>> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr,
>> > bool *async, bool write_fault,
>> >               npages = get_user_page_nowait(current, current->mm,
>> >                                             addr, write_fault, page);
>> >               up_read(&current->mm->mmap_sem);
>> > -     } else
>> > -             npages = get_user_pages_fast(addr, 1, write_fault,
>> > -                                          page);
>> > +     } else {
>> > +             /*
>> > +              * By now we have tried gup_fast, and possible async_pf,
>> > and we
>> > +              * are certainly not atomic. Time to retry the gup,
>> > allowing
>> > +              * mmap semaphore to be relinquished in the case of IO.
>> > +              */
>> > +             npages = kvm_get_user_page_retry(current, current->mm,
>> > addr,
>> > +                                              write_fault, page);
>>
>> This is a separate logical change.  Was this:
>>
>>         down_read(&mm->mmap_sem);
>>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>>         up_read(&mm->mmap_sem);
>>
>> the intention rather than get_user_pages_fast?
>
>
> Nope. The intention was to pass FAULT_FLAG_RETRY to the vma fault handler
> (without _NOWAIT). And once you do that, if you come back without holding
> the mmap sem, you need to call yet again.
>
> By that point in the call chain I felt comfortable dropping the _fast. All
> paths that get there have already tried _fast (and some have tried _NOWAIT).
>
>>
>> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>> fault after a gup with FOLL_NOWAIT.") and the second would add
>> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>> filemap/swap has to wait on page lock (and retries the gup to completion
>> after that").
>
>
> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is done
> by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the fault
> handler (e.g. filemap) know that we've been there and waited on the IO
> already, so in the common case we won't need to redo the IO.
>
> Have a look at how FAULT_FLAG_TRIED is used in e.g. arch/x86/mm/fault.c.
>
>>
>>
>> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
>> think it's best to merge it through the KVM tree with someone's Acked-by.
>
>
> Thanks!
> Andres
>
>>
>>
>> Paolo
>
>
>
>
> --
> Andres Lagar-Cavilla | Google Cloud Platform | andreslc@google.com |
> 647-778-4380



-- 
Andres Lagar-Cavilla | Google Cloud Platform | andreslc@google.com |
647-778-4380

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 16:55       ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 16:55 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 9:52 AM, Andres Lagar-Cavilla
<andreslc@google.com> wrote:

Apologies to all. Resend as lists rejected my gmail-formatted version.
Now on plain text. Won't happen again.

> On Tue, Sep 16, 2014 at 6:51 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
>> > +     if (!locked) {
>> > +             BUG_ON(npages != -EBUSY);
>>
>> VM_BUG_ON perhaps?
>
> Sure.
>
>>
>> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr,
>> > bool *async, bool write_fault,
>> >               npages = get_user_page_nowait(current, current->mm,
>> >                                             addr, write_fault, page);
>> >               up_read(&current->mm->mmap_sem);
>> > -     } else
>> > -             npages = get_user_pages_fast(addr, 1, write_fault,
>> > -                                          page);
>> > +     } else {
>> > +             /*
>> > +              * By now we have tried gup_fast, and possible async_pf,
>> > and we
>> > +              * are certainly not atomic. Time to retry the gup,
>> > allowing
>> > +              * mmap semaphore to be relinquished in the case of IO.
>> > +              */
>> > +             npages = kvm_get_user_page_retry(current, current->mm,
>> > addr,
>> > +                                              write_fault, page);
>>
>> This is a separate logical change.  Was this:
>>
>>         down_read(&mm->mmap_sem);
>>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>>         up_read(&mm->mmap_sem);
>>
>> the intention rather than get_user_pages_fast?
>
>
> Nope. The intention was to pass FAULT_FLAG_RETRY to the vma fault handler
> (without _NOWAIT). And once you do that, if you come back without holding
> the mmap sem, you need to call yet again.
>
> By that point in the call chain I felt comfortable dropping the _fast. All
> paths that get there have already tried _fast (and some have tried _NOWAIT).
>
>>
>> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>> fault after a gup with FOLL_NOWAIT.") and the second would add
>> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>> filemap/swap has to wait on page lock (and retries the gup to completion
>> after that").
>
>
> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is done
> by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the fault
> handler (e.g. filemap) know that we've been there and waited on the IO
> already, so in the common case we won't need to redo the IO.
>
> Have a look at how FAULT_FLAG_TRIED is used in e.g. arch/x86/mm/fault.c.
>
>>
>>
>> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
>> think it's best to merge it through the KVM tree with someone's Acked-by.
>
>
> Thanks!
> Andres
>
>>
>>
>> Paolo
>
>
>
>
> --
> Andres Lagar-Cavilla | Google Cloud Platform | andreslc@google.com |
> 647-778-4380



-- 
Andres Lagar-Cavilla | Google Cloud Platform | andreslc@google.com |
647-778-4380

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 16:52   ` Andres Lagar-Cavilla
@ 2014-09-16 18:29       ` Paolo Bonzini
  2014-09-16 18:29       ` Paolo Bonzini
  1 sibling, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-16 18:29 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

Il 16/09/2014 18:52, Andres Lagar-Cavilla ha scritto:
> Was this:
> 
>         down_read(&mm->mmap_sem);
>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>         up_read(&mm->mmap_sem);
> 
> the intention rather than get_user_pages_fast?

I meant the intention of the original author, not yours.

> By that point in the call chain I felt comfortable dropping the _fast.
> All paths that get there have already tried _fast (and some have tried
> _NOWAIT).

Yes, understood.

>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>     fault after a gup with FOLL_NOWAIT.") and the second would add
>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>     filemap/swap has to wait on page lock (and retries the gup to completion
>     after that").
> 
> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
> fault handler (e.g. filemap) know that we've been there and waited on
> the IO already, so in the common case we won't need to redo the IO.

Yes, that's not what FOLL_TRIED does.  But it's the difference between
get_user_pages and kvm_get_user_page_retry, right?

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 18:29       ` Paolo Bonzini
  0 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-16 18:29 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

Il 16/09/2014 18:52, Andres Lagar-Cavilla ha scritto:
> Was this:
> 
>         down_read(&mm->mmap_sem);
>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>         up_read(&mm->mmap_sem);
> 
> the intention rather than get_user_pages_fast?

I meant the intention of the original author, not yours.

> By that point in the call chain I felt comfortable dropping the _fast.
> All paths that get there have already tried _fast (and some have tried
> _NOWAIT).

Yes, understood.

>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>     fault after a gup with FOLL_NOWAIT.") and the second would add
>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>     filemap/swap has to wait on page lock (and retries the gup to completion
>     after that").
> 
> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
> fault handler (e.g. filemap) know that we've been there and waited on
> the IO already, so in the common case we won't need to redo the IO.

Yes, that's not what FOLL_TRIED does.  But it's the difference between
get_user_pages and kvm_get_user_page_retry, right?

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 18:29       ` Paolo Bonzini
@ 2014-09-16 18:42         ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 18:42 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 11:29 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 16/09/2014 18:52, Andres Lagar-Cavilla ha scritto:
>> Was this:
>>
>>         down_read(&mm->mmap_sem);
>>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>>         up_read(&mm->mmap_sem);
>>
>> the intention rather than get_user_pages_fast?
>
> I meant the intention of the original author, not yours.

Yes, in all likelihood. I hope!

>
>> By that point in the call chain I felt comfortable dropping the _fast.
>> All paths that get there have already tried _fast (and some have tried
>> _NOWAIT).
>
> Yes, understood.
>
>>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>>     fault after a gup with FOLL_NOWAIT.") and the second would add
>>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>>     filemap/swap has to wait on page lock (and retries the gup to completion
>>     after that").
>>
>> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
>> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
>> fault handler (e.g. filemap) know that we've been there and waited on
>> the IO already, so in the common case we won't need to redo the IO.
>
> Yes, that's not what FOLL_TRIED does.  But it's the difference between
> get_user_pages and kvm_get_user_page_retry, right?

Unfortunately get_user_pages does not expose the param (int
*nonblocking) that __gup will use to set FAULT_FLAG_ALLOW_RETRY. So
that's one difference. The second difference is that kvm_gup_retry
will call two times if necessary (the second without _RETRY but with
_TRIED).

Thanks
Andres
>
> Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 18:42         ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 18:42 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 11:29 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 16/09/2014 18:52, Andres Lagar-Cavilla ha scritto:
>> Was this:
>>
>>         down_read(&mm->mmap_sem);
>>         npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>>         up_read(&mm->mmap_sem);
>>
>> the intention rather than get_user_pages_fast?
>
> I meant the intention of the original author, not yours.

Yes, in all likelihood. I hope!

>
>> By that point in the call chain I felt comfortable dropping the _fast.
>> All paths that get there have already tried _fast (and some have tried
>> _NOWAIT).
>
> Yes, understood.
>
>>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>>     fault after a gup with FOLL_NOWAIT.") and the second would add
>>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>>     filemap/swap has to wait on page lock (and retries the gup to completion
>>     after that").
>>
>> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
>> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
>> fault handler (e.g. filemap) know that we've been there and waited on
>> the IO already, so in the common case we won't need to redo the IO.
>
> Yes, that's not what FOLL_TRIED does.  But it's the difference between
> get_user_pages and kvm_get_user_page_retry, right?

Unfortunately get_user_pages does not expose the param (int
*nonblocking) that __gup will use to set FAULT_FLAG_ALLOW_RETRY. So
that's one difference. The second difference is that kvm_gup_retry
will call two times if necessary (the second without _RETRY but with
_TRIED).

Thanks
Andres
>
> Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 13:51   ` Paolo Bonzini
  (?)
@ 2014-09-16 20:51     ` Radim Krčmář
  -1 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-16 20:51 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,

The suffix '_retry' is not best suited for this.
On first reading, I imagined we will be retrying something from before,
possibly calling it in a loop, but we are actually doing the first and
last try in one call.

Hard to find something that conveys our lock-dropping mechanic,
'_polite' is my best candidate at the moment.

> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |

(FOLL_HWPOISON wasn't used before, but it's harmless.)

2014-09-16 15:51+0200, Paolo Bonzini:
> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> >  		npages = get_user_page_nowait(current, current->mm,
> >  					      addr, write_fault, page);
> >  		up_read(&current->mm->mmap_sem);
> > -	} else
> > -		npages = get_user_pages_fast(addr, 1, write_fault,
> > -					     page);
> > +	} else {
> > +		/*
> > +		 * By now we have tried gup_fast, and possible async_pf, and we
                                        ^
(If we really tried get_user_pages_fast, we wouldn't be here, so I'd
 prepend two underscores here as well.)

> > +		 * are certainly not atomic. Time to retry the gup, allowing
> > +		 * mmap semaphore to be relinquished in the case of IO.
> > +		 */
> > +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> > +						 write_fault, page);
> 
> This is a separate logical change.  Was this:
> 
> 	down_read(&mm->mmap_sem);
> 	npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> 	up_read(&mm->mmap_sem);
> 
> the intention rather than get_user_pages_fast?

I believe so as well.

(Looking at get_user_pages_fast and __get_user_pages_fast made my
 abstraction detector very sad.)

> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
> fault after a gup with FOLL_NOWAIT.") and the second would add
> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
> filemap/swap has to wait on page lock (and retries the gup to completion
> after that").

Not sure if that would help to understand the goal ...

> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
> think it's best to merge it through the KVM tree with someone's Acked-by.

I would prefer to have the last hunk in a separate patch, but still,

Acked-by: Radim Krčmář <rkrcmar@redhat.com>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 20:51     ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-16 20:51 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,

The suffix '_retry' is not best suited for this.
On first reading, I imagined we will be retrying something from before,
possibly calling it in a loop, but we are actually doing the first and
last try in one call.

Hard to find something that conveys our lock-dropping mechanic,
'_polite' is my best candidate at the moment.

> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |

(FOLL_HWPOISON wasn't used before, but it's harmless.)

2014-09-16 15:51+0200, Paolo Bonzini:
> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> >  		npages = get_user_page_nowait(current, current->mm,
> >  					      addr, write_fault, page);
> >  		up_read(&current->mm->mmap_sem);
> > -	} else
> > -		npages = get_user_pages_fast(addr, 1, write_fault,
> > -					     page);
> > +	} else {
> > +		/*
> > +		 * By now we have tried gup_fast, and possible async_pf, and we
                                        ^
(If we really tried get_user_pages_fast, we wouldn't be here, so I'd
 prepend two underscores here as well.)

> > +		 * are certainly not atomic. Time to retry the gup, allowing
> > +		 * mmap semaphore to be relinquished in the case of IO.
> > +		 */
> > +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> > +						 write_fault, page);
> 
> This is a separate logical change.  Was this:
> 
> 	down_read(&mm->mmap_sem);
> 	npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> 	up_read(&mm->mmap_sem);
> 
> the intention rather than get_user_pages_fast?

I believe so as well.

(Looking at get_user_pages_fast and __get_user_pages_fast made my
 abstraction detector very sad.)

> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
> fault after a gup with FOLL_NOWAIT.") and the second would add
> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
> filemap/swap has to wait on page lock (and retries the gup to completion
> after that").

Not sure if that would help to understand the goal ...

> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
> think it's best to merge it through the KVM tree with someone's Acked-by.

I would prefer to have the last hunk in a separate patch, but still,

Acked-by: Radim Krčmář <rkrcmar@redhat.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 20:51     ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-16 20:51 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,

The suffix '_retry' is not best suited for this.
On first reading, I imagined we will be retrying something from before,
possibly calling it in a loop, but we are actually doing the first and
last try in one call.

Hard to find something that conveys our lock-dropping mechanic,
'_polite' is my best candidate at the moment.

> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |

(FOLL_HWPOISON wasn't used before, but it's harmless.)

2014-09-16 15:51+0200, Paolo Bonzini:
> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> >  		npages = get_user_page_nowait(current, current->mm,
> >  					      addr, write_fault, page);
> >  		up_read(&current->mm->mmap_sem);
> > -	} else
> > -		npages = get_user_pages_fast(addr, 1, write_fault,
> > -					     page);
> > +	} else {
> > +		/*
> > +		 * By now we have tried gup_fast, and possible async_pf, and we
                                        ^
(If we really tried get_user_pages_fast, we wouldn't be here, so I'd
 prepend two underscores here as well.)

> > +		 * are certainly not atomic. Time to retry the gup, allowing
> > +		 * mmap semaphore to be relinquished in the case of IO.
> > +		 */
> > +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> > +						 write_fault, page);
> 
> This is a separate logical change.  Was this:
> 
> 	down_read(&mm->mmap_sem);
> 	npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> 	up_read(&mm->mmap_sem);
> 
> the intention rather than get_user_pages_fast?

I believe so as well.

(Looking at get_user_pages_fast and __get_user_pages_fast made my
 abstraction detector very sad.)

> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
> fault after a gup with FOLL_NOWAIT.") and the second would add
> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
> filemap/swap has to wait on page lock (and retries the gup to completion
> after that").

Not sure if that would help to understand the goal ...

> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
> think it's best to merge it through the KVM tree with someone's Acked-by.

I would prefer to have the last hunk in a separate patch, but still,

Acked-by: Radim KrA?mA!A? <rkrcmar@redhat.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 20:51     ` Radim Krčmář
@ 2014-09-16 21:01       ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 21:01 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
>> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
>
> The suffix '_retry' is not best suited for this.
> On first reading, I imagined we will be retrying something from before,
> possibly calling it in a loop, but we are actually doing the first and
> last try in one call.

We are doing ... the second and third in most scenarios. async_pf did
the first with _NOWAIT. We call this from the async pf retrier, or if
async pf couldn't be notified to the guest.

>
> Hard to find something that conveys our lock-dropping mechanic,
> '_polite' is my best candidate at the moment.

I'm at a loss towards finding a better name than '_retry'.

>
>> +     int flags = FOLL_TOUCH | FOLL_HWPOISON |
>
> (FOLL_HWPOISON wasn't used before, but it's harmless.)

Ok. Wasn't 100% sure TBH.

>
> 2014-09-16 15:51+0200, Paolo Bonzini:
>> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
>> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>> >             npages = get_user_page_nowait(current, current->mm,
>> >                                           addr, write_fault, page);
>> >             up_read(&current->mm->mmap_sem);
>> > -   } else
>> > -           npages = get_user_pages_fast(addr, 1, write_fault,
>> > -                                        page);
>> > +   } else {
>> > +           /*
>> > +            * By now we have tried gup_fast, and possible async_pf, and we
>                                         ^
> (If we really tried get_user_pages_fast, we wouldn't be here, so I'd
>  prepend two underscores here as well.)

Yes, async pf tries and fails to do fast, and then we fallback to
slow, and so on.

>
>> > +            * are certainly not atomic. Time to retry the gup, allowing
>> > +            * mmap semaphore to be relinquished in the case of IO.
>> > +            */
>> > +           npages = kvm_get_user_page_retry(current, current->mm, addr,
>> > +                                            write_fault, page);
>>
>> This is a separate logical change.  Was this:
>>
>>       down_read(&mm->mmap_sem);
>>       npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>>       up_read(&mm->mmap_sem);
>>
>> the intention rather than get_user_pages_fast?
>
> I believe so as well.
>
> (Looking at get_user_pages_fast and __get_user_pages_fast made my
>  abstraction detector very sad.)

It's clunky, but a separate battle.

>
>> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>> fault after a gup with FOLL_NOWAIT.") and the second would add
>> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>> filemap/swap has to wait on page lock (and retries the gup to completion
>> after that").
>
> Not sure if that would help to understand the goal ...
>
>> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
>> think it's best to merge it through the KVM tree with someone's Acked-by.
>
> I would prefer to have the last hunk in a separate patch, but still,
>
> Acked-by: Radim Krčmář <rkrcmar@redhat.com>
Awesome, thanks much.

I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
else from this email should go into the recut.

Andres


-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 21:01       ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-16 21:01 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
>> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
>
> The suffix '_retry' is not best suited for this.
> On first reading, I imagined we will be retrying something from before,
> possibly calling it in a loop, but we are actually doing the first and
> last try in one call.

We are doing ... the second and third in most scenarios. async_pf did
the first with _NOWAIT. We call this from the async pf retrier, or if
async pf couldn't be notified to the guest.

>
> Hard to find something that conveys our lock-dropping mechanic,
> '_polite' is my best candidate at the moment.

I'm at a loss towards finding a better name than '_retry'.

>
>> +     int flags = FOLL_TOUCH | FOLL_HWPOISON |
>
> (FOLL_HWPOISON wasn't used before, but it's harmless.)

Ok. Wasn't 100% sure TBH.

>
> 2014-09-16 15:51+0200, Paolo Bonzini:
>> Il 15/09/2014 22:11, Andres Lagar-Cavilla ha scritto:
>> > @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>> >             npages = get_user_page_nowait(current, current->mm,
>> >                                           addr, write_fault, page);
>> >             up_read(&current->mm->mmap_sem);
>> > -   } else
>> > -           npages = get_user_pages_fast(addr, 1, write_fault,
>> > -                                        page);
>> > +   } else {
>> > +           /*
>> > +            * By now we have tried gup_fast, and possible async_pf, and we
>                                         ^
> (If we really tried get_user_pages_fast, we wouldn't be here, so I'd
>  prepend two underscores here as well.)

Yes, async pf tries and fails to do fast, and then we fallback to
slow, and so on.

>
>> > +            * are certainly not atomic. Time to retry the gup, allowing
>> > +            * mmap semaphore to be relinquished in the case of IO.
>> > +            */
>> > +           npages = kvm_get_user_page_retry(current, current->mm, addr,
>> > +                                            write_fault, page);
>>
>> This is a separate logical change.  Was this:
>>
>>       down_read(&mm->mmap_sem);
>>       npages = get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
>>       up_read(&mm->mmap_sem);
>>
>> the intention rather than get_user_pages_fast?
>
> I believe so as well.
>
> (Looking at get_user_pages_fast and __get_user_pages_fast made my
>  abstraction detector very sad.)

It's clunky, but a separate battle.

>
>> I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>> fault after a gup with FOLL_NOWAIT.") and the second would add
>> FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>> filemap/swap has to wait on page lock (and retries the gup to completion
>> after that").
>
> Not sure if that would help to understand the goal ...
>
>> Apart from this, the patch looks good.  The mm/ parts are minimal, so I
>> think it's best to merge it through the KVM tree with someone's Acked-by.
>
> I would prefer to have the last hunk in a separate patch, but still,
>
> Acked-by: Radim Krčmář <rkrcmar@redhat.com>
Awesome, thanks much.

I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
else from this email should go into the recut.

Andres


-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 21:01       ` Andres Lagar-Cavilla
  (?)
@ 2014-09-16 22:34         ` Radim Krčmář
  -1 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-16 22:34 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, kvm, linux-kernel, linux-mm

[Emergency posting to fix the tag and couldn't find unmangled Cc list,
 so some recipients were dropped, sorry.  (I guess you are glad though).]

2014-09-16 14:01-0700, Andres Lagar-Cavilla:
> On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
> >> mm_struct *mm,
> >
> > The suffix '_retry' is not best suited for this.
> > On first reading, I imagined we will be retrying something from
> > before,
> > possibly calling it in a loop, but we are actually doing the first and
> > last try in one call.
> 
> We are doing ... the second and third in most scenarios. async_pf did
> the first with _NOWAIT. We call this from the async pf retrier, or if
> async pf couldn't be notified to the guest.

I was thinking more about what the function does, not how we currently
use it -- nothing prevents us from using it as first somewhere -- but
yeah, even comments would be off then.

> >> Apart from this, the patch looks good.  The mm/ parts are minimal, so
> >> I
> >> think it's best to merge it through the KVM tree with someone's
> >> Acked-by.
> >
> > I would prefer to have the last hunk in a separate patch, but still,
> >
> > Acked-by: Radim Krčmář <rkrcmar@redhat.com>
> 
> Awesome, thanks much.
> 
> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
> else from this email should go into the recut.

Ah, sorry, I'm not maintaining mm ... what I meant was

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

and I had to leave before I could find a good apology for
VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
look at that one as well.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 22:34         ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-16 22:34 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, kvm, linux-kernel, linux-mm

[Emergency posting to fix the tag and couldn't find unmangled Cc list,
 so some recipients were dropped, sorry.  (I guess you are glad though).]

2014-09-16 14:01-0700, Andres Lagar-Cavilla:
> On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
> >> mm_struct *mm,
> >
> > The suffix '_retry' is not best suited for this.
> > On first reading, I imagined we will be retrying something from
> > before,
> > possibly calling it in a loop, but we are actually doing the first and
> > last try in one call.
> 
> We are doing ... the second and third in most scenarios. async_pf did
> the first with _NOWAIT. We call this from the async pf retrier, or if
> async pf couldn't be notified to the guest.

I was thinking more about what the function does, not how we currently
use it -- nothing prevents us from using it as first somewhere -- but
yeah, even comments would be off then.

> >> Apart from this, the patch looks good.  The mm/ parts are minimal, so
> >> I
> >> think it's best to merge it through the KVM tree with someone's
> >> Acked-by.
> >
> > I would prefer to have the last hunk in a separate patch, but still,
> >
> > Acked-by: Radim Krčmář <rkrcmar@redhat.com>
> 
> Awesome, thanks much.
> 
> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
> else from this email should go into the recut.

Ah, sorry, I'm not maintaining mm ... what I meant was

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

and I had to leave before I could find a good apology for
VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
look at that one as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-16 22:34         ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-16 22:34 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, kvm, linux-kernel, linux-mm

[Emergency posting to fix the tag and couldn't find unmangled Cc list,
 so some recipients were dropped, sorry.  (I guess you are glad though).]

2014-09-16 14:01-0700, Andres Lagar-Cavilla:
> On Tue, Sep 16, 2014 at 1:51 PM, Radim KrA?mA!A? <rkrcmar@redhat.com> wrote:
> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
> >> mm_struct *mm,
> >
> > The suffix '_retry' is not best suited for this.
> > On first reading, I imagined we will be retrying something from
> > before,
> > possibly calling it in a loop, but we are actually doing the first and
> > last try in one call.
> 
> We are doing ... the second and third in most scenarios. async_pf did
> the first with _NOWAIT. We call this from the async pf retrier, or if
> async pf couldn't be notified to the guest.

I was thinking more about what the function does, not how we currently
use it -- nothing prevents us from using it as first somewhere -- but
yeah, even comments would be off then.

> >> Apart from this, the patch looks good.  The mm/ parts are minimal, so
> >> I
> >> think it's best to merge it through the KVM tree with someone's
> >> Acked-by.
> >
> > I would prefer to have the last hunk in a separate patch, but still,
> >
> > Acked-by: Radim KrA?mA!A? <rkrcmar@redhat.com>
> 
> Awesome, thanks much.
> 
> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
> else from this email should go into the recut.

Ah, sorry, I'm not maintaining mm ... what I meant was

Reviewed-by: Radim KrA?mA!A? <rkrcmar@redhat.com>

and I had to leave before I could find a good apology for
VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
look at that one as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 22:34         ` Radim Krčmář
@ 2014-09-17  4:15           ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17  4:15 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, kvm, linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 3:34 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> [Emergency posting to fix the tag and couldn't find unmangled Cc list,
>  so some recipients were dropped, sorry.  (I guess you are glad though).]
>
> 2014-09-16 14:01-0700, Andres Lagar-Cavilla:
>> On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
>> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
>> >> mm_struct *mm,
>> >
>> > The suffix '_retry' is not best suited for this.
>> > On first reading, I imagined we will be retrying something from
>> > before,
>> > possibly calling it in a loop, but we are actually doing the first and
>> > last try in one call.
>>
>> We are doing ... the second and third in most scenarios. async_pf did
>> the first with _NOWAIT. We call this from the async pf retrier, or if
>> async pf couldn't be notified to the guest.
>
> I was thinking more about what the function does, not how we currently
> use it -- nothing prevents us from using it as first somewhere -- but
> yeah, even comments would be off then.
>

Good point. Happy to expand comments. What about _complete? _io? _full?

>> >> Apart from this, the patch looks good.  The mm/ parts are minimal, so
>> >> I
>> >> think it's best to merge it through the KVM tree with someone's
>> >> Acked-by.
>> >
>> > I would prefer to have the last hunk in a separate patch, but still,
>> >
>> > Acked-by: Radim Krčmář <rkrcmar@redhat.com>
>>
>> Awesome, thanks much.
>>
>> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
>> else from this email should go into the recut.
>
> Ah, sorry, I'm not maintaining mm ... what I meant was
>
> Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

Cool cool cool
Andres

>
> and I had to leave before I could find a good apology for
> VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
> look at that one as well.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17  4:15           ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17  4:15 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, kvm, linux-kernel, linux-mm

On Tue, Sep 16, 2014 at 3:34 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> [Emergency posting to fix the tag and couldn't find unmangled Cc list,
>  so some recipients were dropped, sorry.  (I guess you are glad though).]
>
> 2014-09-16 14:01-0700, Andres Lagar-Cavilla:
>> On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
>> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
>> >> mm_struct *mm,
>> >
>> > The suffix '_retry' is not best suited for this.
>> > On first reading, I imagined we will be retrying something from
>> > before,
>> > possibly calling it in a loop, but we are actually doing the first and
>> > last try in one call.
>>
>> We are doing ... the second and third in most scenarios. async_pf did
>> the first with _NOWAIT. We call this from the async pf retrier, or if
>> async pf couldn't be notified to the guest.
>
> I was thinking more about what the function does, not how we currently
> use it -- nothing prevents us from using it as first somewhere -- but
> yeah, even comments would be off then.
>

Good point. Happy to expand comments. What about _complete? _io? _full?

>> >> Apart from this, the patch looks good.  The mm/ parts are minimal, so
>> >> I
>> >> think it's best to merge it through the KVM tree with someone's
>> >> Acked-by.
>> >
>> > I would prefer to have the last hunk in a separate patch, but still,
>> >
>> > Acked-by: Radim Krčmář <rkrcmar@redhat.com>
>>
>> Awesome, thanks much.
>>
>> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
>> else from this email should go into the recut.
>
> Ah, sorry, I'm not maintaining mm ... what I meant was
>
> Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

Cool cool cool
Andres

>
> and I had to leave before I could find a good apology for
> VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
> look at that one as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 18:42         ` Andres Lagar-Cavilla
@ 2014-09-17  7:43           ` Paolo Bonzini
  -1 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-17  7:43 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

Il 16/09/2014 20:42, Andres Lagar-Cavilla ha scritto:
> On Tue, Sep 16, 2014 at 11:29 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>>>     fault after a gup with FOLL_NOWAIT.") and the second would add
>>>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>>>     filemap/swap has to wait on page lock (and retries the gup to completion
>>>     after that").
>>>
>>> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
>>> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
>>> fault handler (e.g. filemap) know that we've been there and waited on
>>> the IO already, so in the common case we won't need to redo the IO.
>>
>> Yes, that's not what FOLL_TRIED does.  But it's the difference between
>> get_user_pages and kvm_get_user_page_retry, right?
> 
> Unfortunately get_user_pages does not expose the param (int
> *nonblocking) that __gup will use to set FAULT_FLAG_ALLOW_RETRY. So
> that's one difference. The second difference is that kvm_gup_retry
> will call two times if necessary (the second without _RETRY but with
> _TRIED).

Yeah, that's how it is in your patch.  I can see that.

What I'm saying is that your patch is two changes in one:

1) do not use gup_fast in hva_to_pfn_slow, instead use gup as in
async_pf_execute.  This change can already introduce a function called
kvm_get_user_page_retry, and can already use it in async_pf_execute and
hva_to_pfn_slow

2) introduce the two-phase RETRY + TRIED mechanism in
kvm_get_user_page_retry, so that the mmap semaphore is relinquished
properly if the filemap or swap has to wait on the page lock.

I would prefer to split it in two patches.  Is it clearer now?

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17  7:43           ` Paolo Bonzini
  0 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-17  7:43 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

Il 16/09/2014 20:42, Andres Lagar-Cavilla ha scritto:
> On Tue, Sep 16, 2014 at 11:29 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>>>     fault after a gup with FOLL_NOWAIT.") and the second would add
>>>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>>>     filemap/swap has to wait on page lock (and retries the gup to completion
>>>     after that").
>>>
>>> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
>>> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
>>> fault handler (e.g. filemap) know that we've been there and waited on
>>> the IO already, so in the common case we won't need to redo the IO.
>>
>> Yes, that's not what FOLL_TRIED does.  But it's the difference between
>> get_user_pages and kvm_get_user_page_retry, right?
> 
> Unfortunately get_user_pages does not expose the param (int
> *nonblocking) that __gup will use to set FAULT_FLAG_ALLOW_RETRY. So
> that's one difference. The second difference is that kvm_gup_retry
> will call two times if necessary (the second without _RETRY but with
> _TRIED).

Yeah, that's how it is in your patch.  I can see that.

What I'm saying is that your patch is two changes in one:

1) do not use gup_fast in hva_to_pfn_slow, instead use gup as in
async_pf_execute.  This change can already introduce a function called
kvm_get_user_page_retry, and can already use it in async_pf_execute and
hva_to_pfn_slow

2) introduce the two-phase RETRY + TRIED mechanism in
kvm_get_user_page_retry, so that the mmap semaphore is relinquished
properly if the filemap or swap has to wait on the page lock.

I would prefer to split it in two patches.  Is it clearer now?

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-15 20:11 ` Andres Lagar-Cavilla
@ 2014-09-17 10:26   ` Gleb Natapov
  -1 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 10:26 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Mon, Sep 15, 2014 at 01:11:25PM -0700, Andres Lagar-Cavilla wrote:
> When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been
> swapped out or is behind a filemap, this will trigger async readahead and
> return immediately. The rationale is that KVM will kick back the guest with an
> "async page fault" and allow for some other guest process to take over.
> 
> If async PFs are enabled the fault is retried asap from a workqueue, or
> immediately if no async PFs. The retry will not relinquish the mmap semaphore
> and will block on the IO. This is a bad thing, as other mmap semaphore users
> now stall. The fault could take a long time, depending on swap or filemap
> latency.
> 
> This patch ensures both the regular and async PF path re-enter the fault
> allowing for the mmap semaphore to be relinquished in the case of IO wait.
> 
> Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
> ---
>  include/linux/kvm_host.h |  9 +++++++++
>  include/linux/mm.h       |  1 +
>  mm/gup.c                 |  4 ++++
>  virt/kvm/async_pf.c      |  4 +---
>  virt/kvm/kvm_main.c      | 45 ++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 57 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3addcbc..704908d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -198,6 +198,15 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
>  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #endif
>  
> +/*
> + * Retry a fault after a gup with FOLL_NOWAIT. This properly relinquishes mmap
> + * semaphore if the filemap/swap has to wait on page lock (and retries the gup
> + * to completion after that).
> + */
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long addr, bool write_fault,
> +			    struct page **pagep);
> +
>  enum {
>  	OUTSIDE_GUEST_MODE,
>  	IN_GUEST_MODE,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ebc5f90..13e585f7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
>  #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
>  #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>  #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
> +#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>  
>  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>  			void *data);
> diff --git a/mm/gup.c b/mm/gup.c
> index 91d044b..332d1c3 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
>  	if (*flags & FOLL_NOWAIT)
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
> +	if (*flags & FOLL_TRIED) {
> +		WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
> +		fault_flags |= FAULT_FLAG_TRIED;
> +	}
>  
>  	ret = handle_mm_fault(mm, vma, address, fault_flags);
>  	if (ret & VM_FAULT_ERROR) {
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index d6a3d09..17b78b1 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
>  
>  	might_sleep();
>  
> -	down_read(&mm->mmap_sem);
> -	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> -	up_read(&mm->mmap_sem);
> +	kvm_get_user_page_retry(NULL, mm, addr, 1, NULL);
>  	kvm_async_page_present_sync(vcpu, apf);
>  
>  	spin_lock(&vcpu->async_pf.lock);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7ef6b48..43a9ab9 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1115,6 +1115,39 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
>  	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
>  }
>  
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
> +				unsigned long addr, bool write_fault,
> +				struct page **pagep)
> +{
> +	int npages;
> +	int locked = 1;
> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |
> +		    (pagep ? FOLL_GET : 0) |
> +		    (write_fault ? FOLL_WRITE : 0);
> +
> +	/*
> +	 * Retrying fault, we get here *not* having allowed the filemap to wait
> +	 * on the page lock. We should now allow waiting on the IO with the
> +	 * mmap semaphore released.
> +	 */
> +	down_read(&mm->mmap_sem);
> +	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
> +				  &locked);
> +	if (!locked) {
> +		BUG_ON(npages != -EBUSY);
> +		/*
> +		 * The previous call has now waited on the IO. Now we can
> +		 * retry and complete. Pass TRIED to ensure we do not re
> +		 * schedule async IO (see e.g. filemap_fault).
> +		 */
> +		down_read(&mm->mmap_sem);
> +		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
> +					  pagep, NULL, NULL);
For async_pf_execute() you do not need to even retry. Next guest's page fault
will retry it for you.

> +	}
> +	up_read(&mm->mmap_sem);
> +	return npages;
> +}
> +
>  static inline int check_user_page_hwpoison(unsigned long addr)
>  {
>  	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  		npages = get_user_page_nowait(current, current->mm,
>  					      addr, write_fault, page);
>  		up_read(&current->mm->mmap_sem);
> -	} else
> -		npages = get_user_pages_fast(addr, 1, write_fault,
> -					     page);
> +	} else {
> +		/*
> +		 * By now we have tried gup_fast, and possible async_pf, and we
> +		 * are certainly not atomic. Time to retry the gup, allowing
> +		 * mmap semaphore to be relinquished in the case of IO.
> +		 */
> +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> +						 write_fault, page);
> +	}
>  	if (npages != 1)
>  		return npages;
>  
> -- 
> 2.1.0.rc2.206.gedb03e5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 10:26   ` Gleb Natapov
  0 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 10:26 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Mon, Sep 15, 2014 at 01:11:25PM -0700, Andres Lagar-Cavilla wrote:
> When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been
> swapped out or is behind a filemap, this will trigger async readahead and
> return immediately. The rationale is that KVM will kick back the guest with an
> "async page fault" and allow for some other guest process to take over.
> 
> If async PFs are enabled the fault is retried asap from a workqueue, or
> immediately if no async PFs. The retry will not relinquish the mmap semaphore
> and will block on the IO. This is a bad thing, as other mmap semaphore users
> now stall. The fault could take a long time, depending on swap or filemap
> latency.
> 
> This patch ensures both the regular and async PF path re-enter the fault
> allowing for the mmap semaphore to be relinquished in the case of IO wait.
> 
> Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
> ---
>  include/linux/kvm_host.h |  9 +++++++++
>  include/linux/mm.h       |  1 +
>  mm/gup.c                 |  4 ++++
>  virt/kvm/async_pf.c      |  4 +---
>  virt/kvm/kvm_main.c      | 45 ++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 57 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3addcbc..704908d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -198,6 +198,15 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
>  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #endif
>  
> +/*
> + * Retry a fault after a gup with FOLL_NOWAIT. This properly relinquishes mmap
> + * semaphore if the filemap/swap has to wait on page lock (and retries the gup
> + * to completion after that).
> + */
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
> +			    unsigned long addr, bool write_fault,
> +			    struct page **pagep);
> +
>  enum {
>  	OUTSIDE_GUEST_MODE,
>  	IN_GUEST_MODE,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ebc5f90..13e585f7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
>  #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
>  #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>  #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
> +#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>  
>  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>  			void *data);
> diff --git a/mm/gup.c b/mm/gup.c
> index 91d044b..332d1c3 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
>  	if (*flags & FOLL_NOWAIT)
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
> +	if (*flags & FOLL_TRIED) {
> +		WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
> +		fault_flags |= FAULT_FLAG_TRIED;
> +	}
>  
>  	ret = handle_mm_fault(mm, vma, address, fault_flags);
>  	if (ret & VM_FAULT_ERROR) {
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index d6a3d09..17b78b1 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
>  
>  	might_sleep();
>  
> -	down_read(&mm->mmap_sem);
> -	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> -	up_read(&mm->mmap_sem);
> +	kvm_get_user_page_retry(NULL, mm, addr, 1, NULL);
>  	kvm_async_page_present_sync(vcpu, apf);
>  
>  	spin_lock(&vcpu->async_pf.lock);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7ef6b48..43a9ab9 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1115,6 +1115,39 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
>  	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
>  }
>  
> +int kvm_get_user_page_retry(struct task_struct *tsk, struct mm_struct *mm,
> +				unsigned long addr, bool write_fault,
> +				struct page **pagep)
> +{
> +	int npages;
> +	int locked = 1;
> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |
> +		    (pagep ? FOLL_GET : 0) |
> +		    (write_fault ? FOLL_WRITE : 0);
> +
> +	/*
> +	 * Retrying fault, we get here *not* having allowed the filemap to wait
> +	 * on the page lock. We should now allow waiting on the IO with the
> +	 * mmap semaphore released.
> +	 */
> +	down_read(&mm->mmap_sem);
> +	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
> +				  &locked);
> +	if (!locked) {
> +		BUG_ON(npages != -EBUSY);
> +		/*
> +		 * The previous call has now waited on the IO. Now we can
> +		 * retry and complete. Pass TRIED to ensure we do not re
> +		 * schedule async IO (see e.g. filemap_fault).
> +		 */
> +		down_read(&mm->mmap_sem);
> +		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
> +					  pagep, NULL, NULL);
For async_pf_execute() you do not need to even retry. Next guest's page fault
will retry it for you.

> +	}
> +	up_read(&mm->mmap_sem);
> +	return npages;
> +}
> +
>  static inline int check_user_page_hwpoison(unsigned long addr)
>  {
>  	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> @@ -1177,9 +1210,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  		npages = get_user_page_nowait(current, current->mm,
>  					      addr, write_fault, page);
>  		up_read(&current->mm->mmap_sem);
> -	} else
> -		npages = get_user_pages_fast(addr, 1, write_fault,
> -					     page);
> +	} else {
> +		/*
> +		 * By now we have tried gup_fast, and possible async_pf, and we
> +		 * are certainly not atomic. Time to retry the gup, allowing
> +		 * mmap semaphore to be relinquished in the case of IO.
> +		 */
> +		npages = kvm_get_user_page_retry(current, current->mm, addr,
> +						 write_fault, page);
> +	}
>  	if (npages != 1)
>  		return npages;
>  
> -- 
> 2.1.0.rc2.206.gedb03e5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 10:26   ` Gleb Natapov
@ 2014-09-17 11:27     ` Radim Krčmář
  -1 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-17 11:27 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

2014-09-17 13:26+0300, Gleb Natapov:
> For async_pf_execute() you do not need to even retry. Next guest's page fault
> will retry it for you.

Wouldn't that be a waste of vmentries?

The guest might be able to handle interrupts while we are waiting, so if
we used async-io-done notifier, this could work without CPU spinning.
(Probably with added latency.)

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 11:27     ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-17 11:27 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

2014-09-17 13:26+0300, Gleb Natapov:
> For async_pf_execute() you do not need to even retry. Next guest's page fault
> will retry it for you.

Wouldn't that be a waste of vmentries?

The guest might be able to handle interrupts while we are waiting, so if
we used async-io-done notifier, this could work without CPU spinning.
(Probably with added latency.)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-16 21:01       ` Andres Lagar-Cavilla
  (?)
@ 2014-09-17 11:35         ` Radim Krčmář
  -1 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-17 11:35 UTC (permalink / raw)
  To: kvm, linux-kernel, linux-mm

[Repost for lists, the last mail was eaten by a security troll.]

2014-09-16 14:01-0700, Andres Lagar-Cavilla:
> On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com>
> wrote:
> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
> >> mm_struct *mm,
> >
> > The suffix '_retry' is not best suited for this.
> > On first reading, I imagined we will be retrying something from
> > before,
> > possibly calling it in a loop, but we are actually doing the first
> > and
> > last try in one call.
>
> We are doing ... the second and third in most scenarios. async_pf did
> the first with _NOWAIT. We call this from the async pf retrier, or if
> async pf couldn't be notified to the guest.

I was thinking more about what the function does, not how we currently
use it -- nothing prevents us from using it as first somewhere -- but
yeah, even comments would be off then.

> >> Apart from this, the patch looks good.  The mm/ parts are minimal,
> >> so
> >> I
> >> think it's best to merge it through the KVM tree with someone's
> >> Acked-by.
> >
> > I would prefer to have the last hunk in a separate patch, but still,
> >
> > Acked-by: Radim Krčmář <rkrcmar@redhat.com>
>
> Awesome, thanks much.
>
> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
> else from this email should go into the recut.

Ah, sorry, I'm not maintaining mm ... what I meant was

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

and I had to leave before I could find a good apology for
VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
look at that one as well.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 11:35         ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-17 11:35 UTC (permalink / raw)
  To: kvm, linux-kernel, linux-mm

[Repost for lists, the last mail was eaten by a security troll.]

2014-09-16 14:01-0700, Andres Lagar-Cavilla:
> On Tue, Sep 16, 2014 at 1:51 PM, Radim Krčmář <rkrcmar@redhat.com>
> wrote:
> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
> >> mm_struct *mm,
> >
> > The suffix '_retry' is not best suited for this.
> > On first reading, I imagined we will be retrying something from
> > before,
> > possibly calling it in a loop, but we are actually doing the first
> > and
> > last try in one call.
>
> We are doing ... the second and third in most scenarios. async_pf did
> the first with _NOWAIT. We call this from the async pf retrier, or if
> async pf couldn't be notified to the guest.

I was thinking more about what the function does, not how we currently
use it -- nothing prevents us from using it as first somewhere -- but
yeah, even comments would be off then.

> >> Apart from this, the patch looks good.  The mm/ parts are minimal,
> >> so
> >> I
> >> think it's best to merge it through the KVM tree with someone's
> >> Acked-by.
> >
> > I would prefer to have the last hunk in a separate patch, but still,
> >
> > Acked-by: Radim Krčmář <rkrcmar@redhat.com>
>
> Awesome, thanks much.
>
> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
> else from this email should go into the recut.

Ah, sorry, I'm not maintaining mm ... what I meant was

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

and I had to leave before I could find a good apology for
VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
look at that one as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 11:35         ` Radim Krčmář
  0 siblings, 0 replies; 70+ messages in thread
From: Radim Krčmář @ 2014-09-17 11:35 UTC (permalink / raw)
  To: kvm, linux-kernel, linux-mm

[Repost for lists, the last mail was eaten by a security troll.]

2014-09-16 14:01-0700, Andres Lagar-Cavilla:
> On Tue, Sep 16, 2014 at 1:51 PM, Radim KrA?mA!A? <rkrcmar@redhat.com>
> wrote:
> > 2014-09-15 13:11-0700, Andres Lagar-Cavilla:
> >> +int kvm_get_user_page_retry(struct task_struct *tsk, struct
> >> mm_struct *mm,
> >
> > The suffix '_retry' is not best suited for this.
> > On first reading, I imagined we will be retrying something from
> > before,
> > possibly calling it in a loop, but we are actually doing the first
> > and
> > last try in one call.
>
> We are doing ... the second and third in most scenarios. async_pf did
> the first with _NOWAIT. We call this from the async pf retrier, or if
> async pf couldn't be notified to the guest.

I was thinking more about what the function does, not how we currently
use it -- nothing prevents us from using it as first somewhere -- but
yeah, even comments would be off then.

> >> Apart from this, the patch looks good.  The mm/ parts are minimal,
> >> so
> >> I
> >> think it's best to merge it through the KVM tree with someone's
> >> Acked-by.
> >
> > I would prefer to have the last hunk in a separate patch, but still,
> >
> > Acked-by: Radim KrA?mA!A? <rkrcmar@redhat.com>
>
> Awesome, thanks much.
>
> I'll recut with the VM_BUG_ON from Paolo and your Ack. LMK if anything
> else from this email should go into the recut.

Ah, sorry, I'm not maintaining mm ... what I meant was

Reviewed-by: Radim KrA?mA!A? <rkrcmar@redhat.com>

and I had to leave before I could find a good apology for
VM_WARN_ON_ONCE(), so if you are replacing BUG_ON, you might want to
look at that one as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 11:27     ` Radim Krčmář
@ 2014-09-17 11:42       ` Gleb Natapov
  -1 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 11:42 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
> 2014-09-17 13:26+0300, Gleb Natapov:
> > For async_pf_execute() you do not need to even retry. Next guest's page fault
> > will retry it for you.
> 
> Wouldn't that be a waste of vmentries?
This is how it will work with or without this second gup. Page is not
mapped into a shadow page table on this path, it happens on a next fault.

--
			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 11:42       ` Gleb Natapov
  0 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 11:42 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Andres Lagar-Cavilla, Gleb Natapov, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
> 2014-09-17 13:26+0300, Gleb Natapov:
> > For async_pf_execute() you do not need to even retry. Next guest's page fault
> > will retry it for you.
> 
> Wouldn't that be a waste of vmentries?
This is how it will work with or without this second gup. Page is not
mapped into a shadow page table on this path, it happens on a next fault.

--
			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17  7:43           ` Paolo Bonzini
@ 2014-09-17 16:58             ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 16:58 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 12:43 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 16/09/2014 20:42, Andres Lagar-Cavilla ha scritto:
>> On Tue, Sep 16, 2014 at 11:29 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>>>>     fault after a gup with FOLL_NOWAIT.") and the second would add
>>>>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>>>>     filemap/swap has to wait on page lock (and retries the gup to completion
>>>>     after that").
>>>>
>>>> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
>>>> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
>>>> fault handler (e.g. filemap) know that we've been there and waited on
>>>> the IO already, so in the common case we won't need to redo the IO.
>>>
>>> Yes, that's not what FOLL_TRIED does.  But it's the difference between
>>> get_user_pages and kvm_get_user_page_retry, right?
>>
>> Unfortunately get_user_pages does not expose the param (int
>> *nonblocking) that __gup will use to set FAULT_FLAG_ALLOW_RETRY. So
>> that's one difference. The second difference is that kvm_gup_retry
>> will call two times if necessary (the second without _RETRY but with
>> _TRIED).
>
> Yeah, that's how it is in your patch.  I can see that.
>
> What I'm saying is that your patch is two changes in one:
>
> 1) do not use gup_fast in hva_to_pfn_slow, instead use gup as in
> async_pf_execute.  This change can already introduce a function called
> kvm_get_user_page_retry, and can already use it in async_pf_execute and
> hva_to_pfn_slow
>
> 2) introduce the two-phase RETRY + TRIED mechanism in
> kvm_get_user_page_retry, so that the mmap semaphore is relinquished
> properly if the filemap or swap has to wait on the page lock.
>
> I would prefer to split it in two patches.  Is it clearer now?

Understood. So in patch 1, would kvm_gup_retry be ... just a wrapper
around gup? That looks thin to me, and the naming of the function will
not be accurate. Plus, considering Radim's suggestion that the naming
is not optimal.

I can have patch 1 just s/gup_fast/gup (one liner), and then patch 2
do the rest of the work.

Andres
>
> Paolo



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 16:58             ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 16:58 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 12:43 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 16/09/2014 20:42, Andres Lagar-Cavilla ha scritto:
>> On Tue, Sep 16, 2014 at 11:29 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>>     I think a first patch should introduce kvm_get_user_page_retry ("Retry a
>>>>     fault after a gup with FOLL_NOWAIT.") and the second would add
>>>>     FOLL_TRIED ("This properly relinquishes mmap semaphore if the
>>>>     filemap/swap has to wait on page lock (and retries the gup to completion
>>>>     after that").
>>>>
>>>> That's not what FOLL_TRIED does. The relinquishing of mmap semaphore is
>>>> done by this patch minus the FOLL_TRIED bits. FOLL_TRIED will let the
>>>> fault handler (e.g. filemap) know that we've been there and waited on
>>>> the IO already, so in the common case we won't need to redo the IO.
>>>
>>> Yes, that's not what FOLL_TRIED does.  But it's the difference between
>>> get_user_pages and kvm_get_user_page_retry, right?
>>
>> Unfortunately get_user_pages does not expose the param (int
>> *nonblocking) that __gup will use to set FAULT_FLAG_ALLOW_RETRY. So
>> that's one difference. The second difference is that kvm_gup_retry
>> will call two times if necessary (the second without _RETRY but with
>> _TRIED).
>
> Yeah, that's how it is in your patch.  I can see that.
>
> What I'm saying is that your patch is two changes in one:
>
> 1) do not use gup_fast in hva_to_pfn_slow, instead use gup as in
> async_pf_execute.  This change can already introduce a function called
> kvm_get_user_page_retry, and can already use it in async_pf_execute and
> hva_to_pfn_slow
>
> 2) introduce the two-phase RETRY + TRIED mechanism in
> kvm_get_user_page_retry, so that the mmap semaphore is relinquished
> properly if the filemap or swap has to wait on the page lock.
>
> I would prefer to split it in two patches.  Is it clearer now?

Understood. So in patch 1, would kvm_gup_retry be ... just a wrapper
around gup? That looks thin to me, and the naming of the function will
not be accurate. Plus, considering Radim's suggestion that the naming
is not optimal.

I can have patch 1 just s/gup_fast/gup (one liner), and then patch 2
do the rest of the work.

Andres
>
> Paolo



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 11:42       ` Gleb Natapov
@ 2014-09-17 17:00         ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:00 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Radim Krčmář,
	Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
> On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
>> 2014-09-17 13:26+0300, Gleb Natapov:
>> > For async_pf_execute() you do not need to even retry. Next guest's page fault
>> > will retry it for you.
>>
>> Wouldn't that be a waste of vmentries?
> This is how it will work with or without this second gup. Page is not
> mapped into a shadow page table on this path, it happens on a next fault.

The point is that the gup in the async pf completion from the work
queue will not relinquish the mmap semaphore. And it most definitely
should, given that we are likely looking at swap/filemap.

Andres

>
> --
>                         Gleb.



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 17:00         ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:00 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Radim Krčmář,
	Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
> On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
>> 2014-09-17 13:26+0300, Gleb Natapov:
>> > For async_pf_execute() you do not need to even retry. Next guest's page fault
>> > will retry it for you.
>>
>> Wouldn't that be a waste of vmentries?
> This is how it will work with or without this second gup. Page is not
> mapped into a shadow page table on this path, it happens on a next fault.

The point is that the gup in the async pf completion from the work
queue will not relinquish the mmap semaphore. And it most definitely
should, given that we are likely looking at swap/filemap.

Andres

>
> --
>                         Gleb.



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:00         ` Andres Lagar-Cavilla
@ 2014-09-17 17:08           ` Gleb Natapov
  -1 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 17:08 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Radim Krčmář,
	Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
> >> 2014-09-17 13:26+0300, Gleb Natapov:
> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
> >> > will retry it for you.
> >>
> >> Wouldn't that be a waste of vmentries?
> > This is how it will work with or without this second gup. Page is not
> > mapped into a shadow page table on this path, it happens on a next fault.
> 
> The point is that the gup in the async pf completion from the work
> queue will not relinquish the mmap semaphore. And it most definitely
> should, given that we are likely looking at swap/filemap.
> 
I get this point and the patch looks good in general, but my point is
that when _retry() is called from async_pf_execute() second gup is not
needed. In the original code gup is called to do IO and nothing else.
In your patch this is accomplished by the first gup already, so you
can skip second gup if pagep == nullptr.

--
			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 17:08           ` Gleb Natapov
  0 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 17:08 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Radim Krčmář,
	Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
> >> 2014-09-17 13:26+0300, Gleb Natapov:
> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
> >> > will retry it for you.
> >>
> >> Wouldn't that be a waste of vmentries?
> > This is how it will work with or without this second gup. Page is not
> > mapped into a shadow page table on this path, it happens on a next fault.
> 
> The point is that the gup in the async pf completion from the work
> queue will not relinquish the mmap semaphore. And it most definitely
> should, given that we are likely looking at swap/filemap.
> 
I get this point and the patch looks good in general, but my point is
that when _retry() is called from async_pf_execute() second gup is not
needed. In the original code gup is called to do IO and nothing else.
In your patch this is accomplished by the first gup already, so you
can skip second gup if pagep == nullptr.

--
			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:08           ` Gleb Natapov
@ 2014-09-17 17:13             ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:13 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Radim Krčmář,
	Rik van Riel, Peter Zijlstra, Mel Gorman, Andy Lutomirski,
	Andrew Morton, Andrea Arcangeli, Sasha Levin, Jianyu Zhan,
	Paul Cassella, Hugh Dickins, Peter Feiner, kvm, linux-kernel,
	linux-mm

On Wed, Sep 17, 2014 at 10:08 AM, Gleb Natapov <gleb@kernel.org> wrote:
> On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
>> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
>> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
>> >> 2014-09-17 13:26+0300, Gleb Natapov:
>> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
>> >> > will retry it for you.
>> >>
>> >> Wouldn't that be a waste of vmentries?
>> > This is how it will work with or without this second gup. Page is not
>> > mapped into a shadow page table on this path, it happens on a next fault.
>>
>> The point is that the gup in the async pf completion from the work
>> queue will not relinquish the mmap semaphore. And it most definitely
>> should, given that we are likely looking at swap/filemap.
>>
> I get this point and the patch looks good in general, but my point is
> that when _retry() is called from async_pf_execute() second gup is not
> needed. In the original code gup is called to do IO and nothing else.
> In your patch this is accomplished by the first gup already, so you
> can skip second gup if pagep == nullptr.

I see. However, if this function were to be used elsewhere in the
future, then the "if pagep == NULL don't retry" semantics may not
match the new caller's intention. Would you prefer an explicit flag?

Andres

>
> --
>                         Gleb.



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 17:13             ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:13 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Radim Krčmář,
	Rik van Riel, Peter Zijlstra, Mel Gorman, Andy Lutomirski,
	Andrew Morton, Andrea Arcangeli, Sasha Levin, Jianyu Zhan,
	Paul Cassella, Hugh Dickins, Peter Feiner, kvm, linux-kernel,
	linux-mm

On Wed, Sep 17, 2014 at 10:08 AM, Gleb Natapov <gleb@kernel.org> wrote:
> On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
>> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
>> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
>> >> 2014-09-17 13:26+0300, Gleb Natapov:
>> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
>> >> > will retry it for you.
>> >>
>> >> Wouldn't that be a waste of vmentries?
>> > This is how it will work with or without this second gup. Page is not
>> > mapped into a shadow page table on this path, it happens on a next fault.
>>
>> The point is that the gup in the async pf completion from the work
>> queue will not relinquish the mmap semaphore. And it most definitely
>> should, given that we are likely looking at swap/filemap.
>>
> I get this point and the patch looks good in general, but my point is
> that when _retry() is called from async_pf_execute() second gup is not
> needed. In the original code gup is called to do IO and nothing else.
> In your patch this is accomplished by the first gup already, so you
> can skip second gup if pagep == nullptr.

I see. However, if this function were to be used elsewhere in the
future, then the "if pagep == NULL don't retry" semantics may not
match the new caller's intention. Would you prefer an explicit flag?

Andres

>
> --
>                         Gleb.



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:13             ` Andres Lagar-Cavilla
@ 2014-09-17 17:21               ` Gleb Natapov
  -1 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 17:21 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Radim Krčmář,
	Rik van Riel, Peter Zijlstra, Mel Gorman, Andy Lutomirski,
	Andrew Morton, Andrea Arcangeli, Sasha Levin, Jianyu Zhan,
	Paul Cassella, Hugh Dickins, Peter Feiner, kvm, linux-kernel,
	linux-mm

On Wed, Sep 17, 2014 at 10:13:45AM -0700, Andres Lagar-Cavilla wrote:
> On Wed, Sep 17, 2014 at 10:08 AM, Gleb Natapov <gleb@kernel.org> wrote:
> > On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
> >> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
> >> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
> >> >> 2014-09-17 13:26+0300, Gleb Natapov:
> >> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
> >> >> > will retry it for you.
> >> >>
> >> >> Wouldn't that be a waste of vmentries?
> >> > This is how it will work with or without this second gup. Page is not
> >> > mapped into a shadow page table on this path, it happens on a next fault.
> >>
> >> The point is that the gup in the async pf completion from the work
> >> queue will not relinquish the mmap semaphore. And it most definitely
> >> should, given that we are likely looking at swap/filemap.
> >>
> > I get this point and the patch looks good in general, but my point is
> > that when _retry() is called from async_pf_execute() second gup is not
> > needed. In the original code gup is called to do IO and nothing else.
> > In your patch this is accomplished by the first gup already, so you
> > can skip second gup if pagep == nullptr.
> 
> I see. However, if this function were to be used elsewhere in the
> future, then the "if pagep == NULL don't retry" semantics may not
> match the new caller's intention. Would you prefer an explicit flag?
> 
We can add explicit flag whenever such caller will be added, if ever.

--
			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 17:21               ` Gleb Natapov
  0 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-17 17:21 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Radim Krčmář,
	Rik van Riel, Peter Zijlstra, Mel Gorman, Andy Lutomirski,
	Andrew Morton, Andrea Arcangeli, Sasha Levin, Jianyu Zhan,
	Paul Cassella, Hugh Dickins, Peter Feiner, kvm, linux-kernel,
	linux-mm

On Wed, Sep 17, 2014 at 10:13:45AM -0700, Andres Lagar-Cavilla wrote:
> On Wed, Sep 17, 2014 at 10:08 AM, Gleb Natapov <gleb@kernel.org> wrote:
> > On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
> >> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
> >> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
> >> >> 2014-09-17 13:26+0300, Gleb Natapov:
> >> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
> >> >> > will retry it for you.
> >> >>
> >> >> Wouldn't that be a waste of vmentries?
> >> > This is how it will work with or without this second gup. Page is not
> >> > mapped into a shadow page table on this path, it happens on a next fault.
> >>
> >> The point is that the gup in the async pf completion from the work
> >> queue will not relinquish the mmap semaphore. And it most definitely
> >> should, given that we are likely looking at swap/filemap.
> >>
> > I get this point and the patch looks good in general, but my point is
> > that when _retry() is called from async_pf_execute() second gup is not
> > needed. In the original code gup is called to do IO and nothing else.
> > In your patch this is accomplished by the first gup already, so you
> > can skip second gup if pagep == nullptr.
> 
> I see. However, if this function were to be used elsewhere in the
> future, then the "if pagep == NULL don't retry" semantics may not
> match the new caller's intention. Would you prefer an explicit flag?
> 
We can add explicit flag whenever such caller will be added, if ever.

--
			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:21               ` Gleb Natapov
@ 2014-09-17 17:41                 ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:41 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Radim Krčmář,
	Rik van Riel, Peter Zijlstra, Mel Gorman, Andy Lutomirski,
	Andrew Morton, Andrea Arcangeli, Sasha Levin, Jianyu Zhan,
	Paul Cassella, Hugh Dickins, Peter Feiner, kvm, linux-kernel,
	linux-mm

On Wed, Sep 17, 2014 at 10:21 AM, Gleb Natapov <gleb@kernel.org> wrote:
> On Wed, Sep 17, 2014 at 10:13:45AM -0700, Andres Lagar-Cavilla wrote:
>> On Wed, Sep 17, 2014 at 10:08 AM, Gleb Natapov <gleb@kernel.org> wrote:
>> > On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
>> >> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
>> >> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
>> >> >> 2014-09-17 13:26+0300, Gleb Natapov:
>> >> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
>> >> >> > will retry it for you.
>> >> >>
>> >> >> Wouldn't that be a waste of vmentries?
>> >> > This is how it will work with or without this second gup. Page is not
>> >> > mapped into a shadow page table on this path, it happens on a next fault.
>> >>
>> >> The point is that the gup in the async pf completion from the work
>> >> queue will not relinquish the mmap semaphore. And it most definitely
>> >> should, given that we are likely looking at swap/filemap.
>> >>
>> > I get this point and the patch looks good in general, but my point is
>> > that when _retry() is called from async_pf_execute() second gup is not
>> > needed. In the original code gup is called to do IO and nothing else.
>> > In your patch this is accomplished by the first gup already, so you
>> > can skip second gup if pagep == nullptr.
>>
>> I see. However, if this function were to be used elsewhere in the
>> future, then the "if pagep == NULL don't retry" semantics may not
>> match the new caller's intention. Would you prefer an explicit flag?
>>
> We can add explicit flag whenever such caller will be added, if ever.

Ok. Patch forthcoming.

Paolo, I'm not sure the split will buy anything, because the first
patch will be a one-liner (no point in the new kvm_gup_something
function if the impl won't match the intended semantics of the
function). But if you push back, I'll cut a v3.

Thanks all,
Andres
>
> --
>                         Gleb.



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 17:41                 ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:41 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Radim Krčmář,
	Rik van Riel, Peter Zijlstra, Mel Gorman, Andy Lutomirski,
	Andrew Morton, Andrea Arcangeli, Sasha Levin, Jianyu Zhan,
	Paul Cassella, Hugh Dickins, Peter Feiner, kvm, linux-kernel,
	linux-mm

On Wed, Sep 17, 2014 at 10:21 AM, Gleb Natapov <gleb@kernel.org> wrote:
> On Wed, Sep 17, 2014 at 10:13:45AM -0700, Andres Lagar-Cavilla wrote:
>> On Wed, Sep 17, 2014 at 10:08 AM, Gleb Natapov <gleb@kernel.org> wrote:
>> > On Wed, Sep 17, 2014 at 10:00:32AM -0700, Andres Lagar-Cavilla wrote:
>> >> On Wed, Sep 17, 2014 at 4:42 AM, Gleb Natapov <gleb@kernel.org> wrote:
>> >> > On Wed, Sep 17, 2014 at 01:27:14PM +0200, Radim Krčmář wrote:
>> >> >> 2014-09-17 13:26+0300, Gleb Natapov:
>> >> >> > For async_pf_execute() you do not need to even retry. Next guest's page fault
>> >> >> > will retry it for you.
>> >> >>
>> >> >> Wouldn't that be a waste of vmentries?
>> >> > This is how it will work with or without this second gup. Page is not
>> >> > mapped into a shadow page table on this path, it happens on a next fault.
>> >>
>> >> The point is that the gup in the async pf completion from the work
>> >> queue will not relinquish the mmap semaphore. And it most definitely
>> >> should, given that we are likely looking at swap/filemap.
>> >>
>> > I get this point and the patch looks good in general, but my point is
>> > that when _retry() is called from async_pf_execute() second gup is not
>> > needed. In the original code gup is called to do IO and nothing else.
>> > In your patch this is accomplished by the first gup already, so you
>> > can skip second gup if pagep == nullptr.
>>
>> I see. However, if this function were to be used elsewhere in the
>> future, then the "if pagep == NULL don't retry" semantics may not
>> match the new caller's intention. Would you prefer an explicit flag?
>>
> We can add explicit flag whenever such caller will be added, if ever.

Ok. Patch forthcoming.

Paolo, I'm not sure the split will buy anything, because the first
patch will be a one-liner (no point in the new kvm_gup_something
function if the impl won't match the intended semantics of the
function). But if you push back, I'll cut a v3.

Thanks all,
Andres
>
> --
>                         Gleb.



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-15 20:11 ` Andres Lagar-Cavilla
@ 2014-09-17 17:51   ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:51 UTC (permalink / raw)
  To: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm
  Cc: Andres Lagar-Cavilla

When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory
has been swapped out or is behind a filemap, this will trigger async
readahead and return immediately. The rationale is that KVM will kick
back the guest with an "async page fault" and allow for some other
guest process to take over.

If async PFs are enabled the fault is retried asap from an async
workqueue. If not, it's retried immediately in the same code path. In
either case the retry will not relinquish the mmap semaphore and will
block on the IO. This is a bad thing, as other mmap semaphore users
now stall as a function of swap or filemap latency.

This patch ensures both the regular and async PF path re-enter the
fault allowing for the mmap semaphore to be relinquished in the case
of IO wait.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>

---
v1 -> v2

* WARN_ON_ONCE -> VM_WARN_ON_ONCE
* pagep == NULL skips the final retry
* kvm_gup_retry -> kvm_gup_io
* Comment updates throughout
---
 include/linux/kvm_host.h | 11 +++++++++++
 include/linux/mm.h       |  1 +
 mm/gup.c                 |  4 ++++
 virt/kvm/async_pf.c      |  4 +---
 virt/kvm/kvm_main.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3addcbc..4c1991b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -198,6 +198,17 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+/*
+ * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
+ * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
+ * controls whether we retry the gup one more time to completion in that case.
+ * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
+ * handler.
+ */
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+			 unsigned long addr, bool write_fault,
+			 struct page **pagep);
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ebc5f90..13e585f7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
+#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b..af7ea3e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 	if (*flags & FOLL_NOWAIT)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+	if (*flags & FOLL_TRIED) {
+		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+		fault_flags |= FAULT_FLAG_TRIED;
+	}
 
 	ret = handle_mm_fault(mm, vma, address, fault_flags);
 	if (ret & VM_FAULT_ERROR) {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d6a3d09..5ff7f7f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	down_read(&mm->mmap_sem);
-	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
-	up_read(&mm->mmap_sem);
+	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7ef6b48..fa8a565 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,6 +1115,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+			 unsigned long addr, bool write_fault,
+			 struct page **pagep)
+{
+	int npages;
+	int locked = 1;
+	int flags = FOLL_TOUCH | FOLL_HWPOISON |
+		    (pagep ? FOLL_GET : 0) |
+		    (write_fault ? FOLL_WRITE : 0);
+
+	/*
+	 * If retrying the fault, we get here *not* having allowed the filemap
+	 * to wait on the page lock. We should now allow waiting on the IO with
+	 * the mmap semaphore released.
+	 */
+	down_read(&mm->mmap_sem);
+	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+				  &locked);
+	if (!locked) {
+		VM_BUG_ON(npages != -EBUSY);
+
+		if (!pagep)
+			return 0;
+
+		/*
+		 * The previous call has now waited on the IO. Now we can
+		 * retry and complete. Pass TRIED to ensure we do not re
+		 * schedule async IO (see e.g. filemap_fault).
+		 */
+		down_read(&mm->mmap_sem);
+		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+					  pagep, NULL, NULL);
+	}
+	up_read(&mm->mmap_sem);
+	return npages;
+}
+
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else
-		npages = get_user_pages_fast(addr, 1, write_fault,
-					     page);
+	} else {
+		/*
+		 * By now we have tried gup_fast, and possibly async_pf, and we
+		 * are certainly not atomic. Time to retry the gup, allowing
+		 * mmap semaphore to be relinquished in the case of IO.
+		 */
+		npages = kvm_get_user_page_io(current, current->mm, addr,
+					      write_fault, page);
+	}
 	if (npages != 1)
 		return npages;
 
-- 
2.1.0.rc2.206.gedb03e5


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 17:51   ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-17 17:51 UTC (permalink / raw)
  To: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm
  Cc: Andres Lagar-Cavilla

When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory
has been swapped out or is behind a filemap, this will trigger async
readahead and return immediately. The rationale is that KVM will kick
back the guest with an "async page fault" and allow for some other
guest process to take over.

If async PFs are enabled the fault is retried asap from an async
workqueue. If not, it's retried immediately in the same code path. In
either case the retry will not relinquish the mmap semaphore and will
block on the IO. This is a bad thing, as other mmap semaphore users
now stall as a function of swap or filemap latency.

This patch ensures both the regular and async PF path re-enter the
fault allowing for the mmap semaphore to be relinquished in the case
of IO wait.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>

---
v1 -> v2

* WARN_ON_ONCE -> VM_WARN_ON_ONCE
* pagep == NULL skips the final retry
* kvm_gup_retry -> kvm_gup_io
* Comment updates throughout
---
 include/linux/kvm_host.h | 11 +++++++++++
 include/linux/mm.h       |  1 +
 mm/gup.c                 |  4 ++++
 virt/kvm/async_pf.c      |  4 +---
 virt/kvm/kvm_main.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3addcbc..4c1991b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -198,6 +198,17 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+/*
+ * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
+ * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
+ * controls whether we retry the gup one more time to completion in that case.
+ * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
+ * handler.
+ */
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+			 unsigned long addr, bool write_fault,
+			 struct page **pagep);
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ebc5f90..13e585f7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
+#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b..af7ea3e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 	if (*flags & FOLL_NOWAIT)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+	if (*flags & FOLL_TRIED) {
+		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+		fault_flags |= FAULT_FLAG_TRIED;
+	}
 
 	ret = handle_mm_fault(mm, vma, address, fault_flags);
 	if (ret & VM_FAULT_ERROR) {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d6a3d09..5ff7f7f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	down_read(&mm->mmap_sem);
-	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
-	up_read(&mm->mmap_sem);
+	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7ef6b48..fa8a565 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1115,6 +1115,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
+int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+			 unsigned long addr, bool write_fault,
+			 struct page **pagep)
+{
+	int npages;
+	int locked = 1;
+	int flags = FOLL_TOUCH | FOLL_HWPOISON |
+		    (pagep ? FOLL_GET : 0) |
+		    (write_fault ? FOLL_WRITE : 0);
+
+	/*
+	 * If retrying the fault, we get here *not* having allowed the filemap
+	 * to wait on the page lock. We should now allow waiting on the IO with
+	 * the mmap semaphore released.
+	 */
+	down_read(&mm->mmap_sem);
+	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+				  &locked);
+	if (!locked) {
+		VM_BUG_ON(npages != -EBUSY);
+
+		if (!pagep)
+			return 0;
+
+		/*
+		 * The previous call has now waited on the IO. Now we can
+		 * retry and complete. Pass TRIED to ensure we do not re
+		 * schedule async IO (see e.g. filemap_fault).
+		 */
+		down_read(&mm->mmap_sem);
+		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+					  pagep, NULL, NULL);
+	}
+	up_read(&mm->mmap_sem);
+	return npages;
+}
+
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else
-		npages = get_user_pages_fast(addr, 1, write_fault,
-					     page);
+	} else {
+		/*
+		 * By now we have tried gup_fast, and possibly async_pf, and we
+		 * are certainly not atomic. Time to retry the gup, allowing
+		 * mmap semaphore to be relinquished in the case of IO.
+		 */
+		npages = kvm_get_user_page_io(current, current->mm, addr,
+					      write_fault, page);
+	}
 	if (npages != 1)
 		return npages;
 
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 16:58             ` Andres Lagar-Cavilla
@ 2014-09-17 20:01               ` Paolo Bonzini
  -1 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-17 20:01 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

Il 17/09/2014 18:58, Andres Lagar-Cavilla ha scritto:
> Understood. So in patch 1, would kvm_gup_retry be ... just a wrapper
> around gup? That looks thin to me, and the naming of the function will
> not be accurate.

Depends on how you interpret "retry" ("with retry" vs. "retry after
_fast"). :)

My point was more to make possible future bisection easier, but I'm not
going to insist.  I'll queue the patch as soon as I get the required
Acked-by.

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-17 20:01               ` Paolo Bonzini
  0 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-17 20:01 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Rik van Riel, Peter Zijlstra, Mel Gorman,
	Andy Lutomirski, Andrew Morton, Andrea Arcangeli, Sasha Levin,
	Jianyu Zhan, Paul Cassella, Hugh Dickins, Peter Feiner, kvm,
	linux-kernel, linux-mm

Il 17/09/2014 18:58, Andres Lagar-Cavilla ha scritto:
> Understood. So in patch 1, would kvm_gup_retry be ... just a wrapper
> around gup? That looks thin to me, and the naming of the function will
> not be accurate.

Depends on how you interpret "retry" ("with retry" vs. "retry after
_fast"). :)

My point was more to make possible future bisection easier, but I'm not
going to insist.  I'll queue the patch as soon as I get the required
Acked-by.

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:51   ` Andres Lagar-Cavilla
@ 2014-09-18  0:29     ` Wanpeng Li
  -1 siblings, 0 replies; 70+ messages in thread
From: Wanpeng Li @ 2014-09-18  0:29 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm

Hi Andres,
On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
[...]
> static inline int check_user_page_hwpoison(unsigned long addr)
> {
> 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
>@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> 		npages = get_user_page_nowait(current, current->mm,
> 					      addr, write_fault, page);
> 		up_read(&current->mm->mmap_sem);
>-	} else
>-		npages = get_user_pages_fast(addr, 1, write_fault,
>-					     page);
>+	} else {
>+		/*
>+		 * By now we have tried gup_fast, and possibly async_pf, and we
>+		 * are certainly not atomic. Time to retry the gup, allowing
>+		 * mmap semaphore to be relinquished in the case of IO.
>+		 */
>+		npages = kvm_get_user_page_io(current, current->mm, addr,
>+					      write_fault, page);
>+	}

try_async_pf 
 gfn_to_pfn_async 
  __gfn_to_pfn  			async = false 
   __gfn_to_pfn_memslot
    hva_to_pfn 
	 hva_to_pfn_fast 
	 hva_to_pfn_slow 
	  kvm_get_user_page_io

page will always be ready after kvm_get_user_page_io which leads to APF
don't need to work any more.

Regards,
Wanpeng Li

> 	if (npages != 1)
> 		return npages;
> 
>-- 
>2.1.0.rc2.206.gedb03e5
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-18  0:29     ` Wanpeng Li
  0 siblings, 0 replies; 70+ messages in thread
From: Wanpeng Li @ 2014-09-18  0:29 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm

Hi Andres,
On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
[...]
> static inline int check_user_page_hwpoison(unsigned long addr)
> {
> 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
>@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> 		npages = get_user_page_nowait(current, current->mm,
> 					      addr, write_fault, page);
> 		up_read(&current->mm->mmap_sem);
>-	} else
>-		npages = get_user_pages_fast(addr, 1, write_fault,
>-					     page);
>+	} else {
>+		/*
>+		 * By now we have tried gup_fast, and possibly async_pf, and we
>+		 * are certainly not atomic. Time to retry the gup, allowing
>+		 * mmap semaphore to be relinquished in the case of IO.
>+		 */
>+		npages = kvm_get_user_page_io(current, current->mm, addr,
>+					      write_fault, page);
>+	}

try_async_pf 
 gfn_to_pfn_async 
  __gfn_to_pfn  			async = false 
   __gfn_to_pfn_memslot
    hva_to_pfn 
	 hva_to_pfn_fast 
	 hva_to_pfn_slow 
	  kvm_get_user_page_io

page will always be ready after kvm_get_user_page_io which leads to APF
don't need to work any more.

Regards,
Wanpeng Li

> 	if (npages != 1)
> 		return npages;
> 
>-- 
>2.1.0.rc2.206.gedb03e5
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@kvack.org.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-18  0:29     ` Wanpeng Li
@ 2014-09-18  6:13       ` Gleb Natapov
  -1 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-18  6:13 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Andres Lagar-Cavilla, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 08:29:17AM +0800, Wanpeng Li wrote:
> Hi Andres,
> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
> [...]
> > static inline int check_user_page_hwpoison(unsigned long addr)
> > {
> > 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> >@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > 		npages = get_user_page_nowait(current, current->mm,
> > 					      addr, write_fault, page);
> > 		up_read(&current->mm->mmap_sem);
> >-	} else
> >-		npages = get_user_pages_fast(addr, 1, write_fault,
> >-					     page);
> >+	} else {
> >+		/*
> >+		 * By now we have tried gup_fast, and possibly async_pf, and we
> >+		 * are certainly not atomic. Time to retry the gup, allowing
> >+		 * mmap semaphore to be relinquished in the case of IO.
> >+		 */
> >+		npages = kvm_get_user_page_io(current, current->mm, addr,
> >+					      write_fault, page);
> >+	}
> 
> try_async_pf 
>  gfn_to_pfn_async 
>   __gfn_to_pfn  			async = false 
                                        *async = false

>    __gfn_to_pfn_memslot
>     hva_to_pfn 
> 	 hva_to_pfn_fast 
> 	 hva_to_pfn_slow 
hva_to_pfn_slow checks async not *async.

> 	  kvm_get_user_page_io
> 
> page will always be ready after kvm_get_user_page_io which leads to APF
> don't need to work any more.
> 
> Regards,
> Wanpeng Li
> 
> > 	if (npages != 1)
> > 		return npages;
> > 
> >-- 
> >2.1.0.rc2.206.gedb03e5
> >
> >--
> >To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >the body to majordomo@kvack.org.  For more info on Linux MM,
> >see: http://www.linux-mm.org/ .
> >Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-18  6:13       ` Gleb Natapov
  0 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-18  6:13 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Andres Lagar-Cavilla, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 08:29:17AM +0800, Wanpeng Li wrote:
> Hi Andres,
> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
> [...]
> > static inline int check_user_page_hwpoison(unsigned long addr)
> > {
> > 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> >@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > 		npages = get_user_page_nowait(current, current->mm,
> > 					      addr, write_fault, page);
> > 		up_read(&current->mm->mmap_sem);
> >-	} else
> >-		npages = get_user_pages_fast(addr, 1, write_fault,
> >-					     page);
> >+	} else {
> >+		/*
> >+		 * By now we have tried gup_fast, and possibly async_pf, and we
> >+		 * are certainly not atomic. Time to retry the gup, allowing
> >+		 * mmap semaphore to be relinquished in the case of IO.
> >+		 */
> >+		npages = kvm_get_user_page_io(current, current->mm, addr,
> >+					      write_fault, page);
> >+	}
> 
> try_async_pf 
>  gfn_to_pfn_async 
>   __gfn_to_pfn  			async = false 
                                        *async = false

>    __gfn_to_pfn_memslot
>     hva_to_pfn 
> 	 hva_to_pfn_fast 
> 	 hva_to_pfn_slow 
hva_to_pfn_slow checks async not *async.

> 	  kvm_get_user_page_io
> 
> page will always be ready after kvm_get_user_page_io which leads to APF
> don't need to work any more.
> 
> Regards,
> Wanpeng Li
> 
> > 	if (npages != 1)
> > 		return npages;
> > 
> >-- 
> >2.1.0.rc2.206.gedb03e5
> >
> >--
> >To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >the body to majordomo@kvack.org.  For more info on Linux MM,
> >see: http://www.linux-mm.org/ .
> >Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:51   ` Andres Lagar-Cavilla
@ 2014-09-18  6:15     ` Gleb Natapov
  -1 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-18  6:15 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Radim Krcmar, Paolo Bonzini, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
> When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory
> has been swapped out or is behind a filemap, this will trigger async
> readahead and return immediately. The rationale is that KVM will kick
> back the guest with an "async page fault" and allow for some other
> guest process to take over.
> 
> If async PFs are enabled the fault is retried asap from an async
> workqueue. If not, it's retried immediately in the same code path. In
> either case the retry will not relinquish the mmap semaphore and will
> block on the IO. This is a bad thing, as other mmap semaphore users
> now stall as a function of swap or filemap latency.
> 
> This patch ensures both the regular and async PF path re-enter the
> fault allowing for the mmap semaphore to be relinquished in the case
> of IO wait.
> 
Reviewed-by: Gleb Natapov <gleb@kernel.org>

> Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
> Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
> 
> ---
> v1 -> v2
> 
> * WARN_ON_ONCE -> VM_WARN_ON_ONCE
> * pagep == NULL skips the final retry
> * kvm_gup_retry -> kvm_gup_io
> * Comment updates throughout
> ---
>  include/linux/kvm_host.h | 11 +++++++++++
>  include/linux/mm.h       |  1 +
>  mm/gup.c                 |  4 ++++
>  virt/kvm/async_pf.c      |  4 +---
>  virt/kvm/kvm_main.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 63 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3addcbc..4c1991b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -198,6 +198,17 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
>  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #endif
>  
> +/*
> + * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
> + * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
> + * controls whether we retry the gup one more time to completion in that case.
> + * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
> + * handler.
> + */
> +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
> +			 unsigned long addr, bool write_fault,
> +			 struct page **pagep);
> +
>  enum {
>  	OUTSIDE_GUEST_MODE,
>  	IN_GUEST_MODE,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ebc5f90..13e585f7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
>  #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
>  #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>  #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
> +#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>  
>  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>  			void *data);
> diff --git a/mm/gup.c b/mm/gup.c
> index 91d044b..af7ea3e 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
>  	if (*flags & FOLL_NOWAIT)
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
> +	if (*flags & FOLL_TRIED) {
> +		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
> +		fault_flags |= FAULT_FLAG_TRIED;
> +	}
>  
>  	ret = handle_mm_fault(mm, vma, address, fault_flags);
>  	if (ret & VM_FAULT_ERROR) {
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index d6a3d09..5ff7f7f 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
>  
>  	might_sleep();
>  
> -	down_read(&mm->mmap_sem);
> -	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> -	up_read(&mm->mmap_sem);
> +	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
>  	kvm_async_page_present_sync(vcpu, apf);
>  
>  	spin_lock(&vcpu->async_pf.lock);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7ef6b48..fa8a565 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1115,6 +1115,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
>  	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
>  }
>  
> +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
> +			 unsigned long addr, bool write_fault,
> +			 struct page **pagep)
> +{
> +	int npages;
> +	int locked = 1;
> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |
> +		    (pagep ? FOLL_GET : 0) |
> +		    (write_fault ? FOLL_WRITE : 0);
> +
> +	/*
> +	 * If retrying the fault, we get here *not* having allowed the filemap
> +	 * to wait on the page lock. We should now allow waiting on the IO with
> +	 * the mmap semaphore released.
> +	 */
> +	down_read(&mm->mmap_sem);
> +	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
> +				  &locked);
> +	if (!locked) {
> +		VM_BUG_ON(npages != -EBUSY);
> +
> +		if (!pagep)
> +			return 0;
> +
> +		/*
> +		 * The previous call has now waited on the IO. Now we can
> +		 * retry and complete. Pass TRIED to ensure we do not re
> +		 * schedule async IO (see e.g. filemap_fault).
> +		 */
> +		down_read(&mm->mmap_sem);
> +		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
> +					  pagep, NULL, NULL);
> +	}
> +	up_read(&mm->mmap_sem);
> +	return npages;
> +}
> +
>  static inline int check_user_page_hwpoison(unsigned long addr)
>  {
>  	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> @@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  		npages = get_user_page_nowait(current, current->mm,
>  					      addr, write_fault, page);
>  		up_read(&current->mm->mmap_sem);
> -	} else
> -		npages = get_user_pages_fast(addr, 1, write_fault,
> -					     page);
> +	} else {
> +		/*
> +		 * By now we have tried gup_fast, and possibly async_pf, and we
> +		 * are certainly not atomic. Time to retry the gup, allowing
> +		 * mmap semaphore to be relinquished in the case of IO.
> +		 */
> +		npages = kvm_get_user_page_io(current, current->mm, addr,
> +					      write_fault, page);
> +	}
>  	if (npages != 1)
>  		return npages;
>  
> -- 
> 2.1.0.rc2.206.gedb03e5
> 

--
			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-18  6:15     ` Gleb Natapov
  0 siblings, 0 replies; 70+ messages in thread
From: Gleb Natapov @ 2014-09-18  6:15 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Radim Krcmar, Paolo Bonzini, Rik van Riel, Peter Zijlstra,
	Mel Gorman, Andy Lutomirski, Andrew Morton, Andrea Arcangeli,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm

On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
> When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory
> has been swapped out or is behind a filemap, this will trigger async
> readahead and return immediately. The rationale is that KVM will kick
> back the guest with an "async page fault" and allow for some other
> guest process to take over.
> 
> If async PFs are enabled the fault is retried asap from an async
> workqueue. If not, it's retried immediately in the same code path. In
> either case the retry will not relinquish the mmap semaphore and will
> block on the IO. This is a bad thing, as other mmap semaphore users
> now stall as a function of swap or filemap latency.
> 
> This patch ensures both the regular and async PF path re-enter the
> fault allowing for the mmap semaphore to be relinquished in the case
> of IO wait.
> 
Reviewed-by: Gleb Natapov <gleb@kernel.org>

> Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
> Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
> 
> ---
> v1 -> v2
> 
> * WARN_ON_ONCE -> VM_WARN_ON_ONCE
> * pagep == NULL skips the final retry
> * kvm_gup_retry -> kvm_gup_io
> * Comment updates throughout
> ---
>  include/linux/kvm_host.h | 11 +++++++++++
>  include/linux/mm.h       |  1 +
>  mm/gup.c                 |  4 ++++
>  virt/kvm/async_pf.c      |  4 +---
>  virt/kvm/kvm_main.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 63 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3addcbc..4c1991b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -198,6 +198,17 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
>  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #endif
>  
> +/*
> + * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
> + * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
> + * controls whether we retry the gup one more time to completion in that case.
> + * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
> + * handler.
> + */
> +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
> +			 unsigned long addr, bool write_fault,
> +			 struct page **pagep);
> +
>  enum {
>  	OUTSIDE_GUEST_MODE,
>  	IN_GUEST_MODE,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ebc5f90..13e585f7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2011,6 +2011,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
>  #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
>  #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
>  #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
> +#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
>  
>  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
>  			void *data);
> diff --git a/mm/gup.c b/mm/gup.c
> index 91d044b..af7ea3e 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
>  	if (*flags & FOLL_NOWAIT)
>  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
> +	if (*flags & FOLL_TRIED) {
> +		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
> +		fault_flags |= FAULT_FLAG_TRIED;
> +	}
>  
>  	ret = handle_mm_fault(mm, vma, address, fault_flags);
>  	if (ret & VM_FAULT_ERROR) {
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index d6a3d09..5ff7f7f 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work)
>  
>  	might_sleep();
>  
> -	down_read(&mm->mmap_sem);
> -	get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL);
> -	up_read(&mm->mmap_sem);
> +	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
>  	kvm_async_page_present_sync(vcpu, apf);
>  
>  	spin_lock(&vcpu->async_pf.lock);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7ef6b48..fa8a565 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1115,6 +1115,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
>  	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
>  }
>  
> +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
> +			 unsigned long addr, bool write_fault,
> +			 struct page **pagep)
> +{
> +	int npages;
> +	int locked = 1;
> +	int flags = FOLL_TOUCH | FOLL_HWPOISON |
> +		    (pagep ? FOLL_GET : 0) |
> +		    (write_fault ? FOLL_WRITE : 0);
> +
> +	/*
> +	 * If retrying the fault, we get here *not* having allowed the filemap
> +	 * to wait on the page lock. We should now allow waiting on the IO with
> +	 * the mmap semaphore released.
> +	 */
> +	down_read(&mm->mmap_sem);
> +	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
> +				  &locked);
> +	if (!locked) {
> +		VM_BUG_ON(npages != -EBUSY);
> +
> +		if (!pagep)
> +			return 0;
> +
> +		/*
> +		 * The previous call has now waited on the IO. Now we can
> +		 * retry and complete. Pass TRIED to ensure we do not re
> +		 * schedule async IO (see e.g. filemap_fault).
> +		 */
> +		down_read(&mm->mmap_sem);
> +		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
> +					  pagep, NULL, NULL);
> +	}
> +	up_read(&mm->mmap_sem);
> +	return npages;
> +}
> +
>  static inline int check_user_page_hwpoison(unsigned long addr)
>  {
>  	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> @@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  		npages = get_user_page_nowait(current, current->mm,
>  					      addr, write_fault, page);
>  		up_read(&current->mm->mmap_sem);
> -	} else
> -		npages = get_user_pages_fast(addr, 1, write_fault,
> -					     page);
> +	} else {
> +		/*
> +		 * By now we have tried gup_fast, and possibly async_pf, and we
> +		 * are certainly not atomic. Time to retry the gup, allowing
> +		 * mmap semaphore to be relinquished in the case of IO.
> +		 */
> +		npages = kvm_get_user_page_io(current, current->mm, addr,
> +					      write_fault, page);
> +	}
>  	if (npages != 1)
>  		return npages;
>  
> -- 
> 2.1.0.rc2.206.gedb03e5
> 

--
			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-18  6:13       ` Gleb Natapov
@ 2014-09-19  0:32         ` Wanpeng Li
  -1 siblings, 0 replies; 70+ messages in thread
From: Wanpeng Li @ 2014-09-19  0:32 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Andres Lagar-Cavilla, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 09:13:26AM +0300, Gleb Natapov wrote:
>On Thu, Sep 18, 2014 at 08:29:17AM +0800, Wanpeng Li wrote:
>> Hi Andres,
>> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
>> [...]
>> > static inline int check_user_page_hwpoison(unsigned long addr)
>> > {
>> > 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
>> >@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>> > 		npages = get_user_page_nowait(current, current->mm,
>> > 					      addr, write_fault, page);
>> > 		up_read(&current->mm->mmap_sem);
>> >-	} else
>> >-		npages = get_user_pages_fast(addr, 1, write_fault,
>> >-					     page);
>> >+	} else {
>> >+		/*
>> >+		 * By now we have tried gup_fast, and possibly async_pf, and we
>> >+		 * are certainly not atomic. Time to retry the gup, allowing
>> >+		 * mmap semaphore to be relinquished in the case of IO.
>> >+		 */
>> >+		npages = kvm_get_user_page_io(current, current->mm, addr,
>> >+					      write_fault, page);
>> >+	}
>> 
>> try_async_pf 
>>  gfn_to_pfn_async 
>>   __gfn_to_pfn  			async = false 
>                                        *async = false
>
>>    __gfn_to_pfn_memslot
>>     hva_to_pfn 
>> 	 hva_to_pfn_fast 
>> 	 hva_to_pfn_slow 
>hva_to_pfn_slow checks async not *async.

Got it, thanks for your pointing out.

Reviewed-by: Wanpeng Li <wanpeng.li@linux.intel.com>

Regards,
Wanpeng Li 

>
>> 	  kvm_get_user_page_io
>> 
>> page will always be ready after kvm_get_user_page_io which leads to APF
>> don't need to work any more.
>> 
>> Regards,
>> Wanpeng Li
>> 
>> > 	if (npages != 1)
>> > 		return npages;
>> > 
>> >-- 
>> >2.1.0.rc2.206.gedb03e5
>> >
>> >--
>> >To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> >the body to majordomo@kvack.org.  For more info on Linux MM,
>> >see: http://www.linux-mm.org/ .
>> >Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
>--
>			Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-19  0:32         ` Wanpeng Li
  0 siblings, 0 replies; 70+ messages in thread
From: Wanpeng Li @ 2014-09-19  0:32 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Andres Lagar-Cavilla, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Andrea Arcangeli, Sasha Levin, Jianyu Zhan, Paul Cassella,
	Hugh Dickins, Peter Feiner, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 09:13:26AM +0300, Gleb Natapov wrote:
>On Thu, Sep 18, 2014 at 08:29:17AM +0800, Wanpeng Li wrote:
>> Hi Andres,
>> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
>> [...]
>> > static inline int check_user_page_hwpoison(unsigned long addr)
>> > {
>> > 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
>> >@@ -1177,9 +1214,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>> > 		npages = get_user_page_nowait(current, current->mm,
>> > 					      addr, write_fault, page);
>> > 		up_read(&current->mm->mmap_sem);
>> >-	} else
>> >-		npages = get_user_pages_fast(addr, 1, write_fault,
>> >-					     page);
>> >+	} else {
>> >+		/*
>> >+		 * By now we have tried gup_fast, and possibly async_pf, and we
>> >+		 * are certainly not atomic. Time to retry the gup, allowing
>> >+		 * mmap semaphore to be relinquished in the case of IO.
>> >+		 */
>> >+		npages = kvm_get_user_page_io(current, current->mm, addr,
>> >+					      write_fault, page);
>> >+	}
>> 
>> try_async_pf 
>>  gfn_to_pfn_async 
>>   __gfn_to_pfn  			async = false 
>                                        *async = false
>
>>    __gfn_to_pfn_memslot
>>     hva_to_pfn 
>> 	 hva_to_pfn_fast 
>> 	 hva_to_pfn_slow 
>hva_to_pfn_slow checks async not *async.

Got it, thanks for your pointing out.

Reviewed-by: Wanpeng Li <wanpeng.li@linux.intel.com>

Regards,
Wanpeng Li 

>
>> 	  kvm_get_user_page_io
>> 
>> page will always be ready after kvm_get_user_page_io which leads to APF
>> don't need to work any more.
>> 
>> Regards,
>> Wanpeng Li
>> 
>> > 	if (npages != 1)
>> > 		return npages;
>> > 
>> >-- 
>> >2.1.0.rc2.206.gedb03e5
>> >
>> >--
>> >To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> >the body to majordomo@kvack.org.  For more info on Linux MM,
>> >see: http://www.linux-mm.org/ .
>> >Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
>--
>			Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-19  0:32         ` Wanpeng Li
@ 2014-09-19  3:58           ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-19  3:58 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Andrew Morton, Andrea Arcangeli, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 5:32 PM, Wanpeng Li <wanpeng.li@linux.intel.com> wrote:
> On Thu, Sep 18, 2014 at 09:13:26AM +0300, Gleb Natapov wrote:
>>On Thu, Sep 18, 2014 at 08:29:17AM +0800, Wanpeng Li wrote:
>>> Hi Andres,
>>> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
>>> [...]
>>> > static inline int check_user_page_hwpoison(unsigned long addr)
>>> > {
>>> >    int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> Got it, thanks for your pointing out.
>
> Reviewed-by: Wanpeng Li <wanpeng.li@linux.intel.com>
>
> Regards,
> Wanpeng Li
>
Thanks.

Paolo, should I recut including the recent Reviewed-by's?

Thanks
Andres
ps: shrunk cc

>>
>>>        kvm_get_user_page_io
>>>
>>> page will always be ready after kvm_get_user_page_io which leads to APF
>>> don't need to work any more.
>>>
>>> Regards,
>>> Wanpeng Li
>>>
>>> >    if (npages != 1)
>>> >            return npages;
>>> >
>>> >--
>>> >2.1.0.rc2.206.gedb03e5
>>> >
>>> >--
>>> >To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> >the body to majordomo@kvack.org.  For more info on Linux MM,
>>> >see: http://www.linux-mm.org/ .
>>> >Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>>
>>--
>>                       Gleb.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-19  3:58           ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-19  3:58 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Andrew Morton, Andrea Arcangeli, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 5:32 PM, Wanpeng Li <wanpeng.li@linux.intel.com> wrote:
> On Thu, Sep 18, 2014 at 09:13:26AM +0300, Gleb Natapov wrote:
>>On Thu, Sep 18, 2014 at 08:29:17AM +0800, Wanpeng Li wrote:
>>> Hi Andres,
>>> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
>>> [...]
>>> > static inline int check_user_page_hwpoison(unsigned long addr)
>>> > {
>>> >    int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> Got it, thanks for your pointing out.
>
> Reviewed-by: Wanpeng Li <wanpeng.li@linux.intel.com>
>
> Regards,
> Wanpeng Li
>
Thanks.

Paolo, should I recut including the recent Reviewed-by's?

Thanks
Andres
ps: shrunk cc

>>
>>>        kvm_get_user_page_io
>>>
>>> page will always be ready after kvm_get_user_page_io which leads to APF
>>> don't need to work any more.
>>>
>>> Regards,
>>> Wanpeng Li
>>>
>>> >    if (npages != 1)
>>> >            return npages;
>>> >
>>> >--
>>> >2.1.0.rc2.206.gedb03e5
>>> >
>>> >--
>>> >To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> >the body to majordomo@kvack.org.  For more info on Linux MM,
>>> >see: http://www.linux-mm.org/ .
>>> >Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>>
>>--
>>                       Gleb.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-19  3:58           ` Andres Lagar-Cavilla
@ 2014-09-19  6:08             ` Paolo Bonzini
  -1 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-19  6:08 UTC (permalink / raw)
  To: Andres Lagar-Cavilla, Wanpeng Li
  Cc: Gleb Natapov, Radim Krcmar, Rik van Riel, Andrew Morton,
	Andrea Arcangeli, kvm, linux-kernel, linux-mm

Il 19/09/2014 05:58, Andres Lagar-Cavilla ha scritto:
> Paolo, should I recut including the recent Reviewed-by's?

No, I'll add them myself.

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-19  6:08             ` Paolo Bonzini
  0 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-19  6:08 UTC (permalink / raw)
  To: Andres Lagar-Cavilla, Wanpeng Li
  Cc: Gleb Natapov, Radim Krcmar, Rik van Riel, Andrew Morton,
	Andrea Arcangeli, kvm, linux-kernel, linux-mm

Il 19/09/2014 05:58, Andres Lagar-Cavilla ha scritto:
> Paolo, should I recut including the recent Reviewed-by's?

No, I'll add them myself.

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-19  6:08             ` Paolo Bonzini
@ 2014-09-22 20:49               ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-22 20:49 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Wanpeng Li, Gleb Natapov, Radim Krcmar, Rik van Riel,
	Andrew Morton, Andrea Arcangeli, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 11:08 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> Il 19/09/2014 05:58, Andres Lagar-Cavilla ha scritto:
> > Paolo, should I recut including the recent Reviewed-by's?
>
> No, I'll add them myself.

Paolo, is this patch waiting for something? Is Gleb's Reviewed-by enough?

Thanks much
Andres

>
>
> Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-22 20:49               ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-22 20:49 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Wanpeng Li, Gleb Natapov, Radim Krcmar, Rik van Riel,
	Andrew Morton, Andrea Arcangeli, kvm, linux-kernel, linux-mm

On Thu, Sep 18, 2014 at 11:08 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> Il 19/09/2014 05:58, Andres Lagar-Cavilla ha scritto:
> > Paolo, should I recut including the recent Reviewed-by's?
>
> No, I'll add them myself.

Paolo, is this patch waiting for something? Is Gleb's Reviewed-by enough?

Thanks much
Andres

>
>
> Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-22 20:49               ` Andres Lagar-Cavilla
@ 2014-09-22 21:32                 ` Paolo Bonzini
  -1 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-22 21:32 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Wanpeng Li, Gleb Natapov, Radim Krcmar, Rik van Riel,
	Andrew Morton, Andrea Arcangeli, kvm, linux-kernel, linux-mm

Il 22/09/2014 22:49, Andres Lagar-Cavilla ha scritto:
>>> > > Paolo, should I recut including the recent Reviewed-by's?
>> >
>> > No, I'll add them myself.
> Paolo, is this patch waiting for something? Is Gleb's Reviewed-by enough?

It's waiting for an Acked-by on the mm/ changes.

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-22 21:32                 ` Paolo Bonzini
  0 siblings, 0 replies; 70+ messages in thread
From: Paolo Bonzini @ 2014-09-22 21:32 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Wanpeng Li, Gleb Natapov, Radim Krcmar, Rik van Riel,
	Andrew Morton, Andrea Arcangeli, kvm, linux-kernel, linux-mm

Il 22/09/2014 22:49, Andres Lagar-Cavilla ha scritto:
>>> > > Paolo, should I recut including the recent Reviewed-by's?
>> >
>> > No, I'll add them myself.
> Paolo, is this patch waiting for something? Is Gleb's Reviewed-by enough?

It's waiting for an Acked-by on the mm/ changes.

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-22 21:32                 ` Paolo Bonzini
@ 2014-09-22 21:53                   ` Andrew Morton
  -1 siblings, 0 replies; 70+ messages in thread
From: Andrew Morton @ 2014-09-22 21:53 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andres Lagar-Cavilla, Wanpeng Li, Gleb Natapov, Radim Krcmar,
	Rik van Riel, Andrea Arcangeli, kvm, linux-kernel, linux-mm

On Mon, 22 Sep 2014 23:32:36 +0200 Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 22/09/2014 22:49, Andres Lagar-Cavilla ha scritto:
> >>> > > Paolo, should I recut including the recent Reviewed-by's?
> >> >
> >> > No, I'll add them myself.
> > Paolo, is this patch waiting for something? Is Gleb's Reviewed-by enough?
> 
> It's waiting for an Acked-by on the mm/ changes.
> 

The MM changes look good to me.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-22 21:53                   ` Andrew Morton
  0 siblings, 0 replies; 70+ messages in thread
From: Andrew Morton @ 2014-09-22 21:53 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Andres Lagar-Cavilla, Wanpeng Li, Gleb Natapov, Radim Krcmar,
	Rik van Riel, Andrea Arcangeli, kvm, linux-kernel, linux-mm

On Mon, 22 Sep 2014 23:32:36 +0200 Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 22/09/2014 22:49, Andres Lagar-Cavilla ha scritto:
> >>> > > Paolo, should I recut including the recent Reviewed-by's?
> >> >
> >> > No, I'll add them myself.
> > Paolo, is this patch waiting for something? Is Gleb's Reviewed-by enough?
> 
> It's waiting for an Acked-by on the mm/ changes.
> 

The MM changes look good to me.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-17 17:51   ` Andres Lagar-Cavilla
@ 2014-09-25 21:16     ` Andrea Arcangeli
  -1 siblings, 0 replies; 70+ messages in thread
From: Andrea Arcangeli @ 2014-09-25 21:16 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm,
	Dr. David Alan Gilbert

Hi Andres,

On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
> +	if (!locked) {
> +		VM_BUG_ON(npages != -EBUSY);
> +

Shouldn't this be VM_BUG_ON(npages)?

Alternatively we could patch gup to do:

			case -EHWPOISON:
+			case -EBUSY:
				return i ? i : ret;
-			case -EBUSY:
-				return i;

I need to fix gup_fast slow path to start with FAULT_FLAG_ALLOW_RETRY
similarly to what you did to the KVM slow path.

gup_fast is called without the mmap_sem (incidentally its whole point
is to only disable irqs and not take the locks) so the enabling of
FAULT_FLAG_ALLOW_RETRY initial pass inside gup_fast should be all self
contained. It shouldn't concern KVM which should be already fine with
your patch, but it will allow the userfaultfd to intercept all
O_DIRECT gup_fast in addition to KVM with your patch.

Eventually get_user_pages should be obsoleted in favor of
get_user_pages_locked (or whoever we decide to call it) so the
userfaultfd can intercept all kind of gups. gup_locked is same as gup
except for one more "locked" parameter at the end, I called the
parameter locked instead of nonblocking because it'd be more proper to
call "nonblocking" gup the FOLL_NOWAIT kind which is quite the
opposite (in fact as the mmap_sem cannot be dropped in the non
blocking version).

ptrace ironically is better off sticking with a NULL locked parameter
and to get a sigbus instead of risking hanging on the userfaultfd
(which would be safe as it can be killed, but it'd be annoying if
erroneously poking into a hole during a gdb session). It's still
possible to pass NULL as parameter to get_user_pages_locked to achieve
that. So the fact some callers won't block in handle_userfault because
FAULT_FLAG_ALLOW_RETRY is not set and the userfault cannot block, may
come handy.

What I'm trying to solve in this context is that the userfault cannot
safely block without FAULT_FLAG_ALLOW_RETRY because we can't allow
userland to take the mmap_sem for an unlimited amount of time without
requiring special privileges, so if handle_userfault wants to blocks
within a gup invocation, it must first release the mmap_sem hence
FAULT_FLAG_ALLOW_RETRY is always required at the first attempt for any
virtual address.

With regard to the last sentence, there's actually a race with
MADV_DONTNEED too, I'd need to change the code to always pass
FAULT_FLAG_ALLOW_RETRY (your code also would need to loop and
insisting with the __get_user_pages(locked) version to solve it). The
KVM ioctl worst case would get an -EFAULT if the race materializes for
example. It's non concerning though because that can be solved in
userland somehow by separating ballooning and live migration
activities.

Thanks,
Andrea

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-25 21:16     ` Andrea Arcangeli
  0 siblings, 0 replies; 70+ messages in thread
From: Andrea Arcangeli @ 2014-09-25 21:16 UTC (permalink / raw)
  To: Andres Lagar-Cavilla
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm,
	Dr. David Alan Gilbert

Hi Andres,

On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
> +	if (!locked) {
> +		VM_BUG_ON(npages != -EBUSY);
> +

Shouldn't this be VM_BUG_ON(npages)?

Alternatively we could patch gup to do:

			case -EHWPOISON:
+			case -EBUSY:
				return i ? i : ret;
-			case -EBUSY:
-				return i;

I need to fix gup_fast slow path to start with FAULT_FLAG_ALLOW_RETRY
similarly to what you did to the KVM slow path.

gup_fast is called without the mmap_sem (incidentally its whole point
is to only disable irqs and not take the locks) so the enabling of
FAULT_FLAG_ALLOW_RETRY initial pass inside gup_fast should be all self
contained. It shouldn't concern KVM which should be already fine with
your patch, but it will allow the userfaultfd to intercept all
O_DIRECT gup_fast in addition to KVM with your patch.

Eventually get_user_pages should be obsoleted in favor of
get_user_pages_locked (or whoever we decide to call it) so the
userfaultfd can intercept all kind of gups. gup_locked is same as gup
except for one more "locked" parameter at the end, I called the
parameter locked instead of nonblocking because it'd be more proper to
call "nonblocking" gup the FOLL_NOWAIT kind which is quite the
opposite (in fact as the mmap_sem cannot be dropped in the non
blocking version).

ptrace ironically is better off sticking with a NULL locked parameter
and to get a sigbus instead of risking hanging on the userfaultfd
(which would be safe as it can be killed, but it'd be annoying if
erroneously poking into a hole during a gdb session). It's still
possible to pass NULL as parameter to get_user_pages_locked to achieve
that. So the fact some callers won't block in handle_userfault because
FAULT_FLAG_ALLOW_RETRY is not set and the userfault cannot block, may
come handy.

What I'm trying to solve in this context is that the userfault cannot
safely block without FAULT_FLAG_ALLOW_RETRY because we can't allow
userland to take the mmap_sem for an unlimited amount of time without
requiring special privileges, so if handle_userfault wants to blocks
within a gup invocation, it must first release the mmap_sem hence
FAULT_FLAG_ALLOW_RETRY is always required at the first attempt for any
virtual address.

With regard to the last sentence, there's actually a race with
MADV_DONTNEED too, I'd need to change the code to always pass
FAULT_FLAG_ALLOW_RETRY (your code also would need to loop and
insisting with the __get_user_pages(locked) version to solve it). The
KVM ioctl worst case would get an -EFAULT if the race materializes for
example. It's non concerning though because that can be solved in
userland somehow by separating ballooning and live migration
activities.

Thanks,
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
  2014-09-25 21:16     ` Andrea Arcangeli
@ 2014-09-25 21:50       ` Andres Lagar-Cavilla
  -1 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-25 21:50 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm,
	Dr. David Alan Gilbert

On Thu, Sep 25, 2014 at 2:16 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> Hi Andres,
>
> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
>> +     if (!locked) {
>> +             VM_BUG_ON(npages != -EBUSY);
>> +
>
> Shouldn't this be VM_BUG_ON(npages)?

Oh shoot you're right. I was confused by the introduction of -EBUSY in
the forward port.

        if (ret & VM_FAULT_RETRY) {
                if (nonblocking)
                        *nonblocking = 0;
                return -EBUSY;
        }

(gaaah!!!)

>
> Alternatively we could patch gup to do:
>
>                         case -EHWPOISON:
> +                       case -EBUSY:
>                                 return i ? i : ret;
> -                       case -EBUSY:
> -                               return i;
>

No preference. Not a lot of semantics available given that we pass 1
as the count to gup. Want to cut the patch or I can just shoot one
right away?

> I need to fix gup_fast slow path to start with FAULT_FLAG_ALLOW_RETRY
> similarly to what you did to the KVM slow path.
>
> gup_fast is called without the mmap_sem (incidentally its whole point
> is to only disable irqs and not take the locks) so the enabling of
> FAULT_FLAG_ALLOW_RETRY initial pass inside gup_fast should be all self
> contained. It shouldn't concern KVM which should be already fine with
> your patch, but it will allow the userfaultfd to intercept all
> O_DIRECT gup_fast in addition to KVM with your patch.
>
> Eventually get_user_pages should be obsoleted in favor of
> get_user_pages_locked (or whoever we decide to call it) so the
> userfaultfd can intercept all kind of gups. gup_locked is same as gup
> except for one more "locked" parameter at the end, I called the
> parameter locked instead of nonblocking because it'd be more proper to
> call "nonblocking" gup the FOLL_NOWAIT kind which is quite the
> opposite (in fact as the mmap_sem cannot be dropped in the non
> blocking version).
>

It's nearly impossible to name it right because 1) it indicates we can
relinquish 2) it returns whether we still hold the mmap semaphore.

I'd prefer it'd be called mmap_sem_hold, which conveys immediately
what this is about ("nonblocking" or "locked" could be about a whole
lot of things)

> ptrace ironically is better off sticking with a NULL locked parameter
> and to get a sigbus instead of risking hanging on the userfaultfd
> (which would be safe as it can be killed, but it'd be annoying if
> erroneously poking into a hole during a gdb session). It's still
> possible to pass NULL as parameter to get_user_pages_locked to achieve
> that. So the fact some callers won't block in handle_userfault because
> FAULT_FLAG_ALLOW_RETRY is not set and the userfault cannot block, may
> come handy.
>
> What I'm trying to solve in this context is that the userfault cannot
> safely block without FAULT_FLAG_ALLOW_RETRY because we can't allow
> userland to take the mmap_sem for an unlimited amount of time without
> requiring special privileges, so if handle_userfault wants to blocks
> within a gup invocation, it must first release the mmap_sem hence
> FAULT_FLAG_ALLOW_RETRY is always required at the first attempt for any
> virtual address.

I can see that. My background for coming into this is very similar: in
a previous life we had a file system shim that would kick up into
userspace for servicing VM memory. KVM just wouldn't let the file
system give up the mmap semaphore. We had /proc readers hanging up all
over the place while userspace was servicing. Not happy.

With KVM (now) and the standard x86 fault giving you ALLOW_RETRY, what
stands in your way? Methinks that gup_fast has no slowpath fallback
that turns on ALLOW_RETRY. What would oppose that being the global
behavior?

>
> With regard to the last sentence, there's actually a race with
> MADV_DONTNEED too, I'd need to change the code to always pass
> FAULT_FLAG_ALLOW_RETRY (your code also would need to loop and
> insisting with the __get_user_pages(locked) version to solve it). The
> KVM ioctl worst case would get an -EFAULT if the race materializes for
> example. It's non concerning though because that can be solved in
> userland somehow by separating ballooning and live migration
> activities.

Well, IIUC every code path that has ALLOW_RETRY dives in the second
time with FAULT_TRIED or similar. In the common case, you happily
blaze through the second time, but if someone raced in while all locks
were given up, one pays the price of the second time being a full
fault hogging the mmap sem. At some point you need to not keep being
polite otherwise the task starves. Presumably the risk of an extra
retry drops steeply every new gup retry. Maybe just try three times is
good enough. It makes for ugly logic though.

Thanks
Andres

>
> Thanks,
> Andrea



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] kvm: Faults which trigger IO release the mmap_sem
@ 2014-09-25 21:50       ` Andres Lagar-Cavilla
  0 siblings, 0 replies; 70+ messages in thread
From: Andres Lagar-Cavilla @ 2014-09-25 21:50 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Gleb Natapov, Radim Krcmar, Paolo Bonzini, Rik van Riel,
	Peter Zijlstra, Mel Gorman, Andy Lutomirski, Andrew Morton,
	Sasha Levin, Jianyu Zhan, Paul Cassella, Hugh Dickins,
	Peter Feiner, kvm, linux-kernel, linux-mm,
	Dr. David Alan Gilbert

On Thu, Sep 25, 2014 at 2:16 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> Hi Andres,
>
> On Wed, Sep 17, 2014 at 10:51:48AM -0700, Andres Lagar-Cavilla wrote:
>> +     if (!locked) {
>> +             VM_BUG_ON(npages != -EBUSY);
>> +
>
> Shouldn't this be VM_BUG_ON(npages)?

Oh shoot you're right. I was confused by the introduction of -EBUSY in
the forward port.

        if (ret & VM_FAULT_RETRY) {
                if (nonblocking)
                        *nonblocking = 0;
                return -EBUSY;
        }

(gaaah!!!)

>
> Alternatively we could patch gup to do:
>
>                         case -EHWPOISON:
> +                       case -EBUSY:
>                                 return i ? i : ret;
> -                       case -EBUSY:
> -                               return i;
>

No preference. Not a lot of semantics available given that we pass 1
as the count to gup. Want to cut the patch or I can just shoot one
right away?

> I need to fix gup_fast slow path to start with FAULT_FLAG_ALLOW_RETRY
> similarly to what you did to the KVM slow path.
>
> gup_fast is called without the mmap_sem (incidentally its whole point
> is to only disable irqs and not take the locks) so the enabling of
> FAULT_FLAG_ALLOW_RETRY initial pass inside gup_fast should be all self
> contained. It shouldn't concern KVM which should be already fine with
> your patch, but it will allow the userfaultfd to intercept all
> O_DIRECT gup_fast in addition to KVM with your patch.
>
> Eventually get_user_pages should be obsoleted in favor of
> get_user_pages_locked (or whoever we decide to call it) so the
> userfaultfd can intercept all kind of gups. gup_locked is same as gup
> except for one more "locked" parameter at the end, I called the
> parameter locked instead of nonblocking because it'd be more proper to
> call "nonblocking" gup the FOLL_NOWAIT kind which is quite the
> opposite (in fact as the mmap_sem cannot be dropped in the non
> blocking version).
>

It's nearly impossible to name it right because 1) it indicates we can
relinquish 2) it returns whether we still hold the mmap semaphore.

I'd prefer it'd be called mmap_sem_hold, which conveys immediately
what this is about ("nonblocking" or "locked" could be about a whole
lot of things)

> ptrace ironically is better off sticking with a NULL locked parameter
> and to get a sigbus instead of risking hanging on the userfaultfd
> (which would be safe as it can be killed, but it'd be annoying if
> erroneously poking into a hole during a gdb session). It's still
> possible to pass NULL as parameter to get_user_pages_locked to achieve
> that. So the fact some callers won't block in handle_userfault because
> FAULT_FLAG_ALLOW_RETRY is not set and the userfault cannot block, may
> come handy.
>
> What I'm trying to solve in this context is that the userfault cannot
> safely block without FAULT_FLAG_ALLOW_RETRY because we can't allow
> userland to take the mmap_sem for an unlimited amount of time without
> requiring special privileges, so if handle_userfault wants to blocks
> within a gup invocation, it must first release the mmap_sem hence
> FAULT_FLAG_ALLOW_RETRY is always required at the first attempt for any
> virtual address.

I can see that. My background for coming into this is very similar: in
a previous life we had a file system shim that would kick up into
userspace for servicing VM memory. KVM just wouldn't let the file
system give up the mmap semaphore. We had /proc readers hanging up all
over the place while userspace was servicing. Not happy.

With KVM (now) and the standard x86 fault giving you ALLOW_RETRY, what
stands in your way? Methinks that gup_fast has no slowpath fallback
that turns on ALLOW_RETRY. What would oppose that being the global
behavior?

>
> With regard to the last sentence, there's actually a race with
> MADV_DONTNEED too, I'd need to change the code to always pass
> FAULT_FLAG_ALLOW_RETRY (your code also would need to loop and
> insisting with the __get_user_pages(locked) version to solve it). The
> KVM ioctl worst case would get an -EFAULT if the race materializes for
> example. It's non concerning though because that can be solved in
> userland somehow by separating ballooning and live migration
> activities.

Well, IIUC every code path that has ALLOW_RETRY dives in the second
time with FAULT_TRIED or similar. In the common case, you happily
blaze through the second time, but if someone raced in while all locks
were given up, one pays the price of the second time being a full
fault hogging the mmap sem. At some point you need to not keep being
polite otherwise the task starves. Presumably the risk of an extra
retry drops steeply every new gup retry. Maybe just try three times is
good enough. It makes for ugly logic though.

Thanks
Andres

>
> Thanks,
> Andrea



-- 
Andres Lagar-Cavilla | Google Kernel Team | andreslc@google.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 70+ messages in thread

end of thread, other threads:[~2014-09-25 21:50 UTC | newest]

Thread overview: 70+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-15 20:11 [PATCH] kvm: Faults which trigger IO release the mmap_sem Andres Lagar-Cavilla
2014-09-15 20:11 ` Andres Lagar-Cavilla
2014-09-16 13:51 ` Paolo Bonzini
2014-09-16 13:51   ` Paolo Bonzini
2014-09-16 16:52   ` Andres Lagar-Cavilla
2014-09-16 16:55     ` Andres Lagar-Cavilla
2014-09-16 16:55       ` Andres Lagar-Cavilla
2014-09-16 18:29     ` Paolo Bonzini
2014-09-16 18:29       ` Paolo Bonzini
2014-09-16 18:42       ` Andres Lagar-Cavilla
2014-09-16 18:42         ` Andres Lagar-Cavilla
2014-09-17  7:43         ` Paolo Bonzini
2014-09-17  7:43           ` Paolo Bonzini
2014-09-17 16:58           ` Andres Lagar-Cavilla
2014-09-17 16:58             ` Andres Lagar-Cavilla
2014-09-17 20:01             ` Paolo Bonzini
2014-09-17 20:01               ` Paolo Bonzini
2014-09-16 20:51   ` Radim Krčmář
2014-09-16 20:51     ` Radim Krčmář
2014-09-16 20:51     ` Radim Krčmář
2014-09-16 21:01     ` Andres Lagar-Cavilla
2014-09-16 21:01       ` Andres Lagar-Cavilla
2014-09-16 22:34       ` Radim Krčmář
2014-09-16 22:34         ` Radim Krčmář
2014-09-16 22:34         ` Radim Krčmář
2014-09-17  4:15         ` Andres Lagar-Cavilla
2014-09-17  4:15           ` Andres Lagar-Cavilla
2014-09-17 11:35       ` Radim Krčmář
2014-09-17 11:35         ` Radim Krčmář
2014-09-17 11:35         ` Radim Krčmář
2014-09-17 10:26 ` Gleb Natapov
2014-09-17 10:26   ` Gleb Natapov
2014-09-17 11:27   ` Radim Krčmář
2014-09-17 11:27     ` Radim Krčmář
2014-09-17 11:42     ` Gleb Natapov
2014-09-17 11:42       ` Gleb Natapov
2014-09-17 17:00       ` Andres Lagar-Cavilla
2014-09-17 17:00         ` Andres Lagar-Cavilla
2014-09-17 17:08         ` Gleb Natapov
2014-09-17 17:08           ` Gleb Natapov
2014-09-17 17:13           ` Andres Lagar-Cavilla
2014-09-17 17:13             ` Andres Lagar-Cavilla
2014-09-17 17:21             ` Gleb Natapov
2014-09-17 17:21               ` Gleb Natapov
2014-09-17 17:41               ` Andres Lagar-Cavilla
2014-09-17 17:41                 ` Andres Lagar-Cavilla
2014-09-17 17:51 ` [PATCH v2] " Andres Lagar-Cavilla
2014-09-17 17:51   ` Andres Lagar-Cavilla
2014-09-18  0:29   ` Wanpeng Li
2014-09-18  0:29     ` Wanpeng Li
2014-09-18  6:13     ` Gleb Natapov
2014-09-18  6:13       ` Gleb Natapov
2014-09-19  0:32       ` Wanpeng Li
2014-09-19  0:32         ` Wanpeng Li
2014-09-19  3:58         ` Andres Lagar-Cavilla
2014-09-19  3:58           ` Andres Lagar-Cavilla
2014-09-19  6:08           ` Paolo Bonzini
2014-09-19  6:08             ` Paolo Bonzini
2014-09-22 20:49             ` Andres Lagar-Cavilla
2014-09-22 20:49               ` Andres Lagar-Cavilla
2014-09-22 21:32               ` Paolo Bonzini
2014-09-22 21:32                 ` Paolo Bonzini
2014-09-22 21:53                 ` Andrew Morton
2014-09-22 21:53                   ` Andrew Morton
2014-09-18  6:15   ` Gleb Natapov
2014-09-18  6:15     ` Gleb Natapov
2014-09-25 21:16   ` Andrea Arcangeli
2014-09-25 21:16     ` Andrea Arcangeli
2014-09-25 21:50     ` Andres Lagar-Cavilla
2014-09-25 21:50       ` Andres Lagar-Cavilla

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.