All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-01-24 21:23 Jeff Layton
  2017-01-25 13:32 ` [PATCH v3 0/2] " Jeff Layton
  0 siblings, 1 reply; 101+ messages in thread
From: Jeff Layton @ 2017-01-24 21:23 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

Currently, iov_iter_get_pages_alloc will only ever operate on the first
vector that iterate_all_kinds hands back. Many of the callers however
would like to have as long a set of pages as possible, to allow for
fewer, but larger I/Os.

When the previous vector ends on a page boundary and the current one
begins on one, we can continue to add more pages.

Change the function to first scan the iov_iter to see how long an
array of pages we could create from the current position. Then,
allocate an array that large (or up to the maxsize), and fill that
many pages.

This patch would allow us to rip out a chunk of code in ceph that
tries to do the same thing, but doesn't handle it right.

For NFS, this allows the client to do larger I/Os when userland passes
down an array of page-aligned iovecs in an O_DIRECT request. I imagine
this will also make splice writes into an O_DIRECT file more efficient.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 lib/iov_iter.c | 140 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 111 insertions(+), 29 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..956d17767a3e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -883,6 +883,58 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_gap_alignment);
 
+/**
+ * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
+ * @i: iov_iter to in which to find the size
+ *
+ * Some filesystems can stitch together multiple iovecs into a single
+ * page vector when both the previous tail and current base are page
+ * aligned. This function discovers the length that can fit in a single
+ * pagevec and returns it.
+ */
+static size_t iov_iter_pvec_size(const struct iov_iter *i)
+{
+	size_t size = i->count;
+	size_t pv_size = 0;
+	bool contig = false, first = true;
+
+	if (!size)
+		return 0;
+
+	/* Pipes are naturally aligned for this */
+	if (unlikely(i->type & ITER_PIPE))
+		return size;
+
+	/*
+	 * An iov can be page vectored when the current base and previous
+	 * tail are both page aligned. Note that we don't require that the
+	 * initial base in the first iovec also be page aligned.
+	 */
+	iterate_all_kinds(i, size, v,
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }; 0;
+		 }),
+		({
+		 if (first || (contig && v.bv_offset == 0)) {
+			pv_size += v.bv_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
+		 }
+		 }),
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }
+		 }))
+	return pv_size;
+}
+
 static inline size_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,
@@ -1006,47 +1058,77 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 }
 
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
-		   struct page ***pages, size_t maxsize,
-		   size_t *start)
+		   struct page ***ppages, size_t maxsize,
+		   size_t *pstart)
 {
-	struct page **p;
-
-	if (maxsize > i->count)
-		maxsize = i->count;
+	struct page **p, **pc;
+	size_t start = 0;
+	ssize_t len = 0;
+	int npages, res = 0;
+	bool first = true;
 
 	if (unlikely(i->type & ITER_PIPE))
-		return pipe_get_pages_alloc(i, pages, maxsize, start);
+		return pipe_get_pages_alloc(i, ppages, maxsize, pstart);
+
+	maxsize = min(iov_iter_pvec_size(i), maxsize);
+	npages = DIV_ROUND_UP(maxsize, PAGE_SIZE);
+	p = get_pages_array(npages);
+	if (!p)
+		return -ENOMEM;
+
+	pc = p;
 	iterate_all_kinds(i, maxsize, v, ({
 		unsigned long addr = (unsigned long)v.iov_base;
-		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		size_t slen = v.iov_len;
 		int n;
-		int res;
 
-		addr &= ~(PAGE_SIZE - 1);
-		n = DIV_ROUND_UP(len, PAGE_SIZE);
-		p = get_pages_array(n);
-		if (!p)
-			return -ENOMEM;
-		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
-		if (unlikely(res < 0)) {
-			kvfree(p);
-			return res;
+		if (first) {
+			start = addr & (PAGE_SIZE - 1);
+			slen += start;
+			first = false;
 		}
-		*pages = p;
-		return (res == n ? len : res * PAGE_SIZE) - *start;
+
+		n = DIV_ROUND_UP(slen, PAGE_SIZE);
+		if ((pc + n) > (p + npages)) {
+			/* Did something change the iov array?!? */
+			res = -EFAULT;
+			goto out;
+		}
+		addr &= ~(PAGE_SIZE - 1);
+		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pc);
+		if (unlikely(res < 0))
+			goto out;
+		len += (res == n ? slen : res * PAGE_SIZE) - start;
+		pc += res;
 	0;}),({
-		/* can't be more than PAGE_SIZE */
-		*start = v.bv_offset;
-		*pages = p = get_pages_array(1);
-		if (!p)
-			return -ENOMEM;
-		get_page(*p = v.bv_page);
-		return v.bv_len;
+		/* bio_vecs are limited to a single page each */
+		if (first) {
+			start = v.bv_offset;
+			first = false;
+		}
+		get_page(*pc = v.bv_page);
+		len += v.bv_len;
+		++pc;
+		BUG_ON(pc > p + npages);
 	}),({
-		return -EFAULT;
+		/* FIXME: should we handle this case? */
+		res = -EFAULT;
+		goto out;
 	})
 	)
-	return 0;
+out:
+	if (unlikely(res < 0)) {
+		struct page **i;
+
+		for (i = p; i < pc; i++)
+			put_page(*i);
+		kvfree(p);
+		return res;
+	}
+
+	*ppages = p;
+	*pstart = start;
+	return len;
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-01-24 21:23 [PATCH] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call Jeff Layton
@ 2017-01-25 13:32 ` Jeff Layton
  2017-01-25 13:32   ` [PATCH v3 1/2] " Jeff Layton
                     ` (2 more replies)
  0 siblings, 3 replies; 101+ messages in thread
From: Jeff Layton @ 2017-01-25 13:32 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

Small respin of the patch that I sent yesterday for the same thing.

This moves the maxsize handling into iov_iter_pvec_size, so that we don't
end up iterating past the max size we'll use anyway when trying to
determine the pagevec length.

Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
trying to do it via its own routine.

Al, if these look ok, do you want to pick these up or shall I ask
Ilya to merge them via the ceph tree?

Jeff Layton (2):
  iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per
    call
  ceph: switch DIO code to use iov_iter_get_pages_alloc

 fs/ceph/file.c |  75 ++----------------------------
 lib/iov_iter.c | 142 +++++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 116 insertions(+), 101 deletions(-)

-- 
2.9.3

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [PATCH v3 1/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-01-25 13:32 ` [PATCH v3 0/2] " Jeff Layton
@ 2017-01-25 13:32   ` Jeff Layton
  2017-01-26 12:35     ` Jeff Layton
  2017-01-25 13:32   ` [PATCH v3 " Jeff Layton
  2017-02-02  9:51     ` Al Viro
  2 siblings, 1 reply; 101+ messages in thread
From: Jeff Layton @ 2017-01-25 13:32 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

Currently, iov_iter_get_pages_alloc will only ever operate on the first
vector that iterate_all_kinds hands back. Most of the callers however
would like to have as long a set of pages as possible, to allow for
fewer, but larger I/Os.

When the previous vector ends on a page boundary and the current one
begins on one, we can continue to add more pages.

Change the function to first scan the iov_iter to see how long an
array of pages we could create from the current position up to the
maxsize passed in. Then, allocate an array that large and start
filling in that many pages.

The main impetus for this patch is to rip out a swath of code in ceph
that tries to do this but that doesn't handle ITER_BVEC correctly.

NFS also uses this function and this also allows the client to do larger
I/Os when userland passes down an array of page-aligned iovecs in an
O_DIRECT request. This should also make splice writes into an O_DIRECT
file on NFS use larger I/Os, since that's now done by passing down an
ITER_BVEC representing the data to be written.

I believe the other callers (lustre and 9p) may also benefit from this
change, but I don't have a great way to verify that.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 lib/iov_iter.c | 142 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 113 insertions(+), 29 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..c87ba154371a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -883,6 +883,59 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_gap_alignment);
 
+/**
+ * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
+ * @i: iov_iter to in which to find the size
+ * @maxsize: maximum size to return
+ *
+ * Some filesystems can stitch together multiple iovecs into a single page
+ * vector when both the previous tail and current base are page aligned. This
+ * function determines how much of the remaining iov_iter can be stuffed into
+ * a single pagevec, up to the provided maxsize value.
+ */
+static size_t iov_iter_pvec_size(const struct iov_iter *i, size_t maxsize)
+{
+	size_t size = min(iov_iter_count(i), maxsize);
+	size_t pv_size = 0;
+	bool contig = false, first = true;
+
+	if (!size)
+		return 0;
+
+	/* Pipes are naturally aligned for this */
+	if (unlikely(i->type & ITER_PIPE))
+		return size;
+
+	/*
+	 * An iov can be page vectored when the current base and previous
+	 * tail are both page aligned. Note that we don't require that the
+	 * initial base in the first iovec also be page aligned.
+	 */
+	iterate_all_kinds(i, size, v,
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }; 0;
+		 }),
+		({
+		 if (first || (contig && v.bv_offset == 0)) {
+			pv_size += v.bv_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
+		 }
+		 }),
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }
+		 }))
+	return pv_size;
+}
+
 static inline size_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,
@@ -1006,47 +1059,78 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 }
 
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
-		   struct page ***pages, size_t maxsize,
-		   size_t *start)
+		   struct page ***ppages, size_t maxsize,
+		   size_t *pstart)
 {
-	struct page **p;
-
-	if (maxsize > i->count)
-		maxsize = i->count;
+	struct page **p, **pc;
+	size_t start = 0;
+	ssize_t len = 0;
+	int npages, res = 0;
+	bool first = true;
 
 	if (unlikely(i->type & ITER_PIPE))
-		return pipe_get_pages_alloc(i, pages, maxsize, start);
+		return pipe_get_pages_alloc(i, ppages, maxsize, pstart);
+
+	maxsize = iov_iter_pvec_size(i, maxsize);
+	npages = DIV_ROUND_UP(maxsize, PAGE_SIZE);
+	p = get_pages_array(npages);
+	if (!p)
+		return -ENOMEM;
+
+	pc = p;
 	iterate_all_kinds(i, maxsize, v, ({
 		unsigned long addr = (unsigned long)v.iov_base;
-		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		size_t slen = v.iov_len;
 		int n;
-		int res;
 
-		addr &= ~(PAGE_SIZE - 1);
-		n = DIV_ROUND_UP(len, PAGE_SIZE);
-		p = get_pages_array(n);
-		if (!p)
-			return -ENOMEM;
-		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
-		if (unlikely(res < 0)) {
-			kvfree(p);
-			return res;
+		if (first) {
+			start = addr & (PAGE_SIZE - 1);
+			slen += start;
+			first = false;
 		}
-		*pages = p;
-		return (res == n ? len : res * PAGE_SIZE) - *start;
+
+		n = DIV_ROUND_UP(slen, PAGE_SIZE);
+		if (pc + n > p + npages) {
+			/* Did something change the iov array?!? */
+			res = -EFAULT;
+			goto out;
+		}
+		addr &= ~(PAGE_SIZE - 1);
+		res = get_user_pages_fast(addr, n,
+					  (i->type & WRITE) != WRITE, pc);
+		if (unlikely(res < 0))
+			goto out;
+		len += (res == n ? slen : res * PAGE_SIZE) - start;
+		pc += res;
 	0;}),({
-		/* can't be more than PAGE_SIZE */
-		*start = v.bv_offset;
-		*pages = p = get_pages_array(1);
-		if (!p)
-			return -ENOMEM;
-		get_page(*p = v.bv_page);
-		return v.bv_len;
+		/* bio_vecs are limited to a single page each */
+		if (first) {
+			start = v.bv_offset;
+			first = false;
+		}
+		get_page(*pc = v.bv_page);
+		len += v.bv_len;
+		++pc;
+		BUG_ON(pc > p + npages);
 	}),({
-		return -EFAULT;
+		/* FIXME: should we handle this case? */
+		res = -EFAULT;
+		goto out;
 	})
 	)
-	return 0;
+out:
+	if (unlikely(res < 0)) {
+		struct page **i;
+
+		for (i = p; i < pc; i++)
+			put_page(*i);
+		kvfree(p);
+		return res;
+	}
+
+	*ppages = p;
+	*pstart = start;
+	return len;
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* [PATCH v3 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc
  2017-01-25 13:32 ` [PATCH v3 0/2] " Jeff Layton
  2017-01-25 13:32   ` [PATCH v3 1/2] " Jeff Layton
@ 2017-01-25 13:32   ` Jeff Layton
  2017-02-02  9:51     ` Al Viro
  2 siblings, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-01-25 13:32 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

xfstest generic/095 triggers soft lockups in kcephfs. It uses fio to
drive some I/O via vmsplice ane splice. Ceph then ends up trying to
access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it to
pick up a wrong offset and get stuck in an infinite loop while trying to
populate the page array. dio_get_pagev_size has a similar problem.

Now that iov_iter_get_pages_alloc doesn't stop after the first vector in
the array, we can just call it instead and dump the old code that tried
to do the same thing.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/file.c | 75 +++-------------------------------------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 045d30d26624..0ce79f1eabbc 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -35,75 +35,6 @@
  */
 
 /*
- * Calculate the length sum of direct io vectors that can
- * be combined into one page vector.
- */
-static size_t dio_get_pagev_size(const struct iov_iter *it)
-{
-    const struct iovec *iov = it->iov;
-    const struct iovec *iovend = iov + it->nr_segs;
-    size_t size;
-
-    size = iov->iov_len - it->iov_offset;
-    /*
-     * An iov can be page vectored when both the current tail
-     * and the next base are page aligned.
-     */
-    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
-           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
-        size += iov->iov_len;
-    }
-    dout("dio_get_pagevlen len = %zu\n", size);
-    return size;
-}
-
-/*
- * Allocate a page vector based on (@it, @nbytes).
- * The return value is the tuple describing a page vector,
- * that is (@pages, @page_align, @num_pages).
- */
-static struct page **
-dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
-		    size_t *page_align, int *num_pages)
-{
-	struct iov_iter tmp_it = *it;
-	size_t align;
-	struct page **pages;
-	int ret = 0, idx, npages;
-
-	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
-		(PAGE_SIZE - 1);
-	npages = calc_pages_for(align, nbytes);
-	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
-	if (!pages) {
-		pages = vmalloc(sizeof(*pages) * npages);
-		if (!pages)
-			return ERR_PTR(-ENOMEM);
-	}
-
-	for (idx = 0; idx < npages; ) {
-		size_t start;
-		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
-					 npages - idx, &start);
-		if (ret < 0)
-			goto fail;
-
-		iov_iter_advance(&tmp_it, ret);
-		nbytes -= ret;
-		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
-	}
-
-	BUG_ON(nbytes != 0);
-	*num_pages = npages;
-	*page_align = align;
-	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
-	return pages;
-fail:
-	ceph_put_page_vector(pages, idx, false);
-	return ERR_PTR(ret);
-}
-
-/*
  * Prepare an open request.  Preallocate ceph_cap to avoid an
  * inopportune ENOMEM later.
  */
@@ -923,7 +854,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	while (iov_iter_count(iter) > 0) {
-		u64 size = dio_get_pagev_size(iter);
+		u64 size = iov_iter_count(iter);
 		size_t start = 0;
 		ssize_t len;
 
@@ -943,13 +874,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			break;
 		}
 
-		len = size;
-		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
+		len = iov_iter_get_pages_alloc(iter, &pages, size, &start);
 		if (IS_ERR(pages)) {
 			ceph_osdc_put_request(req);
 			ret = PTR_ERR(pages);
 			break;
 		}
+		num_pages = DIV_ROUND_UP(len, PAGE_SIZE);
 
 		/*
 		 * To simplify error handling, allow AIO when IO within i_size
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 1/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-01-25 13:32   ` [PATCH v3 1/2] " Jeff Layton
@ 2017-01-26 12:35     ` Jeff Layton
  2017-01-27 13:24       ` [PATCH v4 0/2] " Jeff Layton
  0 siblings, 1 reply; 101+ messages in thread
From: Jeff Layton @ 2017-01-26 12:35 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

On Wed, 2017-01-25 at 08:32 -0500, Jeff Layton wrote:
> Currently, iov_iter_get_pages_alloc will only ever operate on the first
> vector that iterate_all_kinds hands back. Most of the callers however
> would like to have as long a set of pages as possible, to allow for
> fewer, but larger I/Os.
> 
> When the previous vector ends on a page boundary and the current one
> begins on one, we can continue to add more pages.
> 
> Change the function to first scan the iov_iter to see how long an
> array of pages we could create from the current position up to the
> maxsize passed in. Then, allocate an array that large and start
> filling in that many pages.
> 
> The main impetus for this patch is to rip out a swath of code in ceph
> that tries to do this but that doesn't handle ITER_BVEC correctly.
> 
> NFS also uses this function and this also allows the client to do larger
> I/Os when userland passes down an array of page-aligned iovecs in an
> O_DIRECT request. This should also make splice writes into an O_DIRECT
> file on NFS use larger I/Os, since that's now done by passing down an
> ITER_BVEC representing the data to be written.
> 
> I believe the other callers (lustre and 9p) may also benefit from this
> change, but I don't have a great way to verify that.
> 
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  lib/iov_iter.c | 142 +++++++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 113 insertions(+), 29 deletions(-)
> 
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index e68604ae3ced..c87ba154371a 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -883,6 +883,59 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
>  }
>  EXPORT_SYMBOL(iov_iter_gap_alignment);
>  
> +/**
> + * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
> + * @i: iov_iter to in which to find the size
> + * @maxsize: maximum size to return
> + *
> + * Some filesystems can stitch together multiple iovecs into a single page
> + * vector when both the previous tail and current base are page aligned. This
> + * function determines how much of the remaining iov_iter can be stuffed into
> + * a single pagevec, up to the provided maxsize value.
> + */
> +static size_t iov_iter_pvec_size(const struct iov_iter *i, size_t maxsize)
> +{
> +	size_t size = min(iov_iter_count(i), maxsize);
> +	size_t pv_size = 0;
> +	bool contig = false, first = true;
> +
> +	if (!size)
> +		return 0;
> +
> +	/* Pipes are naturally aligned for this */
> +	if (unlikely(i->type & ITER_PIPE))
> +		return size;
> +
> +	/*
> +	 * An iov can be page vectored when the current base and previous
> +	 * tail are both page aligned. Note that we don't require that the
> +	 * initial base in the first iovec also be page aligned.
> +	 */
> +	iterate_all_kinds(i, size, v,
> +		({
> +		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
> +			pv_size += v.iov_len;
> +			first = false;
> +			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
> +		 }; 0;
> +		 }),
> +		({
> +		 if (first || (contig && v.bv_offset == 0)) {
> +			pv_size += v.bv_len;
> +			first = false;
> +			contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
> +		 }
> +		 }),
> +		({
> +		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
> +			pv_size += v.iov_len;
> +			first = false;
> +			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
> +		 }
> +		 }))
> +	return pv_size;
> +}
> +
>  static inline size_t __pipe_get_pages(struct iov_iter *i,
>  				size_t maxsize,
>  				struct page **pages,
> @@ -1006,47 +1059,78 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
>  }
>  
>  ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
> -		   struct page ***pages, size_t maxsize,
> -		   size_t *start)
> +		   struct page ***ppages, size_t maxsize,
> +		   size_t *pstart)
>  {
> -	struct page **p;
> -
> -	if (maxsize > i->count)
> -		maxsize = i->count;
> +	struct page **p, **pc;
> +	size_t start = 0;
> +	ssize_t len = 0;
> +	int npages, res = 0;
> +	bool first = true;
>  
>  	if (unlikely(i->type & ITER_PIPE))
> -		return pipe_get_pages_alloc(i, pages, maxsize, start);
> +		return pipe_get_pages_alloc(i, ppages, maxsize, pstart);
> +
> +	maxsize = iov_iter_pvec_size(i, maxsize);
> +	npages = DIV_ROUND_UP(maxsize, PAGE_SIZE);

Bah, the above is wrong when the offset into the first page is non-
zero. I'll fix -- this probably also points out the need for some tests
for this. I'll see if I can cook something up for xfstests.

> +	p = get_pages_array(npages);
> +	if (!p)
> +		return -ENOMEM;
> +
> +	pc = p;
>  	iterate_all_kinds(i, maxsize, v, ({
>  		unsigned long addr = (unsigned long)v.iov_base;
> -		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
> +		size_t slen = v.iov_len;
>  		int n;
> -		int res;
>  
> -		addr &= ~(PAGE_SIZE - 1);
> -		n = DIV_ROUND_UP(len, PAGE_SIZE);
> -		p = get_pages_array(n);
> -		if (!p)
> -			return -ENOMEM;
> -		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
> -		if (unlikely(res < 0)) {
> -			kvfree(p);
> -			return res;
> +		if (first) {
> +			start = addr & (PAGE_SIZE - 1);
> +			slen += start;
> +			first = false;
>  		}
> -		*pages = p;
> -		return (res == n ? len : res * PAGE_SIZE) - *start;
> +
> +		n = DIV_ROUND_UP(slen, PAGE_SIZE);
> +		if (pc + n > p + npages) {
> +			/* Did something change the iov array?!? */
> +			res = -EFAULT;
> +			goto out;
> +		}
> +		addr &= ~(PAGE_SIZE - 1);
> +		res = get_user_pages_fast(addr, n,
> +					  (i->type & WRITE) != WRITE, pc);
> +		if (unlikely(res < 0))
> +			goto out;
> +		len += (res == n ? slen : res * PAGE_SIZE) - start;
> +		pc += res;
>  	0;}),({
> -		/* can't be more than PAGE_SIZE */
> -		*start = v.bv_offset;
> -		*pages = p = get_pages_array(1);
> -		if (!p)
> -			return -ENOMEM;
> -		get_page(*p = v.bv_page);
> -		return v.bv_len;
> +		/* bio_vecs are limited to a single page each */
> +		if (first) {
> +			start = v.bv_offset;
> +			first = false;
> +		}
> +		get_page(*pc = v.bv_page);
> +		len += v.bv_len;
> +		++pc;
> +		BUG_ON(pc > p + npages);
>  	}),({
> -		return -EFAULT;
> +		/* FIXME: should we handle this case? */
> +		res = -EFAULT;
> +		goto out;
>  	})
>  	)
> -	return 0;
> +out:
> +	if (unlikely(res < 0)) {
> +		struct page **i;
> +
> +		for (i = p; i < pc; i++)
> +			put_page(*i);
> +		kvfree(p);
> +		return res;
> +	}
> +
> +	*ppages = p;
> +	*pstart = start;
> +	return len;
>  }
>  EXPORT_SYMBOL(iov_iter_get_pages_alloc);
>  

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [PATCH v4 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-01-26 12:35     ` Jeff Layton
@ 2017-01-27 13:24       ` Jeff Layton
  2017-01-27 13:24         ` [PATCH v4 1/2] " Jeff Layton
  2017-01-27 13:24         ` [PATCH v4 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc Jeff Layton
  0 siblings, 2 replies; 101+ messages in thread
From: Jeff Layton @ 2017-01-27 13:24 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

v1: Initial iteration (too many iov_iter details exposed to callers)
v2: just change iov_iter_get_pages_alloc to add more pages to the array
    if the previous vector and the current one are page aligned
v3: Move maxsize handling into iov_iter_pvec_size. Add patch to make
    ceph use iov_iter_get_pages_alloc instead of doing its own thing.
v4: Fix length handling when neither start nor end of iovec is page
    aligned. Rework error handling when there is a change to iovec
    after calculating the array length. Eliminate a BUG_ON.

Currently iov_iter_get_pages_alloc doesn't actually iterate past the
first element in the vector array. If you have a long array of small
iovecs that are well aligned and you want to stitch them together into a
single I/O, you have to try to do it yourself with multiple calls to
iov_iter_get_pages.

Ceph attempts to do this, but it doesn't handle ITER_BVEC correctly,
which is necessary to handle splice writes into a file open with
O_DIRECT. That usually leads to a softlockup with the current code.

While I can't locate the report at the moment, ISTR that we've also had
people complain in the past that the NFS client doesn't handle small
iovecs well with O_DIRECT. Each iovec gets its own RPC, even when they
are page-aligned. The first patch in the series fixes that as well.

This may also silimarly help lustre and 9p in that situation as well,
but I don't have a great way to test that so I can't verify it.

Jeff Layton (2):
  iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per
    call
  ceph: switch DIO code to use iov_iter_get_pages_alloc

 fs/ceph/file.c |  75 +-----------------------
 lib/iov_iter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 154 insertions(+), 101 deletions(-)

-- 
2.9.3

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [PATCH v4 1/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-01-27 13:24       ` [PATCH v4 0/2] " Jeff Layton
@ 2017-01-27 13:24         ` Jeff Layton
  2017-01-27 13:24         ` [PATCH v4 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc Jeff Layton
  1 sibling, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-01-27 13:24 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

Currently, iov_iter_get_pages_alloc will only ever operate on the first
vector that iterate_all_kinds hands back. Most of the callers however
would like to have as long a set of pages as possible, to allow for
fewer, but larger I/Os.

When the previous vector ends on a page boundary and the current one
begins on one, we can continue to add more pages.

Change the function to first scan the iov_iter to see how long an
array of pages we could create from the current position up to the
maxsize passed in. Then, allocate an array that large and start
filling in that many pages.

The main impetus for this patch is to rip out a swath of code in ceph
that tries to do this but that doesn't handle ITER_BVEC correctly.

NFS also uses this function and this also allows the client to do larger
I/Os when userland passes down an array of page-aligned iovecs in an
O_DIRECT request. This should also make splice writes into an O_DIRECT
file on NFS use larger I/Os, since that's now done by passing down an
ITER_BVEC representing the data to be written.

I believe the other callers (lustre and 9p) may also benefit from this
change, but I don't have a great way to verify that.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 lib/iov_iter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 151 insertions(+), 29 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..e3bcb927429f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1005,48 +1005,170 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 	return n;
 }
 
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
-		   struct page ***pages, size_t maxsize,
-		   size_t *start)
+/**
+ * iov_iter_pvec_size - find length of page aligned iovecs in iov_iter
+ * @i: iov_iter to in which to find the size
+ * @maxsize: maximum size to return
+ * @npages: return pointer for how many pages this I/O will span
+ *
+ * Some filesystems can stitch together multiple iovecs into a single page
+ * vector when both the previous tail and current base are page aligned. This
+ * function determines how much of the remaining iov_iter can be stuffed into
+ * a single pagevec, up to the provided maxsize value.
+ *
+ * It also calculates how many pages this I/O will span and returns that value
+ * in npages.
+ */
+static size_t iov_iter_pvec_size(const struct iov_iter *i, size_t maxsize,
+				 int *npages)
 {
-	struct page **p;
+	size_t size = min(iov_iter_count(i), maxsize);
+	size_t pv_size = 0;
+	size_t start = 0;
+	bool contig = false, first = true;
 
-	if (maxsize > i->count)
-		maxsize = i->count;
+	if (!size)
+		return 0;
 
+	/* Pipes are naturally aligned for this */
 	if (unlikely(i->type & ITER_PIPE))
-		return pipe_get_pages_alloc(i, pages, maxsize, start);
+		return size;
+
+	/*
+	 * An iov can be page vectored when the current base and previous
+	 * tail are both page aligned. Note that we don't require that the
+	 * initial base in the first iovec also be page aligned.
+	 */
+	iterate_all_kinds(i, size, v,
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			if (first)
+				start = ((unsigned long)v.iov_base & (PAGE_SIZE - 1));
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }; 0;
+		 }),
+		({
+		 if (first || (contig && v.bv_offset == 0)) {
+			if (first)
+				start = v.bv_offset;
+			pv_size += v.bv_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.bv_offset + v.bv_len);
+		 }
+		 }),
+		({
+		 if (first || (contig && PAGE_ALIGNED(v.iov_base))) {
+			if (first)
+				start = ((unsigned long)v.iov_base & (PAGE_SIZE - 1));
+			pv_size += v.iov_len;
+			first = false;
+			contig = PAGE_ALIGNED(v.iov_base + v.iov_len);
+		 }
+		 }))
+	*npages = DIV_ROUND_UP(pv_size + start, PAGE_SIZE);
+	return pv_size;
+}
+
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+		   struct page ***ppages, size_t maxsize,
+		   size_t *pstart)
+{
+	struct page **p, **pc;
+	size_t start = 0;
+	ssize_t len = 0;
+	int npages = 0, res = 0;
+	bool first = true;
+
+	if (unlikely(i->type & ITER_PIPE))
+		return pipe_get_pages_alloc(i, ppages, maxsize, pstart);
+
+	maxsize = iov_iter_pvec_size(i, maxsize, &npages);
+	p = get_pages_array(npages);
+	if (!p)
+		return -ENOMEM;
+
+	pc = p;
 	iterate_all_kinds(i, maxsize, v, ({
 		unsigned long addr = (unsigned long)v.iov_base;
-		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		size_t slen = v.iov_len;
+		size_t sstart = 0;
 		int n;
-		int res;
 
-		addr &= ~(PAGE_SIZE - 1);
-		n = DIV_ROUND_UP(len, PAGE_SIZE);
-		p = get_pages_array(n);
-		if (!p)
-			return -ENOMEM;
-		res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
-		if (unlikely(res < 0)) {
-			kvfree(p);
-			return res;
+		if (first) {
+			start = addr & (PAGE_SIZE - 1);
+			slen += start;
+			sstart = start;
+			first = false;
 		}
-		*pages = p;
-		return (res == n ? len : res * PAGE_SIZE) - *start;
+
+		n = DIV_ROUND_UP(slen, PAGE_SIZE);
+		if (pc + n > p + npages) {
+			/*
+			 * Eek! Something changed between when we initially
+			 * measured for the page array and now. Maybe a
+			 * userland memory scribble? We haven't overrun
+			 * anything at this point, so we can safely just
+			 * return what we have, if we have gotten anything.
+			 * If this is the first pass, then just return EFAULT.
+			 */
+			if (first)
+				res = -EFAULT;
+			goto out;
+		}
+		addr &= ~(PAGE_SIZE - 1);
+		res = get_user_pages_fast(addr, n,
+					  (i->type & WRITE) != WRITE, pc);
+		if (unlikely(res < 0))
+			goto out;
+		len += (res == n ? slen : res * PAGE_SIZE) - sstart;
+		pc += res;
 	0;}),({
-		/* can't be more than PAGE_SIZE */
-		*start = v.bv_offset;
-		*pages = p = get_pages_array(1);
-		if (!p)
-			return -ENOMEM;
-		get_page(*p = v.bv_page);
-		return v.bv_len;
+		/* bio_vecs are limited to a single page each */
+		if (first) {
+			start = v.bv_offset;
+			first = false;
+		}
+		get_page(*pc = v.bv_page);
+		len += v.bv_len;
+		++pc;
+		if (pc > p + npages) {
+			/*
+			 * Should we return an error here?  This should never
+			 * happen as kernel callers had better not muck with
+			 * the array while we're iterating over it.
+			 *
+			 * At this point, we haven't overrun anything so we
+			 * can just return what we have gotten so far. Still,
+			 * it looks like something is wrong, so pop a warning
+			 * here.
+			 */
+			WARN_ONCE(true, "%s: array overrun (%p > %p + %d)\n",
+					__func__, pc, p, npages);
+			goto out;
+		}
+
+		BUG_ON(pc > p + npages);
 	}),({
-		return -EFAULT;
+		/* FIXME: should we handle this case? */
+		res = -EFAULT;
+		goto out;
 	})
 	)
-	return 0;
+out:
+	if (unlikely(res < 0)) {
+		struct page **i;
+
+		for (i = p; i < pc; i++)
+			put_page(*i);
+		kvfree(p);
+		return res;
+	}
+
+	*ppages = p;
+	*pstart = start;
+	return len;
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* [PATCH v4 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc
  2017-01-27 13:24       ` [PATCH v4 0/2] " Jeff Layton
  2017-01-27 13:24         ` [PATCH v4 1/2] " Jeff Layton
@ 2017-01-27 13:24         ` Jeff Layton
  2017-01-30 15:40             ` Jeff Layton
  1 sibling, 1 reply; 101+ messages in thread
From: Jeff Layton @ 2017-01-27 13:24 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

xfstest generic/095 triggers soft lockups in kcephfs. It uses fio to
drive some I/O via vmsplice ane splice. Ceph then ends up trying to
access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it to
pick up a wrong offset and get stuck in an infinite loop while trying to
populate the page array. dio_get_pagev_size has a similar problem.

Now that iov_iter_get_pages_alloc doesn't stop after the first vector in
the array, we can just call it instead and dump the old code that tried
to do the same thing.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/file.c | 75 +++-------------------------------------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 045d30d26624..0ce79f1eabbc 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -35,75 +35,6 @@
  */
 
 /*
- * Calculate the length sum of direct io vectors that can
- * be combined into one page vector.
- */
-static size_t dio_get_pagev_size(const struct iov_iter *it)
-{
-    const struct iovec *iov = it->iov;
-    const struct iovec *iovend = iov + it->nr_segs;
-    size_t size;
-
-    size = iov->iov_len - it->iov_offset;
-    /*
-     * An iov can be page vectored when both the current tail
-     * and the next base are page aligned.
-     */
-    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
-           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
-        size += iov->iov_len;
-    }
-    dout("dio_get_pagevlen len = %zu\n", size);
-    return size;
-}
-
-/*
- * Allocate a page vector based on (@it, @nbytes).
- * The return value is the tuple describing a page vector,
- * that is (@pages, @page_align, @num_pages).
- */
-static struct page **
-dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
-		    size_t *page_align, int *num_pages)
-{
-	struct iov_iter tmp_it = *it;
-	size_t align;
-	struct page **pages;
-	int ret = 0, idx, npages;
-
-	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
-		(PAGE_SIZE - 1);
-	npages = calc_pages_for(align, nbytes);
-	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
-	if (!pages) {
-		pages = vmalloc(sizeof(*pages) * npages);
-		if (!pages)
-			return ERR_PTR(-ENOMEM);
-	}
-
-	for (idx = 0; idx < npages; ) {
-		size_t start;
-		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
-					 npages - idx, &start);
-		if (ret < 0)
-			goto fail;
-
-		iov_iter_advance(&tmp_it, ret);
-		nbytes -= ret;
-		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
-	}
-
-	BUG_ON(nbytes != 0);
-	*num_pages = npages;
-	*page_align = align;
-	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
-	return pages;
-fail:
-	ceph_put_page_vector(pages, idx, false);
-	return ERR_PTR(ret);
-}
-
-/*
  * Prepare an open request.  Preallocate ceph_cap to avoid an
  * inopportune ENOMEM later.
  */
@@ -923,7 +854,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	while (iov_iter_count(iter) > 0) {
-		u64 size = dio_get_pagev_size(iter);
+		u64 size = iov_iter_count(iter);
 		size_t start = 0;
 		ssize_t len;
 
@@ -943,13 +874,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			break;
 		}
 
-		len = size;
-		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
+		len = iov_iter_get_pages_alloc(iter, &pages, size, &start);
 		if (IS_ERR(pages)) {
 			ceph_osdc_put_request(req);
 			ret = PTR_ERR(pages);
 			break;
 		}
+		num_pages = DIV_ROUND_UP(len, PAGE_SIZE);
 
 		/*
 		 * To simplify error handling, allow AIO when IO within i_size
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* Re: [PATCH v4 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc
@ 2017-01-30 15:40             ` Jeff Layton
  0 siblings, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-01-30 15:40 UTC (permalink / raw)
  To: viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer

On Fri, 2017-01-27 at 08:24 -0500, Jeff Layton wrote:
> xfstest generic/095 triggers soft lockups in kcephfs. It uses fio to
> drive some I/O via vmsplice ane splice. Ceph then ends up trying to
> access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it to
> pick up a wrong offset and get stuck in an infinite loop while trying to
> populate the page array. dio_get_pagev_size has a similar problem.
> 
> Now that iov_iter_get_pages_alloc doesn't stop after the first vector in
> the array, we can just call it instead and dump the old code that tried
> to do the same thing.
> 
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  fs/ceph/file.c | 75 +++-------------------------------------------------------
>  1 file changed, 3 insertions(+), 72 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 045d30d26624..0ce79f1eabbc 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -35,75 +35,6 @@
>   */
>  
>  /*
> - * Calculate the length sum of direct io vectors that can
> - * be combined into one page vector.
> - */
> -static size_t dio_get_pagev_size(const struct iov_iter *it)
> -{
> -    const struct iovec *iov = it->iov;
> -    const struct iovec *iovend = iov + it->nr_segs;
> -    size_t size;
> -
> -    size = iov->iov_len - it->iov_offset;
> -    /*
> -     * An iov can be page vectored when both the current tail
> -     * and the next base are page aligned.
> -     */
> -    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
> -           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
> -        size += iov->iov_len;
> -    }
> -    dout("dio_get_pagevlen len = %zu\n", size);
> -    return size;
> -}
> -
> -/*
> - * Allocate a page vector based on (@it, @nbytes).
> - * The return value is the tuple describing a page vector,
> - * that is (@pages, @page_align, @num_pages).
> - */
> -static struct page **
> -dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
> -		    size_t *page_align, int *num_pages)
> -{
> -	struct iov_iter tmp_it = *it;
> -	size_t align;
> -	struct page **pages;
> -	int ret = 0, idx, npages;
> -
> -	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
> -		(PAGE_SIZE - 1);
> -	npages = calc_pages_for(align, nbytes);
> -	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
> -	if (!pages) {
> -		pages = vmalloc(sizeof(*pages) * npages);
> -		if (!pages)
> -			return ERR_PTR(-ENOMEM);
> -	}
> -
> -	for (idx = 0; idx < npages; ) {
> -		size_t start;
> -		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
> -					 npages - idx, &start);
> -		if (ret < 0)
> -			goto fail;
> -
> -		iov_iter_advance(&tmp_it, ret);
> -		nbytes -= ret;
> -		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
> -	}
> -
> -	BUG_ON(nbytes != 0);
> -	*num_pages = npages;
> -	*page_align = align;
> -	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
> -	return pages;
> -fail:
> -	ceph_put_page_vector(pages, idx, false);
> -	return ERR_PTR(ret);
> -}
> -
> -/*
>   * Prepare an open request.  Preallocate ceph_cap to avoid an
>   * inopportune ENOMEM later.
>   */
> @@ -923,7 +854,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  	}
>  
>  	while (iov_iter_count(iter) > 0) {
> -		u64 size = dio_get_pagev_size(iter);
> +		u64 size = iov_iter_count(iter);
>  		size_t start = 0;
>  		ssize_t len;
>  
> @@ -943,13 +874,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  			break;
>  		}
>  
> -		len = size;
> -		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
> +		len = iov_iter_get_pages_alloc(iter, &pages, size, &start);
>  		if (IS_ERR(pages)) {
>  			ceph_osdc_put_request(req);
>  			ret = PTR_ERR(pages);
>  			break;
>  		}
> +		num_pages = DIV_ROUND_UP(len, PAGE_SIZE);

Sigh, this should be:

    num_pages = DIV_ROUND_UP(len + start, PAGE_SIZE);

Also, while it is a simple thing to determine, it is rather easy to get
that wrong.

Maybe we should have iov_iter_get_pages_alloc also return the number of
pages? Not having to do a DIV_ROUND_UP on every call into it would be
nice, and all of the callers need that value anyway.

>  
>  		/*
>  		 * To simplify error handling, allow AIO when IO within i_size

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v4 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc
@ 2017-01-30 15:40             ` Jeff Layton
  0 siblings, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-01-30 15:40 UTC (permalink / raw)
  To: viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

On Fri, 2017-01-27 at 08:24 -0500, Jeff Layton wrote:
> xfstest generic/095 triggers soft lockups in kcephfs. It uses fio to
> drive some I/O via vmsplice ane splice. Ceph then ends up trying to
> access an ITER_BVEC type iov_iter as a ITER_IOVEC one. That causes it to
> pick up a wrong offset and get stuck in an infinite loop while trying to
> populate the page array. dio_get_pagev_size has a similar problem.
> 
> Now that iov_iter_get_pages_alloc doesn't stop after the first vector in
> the array, we can just call it instead and dump the old code that tried
> to do the same thing.
> 
> Signed-off-by: Jeff Layton <jlayton-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> ---
>  fs/ceph/file.c | 75 +++-------------------------------------------------------
>  1 file changed, 3 insertions(+), 72 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 045d30d26624..0ce79f1eabbc 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -35,75 +35,6 @@
>   */
>  
>  /*
> - * Calculate the length sum of direct io vectors that can
> - * be combined into one page vector.
> - */
> -static size_t dio_get_pagev_size(const struct iov_iter *it)
> -{
> -    const struct iovec *iov = it->iov;
> -    const struct iovec *iovend = iov + it->nr_segs;
> -    size_t size;
> -
> -    size = iov->iov_len - it->iov_offset;
> -    /*
> -     * An iov can be page vectored when both the current tail
> -     * and the next base are page aligned.
> -     */
> -    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
> -           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
> -        size += iov->iov_len;
> -    }
> -    dout("dio_get_pagevlen len = %zu\n", size);
> -    return size;
> -}
> -
> -/*
> - * Allocate a page vector based on (@it, @nbytes).
> - * The return value is the tuple describing a page vector,
> - * that is (@pages, @page_align, @num_pages).
> - */
> -static struct page **
> -dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
> -		    size_t *page_align, int *num_pages)
> -{
> -	struct iov_iter tmp_it = *it;
> -	size_t align;
> -	struct page **pages;
> -	int ret = 0, idx, npages;
> -
> -	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
> -		(PAGE_SIZE - 1);
> -	npages = calc_pages_for(align, nbytes);
> -	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
> -	if (!pages) {
> -		pages = vmalloc(sizeof(*pages) * npages);
> -		if (!pages)
> -			return ERR_PTR(-ENOMEM);
> -	}
> -
> -	for (idx = 0; idx < npages; ) {
> -		size_t start;
> -		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
> -					 npages - idx, &start);
> -		if (ret < 0)
> -			goto fail;
> -
> -		iov_iter_advance(&tmp_it, ret);
> -		nbytes -= ret;
> -		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
> -	}
> -
> -	BUG_ON(nbytes != 0);
> -	*num_pages = npages;
> -	*page_align = align;
> -	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
> -	return pages;
> -fail:
> -	ceph_put_page_vector(pages, idx, false);
> -	return ERR_PTR(ret);
> -}
> -
> -/*
>   * Prepare an open request.  Preallocate ceph_cap to avoid an
>   * inopportune ENOMEM later.
>   */
> @@ -923,7 +854,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  	}
>  
>  	while (iov_iter_count(iter) > 0) {
> -		u64 size = dio_get_pagev_size(iter);
> +		u64 size = iov_iter_count(iter);
>  		size_t start = 0;
>  		ssize_t len;
>  
> @@ -943,13 +874,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  			break;
>  		}
>  
> -		len = size;
> -		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
> +		len = iov_iter_get_pages_alloc(iter, &pages, size, &start);
>  		if (IS_ERR(pages)) {
>  			ceph_osdc_put_request(req);
>  			ret = PTR_ERR(pages);
>  			break;
>  		}
> +		num_pages = DIV_ROUND_UP(len, PAGE_SIZE);

Sigh, this should be:

    num_pages = DIV_ROUND_UP(len + start, PAGE_SIZE);

Also, while it is a simple thing to determine, it is rather easy to get
that wrong.

Maybe we should have iov_iter_get_pages_alloc also return the number of
pages? Not having to do a DIV_ROUND_UP on every call into it would be
nice, and all of the callers need that value anyway.

>  
>  		/*
>  		 * To simplify error handling, allow AIO when IO within i_size

-- 
Jeff Layton <jlayton-vpEMnDpepFuMZCB2o+C8xQ@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02  9:51     ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02  9:51 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer, Linus Torvalds, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> Small respin of the patch that I sent yesterday for the same thing.
> 
> This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> end up iterating past the max size we'll use anyway when trying to
> determine the pagevec length.
> 
> Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> trying to do it via its own routine.
> 
> Al, if these look ok, do you want to pick these up or shall I ask
> Ilya to merge them via the ceph tree?

I'd rather have that kind of work go through the vfs tree; said that,
I really wonder if this is the right approach.  Most of the users of
iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
something like
	iov_iter_for_each_page(iter, size, f, data)
with int (*f)(struct page *page, size_t from, size_t size, void *data)
passed as callback.  Not everything fits that model, but there's a whole
lot of things that do.

Examples:
	* fs/direct_io.c:do_direct_IO().  We loop through the pages
returned by dio_get_page().  For each of those we find the subrange
of page (from/to) and handle IO on that range.  Then we drop the reference
to page and move on to the next one.  dio_get_page() uses dio->pages and
sdio->{head,tail,from,to} to avoid calling iov_iter_get_pages() on each
page - iov_iter_get_pages() is called for bigger chunks (up to 64 pages,
AFAICS) and results are kept in dio->pages for subsequent calls of
dio_get_page().  Unconsumed references are dropped by dio_cleanup();
AFAICS, it could've been called unconditionally right after the call
of do_direct_IO() (or from it, for that matter) - all remaining references
to pages are never looked at after do_direct_IO().  As it is, we call
it immediately on failure return from do_direct_IO() and then unconditionally
after blk_finish_plug().
	That oddity aside (and AFAICS it's really pointless - all pages
we'd done something with in do_direct_IO() won't be affected by
dio_cleanup()), there's potentially more interesting issue.  If
iov_iter_get_pages() fails on write at the moment when we have pending
mapped blocks, we treat that as write from zero page.  Once that has
happened, we remember to stop mapping new blocks and arrange for having
the error eventually treated as if it had come from IO failure.  I'm not
sure if this sucker handles all cases correctly, BTW - can we end up with
a few pages worth of pending mapped blocks?
	But aside of that, it's really a "loop through all page subranges"
kind of code.  The inner loop in do_direct_IO() could be converted into
a callback quite easily

	* nfs_direct_read_schedule_iovec(): same kind of batching, only there
we have an outer loop calling iov_iter_get_pages_alloc() and then the
inner loop goes through the page subranges, with the same work done for
each.  In this case we grab a reference inside the would-be callback and
drop all references from iov_iter_get_pages_alloc() after the inner loop
is done.  Could've gotten rid of grabbing extra refs - that would mean dropping
only the unused ones if the 'callback' (== inner loop body) has told us
to bugger off early.  IMO that would be a better model.  Incidentally, we
keep allocating/freeing the array used to store page references for each batch.

	* nfs_direct_write_schedule_iovec(): very similar to the read side.

	* zerocopy_sg_from_iter(): similar loop, batch size is MAX_SKB_FRAGS
(i.e. 16 or 17, depending upon the PAGE_SIZE; unless somebody has done
a port with 2Kb pages it shouldn't be greater than 17).  Array of references
is on stack, skb_fill_page_desc(skb, frag++, page, from, size) should
become the callback.  References are consumed by it and it can't fail, so
there's nothing left to drop.

	* af_alg_make_sg().  Looks like it might be massaged to the same
model; the tricky part is af_alg_free_sg() users.  We keep references to
pages in sgl->pages[] *and* store them in sgl->sg[...] (via sg_set_page()).
af_alg_free_sg() drops them using ->pages[] instead of sg_page(...->sg + ...).
Might or might not be a problem - I'm not familiar with that code.

	* fuse_get_user_pages().  It pretty much fills an equivalent of
bio_vec array; the difference is, array of struct page * and arrays of
(start, len) pairs are kept separately.  The only benefit is using the
first array as destination of iov_iter_get_pages(); might as well work
into a separate batching array instead - copying struct page * is noise
compared to storing (and calculating) start/len pairs we have to do there.
Again, what we do there is a pure loop over page subranges.

	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
is a good idea there - fuse_copy_do() could bloody well just use
copy_{to,from}_iter().

	* fs/splice.c:iter_to_pipe().  Loop over page subranges, consuming
page references.  Unused ones are dropped.

	* bio_iov_iter_get_pages().  Wants to populate bio_vec array; should've
been a loop calling iov_iter_get_pages(); gets tripped on each iovec boundary
instead.  IMO would've been better off with a loop and separate 'batching'
array; would've killed the "Deep magic" mess in there, while we are at it.

That's the majority of iov_iter_get_pages{,_alloc} callers.  There's one
I'm not sure about in lustre (looks like their O_DIRECT is complicated by
rudiments of lloop stuff), there's a mess in p9_get_mapped_pages() (with
special-casing the kvec-backed iterators using kmap_to_page() and
vmalloc_to_page(), no less), there's default_file_splice_read() and there's
ceph stuff.  Everything else is covered by the 'loop over page subranges'
stuff.

I'm massaging that code (along with a lot of RTFS); the interesting questions
related to VM side of things are
	* what are the relative costs of doing small vs. large batches?  Some
of get_user_pages_fast() instances have comments along the lines of "we ought
to limit the batch size, but then nobody's doing more than 64 pages at a time
anyway".
	* Not a question: any ->fault() that returns VM_FAULT_RETRY when
*not* passed FAULT_FLAG_ALLOW_RETRY in flags ought to be shot.  cxlflash one
sure as hell is.
	* drivers/gpu/drm/vgem/vgem_drv.c:vgem_gem_fault() is bloody odd -
shmem_read_mapping_page() can't return -EBUSY, AFAICS.  vm_insert_page()
used to (and back then vgem_gem_fault() used to be broken), but these days
it looks like dead code...
	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
it's only (ab)used there as 'not zero, but doesn't contain any error bits';
VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
right?
	* get_user_pages_fast() only returns 0 on zero size.  AFAICS, that's
true and some callers seems to rely upon that.  Correct?
	* aligning the start address passed to get_user_pages_fast() et.al.
Happens in many callers, but not all of them.  Most of the instances
forcibly align it in get_user_pages_fast() itself, but... not the fallback
one.  I'm not sure if it can be used to screw the things up, but it feels
like aligning the sucker in get_user_pages...() would be safer - callers
outnumber them and they are scattered in bad places (including drivers/staging)

	Comments?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02  9:51     ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02  9:51 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> Small respin of the patch that I sent yesterday for the same thing.
> 
> This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> end up iterating past the max size we'll use anyway when trying to
> determine the pagevec length.
> 
> Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> trying to do it via its own routine.
> 
> Al, if these look ok, do you want to pick these up or shall I ask
> Ilya to merge them via the ceph tree?

I'd rather have that kind of work go through the vfs tree; said that,
I really wonder if this is the right approach.  Most of the users of
iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
something like
	iov_iter_for_each_page(iter, size, f, data)
with int (*f)(struct page *page, size_t from, size_t size, void *data)
passed as callback.  Not everything fits that model, but there's a whole
lot of things that do.

Examples:
	* fs/direct_io.c:do_direct_IO().  We loop through the pages
returned by dio_get_page().  For each of those we find the subrange
of page (from/to) and handle IO on that range.  Then we drop the reference
to page and move on to the next one.  dio_get_page() uses dio->pages and
sdio->{head,tail,from,to} to avoid calling iov_iter_get_pages() on each
page - iov_iter_get_pages() is called for bigger chunks (up to 64 pages,
AFAICS) and results are kept in dio->pages for subsequent calls of
dio_get_page().  Unconsumed references are dropped by dio_cleanup();
AFAICS, it could've been called unconditionally right after the call
of do_direct_IO() (or from it, for that matter) - all remaining references
to pages are never looked at after do_direct_IO().  As it is, we call
it immediately on failure return from do_direct_IO() and then unconditionally
after blk_finish_plug().
	That oddity aside (and AFAICS it's really pointless - all pages
we'd done something with in do_direct_IO() won't be affected by
dio_cleanup()), there's potentially more interesting issue.  If
iov_iter_get_pages() fails on write at the moment when we have pending
mapped blocks, we treat that as write from zero page.  Once that has
happened, we remember to stop mapping new blocks and arrange for having
the error eventually treated as if it had come from IO failure.  I'm not
sure if this sucker handles all cases correctly, BTW - can we end up with
a few pages worth of pending mapped blocks?
	But aside of that, it's really a "loop through all page subranges"
kind of code.  The inner loop in do_direct_IO() could be converted into
a callback quite easily

	* nfs_direct_read_schedule_iovec(): same kind of batching, only there
we have an outer loop calling iov_iter_get_pages_alloc() and then the
inner loop goes through the page subranges, with the same work done for
each.  In this case we grab a reference inside the would-be callback and
drop all references from iov_iter_get_pages_alloc() after the inner loop
is done.  Could've gotten rid of grabbing extra refs - that would mean dropping
only the unused ones if the 'callback' (== inner loop body) has told us
to bugger off early.  IMO that would be a better model.  Incidentally, we
keep allocating/freeing the array used to store page references for each batch.

	* nfs_direct_write_schedule_iovec(): very similar to the read side.

	* zerocopy_sg_from_iter(): similar loop, batch size is MAX_SKB_FRAGS
(i.e. 16 or 17, depending upon the PAGE_SIZE; unless somebody has done
a port with 2Kb pages it shouldn't be greater than 17).  Array of references
is on stack, skb_fill_page_desc(skb, frag++, page, from, size) should
become the callback.  References are consumed by it and it can't fail, so
there's nothing left to drop.

	* af_alg_make_sg().  Looks like it might be massaged to the same
model; the tricky part is af_alg_free_sg() users.  We keep references to
pages in sgl->pages[] *and* store them in sgl->sg[...] (via sg_set_page()).
af_alg_free_sg() drops them using ->pages[] instead of sg_page(...->sg + ...).
Might or might not be a problem - I'm not familiar with that code.

	* fuse_get_user_pages().  It pretty much fills an equivalent of
bio_vec array; the difference is, array of struct page * and arrays of
(start, len) pairs are kept separately.  The only benefit is using the
first array as destination of iov_iter_get_pages(); might as well work
into a separate batching array instead - copying struct page * is noise
compared to storing (and calculating) start/len pairs we have to do there.
Again, what we do there is a pure loop over page subranges.

	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
is a good idea there - fuse_copy_do() could bloody well just use
copy_{to,from}_iter().

	* fs/splice.c:iter_to_pipe().  Loop over page subranges, consuming
page references.  Unused ones are dropped.

	* bio_iov_iter_get_pages().  Wants to populate bio_vec array; should've
been a loop calling iov_iter_get_pages(); gets tripped on each iovec boundary
instead.  IMO would've been better off with a loop and separate 'batching'
array; would've killed the "Deep magic" mess in there, while we are at it.

That's the majority of iov_iter_get_pages{,_alloc} callers.  There's one
I'm not sure about in lustre (looks like their O_DIRECT is complicated by
rudiments of lloop stuff), there's a mess in p9_get_mapped_pages() (with
special-casing the kvec-backed iterators using kmap_to_page() and
vmalloc_to_page(), no less), there's default_file_splice_read() and there's
ceph stuff.  Everything else is covered by the 'loop over page subranges'
stuff.

I'm massaging that code (along with a lot of RTFS); the interesting questions
related to VM side of things are
	* what are the relative costs of doing small vs. large batches?  Some
of get_user_pages_fast() instances have comments along the lines of "we ought
to limit the batch size, but then nobody's doing more than 64 pages at a time
anyway".
	* Not a question: any ->fault() that returns VM_FAULT_RETRY when
*not* passed FAULT_FLAG_ALLOW_RETRY in flags ought to be shot.  cxlflash one
sure as hell is.
	* drivers/gpu/drm/vgem/vgem_drv.c:vgem_gem_fault() is bloody odd -
shmem_read_mapping_page() can't return -EBUSY, AFAICS.  vm_insert_page()
used to (and back then vgem_gem_fault() used to be broken), but these days
it looks like dead code...
	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
it's only (ab)used there as 'not zero, but doesn't contain any error bits';
VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
right?
	* get_user_pages_fast() only returns 0 on zero size.  AFAICS, that's
true and some callers seems to rely upon that.  Correct?
	* aligning the start address passed to get_user_pages_fast() et.al.
Happens in many callers, but not all of them.  Most of the instances
forcibly align it in get_user_pages_fast() itself, but... not the fallback
one.  I'm not sure if it can be used to screw the things up, but it feels
like aligning the sucker in get_user_pages...() would be safer - callers
outnumber them and they are scattered in bad places (including drivers/staging)

	Comments?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02  9:51     ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02  9:51 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> Small respin of the patch that I sent yesterday for the same thing.
> 
> This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> end up iterating past the max size we'll use anyway when trying to
> determine the pagevec length.
> 
> Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> trying to do it via its own routine.
> 
> Al, if these look ok, do you want to pick these up or shall I ask
> Ilya to merge them via the ceph tree?

I'd rather have that kind of work go through the vfs tree; said that,
I really wonder if this is the right approach.  Most of the users of
iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
something like
	iov_iter_for_each_page(iter, size, f, data)
with int (*f)(struct page *page, size_t from, size_t size, void *data)
passed as callback.  Not everything fits that model, but there's a whole
lot of things that do.

Examples:
	* fs/direct_io.c:do_direct_IO().  We loop through the pages
returned by dio_get_page().  For each of those we find the subrange
of page (from/to) and handle IO on that range.  Then we drop the reference
to page and move on to the next one.  dio_get_page() uses dio->pages and
sdio->{head,tail,from,to} to avoid calling iov_iter_get_pages() on each
page - iov_iter_get_pages() is called for bigger chunks (up to 64 pages,
AFAICS) and results are kept in dio->pages for subsequent calls of
dio_get_page().  Unconsumed references are dropped by dio_cleanup();
AFAICS, it could've been called unconditionally right after the call
of do_direct_IO() (or from it, for that matter) - all remaining references
to pages are never looked at after do_direct_IO().  As it is, we call
it immediately on failure return from do_direct_IO() and then unconditionally
after blk_finish_plug().
	That oddity aside (and AFAICS it's really pointless - all pages
we'd done something with in do_direct_IO() won't be affected by
dio_cleanup()), there's potentially more interesting issue.  If
iov_iter_get_pages() fails on write at the moment when we have pending
mapped blocks, we treat that as write from zero page.  Once that has
happened, we remember to stop mapping new blocks and arrange for having
the error eventually treated as if it had come from IO failure.  I'm not
sure if this sucker handles all cases correctly, BTW - can we end up with
a few pages worth of pending mapped blocks?
	But aside of that, it's really a "loop through all page subranges"
kind of code.  The inner loop in do_direct_IO() could be converted into
a callback quite easily

	* nfs_direct_read_schedule_iovec(): same kind of batching, only there
we have an outer loop calling iov_iter_get_pages_alloc() and then the
inner loop goes through the page subranges, with the same work done for
each.  In this case we grab a reference inside the would-be callback and
drop all references from iov_iter_get_pages_alloc() after the inner loop
is done.  Could've gotten rid of grabbing extra refs - that would mean dropping
only the unused ones if the 'callback' (== inner loop body) has told us
to bugger off early.  IMO that would be a better model.  Incidentally, we
keep allocating/freeing the array used to store page references for each batch.

	* nfs_direct_write_schedule_iovec(): very similar to the read side.

	* zerocopy_sg_from_iter(): similar loop, batch size is MAX_SKB_FRAGS
(i.e. 16 or 17, depending upon the PAGE_SIZE; unless somebody has done
a port with 2Kb pages it shouldn't be greater than 17).  Array of references
is on stack, skb_fill_page_desc(skb, frag++, page, from, size) should
become the callback.  References are consumed by it and it can't fail, so
there's nothing left to drop.

	* af_alg_make_sg().  Looks like it might be massaged to the same
model; the tricky part is af_alg_free_sg() users.  We keep references to
pages in sgl->pages[] *and* store them in sgl->sg[...] (via sg_set_page()).
af_alg_free_sg() drops them using ->pages[] instead of sg_page(...->sg + ...).
Might or might not be a problem - I'm not familiar with that code.

	* fuse_get_user_pages().  It pretty much fills an equivalent of
bio_vec array; the difference is, array of struct page * and arrays of
(start, len) pairs are kept separately.  The only benefit is using the
first array as destination of iov_iter_get_pages(); might as well work
into a separate batching array instead - copying struct page * is noise
compared to storing (and calculating) start/len pairs we have to do there.
Again, what we do there is a pure loop over page subranges.

	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
is a good idea there - fuse_copy_do() could bloody well just use
copy_{to,from}_iter().

	* fs/splice.c:iter_to_pipe().  Loop over page subranges, consuming
page references.  Unused ones are dropped.

	* bio_iov_iter_get_pages().  Wants to populate bio_vec array; should've
been a loop calling iov_iter_get_pages(); gets tripped on each iovec boundary
instead.  IMO would've been better off with a loop and separate 'batching'
array; would've killed the "Deep magic" mess in there, while we are at it.

That's the majority of iov_iter_get_pages{,_alloc} callers.  There's one
I'm not sure about in lustre (looks like their O_DIRECT is complicated by
rudiments of lloop stuff), there's a mess in p9_get_mapped_pages() (with
special-casing the kvec-backed iterators using kmap_to_page() and
vmalloc_to_page(), no less), there's default_file_splice_read() and there's
ceph stuff.  Everything else is covered by the 'loop over page subranges'
stuff.

I'm massaging that code (along with a lot of RTFS); the interesting questions
related to VM side of things are
	* what are the relative costs of doing small vs. large batches?  Some
of get_user_pages_fast() instances have comments along the lines of "we ought
to limit the batch size, but then nobody's doing more than 64 pages at a time
anyway".
	* Not a question: any ->fault() that returns VM_FAULT_RETRY when
*not* passed FAULT_FLAG_ALLOW_RETRY in flags ought to be shot.  cxlflash one
sure as hell is.
	* drivers/gpu/drm/vgem/vgem_drv.c:vgem_gem_fault() is bloody odd -
shmem_read_mapping_page() can't return -EBUSY, AFAICS.  vm_insert_page()
used to (and back then vgem_gem_fault() used to be broken), but these days
it looks like dead code...
	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
it's only (ab)used there as 'not zero, but doesn't contain any error bits';
VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
right?
	* get_user_pages_fast() only returns 0 on zero size.  AFAICS, that's
true and some callers seems to rely upon that.  Correct?
	* aligning the start address passed to get_user_pages_fast() et.al.
Happens in many callers, but not all of them.  Most of the instances
forcibly align it in get_user_pages_fast() itself, but... not the fallback
one.  I'm not sure if it can be used to screw the things up, but it feels
like aligning the sucker in get_user_pages...() would be safer - callers
outnumber them and they are scattered in bad places (including drivers/staging)

	Comments?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 10:56       ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-02 10:56 UTC (permalink / raw)
  To: Al Viro
  Cc: Jeff Layton, linux-fsdevel, linux-kernel, linux-nfs, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
> On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> > Small respin of the patch that I sent yesterday for the same thing.
> > 
> > This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> > end up iterating past the max size we'll use anyway when trying to
> > determine the pagevec length.
> > 
> > Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> > trying to do it via its own routine.
> > 
> > Al, if these look ok, do you want to pick these up or shall I ask
> > Ilya to merge them via the ceph tree?
> 
> I'd rather have that kind of work go through the vfs tree; said that,
> I really wonder if this is the right approach.  Most of the users of
> iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> something like
> 	iov_iter_for_each_page(iter, size, f, data)
> with int (*f)(struct page *page, size_t from, size_t size, void *data)
> passed as callback.  Not everything fits that model, but there's a whole
> lot of things that do.

I was planning to do that, mostly because of the iomap dio code that
would not only get a lot cleaner with this, but also support multi-page
bvecs that we hope to have in the block layer soon.  The issue with it
is that we need to touch all the arch get_user_pages_fast
implementations, so it's going to be a relatively invasive change that I
didn't want to fix with just introducing the new direct I/O code.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 10:56       ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-02 10:56 UTC (permalink / raw)
  To: Al Viro
  Cc: Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
> On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> > Small respin of the patch that I sent yesterday for the same thing.
> > 
> > This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> > end up iterating past the max size we'll use anyway when trying to
> > determine the pagevec length.
> > 
> > Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> > trying to do it via its own routine.
> > 
> > Al, if these look ok, do you want to pick these up or shall I ask
> > Ilya to merge them via the ceph tree?
> 
> I'd rather have that kind of work go through the vfs tree; said that,
> I really wonder if this is the right approach.  Most of the users of
> iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> something like
> 	iov_iter_for_each_page(iter, size, f, data)
> with int (*f)(struct page *page, size_t from, size_t size, void *data)
> passed as callback.  Not everything fits that model, but there's a whole
> lot of things that do.

I was planning to do that, mostly because of the iomap dio code that
would not only get a lot cleaner with this, but also support multi-page
bvecs that we hope to have in the block layer soon.  The issue with it
is that we need to touch all the arch get_user_pages_fast
implementations, so it's going to be a relatively invasive change that I
didn't want to fix with just introducing the new direct I/O code.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 10:56       ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-02 10:56 UTC (permalink / raw)
  To: Al Viro
  Cc: Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
> On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> > Small respin of the patch that I sent yesterday for the same thing.
> > 
> > This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> > end up iterating past the max size we'll use anyway when trying to
> > determine the pagevec length.
> > 
> > Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> > trying to do it via its own routine.
> > 
> > Al, if these look ok, do you want to pick these up or shall I ask
> > Ilya to merge them via the ceph tree?
> 
> I'd rather have that kind of work go through the vfs tree; said that,
> I really wonder if this is the right approach.  Most of the users of
> iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> something like
> 	iov_iter_for_each_page(iter, size, f, data)
> with int (*f)(struct page *page, size_t from, size_t size, void *data)
> passed as callback.  Not everything fits that model, but there's a whole
> lot of things that do.

I was planning to do that, mostly because of the iomap dio code that
would not only get a lot cleaner with this, but also support multi-page
bvecs that we hope to have in the block layer soon.  The issue with it
is that we need to touch all the arch get_user_pages_fast
implementations, so it's going to be a relatively invasive change that I
didn't want to fix with just introducing the new direct I/O code.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 11:16         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02 11:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Layton, linux-fsdevel, linux-kernel, linux-nfs, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 02:56:51AM -0800, Christoph Hellwig wrote:
> > I really wonder if this is the right approach.  Most of the users of
> > iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> > something like
> > 	iov_iter_for_each_page(iter, size, f, data)
> > with int (*f)(struct page *page, size_t from, size_t size, void *data)
> > passed as callback.  Not everything fits that model, but there's a whole
> > lot of things that do.
> 
> I was planning to do that, mostly because of the iomap dio code that
> would not only get a lot cleaner with this, but also support multi-page
> bvecs that we hope to have in the block layer soon.  The issue with it
> is that we need to touch all the arch get_user_pages_fast
> implementations, so it's going to be a relatively invasive change that I
> didn't want to fix with just introducing the new direct I/O code.

I'm not sure we need to touch any get_user_pages_fast() at all; let it
fill a medium-sized array and use that as a buffer.  In particular,
I *really* don't like the idea of having the callbacks done in an
inconsistent locking environment - sometimes under ->mmap_sem, sometimes
not.

I played with "let it fill bio_vec array", but it doesn't really fit the
users; variant with callbacks is cleaner, IMO.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 11:16         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02 11:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 02:56:51AM -0800, Christoph Hellwig wrote:
> > I really wonder if this is the right approach.  Most of the users of
> > iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> > something like
> > 	iov_iter_for_each_page(iter, size, f, data)
> > with int (*f)(struct page *page, size_t from, size_t size, void *data)
> > passed as callback.  Not everything fits that model, but there's a whole
> > lot of things that do.
> 
> I was planning to do that, mostly because of the iomap dio code that
> would not only get a lot cleaner with this, but also support multi-page
> bvecs that we hope to have in the block layer soon.  The issue with it
> is that we need to touch all the arch get_user_pages_fast
> implementations, so it's going to be a relatively invasive change that I
> didn't want to fix with just introducing the new direct I/O code.

I'm not sure we need to touch any get_user_pages_fast() at all; let it
fill a medium-sized array and use that as a buffer.  In particular,
I *really* don't like the idea of having the callbacks done in an
inconsistent locking environment - sometimes under ->mmap_sem, sometimes
not.

I played with "let it fill bio_vec array", but it doesn't really fit the
users; variant with callbacks is cleaner, IMO.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 11:16         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02 11:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 02:56:51AM -0800, Christoph Hellwig wrote:
> > I really wonder if this is the right approach.  Most of the users of
> > iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> > something like
> > 	iov_iter_for_each_page(iter, size, f, data)
> > with int (*f)(struct page *page, size_t from, size_t size, void *data)
> > passed as callback.  Not everything fits that model, but there's a whole
> > lot of things that do.
> 
> I was planning to do that, mostly because of the iomap dio code that
> would not only get a lot cleaner with this, but also support multi-page
> bvecs that we hope to have in the block layer soon.  The issue with it
> is that we need to touch all the arch get_user_pages_fast
> implementations, so it's going to be a relatively invasive change that I
> didn't want to fix with just introducing the new direct I/O code.

I'm not sure we need to touch any get_user_pages_fast() at all; let it
fill a medium-sized array and use that as a buffer.  In particular,
I *really* don't like the idea of having the callbacks done in an
inconsistent locking environment - sometimes under ->mmap_sem, sometimes
not.

I played with "let it fill bio_vec array", but it doesn't really fit the
users; variant with callbacks is cleaner, IMO.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 13:00           ` Jeff Layton
  0 siblings, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-02-02 13:00 UTC (permalink / raw)
  To: Al Viro, Christoph Hellwig
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer, Linus Torvalds, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Thu, 2017-02-02 at 11:16 +0000, Al Viro wrote:
> On Thu, Feb 02, 2017 at 02:56:51AM -0800, Christoph Hellwig wrote:
> > > I really wonder if this is the right approach.  Most of the users of
> > > iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> > > something like
> > > 	iov_iter_for_each_page(iter, size, f, data)
> > > with int (*f)(struct page *page, size_t from, size_t size, void *data)
> > > passed as callback.  Not everything fits that model, but there's a whole
> > > lot of things that do.
> > 

Yeah, that does seem nicer.

My only concern is that it sounds more invasive, and I'd like to get
some sort of interim fix in for Ceph in the meantime (preferably
something we can send to stable). It really is trivial to vmsplice some
data into the kernel, and then splice write that to a DIO file --
instant softlockup in my testing.

This patch also makes vectored DIO with small iovecs on NFS a lot more
efficient, which is a nice bonus.
 
> > I was planning to do that, mostly because of the iomap dio code that
> > would not only get a lot cleaner with this, but also support multi-page
> > bvecs that we hope to have in the block layer soon.  The issue with it
> > is that we need to touch all the arch get_user_pages_fast
> > implementations, so it's going to be a relatively invasive change that I
> > didn't want to fix with just introducing the new direct I/O code.
> 
> I'm not sure we need to touch any get_user_pages_fast() at all; let it
> fill a medium-sized array and use that as a buffer.  In particular,
> I *really* don't like the idea of having the callbacks done in an
> inconsistent locking environment - sometimes under ->mmap_sem, sometimes
> not.
> 

Yeah, that might work. You could kmalloc the buffer array according to
the maxsize value. For small ones we could even consider using an on-
stack buffer.

> I played with "let it fill bio_vec array", but it doesn't really fit the
> users; variant with callbacks is cleaner, IMO.

I looked at converting this over to a bio_vec array as well, but that
really doesn't work well for Ceph. I like the callback idea better too.

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 13:00           ` Jeff Layton
  0 siblings, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-02-02 13:00 UTC (permalink / raw)
  To: Al Viro, Christoph Hellwig
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, 2017-02-02 at 11:16 +0000, Al Viro wrote:
> On Thu, Feb 02, 2017 at 02:56:51AM -0800, Christoph Hellwig wrote:
> > > I really wonder if this is the right approach.  Most of the users of
> > > iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> > > something like
> > > 	iov_iter_for_each_page(iter, size, f, data)
> > > with int (*f)(struct page *page, size_t from, size_t size, void *data)
> > > passed as callback.  Not everything fits that model, but there's a whole
> > > lot of things that do.
> > 

Yeah, that does seem nicer.

My only concern is that it sounds more invasive, and I'd like to get
some sort of interim fix in for Ceph in the meantime (preferably
something we can send to stable). It really is trivial to vmsplice some
data into the kernel, and then splice write that to a DIO file --
instant softlockup in my testing.

This patch also makes vectored DIO with small iovecs on NFS a lot more
efficient, which is a nice bonus.
 
> > I was planning to do that, mostly because of the iomap dio code that
> > would not only get a lot cleaner with this, but also support multi-page
> > bvecs that we hope to have in the block layer soon.  The issue with it
> > is that we need to touch all the arch get_user_pages_fast
> > implementations, so it's going to be a relatively invasive change that I
> > didn't want to fix with just introducing the new direct I/O code.
> 
> I'm not sure we need to touch any get_user_pages_fast() at all; let it
> fill a medium-sized array and use that as a buffer.  In particular,
> I *really* don't like the idea of having the callbacks done in an
> inconsistent locking environment - sometimes under ->mmap_sem, sometimes
> not.
> 

Yeah, that might work. You could kmalloc the buffer array according to
the maxsize value. For small ones we could even consider using an on-
stack buffer.

> I played with "let it fill bio_vec array", but it doesn't really fit the
> users; variant with callbacks is cleaner, IMO.

I looked at converting this over to a bio_vec array as well, but that
really doesn't work well for Ceph. I like the callback idea better too.

-- 
Jeff Layton <jlayton-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 14:48       ` Jan Kara
  0 siblings, 0 replies; 101+ messages in thread
From: Jan Kara @ 2017-02-02 14:48 UTC (permalink / raw)
  To: Al Viro
  Cc: Jeff Layton, linux-fsdevel, linux-kernel, linux-nfs, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov

On Thu 02-02-17 09:51:25, Al Viro wrote:
> I'm massaging that code (along with a lot of RTFS); the interesting questions
> related to VM side of things are
> 	* what are the relative costs of doing small vs. large batches?  Some
> of get_user_pages_fast() instances have comments along the lines of "we ought
> to limit the batch size, but then nobody's doing more than 64 pages at a time
> anyway".

Well, I believe it's only about the cost of setting up page table walk and
walking down those several levels of page tables. It is cheaper to copy
several PTE entries from the leaf level than doing the full walk every
time. But I didn't ever profile it to get actual numbers.

Large batches are only a problem because of possible soft-lockups and irq
latencies AFAIK. So the batch just should not be insanely large...

> 	* Not a question: any ->fault() that returns VM_FAULT_RETRY when
> *not* passed FAULT_FLAG_ALLOW_RETRY in flags ought to be shot.  cxlflash one
> sure as hell is.

Yep.

> 	* drivers/gpu/drm/vgem/vgem_drv.c:vgem_gem_fault() is bloody odd -
> shmem_read_mapping_page() can't return -EBUSY, AFAICS.  vm_insert_page()
> used to (and back then vgem_gem_fault() used to be broken), but these days
> it looks like dead code...

Looks like that.

> 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> right?

I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
for mmap_sem latency reduction when paging in pages and so not everybody
handles it. If a handler wants to simply retry the fault, returning
VM_FAULT_NOPAGE is a more common way to do that...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 14:48       ` Jan Kara
  0 siblings, 0 replies; 101+ messages in thread
From: Jan Kara @ 2017-02-02 14:48 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu 02-02-17 09:51:25, Al Viro wrote:
> I'm massaging that code (along with a lot of RTFS); the interesting questions
> related to VM side of things are
> 	* what are the relative costs of doing small vs. large batches?  Some
> of get_user_pages_fast() instances have comments along the lines of "we ought
> to limit the batch size, but then nobody's doing more than 64 pages at a time
> anyway".

Well, I believe it's only about the cost of setting up page table walk and
walking down those several levels of page tables. It is cheaper to copy
several PTE entries from the leaf level than doing the full walk every
time. But I didn't ever profile it to get actual numbers.

Large batches are only a problem because of possible soft-lockups and irq
latencies AFAIK. So the batch just should not be insanely large...

> 	* Not a question: any ->fault() that returns VM_FAULT_RETRY when
> *not* passed FAULT_FLAG_ALLOW_RETRY in flags ought to be shot.  cxlflash one
> sure as hell is.

Yep.

> 	* drivers/gpu/drm/vgem/vgem_drv.c:vgem_gem_fault() is bloody odd -
> shmem_read_mapping_page() can't return -EBUSY, AFAICS.  vm_insert_page()
> used to (and back then vgem_gem_fault() used to be broken), but these days
> it looks like dead code...

Looks like that.

> 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> right?

I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
for mmap_sem latency reduction when paging in pages and so not everybody
handles it. If a handler wants to simply retry the fault, returning
VM_FAULT_NOPAGE is a more common way to do that...

								Honza
-- 
Jan Kara <jack-IBi9RG/b67k@public.gmane.org>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 14:48       ` Jan Kara
  0 siblings, 0 replies; 101+ messages in thread
From: Jan Kara @ 2017-02-02 14:48 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu 02-02-17 09:51:25, Al Viro wrote:
> I'm massaging that code (along with a lot of RTFS); the interesting questions
> related to VM side of things are
> 	* what are the relative costs of doing small vs. large batches?  Some
> of get_user_pages_fast() instances have comments along the lines of "we ought
> to limit the batch size, but then nobody's doing more than 64 pages at a time
> anyway".

Well, I believe it's only about the cost of setting up page table walk and
walking down those several levels of page tables. It is cheaper to copy
several PTE entries from the leaf level than doing the full walk every
time. But I didn't ever profile it to get actual numbers.

Large batches are only a problem because of possible soft-lockups and irq
latencies AFAIK. So the batch just should not be insanely large...

> 	* Not a question: any ->fault() that returns VM_FAULT_RETRY when
> *not* passed FAULT_FLAG_ALLOW_RETRY in flags ought to be shot.  cxlflash one
> sure as hell is.

Yep.

> 	* drivers/gpu/drm/vgem/vgem_drv.c:vgem_gem_fault() is bloody odd -
> shmem_read_mapping_page() can't return -EBUSY, AFAICS.  vm_insert_page()
> used to (and back then vgem_gem_fault() used to be broken), but these days
> it looks like dead code...

Looks like that.

> 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> right?

I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
for mmap_sem latency reduction when paging in pages and so not everybody
handles it. If a handler wants to simply retry the fault, returning
VM_FAULT_NOPAGE is a more common way to do that...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 18:28         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02 18:28 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jeff Layton, linux-fsdevel, linux-kernel, linux-nfs, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Chris Wilson,
	Kirill A. Shutemov

On Thu, Feb 02, 2017 at 03:48:17PM +0100, Jan Kara wrote:

> > 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> > it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> > VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> > right?
> 
> I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
> for mmap_sem latency reduction when paging in pages and so not everybody
> handles it. If a handler wants to simply retry the fault, returning
> VM_FAULT_NOPAGE is a more common way to do that...

/* Convert errno to return value from ->page_mkwrite() call */
static inline int block_page_mkwrite_return(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err == -EAGAIN)
                return VM_FAULT_RETRY;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

and a bunch of ->page_mkwrite() instances using that.  However, the only
callers of ->page_mkwrite() are wp_page_shared()->do_page_mkwrite() and
do_shared_fault()->do_page_mkwrite().  do_page_mkwrite() treates
VM_FAULT_RETRY as "lock page and return VM_FAULT_RETRY|VM_FAULT_LOCKED".
Both callers do the same check -
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
and the return value if that predicate is false.  FWIW, use of VM_FAULT_RETRY
comes from your patch back in 2011 and AFAICS the same analysis used to
apply back then, except for the open-coded method calls where we use
do_page_mkwrite() these days...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 18:28         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02 18:28 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 03:48:17PM +0100, Jan Kara wrote:

> > 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> > it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> > VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> > right?
> 
> I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
> for mmap_sem latency reduction when paging in pages and so not everybody
> handles it. If a handler wants to simply retry the fault, returning
> VM_FAULT_NOPAGE is a more common way to do that...

/* Convert errno to return value from ->page_mkwrite() call */
static inline int block_page_mkwrite_return(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err == -EAGAIN)
                return VM_FAULT_RETRY;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

and a bunch of ->page_mkwrite() instances using that.  However, the only
callers of ->page_mkwrite() are wp_page_shared()->do_page_mkwrite() and
do_shared_fault()->do_page_mkwrite().  do_page_mkwrite() treates
VM_FAULT_RETRY as "lock page and return VM_FAULT_RETRY|VM_FAULT_LOCKED".
Both callers do the same check -
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
and the return value if that predicate is false.  FWIW, use of VM_FAULT_RETRY
comes from your patch back in 2011 and AFAICS the same analysis used to
apply back then, except for the open-coded method calls where we use
do_page_mkwrite() these days...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-02 18:28         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-02 18:28 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 03:48:17PM +0100, Jan Kara wrote:

> > 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> > it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> > VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> > right?
> 
> I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
> for mmap_sem latency reduction when paging in pages and so not everybody
> handles it. If a handler wants to simply retry the fault, returning
> VM_FAULT_NOPAGE is a more common way to do that...

/* Convert errno to return value from ->page_mkwrite() call */
static inline int block_page_mkwrite_return(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err == -EAGAIN)
                return VM_FAULT_RETRY;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

and a bunch of ->page_mkwrite() instances using that.  However, the only
callers of ->page_mkwrite() are wp_page_shared()->do_page_mkwrite() and
do_shared_fault()->do_page_mkwrite().  do_page_mkwrite() treates
VM_FAULT_RETRY as "lock page and return VM_FAULT_RETRY|VM_FAULT_LOCKED".
Both callers do the same check -
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
and the return value if that predicate is false.  FWIW, use of VM_FAULT_RETRY
comes from your patch back in 2011 and AFAICS the same analysis used to
apply back then, except for the open-coded method calls where we use
do_page_mkwrite() these days...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  7:29             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03  7:29 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Christoph Hellwig, linux-fsdevel, linux-kernel, linux-nfs,
	ceph-devel, lustre-devel, v9fs-developer, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> > I'm not sure we need to touch any get_user_pages_fast() at all; let it
> > fill a medium-sized array and use that as a buffer.  In particular,
> > I *really* don't like the idea of having the callbacks done in an
> > inconsistent locking environment - sometimes under ->mmap_sem, sometimes
> > not.
> > 
> 
> Yeah, that might work. You could kmalloc the buffer array according to
> the maxsize value. For small ones we could even consider using an on-
> stack buffer.

Yes.  FWIW, I'm not proposing to kill iov_iter_get_pages{,_alloc}()
immediately - the new primitive would initially use the damn thing.
That can be backported without any problems, and conversions would be
one-by-one.  If/when we get to the point where most of the users have
been switched to the new helper, we could try and see if what remains
could be dealt with; if that works, it might be possible to fold
iov_iter_get_pages() into it.  In particular, for ITER_BVEC and
ITER_PIPE we don't need to bother with any intermediate arrays at all, etc.
But that's only after several cycles when everyone is asked to try and use
the new primitive instead of iov_iter_get_pages().

The calling conventions of iov_iter_get_pages() are ugly and I would love to
get rid of them, but doing that in a flagday conversion is a lot of PITA
for no good reason.

Speaking of get_user_pages() and conversions:

get_user_pages() relies upon find_extend_vma() to reject kernel
addresses; the fast side of get_user_pages_fast() doesn't have anything
of that sort in case e.g. HAVE_GENERIC_RCU_GUP.  Sure, usually we have
that verified and rejected earlier anyway, but it's a fairly subtle difference
that doesn't seem to be documented anywhere.

For example, /dev/st* reads and writes (st_read()/st_write()) feed the
address of buffer to get_user_pages_unlocked().  If somebody replaced
those with get_user_pages_fast(), we'd be in trouble as soon as
some code got tricked into using kernel_write() on /dev/st*.

access_ok() in HAVE_GENERIC_RCU_GUP {__,}get_user_pages_fast() obviously
doesn't help in that scenario.  What am I missing here?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  7:29             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03  7:29 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	Christoph Hellwig, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> > I'm not sure we need to touch any get_user_pages_fast() at all; let it
> > fill a medium-sized array and use that as a buffer.  In particular,
> > I *really* don't like the idea of having the callbacks done in an
> > inconsistent locking environment - sometimes under ->mmap_sem, sometimes
> > not.
> > 
> 
> Yeah, that might work. You could kmalloc the buffer array according to
> the maxsize value. For small ones we could even consider using an on-
> stack buffer.

Yes.  FWIW, I'm not proposing to kill iov_iter_get_pages{,_alloc}()
immediately - the new primitive would initially use the damn thing.
That can be backported without any problems, and conversions would be
one-by-one.  If/when we get to the point where most of the users have
been switched to the new helper, we could try and see if what remains
could be dealt with; if that works, it might be possible to fold
iov_iter_get_pages() into it.  In particular, for ITER_BVEC and
ITER_PIPE we don't need to bother with any intermediate arrays at all, etc.
But that's only after several cycles when everyone is asked to try and use
the new primitive instead of iov_iter_get_pages().

The calling conventions of iov_iter_get_pages() are ugly and I would love to
get rid of them, but doing that in a flagday conversion is a lot of PITA
for no good reason.

Speaking of get_user_pages() and conversions:

get_user_pages() relies upon find_extend_vma() to reject kernel
addresses; the fast side of get_user_pages_fast() doesn't have anything
of that sort in case e.g. HAVE_GENERIC_RCU_GUP.  Sure, usually we have
that verified and rejected earlier anyway, but it's a fairly subtle difference
that doesn't seem to be documented anywhere.

For example, /dev/st* reads and writes (st_read()/st_write()) feed the
address of buffer to get_user_pages_unlocked().  If somebody replaced
those with get_user_pages_fast(), we'd be in trouble as soon as
some code got tricked into using kernel_write() on /dev/st*.

access_ok() in HAVE_GENERIC_RCU_GUP {__,}get_user_pages_fast() obviously
doesn't help in that scenario.  What am I missing here?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  7:29             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03  7:29 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	Christoph Hellwig, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> > I'm not sure we need to touch any get_user_pages_fast() at all; let it
> > fill a medium-sized array and use that as a buffer.  In particular,
> > I *really* don't like the idea of having the callbacks done in an
> > inconsistent locking environment - sometimes under ->mmap_sem, sometimes
> > not.
> > 
> 
> Yeah, that might work. You could kmalloc the buffer array according to
> the maxsize value. For small ones we could even consider using an on-
> stack buffer.

Yes.  FWIW, I'm not proposing to kill iov_iter_get_pages{,_alloc}()
immediately - the new primitive would initially use the damn thing.
That can be backported without any problems, and conversions would be
one-by-one.  If/when we get to the point where most of the users have
been switched to the new helper, we could try and see if what remains
could be dealt with; if that works, it might be possible to fold
iov_iter_get_pages() into it.  In particular, for ITER_BVEC and
ITER_PIPE we don't need to bother with any intermediate arrays at all, etc.
But that's only after several cycles when everyone is asked to try and use
the new primitive instead of iov_iter_get_pages().

The calling conventions of iov_iter_get_pages() are ugly and I would love to
get rid of them, but doing that in a flagday conversion is a lot of PITA
for no good reason.

Speaking of get_user_pages() and conversions:

get_user_pages() relies upon find_extend_vma() to reject kernel
addresses; the fast side of get_user_pages_fast() doesn't have anything
of that sort in case e.g. HAVE_GENERIC_RCU_GUP.  Sure, usually we have
that verified and rejected earlier anyway, but it's a fairly subtle difference
that doesn't seem to be documented anywhere.

For example, /dev/st* reads and writes (st_read()/st_write()) feed the
address of buffer to get_user_pages_unlocked().  If somebody replaced
those with get_user_pages_fast(), we'd be in trouble as soon as
some code got tricked into using kernel_write() on /dev/st*.

access_ok() in HAVE_GENERIC_RCU_GUP {__,}get_user_pages_fast() obviously
doesn't help in that scenario.  What am I missing here?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  7:49             ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-03  7:49 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Al Viro, Christoph Hellwig, linux-fsdevel, linux-kernel,
	linux-nfs, ceph-devel, lustre-devel, v9fs-developer,
	Linus Torvalds, Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> Yeah, that might work. You could kmalloc the buffer array according to
> the maxsize value. For small ones we could even consider using an on-
> stack buffer.

For the block direct I/O code we defintively want to avoid any
allocations for small I/O a that shows up in the performance numbers.
And we'd like to reuse the on-stack bio_vec so that the defintion of
a small I/O can be as big as possible without blowing up the stack.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  7:49             ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-03  7:49 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	Christoph Hellwig, Al Viro, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> Yeah, that might work. You could kmalloc the buffer array according to
> the maxsize value. For small ones we could even consider using an on-
> stack buffer.

For the block direct I/O code we defintively want to avoid any
allocations for small I/O a that shows up in the performance numbers.
And we'd like to reuse the on-stack bio_vec so that the defintion of
a small I/O can be as big as possible without blowing up the stack.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  7:49             ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-03  7:49 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	Christoph Hellwig, Al Viro, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> Yeah, that might work. You could kmalloc the buffer array according to
> the maxsize value. For small ones we could even consider using an on-
> stack buffer.

For the block direct I/O code we defintively want to avoid any
allocations for small I/O a that shows up in the performance numbers.
And we'd like to reuse the on-stack bio_vec so that the defintion of
a small I/O can be as big as possible without blowing up the stack.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  8:54               ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03  8:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Layton, linux-fsdevel, linux-kernel, linux-nfs, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 11:49:01PM -0800, Christoph Hellwig wrote:
> On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> > Yeah, that might work. You could kmalloc the buffer array according to
> > the maxsize value. For small ones we could even consider using an on-
> > stack buffer.
> 
> For the block direct I/O code we defintively want to avoid any
> allocations for small I/O a that shows up in the performance numbers.
> And we'd like to reuse the on-stack bio_vec so that the defintion of
> a small I/O can be as big as possible without blowing up the stack.

Hmm...  Reuse part is really nasty ;-/  OTOH, it might make sense to have
a "fill bio_vec array" as separate primitive - having that sucker come
from bio looks like an artificial restriction.

OK, next question, seeing that you've dealt with O_DIRECT guts more than
I have.  When we have iov_iter_get_pages() fail on do_direct_IO() write
with some blocks already allocated, we pick zero page as data source.
So far, so good, but:
	* should we bother zeroing anything unless buffer_new() is true?
	* why, in case of more than a page worth of pending allocated
blocks, do we bother with calling iov_iter_get_pages() again and again?
We *do* take care not to allocate anything else after that point, but
dio_get_page() will be calling iov_iter_get_pages() every time in that
case - there's only one page in queue.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  8:54               ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03  8:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 11:49:01PM -0800, Christoph Hellwig wrote:
> On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> > Yeah, that might work. You could kmalloc the buffer array according to
> > the maxsize value. For small ones we could even consider using an on-
> > stack buffer.
> 
> For the block direct I/O code we defintively want to avoid any
> allocations for small I/O a that shows up in the performance numbers.
> And we'd like to reuse the on-stack bio_vec so that the defintion of
> a small I/O can be as big as possible without blowing up the stack.

Hmm...  Reuse part is really nasty ;-/  OTOH, it might make sense to have
a "fill bio_vec array" as separate primitive - having that sucker come
from bio looks like an artificial restriction.

OK, next question, seeing that you've dealt with O_DIRECT guts more than
I have.  When we have iov_iter_get_pages() fail on do_direct_IO() write
with some blocks already allocated, we pick zero page as data source.
So far, so good, but:
	* should we bother zeroing anything unless buffer_new() is true?
	* why, in case of more than a page worth of pending allocated
blocks, do we bother with calling iov_iter_get_pages() again and again?
We *do* take care not to allocate anything else after that point, but
dio_get_page() will be calling iov_iter_get_pages() every time in that
case - there's only one page in queue.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03  8:54               ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03  8:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov

On Thu, Feb 02, 2017 at 11:49:01PM -0800, Christoph Hellwig wrote:
> On Thu, Feb 02, 2017 at 08:00:52AM -0500, Jeff Layton wrote:
> > Yeah, that might work. You could kmalloc the buffer array according to
> > the maxsize value. For small ones we could even consider using an on-
> > stack buffer.
> 
> For the block direct I/O code we defintively want to avoid any
> allocations for small I/O a that shows up in the performance numbers.
> And we'd like to reuse the on-stack bio_vec so that the defintion of
> a small I/O can be as big as possible without blowing up the stack.

Hmm...  Reuse part is really nasty ;-/  OTOH, it might make sense to have
a "fill bio_vec array" as separate primitive - having that sucker come
from bio looks like an artificial restriction.

OK, next question, seeing that you've dealt with O_DIRECT guts more than
I have.  When we have iov_iter_get_pages() fail on do_direct_IO() write
with some blocks already allocated, we pick zero page as data source.
So far, so good, but:
	* should we bother zeroing anything unless buffer_new() is true?
	* why, in case of more than a page worth of pending allocated
blocks, do we bother with calling iov_iter_get_pages() again and again?
We *do* take care not to allocate anything else after that point, but
dio_get_page() will be calling iov_iter_get_pages() every time in that
case - there's only one page in queue.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 11:09                 ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-03 11:09 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Jeff Layton, linux-fsdevel, linux-kernel,
	linux-nfs, ceph-devel, lustre-devel, v9fs-developer,
	Linus Torvalds, Jan Kara, Chris Wilson, Kirill A. Shutemov

On Fri, Feb 03, 2017 at 08:54:15AM +0000, Al Viro wrote:
> Hmm...  Reuse part is really nasty ;-/  OTOH, it might make sense to have
> a "fill bio_vec array" as separate primitive - having that sucker come
> from bio looks like an artificial restriction.

Or just the only usecase :)  But yes, it could be generalized to take a
bio_vec without too much effort.

> OK, next question, seeing that you've dealt with O_DIRECT guts more than
> I have.  When we have iov_iter_get_pages() fail on do_direct_IO() write
> with some blocks already allocated, we pick zero page as data source.
> So far, so good, but:
> 	* should we bother zeroing anything unless buffer_new() is true?

I don't think so.

> 	* why, in case of more than a page worth of pending allocated
> blocks, do we bother with calling iov_iter_get_pages() again and again?
> We *do* take care not to allocate anything else after that point, but
> dio_get_page() will be calling iov_iter_get_pages() every time in that
> case - there's only one page in queue.

There shouldn't be a need for it.

But take it with a grain of salt - fs/direct-io.c is a hairy mess,
that's one of the reasons why I replaced it with the new iomap
code instead of trying to gradually move it to iomap API.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 11:09                 ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-03 11:09 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	Christoph Hellwig, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Fri, Feb 03, 2017 at 08:54:15AM +0000, Al Viro wrote:
> Hmm...  Reuse part is really nasty ;-/  OTOH, it might make sense to have
> a "fill bio_vec array" as separate primitive - having that sucker come
> from bio looks like an artificial restriction.

Or just the only usecase :)  But yes, it could be generalized to take a
bio_vec without too much effort.

> OK, next question, seeing that you've dealt with O_DIRECT guts more than
> I have.  When we have iov_iter_get_pages() fail on do_direct_IO() write
> with some blocks already allocated, we pick zero page as data source.
> So far, so good, but:
> 	* should we bother zeroing anything unless buffer_new() is true?

I don't think so.

> 	* why, in case of more than a page worth of pending allocated
> blocks, do we bother with calling iov_iter_get_pages() again and again?
> We *do* take care not to allocate anything else after that point, but
> dio_get_page() will be calling iov_iter_get_pages() every time in that
> case - there's only one page in queue.

There shouldn't be a need for it.

But take it with a grain of salt - fs/direct-io.c is a hairy mess,
that's one of the reasons why I replaced it with the new iomap
code instead of trying to gradually move it to iomap API.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 11:09                 ` Christoph Hellwig
  0 siblings, 0 replies; 101+ messages in thread
From: Christoph Hellwig @ 2017-02-03 11:09 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	Christoph Hellwig, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Fri, Feb 03, 2017 at 08:54:15AM +0000, Al Viro wrote:
> Hmm...  Reuse part is really nasty ;-/  OTOH, it might make sense to have
> a "fill bio_vec array" as separate primitive - having that sucker come
> from bio looks like an artificial restriction.

Or just the only usecase :)  But yes, it could be generalized to take a
bio_vec without too much effort.

> OK, next question, seeing that you've dealt with O_DIRECT guts more than
> I have.  When we have iov_iter_get_pages() fail on do_direct_IO() write
> with some blocks already allocated, we pick zero page as data source.
> So far, so good, but:
> 	* should we bother zeroing anything unless buffer_new() is true?

I don't think so.

> 	* why, in case of more than a page worth of pending allocated
> blocks, do we bother with calling iov_iter_get_pages() again and again?
> We *do* take care not to allocate anything else after that point, but
> dio_get_page() will be calling iov_iter_get_pages() every time in that
> case - there's only one page in queue.

There shouldn't be a need for it.

But take it with a grain of salt - fs/direct-io.c is a hairy mess,
that's one of the reasons why I replaced it with the new iomap
code instead of trying to gradually move it to iomap API.

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 14:47           ` Jan Kara
  0 siblings, 0 replies; 101+ messages in thread
From: Jan Kara @ 2017-02-03 14:47 UTC (permalink / raw)
  To: Al Viro
  Cc: Jan Kara, Jeff Layton, linux-fsdevel, linux-kernel, linux-nfs,
	ceph-devel, lustre-devel, v9fs-developer, Linus Torvalds,
	Chris Wilson, Kirill A. Shutemov

On Thu 02-02-17 18:28:02, Al Viro wrote:
> On Thu, Feb 02, 2017 at 03:48:17PM +0100, Jan Kara wrote:
> 
> > > 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> > > it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> > > VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> > > right?
> > 
> > I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
> > for mmap_sem latency reduction when paging in pages and so not everybody
> > handles it. If a handler wants to simply retry the fault, returning
> > VM_FAULT_NOPAGE is a more common way to do that...
> 
> /* Convert errno to return value from ->page_mkwrite() call */
> static inline int block_page_mkwrite_return(int err)
> {
>         if (err == 0)
>                 return VM_FAULT_LOCKED;
>         if (err == -EFAULT)
>                 return VM_FAULT_NOPAGE;
>         if (err == -ENOMEM)
>                 return VM_FAULT_OOM;
>         if (err == -EAGAIN)
>                 return VM_FAULT_RETRY;
>         /* -ENOSPC, -EDQUOT, -EIO ... */
>         return VM_FAULT_SIGBUS;
> }
> 
> and a bunch of ->page_mkwrite() instances using that.  However, the only
> callers of ->page_mkwrite() are wp_page_shared()->do_page_mkwrite() and
> do_shared_fault()->do_page_mkwrite().  do_page_mkwrite() treates
> VM_FAULT_RETRY as "lock page and return VM_FAULT_RETRY|VM_FAULT_LOCKED".
> Both callers do the same check -
>                 if (unlikely(!tmp || (tmp &
>                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> and the return value if that predicate is false.  FWIW, use of VM_FAULT_RETRY
> comes from your patch back in 2011 and AFAICS the same analysis used to
> apply back then, except for the open-coded method calls where we use
> do_page_mkwrite() these days...

Yeah, back then I was not aware of VM_FAULT_RETRY limitations and your
analysis above just shows that its handling from do_page_mkwrite() is
simply broken (or better non-existent). Actually that VM_FAULT_RETRY
return was added by fs freeze handling patch. The freeze handling was
later changed but that change to block_page_mkwrite_return() remained.
I'll send a patch to remove it. Thanks for spotting this.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 14:47           ` Jan Kara
  0 siblings, 0 replies; 101+ messages in thread
From: Jan Kara @ 2017-02-03 14:47 UTC (permalink / raw)
  To: Al Viro
  Cc: Jan Kara, Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Chris Wilson, Kirill A. Shutemov

On Thu 02-02-17 18:28:02, Al Viro wrote:
> On Thu, Feb 02, 2017 at 03:48:17PM +0100, Jan Kara wrote:
> 
> > > 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> > > it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> > > VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> > > right?
> > 
> > I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
> > for mmap_sem latency reduction when paging in pages and so not everybody
> > handles it. If a handler wants to simply retry the fault, returning
> > VM_FAULT_NOPAGE is a more common way to do that...
> 
> /* Convert errno to return value from ->page_mkwrite() call */
> static inline int block_page_mkwrite_return(int err)
> {
>         if (err == 0)
>                 return VM_FAULT_LOCKED;
>         if (err == -EFAULT)
>                 return VM_FAULT_NOPAGE;
>         if (err == -ENOMEM)
>                 return VM_FAULT_OOM;
>         if (err == -EAGAIN)
>                 return VM_FAULT_RETRY;
>         /* -ENOSPC, -EDQUOT, -EIO ... */
>         return VM_FAULT_SIGBUS;
> }
> 
> and a bunch of ->page_mkwrite() instances using that.  However, the only
> callers of ->page_mkwrite() are wp_page_shared()->do_page_mkwrite() and
> do_shared_fault()->do_page_mkwrite().  do_page_mkwrite() treates
> VM_FAULT_RETRY as "lock page and return VM_FAULT_RETRY|VM_FAULT_LOCKED".
> Both callers do the same check -
>                 if (unlikely(!tmp || (tmp &
>                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> and the return value if that predicate is false.  FWIW, use of VM_FAULT_RETRY
> comes from your patch back in 2011 and AFAICS the same analysis used to
> apply back then, except for the open-coded method calls where we use
> do_page_mkwrite() these days...

Yeah, back then I was not aware of VM_FAULT_RETRY limitations and your
analysis above just shows that its handling from do_page_mkwrite() is
simply broken (or better non-existent). Actually that VM_FAULT_RETRY
return was added by fs freeze handling patch. The freeze handling was
later changed but that change to block_page_mkwrite_return() remained.
I'll send a patch to remove it. Thanks for spotting this.

								Honza
-- 
Jan Kara <jack-IBi9RG/b67k@public.gmane.org>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 14:47           ` Jan Kara
  0 siblings, 0 replies; 101+ messages in thread
From: Jan Kara @ 2017-02-03 14:47 UTC (permalink / raw)
  To: Al Viro
  Cc: Jan Kara, Jeff Layton, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Chris Wilson, Kirill A. Shutemov

On Thu 02-02-17 18:28:02, Al Viro wrote:
> On Thu, Feb 02, 2017 at 03:48:17PM +0100, Jan Kara wrote:
> 
> > > 	* ->page_mkwrite() instances sometimes return VM_FAULT_RETRY; AFAICS,
> > > it's only (ab)used there as 'not zero, but doesn't contain any error bits';
> > > VM_FAULT_RETRY from that source does *not* reach handle_mm_fault() callers,
> > > right?
> > 
> > I can see only Lustre doing it and IMHO it is abuse. VM_FAULT_RETRY is used
> > for mmap_sem latency reduction when paging in pages and so not everybody
> > handles it. If a handler wants to simply retry the fault, returning
> > VM_FAULT_NOPAGE is a more common way to do that...
> 
> /* Convert errno to return value from ->page_mkwrite() call */
> static inline int block_page_mkwrite_return(int err)
> {
>         if (err == 0)
>                 return VM_FAULT_LOCKED;
>         if (err == -EFAULT)
>                 return VM_FAULT_NOPAGE;
>         if (err == -ENOMEM)
>                 return VM_FAULT_OOM;
>         if (err == -EAGAIN)
>                 return VM_FAULT_RETRY;
>         /* -ENOSPC, -EDQUOT, -EIO ... */
>         return VM_FAULT_SIGBUS;
> }
> 
> and a bunch of ->page_mkwrite() instances using that.  However, the only
> callers of ->page_mkwrite() are wp_page_shared()->do_page_mkwrite() and
> do_shared_fault()->do_page_mkwrite().  do_page_mkwrite() treates
> VM_FAULT_RETRY as "lock page and return VM_FAULT_RETRY|VM_FAULT_LOCKED".
> Both callers do the same check -
>                 if (unlikely(!tmp || (tmp &
>                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> and the return value if that predicate is false.  FWIW, use of VM_FAULT_RETRY
> comes from your patch back in 2011 and AFAICS the same analysis used to
> apply back then, except for the open-coded method calls where we use
> do_page_mkwrite() these days...

Yeah, back then I was not aware of VM_FAULT_RETRY limitations and your
analysis above just shows that its handling from do_page_mkwrite() is
simply broken (or better non-existent). Actually that VM_FAULT_RETRY
return was added by fs freeze handling patch. The freeze handling was
later changed but that change to block_page_mkwrite_return() remained.
I'll send a patch to remove it. Thanks for spotting this.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 18:29               ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-03 18:29 UTC (permalink / raw)
  To: Al Viro
  Cc: Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel, V9FS Developers, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Thu, Feb 2, 2017 at 11:29 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> get_user_pages() relies upon find_extend_vma() to reject kernel
> addresses; the fast side of get_user_pages_fast() doesn't have anything
> of that sort

It is *supposed* to have it.

See pte_allows_gup(), for example. In particular, it requires the
_PAGE_USER bit in the PTE (and the devpte case should require
_PAGE_BIT_DEVMAP).

So no, get_user_pages_fast() can not look up kernel pages. If it does,
that would be a bug.

              Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 18:29               ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-03 18:29 UTC (permalink / raw)
  To: Al Viro
  Cc: Linux NFS Mailing List, Jan Kara, Jeff Layton,
	Linux Kernel Mailing List, Chris Wilson, Christoph Hellwig,
	linux-fsdevel, V9FS Developers, ceph-devel, Kirill A. Shutemov,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 2, 2017 at 11:29 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
>
> get_user_pages() relies upon find_extend_vma() to reject kernel
> addresses; the fast side of get_user_pages_fast() doesn't have anything
> of that sort

It is *supposed* to have it.

See pte_allows_gup(), for example. In particular, it requires the
_PAGE_USER bit in the PTE (and the devpte case should require
_PAGE_BIT_DEVMAP).

So no, get_user_pages_fast() can not look up kernel pages. If it does,
that would be a bug.

              Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 18:29               ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-03 18:29 UTC (permalink / raw)
  To: Al Viro
  Cc: Linux NFS Mailing List, Jan Kara, Jeff Layton,
	Linux Kernel Mailing List, Chris Wilson, Christoph Hellwig,
	linux-fsdevel, V9FS Developers, ceph-devel, Kirill A. Shutemov,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 2, 2017 at 11:29 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> get_user_pages() relies upon find_extend_vma() to reject kernel
> addresses; the fast side of get_user_pages_fast() doesn't have anything
> of that sort

It is *supposed* to have it.

See pte_allows_gup(), for example. In particular, it requires the
_PAGE_USER bit in the PTE (and the devpte case should require
_PAGE_BIT_DEVMAP).

So no, get_user_pages_fast() can not look up kernel pages. If it does,
that would be a bug.

              Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 19:08                 ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03 19:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel, V9FS Developers, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Fri, Feb 03, 2017 at 10:29:23AM -0800, Linus Torvalds wrote:
> On Thu, Feb 2, 2017 at 11:29 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > get_user_pages() relies upon find_extend_vma() to reject kernel
> > addresses; the fast side of get_user_pages_fast() doesn't have anything
> > of that sort
> 
> It is *supposed* to have it.
> 
> See pte_allows_gup(), for example. In particular, it requires the
> _PAGE_USER bit in the PTE (and the devpte case should require
> _PAGE_BIT_DEVMAP).

On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
there) is vulnerable to e.g. access via kernel_write().  The comment in
there
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
 *      pages containing page tables.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) access_ok is sufficient to validate userspace address ranges.
 *
 * The last two assumptions can be relaxed by the addition of helper functions.

doesn't look promising - access_ok() is never sufficient.  Something like
_PAGE_USER tests in x86 one solves that problem, but if anything similar
works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
what am I missing here...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 19:08                 ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03 19:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA, V9FS Developers, Jan Kara,
	Chris Wilson, Kirill A. Shutemov

On Fri, Feb 03, 2017 at 10:29:23AM -0800, Linus Torvalds wrote:
> On Thu, Feb 2, 2017 at 11:29 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> >
> > get_user_pages() relies upon find_extend_vma() to reject kernel
> > addresses; the fast side of get_user_pages_fast() doesn't have anything
> > of that sort
> 
> It is *supposed* to have it.
> 
> See pte_allows_gup(), for example. In particular, it requires the
> _PAGE_USER bit in the PTE (and the devpte case should require
> _PAGE_BIT_DEVMAP).

On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
there) is vulnerable to e.g. access via kernel_write().  The comment in
there
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
 *      pages containing page tables.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) access_ok is sufficient to validate userspace address ranges.
 *
 * The last two assumptions can be relaxed by the addition of helper functions.

doesn't look promising - access_ok() is never sufficient.  Something like
_PAGE_USER tests in x86 one solves that problem, but if anything similar
works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
what am I missing here...
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 19:08                 ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-03 19:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA, V9FS Developers, Jan Kara,
	Chris Wilson, Kirill A. Shutemov

On Fri, Feb 03, 2017 at 10:29:23AM -0800, Linus Torvalds wrote:
> On Thu, Feb 2, 2017 at 11:29 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > get_user_pages() relies upon find_extend_vma() to reject kernel
> > addresses; the fast side of get_user_pages_fast() doesn't have anything
> > of that sort
> 
> It is *supposed* to have it.
> 
> See pte_allows_gup(), for example. In particular, it requires the
> _PAGE_USER bit in the PTE (and the devpte case should require
> _PAGE_BIT_DEVMAP).

On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
there) is vulnerable to e.g. access via kernel_write().  The comment in
there
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
 *      pages containing page tables.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) access_ok is sufficient to validate userspace address ranges.
 *
 * The last two assumptions can be relaxed by the addition of helper functions.

doesn't look promising - access_ok() is never sufficient.  Something like
_PAGE_USER tests in x86 one solves that problem, but if anything similar
works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
what am I missing here...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 19:28                   ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-03 19:28 UTC (permalink / raw)
  To: Al Viro, Steve Capper, Catalin Marinas, Hugh Dickins,
	Peter Zijlstra, Ingo Molnar
  Cc: Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel, V9FS Developers, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Fri, Feb 3, 2017 at 11:08 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
> only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
> there) is vulnerable to e.g. access via kernel_write().

Yeah, access_ok() is bogus. It needs to just check against TASK_SIZE
or whatever.

> doesn't look promising - access_ok() is never sufficient.  Something like
> _PAGE_USER tests in x86 one solves that problem, but if anything similar
> works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
> what am I missing here...

Ok, I definitely agree that it looks like __get_user_pages_fast() just
needs to get rid of the access_ok() and replace it with a proper check
for the user address space range.

Looks like arm[64] and powerpc.are the current users. Adding in some
people involved with the original submission a few years ago.

I do note that the x86 __get_user_pages_fast() thing looks dodgy too.

In particular, we do it right in the *real* get_user_pages_fast(), see
commit 7f8189068726 ("x86: don't use 'access_ok()' as a range check in
get_user_pages_fast()"). But then the same bug was re-introduced when
the "irq safe" version was merged. As well as in the GENERIC_RCU_GUP
version.

Gaah. Apparently PeterZ copied the old buggy version before the fix
when he added __get_user_pages_fast() in commit 465a454f254e ("x86,
mm: Add __get_user_pages_fast()").

I guess it could be considered a merge error (both happened during the
2.6.31 merge window).

                   Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 19:28                   ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-03 19:28 UTC (permalink / raw)
  To: Al Viro, Steve Capper, Catalin Marinas, Hugh Dickins,
	Peter Zijlstra, Ingo Molnar
  Cc: Linux NFS Mailing List, Jan Kara, Jeff Layton,
	Linux Kernel Mailing List, Chris Wilson, Christoph Hellwig,
	linux-fsdevel, V9FS Developers, ceph-devel, Kirill A. Shutemov,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Fri, Feb 3, 2017 at 11:08 AM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
>
> On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
> only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
> there) is vulnerable to e.g. access via kernel_write().

Yeah, access_ok() is bogus. It needs to just check against TASK_SIZE
or whatever.

> doesn't look promising - access_ok() is never sufficient.  Something like
> _PAGE_USER tests in x86 one solves that problem, but if anything similar
> works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
> what am I missing here...

Ok, I definitely agree that it looks like __get_user_pages_fast() just
needs to get rid of the access_ok() and replace it with a proper check
for the user address space range.

Looks like arm[64] and powerpc.are the current users. Adding in some
people involved with the original submission a few years ago.

I do note that the x86 __get_user_pages_fast() thing looks dodgy too.

In particular, we do it right in the *real* get_user_pages_fast(), see
commit 7f8189068726 ("x86: don't use 'access_ok()' as a range check in
get_user_pages_fast()"). But then the same bug was re-introduced when
the "irq safe" version was merged. As well as in the GENERIC_RCU_GUP
version.

Gaah. Apparently PeterZ copied the old buggy version before the fix
when he added __get_user_pages_fast() in commit 465a454f254e ("x86,
mm: Add __get_user_pages_fast()").

I guess it could be considered a merge error (both happened during the
2.6.31 merge window).

                   Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-03 19:28                   ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-03 19:28 UTC (permalink / raw)
  To: Al Viro, Steve Capper, Catalin Marinas, Hugh Dickins,
	Peter Zijlstra, Ingo Molnar
  Cc: Linux NFS Mailing List, Jan Kara, Jeff Layton,
	Linux Kernel Mailing List, Chris Wilson, Christoph Hellwig,
	linux-fsdevel, V9FS Developers, ceph-devel, Kirill A. Shutemov,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Fri, Feb 3, 2017 at 11:08 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
> only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
> there) is vulnerable to e.g. access via kernel_write().

Yeah, access_ok() is bogus. It needs to just check against TASK_SIZE
or whatever.

> doesn't look promising - access_ok() is never sufficient.  Something like
> _PAGE_USER tests in x86 one solves that problem, but if anything similar
> works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
> what am I missing here...

Ok, I definitely agree that it looks like __get_user_pages_fast() just
needs to get rid of the access_ok() and replace it with a proper check
for the user address space range.

Looks like arm[64] and powerpc.are the current users. Adding in some
people involved with the original submission a few years ago.

I do note that the x86 __get_user_pages_fast() thing looks dodgy too.

In particular, we do it right in the *real* get_user_pages_fast(), see
commit 7f8189068726 ("x86: don't use 'access_ok()' as a range check in
get_user_pages_fast()"). But then the same bug was re-introduced when
the "irq safe" version was merged. As well as in the GENERIC_RCU_GUP
version.

Gaah. Apparently PeterZ copied the old buggy version before the fix
when he added __get_user_pages_fast() in commit 465a454f254e ("x86,
mm: Add __get_user_pages_fast()").

I guess it could be considered a merge error (both happened during the
2.6.31 merge window).

                   Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04  3:08       ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-04  3:08 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer, Linus Torvalds, Jan Kara, Chris Wilson,
	Kirill A. Shutemov, Jeff Layton

On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:

> 	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
> is a good idea there - fuse_copy_do() could bloody well just use
> copy_{to,from}_iter().

Miklos, could you explain why does lock_request() prohibit page faults until
the matching unlock_request()?  All it does is setting FR_LOCKED on
our request and the only thing that even looks at that is fuse_abort_conn(),
which doesn't (AFAICS) wait for anything.

Where does the deadlock come from, and if it's not a deadlock - what is
it?  Or is that comment stale since "fuse: simplify request abort"?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04  3:08       ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-04  3:08 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:

> 	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
> is a good idea there - fuse_copy_do() could bloody well just use
> copy_{to,from}_iter().

Miklos, could you explain why does lock_request() prohibit page faults until
the matching unlock_request()?  All it does is setting FR_LOCKED on
our request and the only thing that even looks at that is fuse_abort_conn(),
which doesn't (AFAICS) wait for anything.

Where does the deadlock come from, and if it's not a deadlock - what is
it?  Or is that comment stale since "fuse: simplify request abort"?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04  3:08       ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-04  3:08 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:

> 	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
> is a good idea there - fuse_copy_do() could bloody well just use
> copy_{to,from}_iter().

Miklos, could you explain why does lock_request() prohibit page faults until
the matching unlock_request()?  All it does is setting FR_LOCKED on
our request and the only thing that even looks at that is fuse_abort_conn(),
which doesn't (AFAICS) wait for anything.

Where does the deadlock come from, and if it's not a deadlock - what is
it?  Or is that comment stale since "fuse: simplify request abort"?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 19:26         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-04 19:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer, Linus Torvalds, Jan Kara, Chris Wilson,
	Kirill A. Shutemov, Jeff Layton

On Sat, Feb 04, 2017 at 03:08:42AM +0000, Al Viro wrote:
> On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
> 
> > 	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
> > is a good idea there - fuse_copy_do() could bloody well just use
> > copy_{to,from}_iter().
> 
> Miklos, could you explain why does lock_request() prohibit page faults until
> the matching unlock_request()?  All it does is setting FR_LOCKED on
> our request and the only thing that even looks at that is fuse_abort_conn(),
> which doesn't (AFAICS) wait for anything.
> 
> Where does the deadlock come from, and if it's not a deadlock - what is
> it?  Or is that comment stale since "fuse: simplify request abort"?

While we are at it...  How can fuse_copy_page() ever get called with
*pagep == NULL?  AFAICS, for that to happen you need either
	i < req->num_pages && req->pages[i] == NULL  
or in fuse_notify_store() have
                err = fuse_copy_page(cs, &page, offset, this_num, 0);
reached with page == NULL.  The latter is flat-out impossible - we have
                if (!page)
                        goto out_iput;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
immediately before the call in question.  As for the former...  I don't see
any place where we would increase ->num_pages without having all additional
->pages[] elements guaranteed to be non-NULL.  Stores to ->num_pages are

in cuse_send_init():
	req->num_pages = 1;
with
        req->pages[0] = page;
shortly before that and
        if (!page)
                goto err_put_req;
earlier.

In fuse_retrieve():
                if (!page)
                        break;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
                req->pages[req->num_pages] = page;
                req->page_descs[req->num_pages].length = this_num;
                req->num_pages++;

In fuse_readdir():
        req->num_pages = 1;
        req->pages[0] = page;
with
        if (!page) {
                fuse_put_request(fc, req);
                return -ENOMEM;
        }
several lines above.

In fuse_do_readpage():
        req->num_pages = 1;
        req->pages[0] = page;
with page dereferenced earlier.

In fuse_fill_write_pages():
                req->pages[req->num_pages] = page;
                req->page_descs[req->num_pages].length = tmp;
                req->num_pages++;
with
                if (!page)
                        break;
earlier.

In fuse_get_user_pages():
                ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
                                        *nbytesp - nbytes, 
                                        req->max_pages - req->num_pages,
                                        &start);
                if (ret < 0)
                        break;

                iov_iter_advance(ii, ret);
                nbytes += ret;

                ret += start;
                npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;

                req->page_descs[req->num_pages].offset = start;
                fuse_page_descs_length_init(req, req->num_pages, npages);

                req->num_pages += npages;
which also shouldn't leave any NULLs in the area in question.

In fuse_writepage_locked():
        req->num_pages = 1;
        req->pages[0] = tmp_page;
with
        if (!tmp_page)
                goto err_free;
done earlier.

In fuse_writepage_in_flight():
	req->num_pages = 1;
with
        BUG_ON(new_req->num_pages != 0);
earlier and
        req->pages[req->num_pages] = tmp_page;
done in the caller (which passes req as new_req).  Earlier in the caller
we have
        if (!tmp_page)
                goto out_unlock;

In fuse_writepages_fill():
		req->num_pages = 0;
is obviously OK and
        req->num_pages++;
in the end of the same function is preceded by the same
        req->pages[req->num_pages] = tmp_page;
(fuse_writepages_fill() is the caller of fuse_writepage_in_flight();
reassignment in fuse_writepage_in_flight() happens only in case when
it returns 1 and in that case we don't reach the increment in
fuse_writepages_fill() at all).

In fuse_do_ioctl():
        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
        req->num_pages = num_pages;
and all increments of num_pages in there are
                pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
                if (!pages[num_pages])
                        goto out;
                num_pages++;
so the array we copy into req->pages has the same property wrt num_pages.

req->pages is assigned only in fuse_request_init(); that gets called
in two places - one is at request allocation time, another (from
put_reserved_req()) passes the current req->pages value, so that leaves it
unchanged.  The contents of req->pages[] is changed only in the aforementioned
places + fuse_request_init(), which zeros ->num_pages + fuse_copy_page()
called from fuse_copy_pages() with &req->pages[i] as argument.  The last
one can modify the damn thing, but only if it hits
                                err = fuse_try_move_page(cs, pagep);
and that sucker never stores a NULL in there -
                *pagep = newpage;
with get_page(newpage) upstream from that point.

What am I missing here?  Looks like those checks in fuse_copy_page() are
dead code...  They had been there since the initial merge, but AFAICS
they were just as useless in 2.6.14.  Rudiments of some prehistorical stuff
that never had been cleaned out, perhaps?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 19:26         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-04 19:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sat, Feb 04, 2017 at 03:08:42AM +0000, Al Viro wrote:
> On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
> 
> > 	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
> > is a good idea there - fuse_copy_do() could bloody well just use
> > copy_{to,from}_iter().
> 
> Miklos, could you explain why does lock_request() prohibit page faults until
> the matching unlock_request()?  All it does is setting FR_LOCKED on
> our request and the only thing that even looks at that is fuse_abort_conn(),
> which doesn't (AFAICS) wait for anything.
> 
> Where does the deadlock come from, and if it's not a deadlock - what is
> it?  Or is that comment stale since "fuse: simplify request abort"?

While we are at it...  How can fuse_copy_page() ever get called with
*pagep == NULL?  AFAICS, for that to happen you need either
	i < req->num_pages && req->pages[i] == NULL  
or in fuse_notify_store() have
                err = fuse_copy_page(cs, &page, offset, this_num, 0);
reached with page == NULL.  The latter is flat-out impossible - we have
                if (!page)
                        goto out_iput;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
immediately before the call in question.  As for the former...  I don't see
any place where we would increase ->num_pages without having all additional
->pages[] elements guaranteed to be non-NULL.  Stores to ->num_pages are

in cuse_send_init():
	req->num_pages = 1;
with
        req->pages[0] = page;
shortly before that and
        if (!page)
                goto err_put_req;
earlier.

In fuse_retrieve():
                if (!page)
                        break;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
                req->pages[req->num_pages] = page;
                req->page_descs[req->num_pages].length = this_num;
                req->num_pages++;

In fuse_readdir():
        req->num_pages = 1;
        req->pages[0] = page;
with
        if (!page) {
                fuse_put_request(fc, req);
                return -ENOMEM;
        }
several lines above.

In fuse_do_readpage():
        req->num_pages = 1;
        req->pages[0] = page;
with page dereferenced earlier.

In fuse_fill_write_pages():
                req->pages[req->num_pages] = page;
                req->page_descs[req->num_pages].length = tmp;
                req->num_pages++;
with
                if (!page)
                        break;
earlier.

In fuse_get_user_pages():
                ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
                                        *nbytesp - nbytes, 
                                        req->max_pages - req->num_pages,
                                        &start);
                if (ret < 0)
                        break;

                iov_iter_advance(ii, ret);
                nbytes += ret;

                ret += start;
                npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;

                req->page_descs[req->num_pages].offset = start;
                fuse_page_descs_length_init(req, req->num_pages, npages);

                req->num_pages += npages;
which also shouldn't leave any NULLs in the area in question.

In fuse_writepage_locked():
        req->num_pages = 1;
        req->pages[0] = tmp_page;
with
        if (!tmp_page)
                goto err_free;
done earlier.

In fuse_writepage_in_flight():
	req->num_pages = 1;
with
        BUG_ON(new_req->num_pages != 0);
earlier and
        req->pages[req->num_pages] = tmp_page;
done in the caller (which passes req as new_req).  Earlier in the caller
we have
        if (!tmp_page)
                goto out_unlock;

In fuse_writepages_fill():
		req->num_pages = 0;
is obviously OK and
        req->num_pages++;
in the end of the same function is preceded by the same
        req->pages[req->num_pages] = tmp_page;
(fuse_writepages_fill() is the caller of fuse_writepage_in_flight();
reassignment in fuse_writepage_in_flight() happens only in case when
it returns 1 and in that case we don't reach the increment in
fuse_writepages_fill() at all).

In fuse_do_ioctl():
        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
        req->num_pages = num_pages;
and all increments of num_pages in there are
                pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
                if (!pages[num_pages])
                        goto out;
                num_pages++;
so the array we copy into req->pages has the same property wrt num_pages.

req->pages is assigned only in fuse_request_init(); that gets called
in two places - one is at request allocation time, another (from
put_reserved_req()) passes the current req->pages value, so that leaves it
unchanged.  The contents of req->pages[] is changed only in the aforementioned
places + fuse_request_init(), which zeros ->num_pages + fuse_copy_page()
called from fuse_copy_pages() with &req->pages[i] as argument.  The last
one can modify the damn thing, but only if it hits
                                err = fuse_try_move_page(cs, pagep);
and that sucker never stores a NULL in there -
                *pagep = newpage;
with get_page(newpage) upstream from that point.

What am I missing here?  Looks like those checks in fuse_copy_page() are
dead code...  They had been there since the initial merge, but AFAICS
they were just as useless in 2.6.14.  Rudiments of some prehistorical stuff
that never had been cleaned out, perhaps?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 19:26         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-04 19:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sat, Feb 04, 2017 at 03:08:42AM +0000, Al Viro wrote:
> On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
> 
> > 	* fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
> > is a good idea there - fuse_copy_do() could bloody well just use
> > copy_{to,from}_iter().
> 
> Miklos, could you explain why does lock_request() prohibit page faults until
> the matching unlock_request()?  All it does is setting FR_LOCKED on
> our request and the only thing that even looks at that is fuse_abort_conn(),
> which doesn't (AFAICS) wait for anything.
> 
> Where does the deadlock come from, and if it's not a deadlock - what is
> it?  Or is that comment stale since "fuse: simplify request abort"?

While we are at it...  How can fuse_copy_page() ever get called with
*pagep == NULL?  AFAICS, for that to happen you need either
	i < req->num_pages && req->pages[i] == NULL  
or in fuse_notify_store() have
                err = fuse_copy_page(cs, &page, offset, this_num, 0);
reached with page == NULL.  The latter is flat-out impossible - we have
                if (!page)
                        goto out_iput;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
immediately before the call in question.  As for the former...  I don't see
any place where we would increase ->num_pages without having all additional
->pages[] elements guaranteed to be non-NULL.  Stores to ->num_pages are

in cuse_send_init():
	req->num_pages = 1;
with
        req->pages[0] = page;
shortly before that and
        if (!page)
                goto err_put_req;
earlier.

In fuse_retrieve():
                if (!page)
                        break;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
                req->pages[req->num_pages] = page;
                req->page_descs[req->num_pages].length = this_num;
                req->num_pages++;

In fuse_readdir():
        req->num_pages = 1;
        req->pages[0] = page;
with
        if (!page) {
                fuse_put_request(fc, req);
                return -ENOMEM;
        }
several lines above.

In fuse_do_readpage():
        req->num_pages = 1;
        req->pages[0] = page;
with page dereferenced earlier.

In fuse_fill_write_pages():
                req->pages[req->num_pages] = page;
                req->page_descs[req->num_pages].length = tmp;
                req->num_pages++;
with
                if (!page)
                        break;
earlier.

In fuse_get_user_pages():
                ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
                                        *nbytesp - nbytes, 
                                        req->max_pages - req->num_pages,
                                        &start);
                if (ret < 0)
                        break;

                iov_iter_advance(ii, ret);
                nbytes += ret;

                ret += start;
                npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;

                req->page_descs[req->num_pages].offset = start;
                fuse_page_descs_length_init(req, req->num_pages, npages);

                req->num_pages += npages;
which also shouldn't leave any NULLs in the area in question.

In fuse_writepage_locked():
        req->num_pages = 1;
        req->pages[0] = tmp_page;
with
        if (!tmp_page)
                goto err_free;
done earlier.

In fuse_writepage_in_flight():
	req->num_pages = 1;
with
        BUG_ON(new_req->num_pages != 0);
earlier and
        req->pages[req->num_pages] = tmp_page;
done in the caller (which passes req as new_req).  Earlier in the caller
we have
        if (!tmp_page)
                goto out_unlock;

In fuse_writepages_fill():
		req->num_pages = 0;
is obviously OK and
        req->num_pages++;
in the end of the same function is preceded by the same
        req->pages[req->num_pages] = tmp_page;
(fuse_writepages_fill() is the caller of fuse_writepage_in_flight();
reassignment in fuse_writepage_in_flight() happens only in case when
it returns 1 and in that case we don't reach the increment in
fuse_writepages_fill() at all).

In fuse_do_ioctl():
        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
        req->num_pages = num_pages;
and all increments of num_pages in there are
                pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
                if (!pages[num_pages])
                        goto out;
                num_pages++;
so the array we copy into req->pages has the same property wrt num_pages.

req->pages is assigned only in fuse_request_init(); that gets called
in two places - one is at request allocation time, another (from
put_reserved_req()) passes the current req->pages value, so that leaves it
unchanged.  The contents of req->pages[] is changed only in the aforementioned
places + fuse_request_init(), which zeros ->num_pages + fuse_copy_page()
called from fuse_copy_pages() with &req->pages[i] as argument.  The last
one can modify the damn thing, but only if it hits
                                err = fuse_try_move_page(cs, pagep);
and that sucker never stores a NULL in there -
                *pagep = newpage;
with get_page(newpage) upstream from that point.

What am I missing here?  Looks like those checks in fuse_copy_page() are
dead code...  They had been there since the initial merge, but AFAICS
they were just as useless in 2.6.14.  Rudiments of some prehistorical stuff
that never had been cleaned out, perhaps?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 22:11         ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-04 22:11 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sat, Feb 4, 2017 at 4:08 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
>
>>       * fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
>> is a good idea there - fuse_copy_do() could bloody well just use
>> copy_{to,from}_iter().
>
> Miklos, could you explain why does lock_request() prohibit page faults until
> the matching unlock_request()?  All it does is setting FR_LOCKED on
> our request and the only thing that even looks at that is fuse_abort_conn(),
> which doesn't (AFAICS) wait for anything.
>
> Where does the deadlock come from, and if it's not a deadlock - what is
> it?  Or is that comment stale since "fuse: simplify request abort"?

Well, it's not historical; at least not yet.  The deadlock is there
alright: mmap fuse file to addr; read byte from mapped page -> page
locked; this triggeres read request served in same process but
separate thread; write addr-headerlen to fuse dev; trying to lock same
page -> deadlock.

The deadlock can be broken by aborting or force unmounting: return
error for original read request; page unlocked; device write can get
page lock and return.

The reason we need to prohibit pagefault while copying is that when
request is aborted and the caller returns the memory in the request
may become invalid (e.g. data from stack).

Another solution would be to copy all data and keep a ref on the copy
by the request even after being aborted.  This is the plan for the
future.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 22:11         ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-04 22:11 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sat, Feb 4, 2017 at 4:08 AM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Thu, Feb 02, 2017 at 09:51:25AM +0000, Al Viro wrote:
>
>>       * fuse_copy_fill().  I'm not at all sure that iov_iter_get_pages()
>> is a good idea there - fuse_copy_do() could bloody well just use
>> copy_{to,from}_iter().
>
> Miklos, could you explain why does lock_request() prohibit page faults until
> the matching unlock_request()?  All it does is setting FR_LOCKED on
> our request and the only thing that even looks at that is fuse_abort_conn(),
> which doesn't (AFAICS) wait for anything.
>
> Where does the deadlock come from, and if it's not a deadlock - what is
> it?  Or is that comment stale since "fuse: simplify request abort"?

Well, it's not historical; at least not yet.  The deadlock is there
alright: mmap fuse file to addr; read byte from mapped page -> page
locked; this triggeres read request served in same process but
separate thread; write addr-headerlen to fuse dev; trying to lock same
page -> deadlock.

The deadlock can be broken by aborting or force unmounting: return
error for original read request; page unlocked; device write can get
page lock and return.

The reason we need to prohibit pagefault while copying is that when
request is aborted and the caller returns the memory in the request
may become invalid (e.g. data from stack).

Another solution would be to copy all data and keep a ref on the copy
by the request even after being aborted.  This is the plan for the
future.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 22:12           ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-04 22:12 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sat, Feb 4, 2017 at 8:26 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sat, Feb 04, 2017 at 03:08:42AM +0000, Al Viro wrote:

> What am I missing here?  Looks like those checks in fuse_copy_page() are
> dead code...  They had been there since the initial merge, but AFAICS
> they were just as useless in 2.6.14.  Rudiments of some prehistorical stuff
> that never had been cleaned out, perhaps?

Yep, that one is probably historical.  I'll double check to make sure, though.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-04 22:12           ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-04 22:12 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sat, Feb 4, 2017 at 8:26 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Sat, Feb 04, 2017 at 03:08:42AM +0000, Al Viro wrote:

> What am I missing here?  Looks like those checks in fuse_copy_page() are
> dead code...  They had been there since the initial merge, but AFAICS
> they were just as useless in 2.6.14.  Rudiments of some prehistorical stuff
> that never had been cleaned out, perhaps?

Yep, that one is probably historical.  I'll double check to make sure, though.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05  1:51           ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05  1:51 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sat, Feb 04, 2017 at 11:11:27PM +0100, Miklos Szeredi wrote:

> Well, it's not historical; at least not yet.  The deadlock is there
> alright: mmap fuse file to addr; read byte from mapped page -> page
> locked; this triggeres read request served in same process but
> separate thread; write addr-headerlen to fuse dev; trying to lock same
> page -> deadlock.

Let me see if I got it straight - you have the same fuse file mmapped
in two processes, one of them being fuse server (either sharing the
entire address space, or the same area mapped in both).  Another process
faults the sucker in; filemap_fault() locks the page and goes
fuse_readpage() -> fuse_do_readpage() -> fuse_send_read() ->
-> fuse_request_send() -> __fuse_request_send() which puts request into
queue and goes to sleep in request_wait_answer().  Eventually, read()
on /dev/fuse (or splice(), whatever) by server picks that request and reply
is formed and fed back into /dev/fuse.  There we (in fuse_do_dev_write())
call copy_out_args(), which tries to copy into our (still locked) page
a piece of data coming from server-supplied iovec.  As it is, you
are calling get_user_pages_fast(), triggering handle_mm_fault().  Since that
malicous FPOS of a server tried to feed you the _same_ mmapped file, you
hit a deadlock.  In server's context.  Correct?

Convoluted, but possible.  But.  Why the hell do we care whether that deadlock
hits in get_user_pages_fast() or in copy_from_user()?  Put it another way,
what difference does it make whether we take that fault with or without
FR_LOCKED in req->flags?

> The deadlock can be broken by aborting or force unmounting: return
> error for original read request; page unlocked; device write can get
> page lock and return.
> 
> The reason we need to prohibit pagefault while copying is that when
> request is aborted and the caller returns the memory in the request
> may become invalid (e.g. data from stack).

???

IDGI.  Your request is marked aborted and should presumably fail, so
that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
with page left not uptodate and _not_ inserted into page tables.  What's
leaking where?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05  1:51           ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05  1:51 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sat, Feb 04, 2017 at 11:11:27PM +0100, Miklos Szeredi wrote:

> Well, it's not historical; at least not yet.  The deadlock is there
> alright: mmap fuse file to addr; read byte from mapped page -> page
> locked; this triggeres read request served in same process but
> separate thread; write addr-headerlen to fuse dev; trying to lock same
> page -> deadlock.

Let me see if I got it straight - you have the same fuse file mmapped
in two processes, one of them being fuse server (either sharing the
entire address space, or the same area mapped in both).  Another process
faults the sucker in; filemap_fault() locks the page and goes
fuse_readpage() -> fuse_do_readpage() -> fuse_send_read() ->
-> fuse_request_send() -> __fuse_request_send() which puts request into
queue and goes to sleep in request_wait_answer().  Eventually, read()
on /dev/fuse (or splice(), whatever) by server picks that request and reply
is formed and fed back into /dev/fuse.  There we (in fuse_do_dev_write())
call copy_out_args(), which tries to copy into our (still locked) page
a piece of data coming from server-supplied iovec.  As it is, you
are calling get_user_pages_fast(), triggering handle_mm_fault().  Since that
malicous FPOS of a server tried to feed you the _same_ mmapped file, you
hit a deadlock.  In server's context.  Correct?

Convoluted, but possible.  But.  Why the hell do we care whether that deadlock
hits in get_user_pages_fast() or in copy_from_user()?  Put it another way,
what difference does it make whether we take that fault with or without
FR_LOCKED in req->flags?

> The deadlock can be broken by aborting or force unmounting: return
> error for original read request; page unlocked; device write can get
> page lock and return.
> 
> The reason we need to prohibit pagefault while copying is that when
> request is aborted and the caller returns the memory in the request
> may become invalid (e.g. data from stack).

???

IDGI.  Your request is marked aborted and should presumably fail, so
that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
with page left not uptodate and _not_ inserted into page tables.  What's
leaking where?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05  1:51           ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05  1:51 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sat, Feb 04, 2017 at 11:11:27PM +0100, Miklos Szeredi wrote:

> Well, it's not historical; at least not yet.  The deadlock is there
> alright: mmap fuse file to addr; read byte from mapped page -> page
> locked; this triggeres read request served in same process but
> separate thread; write addr-headerlen to fuse dev; trying to lock same
> page -> deadlock.

Let me see if I got it straight - you have the same fuse file mmapped
in two processes, one of them being fuse server (either sharing the
entire address space, or the same area mapped in both).  Another process
faults the sucker in; filemap_fault() locks the page and goes
fuse_readpage() -> fuse_do_readpage() -> fuse_send_read() ->
-> fuse_request_send() -> __fuse_request_send() which puts request into
queue and goes to sleep in request_wait_answer().  Eventually, read()
on /dev/fuse (or splice(), whatever) by server picks that request and reply
is formed and fed back into /dev/fuse.  There we (in fuse_do_dev_write())
call copy_out_args(), which tries to copy into our (still locked) page
a piece of data coming from server-supplied iovec.  As it is, you
are calling get_user_pages_fast(), triggering handle_mm_fault().  Since that
malicous FPOS of a server tried to feed you the _same_ mmapped file, you
hit a deadlock.  In server's context.  Correct?

Convoluted, but possible.  But.  Why the hell do we care whether that deadlock
hits in get_user_pages_fast() or in copy_from_user()?  Put it another way,
what difference does it make whether we take that fault with or without
FR_LOCKED in req->flags?

> The deadlock can be broken by aborting or force unmounting: return
> error for original read request; page unlocked; device write can get
> page lock and return.
> 
> The reason we need to prohibit pagefault while copying is that when
> request is aborted and the caller returns the memory in the request
> may become invalid (e.g. data from stack).

???

IDGI.  Your request is marked aborted and should presumably fail, so
that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
with page left not uptodate and _not_ inserted into page tables.  What's
leaking where?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 20:15             ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-05 20:15 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 5, 2017 at 2:51 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sat, Feb 04, 2017 at 11:11:27PM +0100, Miklos Szeredi wrote:
>
>> Well, it's not historical; at least not yet.  The deadlock is there
>> alright: mmap fuse file to addr; read byte from mapped page -> page
>> locked; this triggeres read request served in same process but
>> separate thread; write addr-headerlen to fuse dev; trying to lock same
>> page -> deadlock.
>
> Let me see if I got it straight - you have the same fuse file mmapped
> in two processes, one of them being fuse server (either sharing the
> entire address space, or the same area mapped in both).  Another process
> faults the sucker in; filemap_fault() locks the page and goes
> fuse_readpage() -> fuse_do_readpage() -> fuse_send_read() ->
> -> fuse_request_send() -> __fuse_request_send() which puts request into
> queue and goes to sleep in request_wait_answer().  Eventually, read()
> on /dev/fuse (or splice(), whatever) by server picks that request and reply
> is formed and fed back into /dev/fuse.  There we (in fuse_do_dev_write())
> call copy_out_args(), which tries to copy into our (still locked) page
> a piece of data coming from server-supplied iovec.  As it is, you
> are calling get_user_pages_fast(), triggering handle_mm_fault().  Since that
> malicous FPOS of a server tried to feed you the _same_ mmapped file, you
> hit a deadlock.  In server's context.  Correct?

Yes.

> Convoluted, but possible.  But.  Why the hell do we care whether that deadlock
> hits in get_user_pages_fast() or in copy_from_user()?  Put it another way,
> what difference does it make whether we take that fault with or without
> FR_LOCKED in req->flags?

The difference is that if the page fault happens without FR_LOCKED,
then we can abort the request then and there (done by moving the
request to to_end1 and calling request_end() on it).

If abort happens with FR_LOCKED, we can't end the request now, because
data is possibly being copied to/from the request args.  But we are
guaranteed that the request will end shortly, because no sleeping
under FR_LOCKED is allowed.

But with copy_from_user() page fault and copy aren't separated and so
we don't know whether it's safe to abort or not.

Maybe I'm missing something obvious, but I don't see a simple way out of this.

>> The deadlock can be broken by aborting or force unmounting: return
>> error for original read request; page unlocked; device write can get
>> page lock and return.
>>
>> The reason we need to prohibit pagefault while copying is that when
>> request is aborted and the caller returns the memory in the request
>> may become invalid (e.g. data from stack).
>
> ???
>
> IDGI.  Your request is marked aborted and should presumably fail, so
> that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
> would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
> with page left not uptodate and _not_ inserted into page tables.  What's
> leaking where?

That case is fine.   But nothing guarantees that fuse_abort_conn()
won't be called (in the non-deadlock case) when data is being copied
to the request args.  Ending the request at such a point could easily
lead to use after free,

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 20:15             ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-05 20:15 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 5, 2017 at 2:51 AM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Sat, Feb 04, 2017 at 11:11:27PM +0100, Miklos Szeredi wrote:
>
>> Well, it's not historical; at least not yet.  The deadlock is there
>> alright: mmap fuse file to addr; read byte from mapped page -> page
>> locked; this triggeres read request served in same process but
>> separate thread; write addr-headerlen to fuse dev; trying to lock same
>> page -> deadlock.
>
> Let me see if I got it straight - you have the same fuse file mmapped
> in two processes, one of them being fuse server (either sharing the
> entire address space, or the same area mapped in both).  Another process
> faults the sucker in; filemap_fault() locks the page and goes
> fuse_readpage() -> fuse_do_readpage() -> fuse_send_read() ->
> -> fuse_request_send() -> __fuse_request_send() which puts request into
> queue and goes to sleep in request_wait_answer().  Eventually, read()
> on /dev/fuse (or splice(), whatever) by server picks that request and reply
> is formed and fed back into /dev/fuse.  There we (in fuse_do_dev_write())
> call copy_out_args(), which tries to copy into our (still locked) page
> a piece of data coming from server-supplied iovec.  As it is, you
> are calling get_user_pages_fast(), triggering handle_mm_fault().  Since that
> malicous FPOS of a server tried to feed you the _same_ mmapped file, you
> hit a deadlock.  In server's context.  Correct?

Yes.

> Convoluted, but possible.  But.  Why the hell do we care whether that deadlock
> hits in get_user_pages_fast() or in copy_from_user()?  Put it another way,
> what difference does it make whether we take that fault with or without
> FR_LOCKED in req->flags?

The difference is that if the page fault happens without FR_LOCKED,
then we can abort the request then and there (done by moving the
request to to_end1 and calling request_end() on it).

If abort happens with FR_LOCKED, we can't end the request now, because
data is possibly being copied to/from the request args.  But we are
guaranteed that the request will end shortly, because no sleeping
under FR_LOCKED is allowed.

But with copy_from_user() page fault and copy aren't separated and so
we don't know whether it's safe to abort or not.

Maybe I'm missing something obvious, but I don't see a simple way out of this.

>> The deadlock can be broken by aborting or force unmounting: return
>> error for original read request; page unlocked; device write can get
>> page lock and return.
>>
>> The reason we need to prohibit pagefault while copying is that when
>> request is aborted and the caller returns the memory in the request
>> may become invalid (e.g. data from stack).
>
> ???
>
> IDGI.  Your request is marked aborted and should presumably fail, so
> that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
> would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
> with page left not uptodate and _not_ inserted into page tables.  What's
> leaking where?

That case is fine.   But nothing guarantees that fuse_abort_conn()
won't be called (in the non-deadlock case) when data is being copied
to the request args.  Ending the request at such a point could easily
lead to use after free,

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 20:56             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 20:56 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 01:51:49AM +0000, Al Viro wrote:

> IDGI.  Your request is marked aborted and should presumably fail, so
> that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
> would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
> with page left not uptodate and _not_ inserted into page tables.  What's
> leaking where?

Egads...  Do you mean that req->pages[] contents can be dropped by connection
abort right under fuse_copy_pages()?  Or is it that args[...].value can
end up freed under you?

<goes to look at the ->end() callbacks>

Both, apparently, plus the request initiator might have seen that
request has failed and buggered off, with args[...].value pointing to
what used to be initiator's stack frame.  Is that what you are talking about?


If so, why not mark request as "being handled by fuse_dev_do_{read,write}()"
for the duration, and leave the request_end() on such requests for
fuse_dev_do_{read,write}(), seeing that they will call request_end() for such
anyway?

Looks like your FR_LOCKED is not far from that already.  Why not stop
dropping/regaining FR_LOCKED in lock_request()/unlock_request() and simply
have your
                end_requests(fc, &to_end2);
in fuse_abort_conn() skip the actual calls of request_end()?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 20:56             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 20:56 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sun, Feb 05, 2017 at 01:51:49AM +0000, Al Viro wrote:

> IDGI.  Your request is marked aborted and should presumably fail, so
> that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
> would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
> with page left not uptodate and _not_ inserted into page tables.  What's
> leaking where?

Egads...  Do you mean that req->pages[] contents can be dropped by connection
abort right under fuse_copy_pages()?  Or is it that args[...].value can
end up freed under you?

<goes to look at the ->end() callbacks>

Both, apparently, plus the request initiator might have seen that
request has failed and buggered off, with args[...].value pointing to
what used to be initiator's stack frame.  Is that what you are talking about?


If so, why not mark request as "being handled by fuse_dev_do_{read,write}()"
for the duration, and leave the request_end() on such requests for
fuse_dev_do_{read,write}(), seeing that they will call request_end() for such
anyway?

Looks like your FR_LOCKED is not far from that already.  Why not stop
dropping/regaining FR_LOCKED in lock_request()/unlock_request() and simply
have your
                end_requests(fc, &to_end2);
in fuse_abort_conn() skip the actual calls of request_end()?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 20:56             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 20:56 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sun, Feb 05, 2017 at 01:51:49AM +0000, Al Viro wrote:

> IDGI.  Your request is marked aborted and should presumably fail, so
> that when request_wait_answer() wakes up and finds it screwed, fuse_readpage()
> would just return an error and filemap_fault() will return VM_FAULT_SIGBUS,
> with page left not uptodate and _not_ inserted into page tables.  What's
> leaking where?

Egads...  Do you mean that req->pages[] contents can be dropped by connection
abort right under fuse_copy_pages()?  Or is it that args[...].value can
end up freed under you?

<goes to look@the ->end() callbacks>

Both, apparently, plus the request initiator might have seen that
request has failed and buggered off, with args[...].value pointing to
what used to be initiator's stack frame.  Is that what you are talking about?


If so, why not mark request as "being handled by fuse_dev_do_{read,write}()"
for the duration, and leave the request_end() on such requests for
fuse_dev_do_{read,write}(), seeing that they will call request_end() for such
anyway?

Looks like your FR_LOCKED is not far from that already.  Why not stop
dropping/regaining FR_LOCKED in lock_request()/unlock_request() and simply
have your
                end_requests(fc, &to_end2);
in fuse_abort_conn() skip the actual calls of request_end()?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 21:01               ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 21:01 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 09:15:24PM +0100, Miklos Szeredi wrote:

> That case is fine.   But nothing guarantees that fuse_abort_conn()
> won't be called (in the non-deadlock case) when data is being copied
> to the request args.  Ending the request at such a point could easily
> lead to use after free,

So why not leave ending it to your fuse_dev_do_write()/fuse_dev_do_read()?
See the reply I'd just sent (your mail arrived while I'd been writing that
one - saw it only after I'd sent mine).

Basically, what if we keep FR_LOCKED through *all* fuse_dev_do_{read,write}(),
rather than dropping and regaining it many times and have fuse_abort_conn()
skip request_end() on FR_LOCKED ones?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 21:01               ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 21:01 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 09:15:24PM +0100, Miklos Szeredi wrote:

> That case is fine.   But nothing guarantees that fuse_abort_conn()
> won't be called (in the non-deadlock case) when data is being copied
> to the request args.  Ending the request at such a point could easily
> lead to use after free,

So why not leave ending it to your fuse_dev_do_write()/fuse_dev_do_read()?
See the reply I'd just sent (your mail arrived while I'd been writing that
one - saw it only after I'd sent mine).

Basically, what if we keep FR_LOCKED through *all* fuse_dev_do_{read,write}(),
rather than dropping and regaining it many times and have fuse_abort_conn()
skip request_end() on FR_LOCKED ones?
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 21:01               ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 21:01 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 09:15:24PM +0100, Miklos Szeredi wrote:

> That case is fine.   But nothing guarantees that fuse_abort_conn()
> won't be called (in the non-deadlock case) when data is being copied
> to the request args.  Ending the request at such a point could easily
> lead to use after free,

So why not leave ending it to your fuse_dev_do_write()/fuse_dev_do_read()?
See the reply I'd just sent (your mail arrived while I'd been writing that
one - saw it only after I'd sent mine).

Basically, what if we keep FR_LOCKED through *all* fuse_dev_do_{read,write}(),
rather than dropping and regaining it many times and have fuse_abort_conn()
skip request_end() on FR_LOCKED ones?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 21:19                 ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-05 21:19 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 5, 2017 at 10:01 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sun, Feb 05, 2017 at 09:15:24PM +0100, Miklos Szeredi wrote:
>
>> That case is fine.   But nothing guarantees that fuse_abort_conn()
>> won't be called (in the non-deadlock case) when data is being copied
>> to the request args.  Ending the request at such a point could easily
>> lead to use after free,
>
> So why not leave ending it to your fuse_dev_do_write()/fuse_dev_do_read()?
> See the reply I'd just sent (your mail arrived while I'd been writing that
> one - saw it only after I'd sent mine).
>
> Basically, what if we keep FR_LOCKED through *all* fuse_dev_do_{read,write}(),
> rather than dropping and regaining it many times and have fuse_abort_conn()
> skip request_end() on FR_LOCKED ones?

Then we can't break out of that deadlock: we wait until
fuse_dev_do_write() is done until calling request_end() which
ultimately results in unlocking page.  But fuse_dev_do_write() won't
complete until the page is unlocked.

The only way out that I see is to have a refcount on all pages in
args.  Which means copying everything not already in refcountable page
(i.e. args on stack) to a page array.   It's definitely doable, but
needs time to sort out, and I'm definitely lacking that (overlayfs
currently trumps fuse).

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 21:19                 ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-05 21:19 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 5, 2017 at 10:01 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Sun, Feb 05, 2017 at 09:15:24PM +0100, Miklos Szeredi wrote:
>
>> That case is fine.   But nothing guarantees that fuse_abort_conn()
>> won't be called (in the non-deadlock case) when data is being copied
>> to the request args.  Ending the request at such a point could easily
>> lead to use after free,
>
> So why not leave ending it to your fuse_dev_do_write()/fuse_dev_do_read()?
> See the reply I'd just sent (your mail arrived while I'd been writing that
> one - saw it only after I'd sent mine).
>
> Basically, what if we keep FR_LOCKED through *all* fuse_dev_do_{read,write}(),
> rather than dropping and regaining it many times and have fuse_abort_conn()
> skip request_end() on FR_LOCKED ones?

Then we can't break out of that deadlock: we wait until
fuse_dev_do_write() is done until calling request_end() which
ultimately results in unlocking page.  But fuse_dev_do_write() won't
complete until the page is unlocked.

The only way out that I see is to have a refcount on all pages in
args.  Which means copying everything not already in refcountable page
(i.e. args on stack) to a page array.   It's definitely doable, but
needs time to sort out, and I'm definitely lacking that (overlayfs
currently trumps fuse).

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-05 21:19                 ` Miklos Szeredi
  (?)
  (?)
@ 2017-02-05 22:04                   ` Al Viro
  -1 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 22:04 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 10:19:20PM +0100, Miklos Szeredi wrote:

> Then we can't break out of that deadlock: we wait until
> fuse_dev_do_write() is done until calling request_end() which
> ultimately results in unlocking page.  But fuse_dev_do_write() won't
> complete until the page is unlocked.

Wait a sec.  What happens if

process A: fuse_lookup()
		struct fuse_entry_out outarg on stack
		...
		fuse_request_send() with req->out.args[0].value = &outarg
			sleep in request_wait_answer() on req->waitq
server: read the request, write reply
	fuse_dev_do_write()
		copy_out_args()
			fuse_copy_args()
				fuse_copy_one()
					FR_LOCKED is guaranteed to be set
					fuse_copy_do()
process C on another CPU: umount -f
	fuse_conn_abort()
		end_requests()
			request_end()
				set FR_FINISHED
				wake A up (via req->waitq)
process A: regain CPU
	bugger off from request_wait_answer(), through __fuse_request_send(),
	fuse_request_send(), fuse_simple_request(), fuse_lookup_name(),
	fuse_lookup() and out of fuse_lookup().

In the meanwhile, server in fuse_copy_do() does memcpy() to what used to
be outarg, corrupting the stack of process A.

Sure, you need to hit a fairly narrow window, especially if you are to
cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
situation when you lose CPU on preempt on the way to memcpy(); in that
case server might come back when A has incremented its stack footprint
again.  Or A might end up taking a hardware interrupt and handling it
on the normal kernel stack, etc.

Looks like *any* scenario where fuse_conn_abort() manages to run during
that memcpy() has potential for that kind of trouble; any SMP box appears
to be vulnerable, along with preempt UP...

Am I missing something that prevents that kind of problem?

> The only way out that I see is to have a refcount on all pages in
> args.  Which means copying everything not already in refcountable page
> (i.e. args on stack) to a page array.   It's definitely doable, but
> needs time to sort out, and I'm definitely lacking that (overlayfs
> currently trumps fuse).

Hrm...  Then maybe I'll have to try and cook something along those lines;
AFAICS the current mainline is vulnerable...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 22:04                   ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 22:04 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 10:19:20PM +0100, Miklos Szeredi wrote:

> Then we can't break out of that deadlock: we wait until
> fuse_dev_do_write() is done until calling request_end() which
> ultimately results in unlocking page.  But fuse_dev_do_write() won't
> complete until the page is unlocked.

Wait a sec.  What happens if

process A: fuse_lookup()
		struct fuse_entry_out outarg on stack
		...
		fuse_request_send() with req->out.args[0].value = &outarg
			sleep in request_wait_answer() on req->waitq
server: read the request, write reply
	fuse_dev_do_write()
		copy_out_args()
			fuse_copy_args()
				fuse_copy_one()
					FR_LOCKED is guaranteed to be set
					fuse_copy_do()
process C on another CPU: umount -f
	fuse_conn_abort()
		end_requests()
			request_end()
				set FR_FINISHED
				wake A up (via�req->waitq)
process A: regain CPU
	bugger off from request_wait_answer(), through __fuse_request_send(),
	fuse_request_send(), fuse_simple_request(), fuse_lookup_name(),
	fuse_lookup() and out of fuse_lookup().

In the meanwhile, server in fuse_copy_do() does memcpy() to what used to
be outarg, corrupting the stack of process A.

Sure, you need to hit a fairly narrow window, especially if you are to
cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
situation when you lose CPU on preempt on the way to memcpy(); in that
case server might come back when A has incremented its stack footprint
again.  Or A might end up taking a hardware interrupt and handling it
on the normal kernel stack, etc.

Looks like *any* scenario where fuse_conn_abort() manages to run during
that memcpy() has potential for that kind of trouble; any SMP box appears
to be vulnerable, along with preempt UP...

Am I missing something that prevents that kind of problem?

> The only way out that I see is to have a refcount on all pages in
> args.  Which means copying everything not already in refcountable page
> (i.e. args on stack) to a page array.   It's definitely doable, but
> needs time to sort out, and I'm definitely lacking that (overlayfs
> currently trumps fuse).

Hrm...  Then maybe I'll have to try and cook something along those lines;
AFAICS the current mainline is vulnerable...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 22:04                   ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 22:04 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 10:19:20PM +0100, Miklos Szeredi wrote:

> Then we can't break out of that deadlock: we wait until
> fuse_dev_do_write() is done until calling request_end() which
> ultimately results in unlocking page.  But fuse_dev_do_write() won't
> complete until the page is unlocked.

Wait a sec.  What happens if

process A: fuse_lookup()
		struct fuse_entry_out outarg on stack
		...
		fuse_request_send() with req->out.args[0].value = &outarg
			sleep in request_wait_answer() on req->waitq
server: read the request, write reply
	fuse_dev_do_write()
		copy_out_args()
			fuse_copy_args()
				fuse_copy_one()
					FR_LOCKED is guaranteed to be set
					fuse_copy_do()
process C on another CPU: umount -f
	fuse_conn_abort()
		end_requests()
			request_end()
				set FR_FINISHED
				wake A up (via req->waitq)
process A: regain CPU
	bugger off from request_wait_answer(), through __fuse_request_send(),
	fuse_request_send(), fuse_simple_request(), fuse_lookup_name(),
	fuse_lookup() and out of fuse_lookup().

In the meanwhile, server in fuse_copy_do() does memcpy() to what used to
be outarg, corrupting the stack of process A.

Sure, you need to hit a fairly narrow window, especially if you are to
cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
situation when you lose CPU on preempt on the way to memcpy(); in that
case server might come back when A has incremented its stack footprint
again.  Or A might end up taking a hardware interrupt and handling it
on the normal kernel stack, etc.

Looks like *any* scenario where fuse_conn_abort() manages to run during
that memcpy() has potential for that kind of trouble; any SMP box appears
to be vulnerable, along with preempt UP...

Am I missing something that prevents that kind of problem?

> The only way out that I see is to have a refcount on all pages in
> args.  Which means copying everything not already in refcountable page
> (i.e. args on stack) to a page array.   It's definitely doable, but
> needs time to sort out, and I'm definitely lacking that (overlayfs
> currently trumps fuse).

Hrm...  Then maybe I'll have to try and cook something along those lines;
AFAICS the current mainline is vulnerable...
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-05 22:04                   ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-05 22:04 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 10:19:20PM +0100, Miklos Szeredi wrote:

> Then we can't break out of that deadlock: we wait until
> fuse_dev_do_write() is done until calling request_end() which
> ultimately results in unlocking page.  But fuse_dev_do_write() won't
> complete until the page is unlocked.

Wait a sec.  What happens if

process A: fuse_lookup()
		struct fuse_entry_out outarg on stack
		...
		fuse_request_send() with req->out.args[0].value = &outarg
			sleep in request_wait_answer() on req->waitq
server: read the request, write reply
	fuse_dev_do_write()
		copy_out_args()
			fuse_copy_args()
				fuse_copy_one()
					FR_LOCKED is guaranteed to be set
					fuse_copy_do()
process C on another CPU: umount -f
	fuse_conn_abort()
		end_requests()
			request_end()
				set FR_FINISHED
				wake A up (via?req->waitq)
process A: regain CPU
	bugger off from request_wait_answer(), through __fuse_request_send(),
	fuse_request_send(), fuse_simple_request(), fuse_lookup_name(),
	fuse_lookup() and out of fuse_lookup().

In the meanwhile, server in fuse_copy_do() does memcpy() to what used to
be outarg, corrupting the stack of process A.

Sure, you need to hit a fairly narrow window, especially if you are to
cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
situation when you lose CPU on preempt on the way to memcpy(); in that
case server might come back when A has incremented its stack footprint
again.  Or A might end up taking a hardware interrupt and handling it
on the normal kernel stack, etc.

Looks like *any* scenario where fuse_conn_abort() manages to run during
that memcpy() has potential for that kind of trouble; any SMP box appears
to be vulnerable, along with preempt UP...

Am I missing something that prevents that kind of problem?

> The only way out that I see is to have a refcount on all pages in
> args.  Which means copying everything not already in refcountable page
> (i.e. args on stack) to a page array.   It's definitely doable, but
> needs time to sort out, and I'm definitely lacking that (overlayfs
> currently trumps fuse).

Hrm...  Then maybe I'll have to try and cook something along those lines;
AFAICS the current mainline is vulnerable...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-06  3:05                     ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-06  3:05 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 05, 2017 at 10:04:45PM +0000, Al Viro wrote:

> Sure, you need to hit a fairly narrow window, especially if you are to
> cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
> situation when you lose CPU on preempt on the way to memcpy(); in that
> case server might come back when A has incremented its stack footprint
> again.  Or A might end up taking a hardware interrupt and handling it
> on the normal kernel stack, etc.
> 
> Looks like *any* scenario where fuse_conn_abort() manages to run during
> that memcpy() has potential for that kind of trouble; any SMP box appears
> to be vulnerable, along with preempt UP...
> 
> Am I missing something that prevents that kind of problem?

For that matter, it doesn't have to be on-stack - e.g. fuse_get_link()
has kmalloc'ed buffer for destination, kfree'd upon failure.  Have the
damn thing lose the timeslice in fuse_copy_do() and you might very well
end up spraying user-supplied data over whatever ends up picking your
kfreed buffer.  That one could be reasonably dealt with if we switched
to page_alloc() and stuffed it into the ->pages[] instead...

Some observations regarding the arguments:
	* stack footprint is atrocious.  Consider e.g. fuse_mknod() - you
get 16 bytes of fuse_mknod_in + 120 bytes of struct fuse_args + 128 bytes
of fuse_entry_out.  All on stack, and that's on top of whatever the
callchain already has eaten, which might include e.g. nfsd stuff or
ecryptfs, etc.  Or fuse_get_parent(), for that matter, with 128 bytes of
fuse_entry_out + 120 bytes of fuse_args, both on stack.  This one is
guaranteed to have a nasty call chain - fuse_get_parent() <- reconnect_one()
<- reconnect_path() <- exportfs_decode_fh() (itself with a 256-byte array of
char on stack) <- nfsd_set_fh_dentry() <- fh_verify() <- a bunch of call
chains in nfsd.
	* "out" args (i.e. reply) are probably best dealt with by having
coallocated with request itself - some already are and the sizes tend
to be fixed and not too large (->get_link() is an exception, and it's
probably better handled as mentioned above).
	* "in" args (request) are in some cases easily dealt with by
coallocating with request, but there's a large class of situations where
we are passing dentry->d_name.name and then there's fuse_symlink().
The last one is ugly - potentially up to a page worth of data, coming
straight from method caller; usually it's a part of getname() result,
but e.g. ecryptfs might have it kmalloc'ed, nfsd - picked from sunrpc
request payload, etc.

	AFAICS, your argument applies to the requests that have
some page(s) locked until the request completion (unlock_page() either
by ->end() callback or in the originator of request).  If so, I would
rather mark those as "call request_end() early"; they seem to have
the non-page parts of args hosted in req->misc, so for them it's not
a problem.

	So how about this:

* explicit FR_END_IMMEDIATELY on read/write-related requests
* no FR_LOCKED flipping in lock_request()/unlock_request()
* modifying the call of end_requests() in fuse_abort_conn() so that it
would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
* make fuse_copy_pages() grab page references around the actual
fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
was set.  Adjust fuse_try_move_page() accordingly.

Do you see any problems with that approach for minimal fix?  If all requests
in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
embedded into req->misc, it looks like this ought to suffice.  I probably
could post something along those lines tomorrow, if you see any serious
problems with that - please yell...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-06  3:05                     ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-06  3:05 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sun, Feb 05, 2017 at 10:04:45PM +0000, Al Viro wrote:

> Sure, you need to hit a fairly narrow window, especially if you are to
> cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
> situation when you lose CPU on preempt on the way to memcpy(); in that
> case server might come back when A has incremented its stack footprint
> again.  Or A might end up taking a hardware interrupt and handling it
> on the normal kernel stack, etc.
> 
> Looks like *any* scenario where fuse_conn_abort() manages to run during
> that memcpy() has potential for that kind of trouble; any SMP box appears
> to be vulnerable, along with preempt UP...
> 
> Am I missing something that prevents that kind of problem?

For that matter, it doesn't have to be on-stack - e.g. fuse_get_link()
has kmalloc'ed buffer for destination, kfree'd upon failure.  Have the
damn thing lose the timeslice in fuse_copy_do() and you might very well
end up spraying user-supplied data over whatever ends up picking your
kfreed buffer.  That one could be reasonably dealt with if we switched
to page_alloc() and stuffed it into the ->pages[] instead...

Some observations regarding the arguments:
	* stack footprint is atrocious.  Consider e.g. fuse_mknod() - you
get 16 bytes of fuse_mknod_in + 120 bytes of struct fuse_args + 128 bytes
of fuse_entry_out.  All on stack, and that's on top of whatever the
callchain already has eaten, which might include e.g. nfsd stuff or
ecryptfs, etc.  Or fuse_get_parent(), for that matter, with 128 bytes of
fuse_entry_out + 120 bytes of fuse_args, both on stack.  This one is
guaranteed to have a nasty call chain - fuse_get_parent() <- reconnect_one()
<- reconnect_path() <- exportfs_decode_fh() (itself with a 256-byte array of
char on stack) <- nfsd_set_fh_dentry() <- fh_verify() <- a bunch of call
chains in nfsd.
	* "out" args (i.e. reply) are probably best dealt with by having
coallocated with request itself - some already are and the sizes tend
to be fixed and not too large (->get_link() is an exception, and it's
probably better handled as mentioned above).
	* "in" args (request) are in some cases easily dealt with by
coallocating with request, but there's a large class of situations where
we are passing dentry->d_name.name and then there's fuse_symlink().
The last one is ugly - potentially up to a page worth of data, coming
straight from method caller; usually it's a part of getname() result,
but e.g. ecryptfs might have it kmalloc'ed, nfsd - picked from sunrpc
request payload, etc.

	AFAICS, your argument applies to the requests that have
some page(s) locked until the request completion (unlock_page() either
by ->end() callback or in the originator of request).  If so, I would
rather mark those as "call request_end() early"; they seem to have
the non-page parts of args hosted in req->misc, so for them it's not
a problem.

	So how about this:

* explicit FR_END_IMMEDIATELY on read/write-related requests
* no FR_LOCKED flipping in lock_request()/unlock_request()
* modifying the call of end_requests() in fuse_abort_conn() so that it
would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
* make fuse_copy_pages() grab page references around the actual
fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
was set.  Adjust fuse_try_move_page() accordingly.

Do you see any problems with that approach for minimal fix?  If all requests
in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
embedded into req->misc, it looks like this ought to suffice.  I probably
could post something along those lines tomorrow, if you see any serious
problems with that - please yell...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-06  3:05                     ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-06  3:05 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Sun, Feb 05, 2017 at 10:04:45PM +0000, Al Viro wrote:

> Sure, you need to hit a fairly narrow window, especially if you are to
> cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
> situation when you lose CPU on preempt on the way to memcpy(); in that
> case server might come back when A has incremented its stack footprint
> again.  Or A might end up taking a hardware interrupt and handling it
> on the normal kernel stack, etc.
> 
> Looks like *any* scenario where fuse_conn_abort() manages to run during
> that memcpy() has potential for that kind of trouble; any SMP box appears
> to be vulnerable, along with preempt UP...
> 
> Am I missing something that prevents that kind of problem?

For that matter, it doesn't have to be on-stack - e.g. fuse_get_link()
has kmalloc'ed buffer for destination, kfree'd upon failure.  Have the
damn thing lose the timeslice in fuse_copy_do() and you might very well
end up spraying user-supplied data over whatever ends up picking your
kfreed buffer.  That one could be reasonably dealt with if we switched
to page_alloc() and stuffed it into the ->pages[] instead...

Some observations regarding the arguments:
	* stack footprint is atrocious.  Consider e.g. fuse_mknod() - you
get 16 bytes of fuse_mknod_in + 120 bytes of struct fuse_args + 128 bytes
of fuse_entry_out.  All on stack, and that's on top of whatever the
callchain already has eaten, which might include e.g. nfsd stuff or
ecryptfs, etc.  Or fuse_get_parent(), for that matter, with 128 bytes of
fuse_entry_out + 120 bytes of fuse_args, both on stack.  This one is
guaranteed to have a nasty call chain - fuse_get_parent() <- reconnect_one()
<- reconnect_path() <- exportfs_decode_fh() (itself with a 256-byte array of
char on stack) <- nfsd_set_fh_dentry() <- fh_verify() <- a bunch of call
chains in nfsd.
	* "out" args (i.e. reply) are probably best dealt with by having
coallocated with request itself - some already are and the sizes tend
to be fixed and not too large (->get_link() is an exception, and it's
probably better handled as mentioned above).
	* "in" args (request) are in some cases easily dealt with by
coallocating with request, but there's a large class of situations where
we are passing dentry->d_name.name and then there's fuse_symlink().
The last one is ugly - potentially up to a page worth of data, coming
straight from method caller; usually it's a part of getname() result,
but e.g. ecryptfs might have it kmalloc'ed, nfsd - picked from sunrpc
request payload, etc.

	AFAICS, your argument applies to the requests that have
some page(s) locked until the request completion (unlock_page() either
by ->end() callback or in the originator of request).  If so, I would
rather mark those as "call request_end() early"; they seem to have
the non-page parts of args hosted in req->misc, so for them it's not
a problem.

	So how about this:

* explicit FR_END_IMMEDIATELY on read/write-related requests
* no FR_LOCKED flipping in lock_request()/unlock_request()
* modifying the call of end_requests() in fuse_abort_conn() so that it
would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
* make fuse_copy_pages() grab page references around the actual
fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
was set.  Adjust fuse_try_move_page() accordingly.

Do you see any problems with that approach for minimal fix?  If all requests
in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
embedded into req->misc, it looks like this ought to suffice.  I probably
could post something along those lines tomorrow, if you see any serious
problems with that - please yell...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-05 22:04                   ` Al Viro
                                     ` (3 preceding siblings ...)
  (?)
@ 2017-02-06  8:37                   ` Miklos Szeredi
  -1 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-06  8:37 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Sun, Feb 5, 2017 at 11:04 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sun, Feb 05, 2017 at 10:19:20PM +0100, Miklos Szeredi wrote:
>
>> Then we can't break out of that deadlock: we wait until
>> fuse_dev_do_write() is done until calling request_end() which
>> ultimately results in unlocking page.  But fuse_dev_do_write() won't
>> complete until the page is unlocked.
>
> Wait a sec.  What happens if
>
> process A: fuse_lookup()
>                 struct fuse_entry_out outarg on stack
>                 ...
>                 fuse_request_send() with req->out.args[0].value = &outarg
>                         sleep in request_wait_answer() on req->waitq
> server: read the request, write reply
>         fuse_dev_do_write()
>                 copy_out_args()
>                         fuse_copy_args()
>                                 fuse_copy_one()
>                                         FR_LOCKED is guaranteed to be set
>                                         fuse_copy_do()
> process C on another CPU: umount -f
>         fuse_conn_abort()
>                 end_requests()
>                         request_end()
>                                 set FR_FINISHED
>                                 wake A up (via req->waitq)
> process A: regain CPU
>         bugger off from request_wait_answer(), through __fuse_request_send(),
>         fuse_request_send(), fuse_simple_request(), fuse_lookup_name(),
>         fuse_lookup() and out of fuse_lookup().
>
> In the meanwhile, server in fuse_copy_do() does memcpy() to what used to
> be outarg, corrupting the stack of process A.
>
> Sure, you need to hit a fairly narrow window, especially if you are to
> cause damage in A, but AFAICS it's not impossible.  Consider e.g. the
> situation when you lose CPU on preempt on the way to memcpy(); in that
> case server might come back when A has incremented its stack footprint
> again.  Or A might end up taking a hardware interrupt and handling it
> on the normal kernel stack, etc.
>
> Looks like *any* scenario where fuse_conn_abort() manages to run during
> that memcpy() has potential for that kind of trouble; any SMP box appears
> to be vulnerable, along with preempt UP...
>
> Am I missing something that prevents that kind of problem?

Yes: if FR_LOCKED is set, then we leave the request alone in
fuse_abort_conn().  Then, when the copy is finished and
request_unlock() is called, we return -ENOENT to fuse_dev_do_write(),
which in turn calls request_end() to wake up the original caller,
which gets -ECONNABORTED.

So basically FR_LOCKED is protecting the copy, which is guaranteed to
be atomic due to the get_user_pages magic that faults in all pages
beforehand.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-06  3:05                     ` Al Viro
  (?)
  (?)
@ 2017-02-06  9:08                     ` Miklos Szeredi
  2017-02-06  9:57                         ` Al Viro
  -1 siblings, 1 reply; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-06  9:08 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Mon, Feb 6, 2017 at 4:05 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:

> Some observations regarding the arguments:
>         * stack footprint is atrocious.  Consider e.g. fuse_mknod() - you
> get 16 bytes of fuse_mknod_in + 120 bytes of struct fuse_args + 128 bytes
> of fuse_entry_out.  All on stack, and that's on top of whatever the
> callchain already has eaten, which might include e.g. nfsd stuff or
> ecryptfs, etc.  Or fuse_get_parent(), for that matter, with 128 bytes of
> fuse_entry_out + 120 bytes of fuse_args, both on stack.  This one is
> guaranteed to have a nasty call chain - fuse_get_parent() <- reconnect_one()
> <- reconnect_path() <- exportfs_decode_fh() (itself with a 256-byte array of
> char on stack) <- nfsd_set_fh_dentry() <- fh_verify() <- a bunch of call
> chains in nfsd.

Indeed.

>         * "out" args (i.e. reply) are probably best dealt with by having
> coallocated with request itself - some already are and the sizes tend
> to be fixed and not too large (->get_link() is an exception, and it's
> probably better handled as mentioned above).
>         * "in" args (request) are in some cases easily dealt with by
> coallocating with request, but there's a large class of situations where
> we are passing dentry->d_name.name and then there's fuse_symlink().
> The last one is ugly - potentially up to a page worth of data, coming
> straight from method caller; usually it's a part of getname() result,
> but e.g. ecryptfs might have it kmalloc'ed, nfsd - picked from sunrpc
> request payload, etc.
>
>         AFAICS, your argument applies to the requests that have
> some page(s) locked until the request completion (unlock_page() either
> by ->end() callback or in the originator of request).  If so, I would
> rather mark those as "call request_end() early"; they seem to have
> the non-page parts of args hosted in req->misc, so for them it's not
> a problem.

Yes, I think only page lock can be used to deadlock inside
fuse_dev_read/write().  So requests that don't have locked pages
should be okay  with just waiting until copy_to/from_user() finishes
and only then proceeding with the abort.

Those that have locked pages must be able to be aborted during
copy_to/from_user() because the copy itself may try to acquire the
page lock.

So yes, if we want to switch to copy_to/from_user(), then we can just
fix the page refcounting for read and write requests and handle the
two cases differently.

>         So how about this:
>
> * explicit FR_END_IMMEDIATELY on read/write-related requests
> * no FR_LOCKED flipping in lock_request()/unlock_request()
> * modifying the call of end_requests() in fuse_abort_conn() so that it
> would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
> * make fuse_copy_pages() grab page references around the actual
> fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
> reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
> was set.  Adjust fuse_try_move_page() accordingly.
>
> Do you see any problems with that approach for minimal fix?  If all requests
> in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
> embedded into req->misc, it looks like this ought to suffice.  I probably
> could post something along those lines tomorrow, if you see any serious
> problems with that - please yell...

See previous mail, I don't think there's an issue with the current
code.  Other than being convoluted as hell.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-06  9:57                         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-06  9:57 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Mon, Feb 06, 2017 at 10:08:06AM +0100, Miklos Szeredi wrote:

> Yes, I think only page lock can be used to deadlock inside
> fuse_dev_read/write().  So requests that don't have locked pages
> should be okay  with just waiting until copy_to/from_user() finishes
> and only then proceeding with the abort.

Actually, looking at that some more, this might be not true.  Anything
that takes ->mmap_sem exclusive and *not* killable makes for another
source of deadlock.

Initial page fault takes ->mmap_sem shared.  OK, request sent to
server and server tries to read() it.  In the meanwhile, something
has closed userfaultfd for the same mm_struct.  We have userfaultfd_release()
block on attempt to take ->mmap_sem exclusive and from now on any attempt
to grab ->mmap_sem shared will deadlock.  And get_user_pages(), as well
as copy_to_user(), etc. can end up doing just that.  It doesn't have to
be an mmap of the same file, BTW - any page fault would do.

All you really need is to have server sharing address space with the
process that steps into original page fault, plus an evicted page
of any nature (anon mmap, whatever) being used as a destination of
read() in server.

down_read() inside down_read() is fine, unless there had been down_write()
in between.  And there are unkillable down_write() on ->mmap_sem -
userfaultfd_release() being one example of such.  Many of those can and
probably should become down_write_killable(), but this one can't - there
might be nothing to deliver the signal to, if the final close() happens
e.g. from exit(2).

Warning: the above might be completely bogus - I'm on way too large
uptime at the moment and most of the last day had been spent digging
through various convoluted code, so take the above with a cartload of
salt.  _If_ it's true, that kind of deadlock won't be possible to
break with killing anything or doing umount -f, though.

> 
> Those that have locked pages must be able to be aborted during
> copy_to/from_user() because the copy itself may try to acquire the
> page lock.
> 
> So yes, if we want to switch to copy_to/from_user(), then we can just
> fix the page refcounting for read and write requests and handle the
> two cases differently.
> 
> >         So how about this:
> >
> > * explicit FR_END_IMMEDIATELY on read/write-related requests
> > * no FR_LOCKED flipping in lock_request()/unlock_request()
> > * modifying the call of end_requests() in fuse_abort_conn() so that it
> > would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
> > * make fuse_copy_pages() grab page references around the actual
> > fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
> > reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
> > was set.  Adjust fuse_try_move_page() accordingly.
> >
> > Do you see any problems with that approach for minimal fix?  If all requests
> > in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
> > embedded into req->misc, it looks like this ought to suffice.  I probably
> > could post something along those lines tomorrow, if you see any serious
> > problems with that - please yell...
> 
> See previous mail, I don't think there's an issue with the current
> code.  Other than being convoluted as hell.

OK - I'm an idiot and I've managed to misread fuse_abort_conn() despite having
reread it many times last couple of days.  And yes, state transitions of
requests are convoluted as hell ;-/

Anyway, bedtime for me.  With any luck the scare above re ->mmap_sem *is*
bogus and I'll find "Al, you are an idiot - deadlock on ->mmap_sem can't
happen for <reasons>" from somebody in the mailbox when I get up...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-06  9:57                         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-06  9:57 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Mon, Feb 06, 2017 at 10:08:06AM +0100, Miklos Szeredi wrote:

> Yes, I think only page lock can be used to deadlock inside
> fuse_dev_read/write().  So requests that don't have locked pages
> should be okay  with just waiting until copy_to/from_user() finishes
> and only then proceeding with the abort.

Actually, looking at that some more, this might be not true.  Anything
that takes ->mmap_sem exclusive and *not* killable makes for another
source of deadlock.

Initial page fault takes ->mmap_sem shared.  OK, request sent to
server and server tries to read() it.  In the meanwhile, something
has closed userfaultfd for the same mm_struct.  We have userfaultfd_release()
block on attempt to take ->mmap_sem exclusive and from now on any attempt
to grab ->mmap_sem shared will deadlock.  And get_user_pages(), as well
as copy_to_user(), etc. can end up doing just that.  It doesn't have to
be an mmap of the same file, BTW - any page fault would do.

All you really need is to have server sharing address space with the
process that steps into original page fault, plus an evicted page
of any nature (anon mmap, whatever) being used as a destination of
read() in server.

down_read() inside down_read() is fine, unless there had been down_write()
in between.  And there are unkillable down_write() on ->mmap_sem -
userfaultfd_release() being one example of such.  Many of those can and
probably should become down_write_killable(), but this one can't - there
might be nothing to deliver the signal to, if the final close() happens
e.g. from exit(2).

Warning: the above might be completely bogus - I'm on way too large
uptime at the moment and most of the last day had been spent digging
through various convoluted code, so take the above with a cartload of
salt.  _If_ it's true, that kind of deadlock won't be possible to
break with killing anything or doing umount -f, though.

> 
> Those that have locked pages must be able to be aborted during
> copy_to/from_user() because the copy itself may try to acquire the
> page lock.
> 
> So yes, if we want to switch to copy_to/from_user(), then we can just
> fix the page refcounting for read and write requests and handle the
> two cases differently.
> 
> >         So how about this:
> >
> > * explicit FR_END_IMMEDIATELY on read/write-related requests
> > * no FR_LOCKED flipping in lock_request()/unlock_request()
> > * modifying the call of end_requests() in fuse_abort_conn() so that it
> > would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
> > * make fuse_copy_pages() grab page references around the actual
> > fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
> > reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
> > was set.  Adjust fuse_try_move_page() accordingly.
> >
> > Do you see any problems with that approach for minimal fix?  If all requests
> > in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
> > embedded into req->misc, it looks like this ought to suffice.  I probably
> > could post something along those lines tomorrow, if you see any serious
> > problems with that - please yell...
> 
> See previous mail, I don't think there's an issue with the current
> code.  Other than being convoluted as hell.

OK - I'm an idiot and I've managed to misread fuse_abort_conn() despite having
reread it many times last couple of days.  And yes, state transitions of
requests are convoluted as hell ;-/

Anyway, bedtime for me.  With any luck the scare above re ->mmap_sem *is*
bogus and I'll find "Al, you are an idiot - deadlock on ->mmap_sem can't
happen for <reasons>" from somebody in the mailbox when I get up...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-06  9:57                         ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-06  9:57 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Mon, Feb 06, 2017 at 10:08:06AM +0100, Miklos Szeredi wrote:

> Yes, I think only page lock can be used to deadlock inside
> fuse_dev_read/write().  So requests that don't have locked pages
> should be okay  with just waiting until copy_to/from_user() finishes
> and only then proceeding with the abort.

Actually, looking at that some more, this might be not true.  Anything
that takes ->mmap_sem exclusive and *not* killable makes for another
source of deadlock.

Initial page fault takes ->mmap_sem shared.  OK, request sent to
server and server tries to read() it.  In the meanwhile, something
has closed userfaultfd for the same mm_struct.  We have userfaultfd_release()
block on attempt to take ->mmap_sem exclusive and from now on any attempt
to grab ->mmap_sem shared will deadlock.  And get_user_pages(), as well
as copy_to_user(), etc. can end up doing just that.  It doesn't have to
be an mmap of the same file, BTW - any page fault would do.

All you really need is to have server sharing address space with the
process that steps into original page fault, plus an evicted page
of any nature (anon mmap, whatever) being used as a destination of
read() in server.

down_read() inside down_read() is fine, unless there had been down_write()
in between.  And there are unkillable down_write() on ->mmap_sem -
userfaultfd_release() being one example of such.  Many of those can and
probably should become down_write_killable(), but this one can't - there
might be nothing to deliver the signal to, if the final close() happens
e.g. from exit(2).

Warning: the above might be completely bogus - I'm on way too large
uptime at the moment and most of the last day had been spent digging
through various convoluted code, so take the above with a cartload of
salt.  _If_ it's true, that kind of deadlock won't be possible to
break with killing anything or doing umount -f, though.

> 
> Those that have locked pages must be able to be aborted during
> copy_to/from_user() because the copy itself may try to acquire the
> page lock.
> 
> So yes, if we want to switch to copy_to/from_user(), then we can just
> fix the page refcounting for read and write requests and handle the
> two cases differently.
> 
> >         So how about this:
> >
> > * explicit FR_END_IMMEDIATELY on read/write-related requests
> > * no FR_LOCKED flipping in lock_request()/unlock_request()
> > * modifying the call of end_requests() in fuse_abort_conn() so that it
> > would skip request_end() for everything that isn't marked FR_END_IMMEDIATELY
> > * make fuse_copy_pages() grab page references around the actual
> > fuse_copy_page() - grab req->waitq.lock, check FR_ABORTED, grab a page
> > reference in case it's not, drop req->waitq.lock and bugger off if FR_ABORTED
> > was set.  Adjust fuse_try_move_page() accordingly.
> >
> > Do you see any problems with that approach for minimal fix?  If all requests
> > in need of FR_END_IMMEDIATELY turn out to have non-page part of args already
> > embedded into req->misc, it looks like this ought to suffice.  I probably
> > could post something along those lines tomorrow, if you see any serious
> > problems with that - please yell...
> 
> See previous mail, I don't think there's an issue with the current
> code.  Other than being convoluted as hell.

OK - I'm an idiot and I've managed to misread fuse_abort_conn() despite having
reread it many times last couple of days.  And yes, state transitions of
requests are convoluted as hell ;-/

Anyway, bedtime for me.  With any luck the scare above re ->mmap_sem *is*
bogus and I'll find "Al, you are an idiot - deadlock on ->mmap_sem can't
happen for <reasons>" from somebody in the mailbox when I get up...

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-06  9:57                         ` Al Viro
  (?)
  (?)
@ 2017-02-06 14:18                         ` Miklos Szeredi
  2017-02-07  7:19                             ` Al Viro
  -1 siblings, 1 reply; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-06 14:18 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Mon, Feb 6, 2017 at 10:57 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Mon, Feb 06, 2017 at 10:08:06AM +0100, Miklos Szeredi wrote:
>
>> Yes, I think only page lock can be used to deadlock inside
>> fuse_dev_read/write().  So requests that don't have locked pages
>> should be okay  with just waiting until copy_to/from_user() finishes
>> and only then proceeding with the abort.
>
> Actually, looking at that some more, this might be not true.  Anything
> that takes ->mmap_sem exclusive and *not* killable makes for another
> source of deadlock.
>
> Initial page fault takes ->mmap_sem shared.  OK, request sent to
> server and server tries to read() it.  In the meanwhile, something
> has closed userfaultfd for the same mm_struct.  We have userfaultfd_release()
> block on attempt to take ->mmap_sem exclusive and from now on any attempt
> to grab ->mmap_sem shared will deadlock.  And get_user_pages(), as well
> as copy_to_user(), etc. can end up doing just that.  It doesn't have to
> be an mmap of the same file, BTW - any page fault would do.
>
> All you really need is to have server sharing address space with the
> process that steps into original page fault, plus an evicted page
> of any nature (anon mmap, whatever) being used as a destination of
> read() in server.
>
> down_read() inside down_read() is fine, unless there had been down_write()
> in between.  And there are unkillable down_write() on ->mmap_sem -
> userfaultfd_release() being one example of such.  Many of those can and
> probably should become down_write_killable(), but this one can't - there
> might be nothing to deliver the signal to, if the final close() happens
> e.g. from exit(2).
>
> Warning: the above might be completely bogus - I'm on way too large
> uptime at the moment and most of the last day had been spent digging
> through various convoluted code, so take the above with a cartload of
> salt.  _If_ it's true, that kind of deadlock won't be possible to
> break with killing anything or doing umount -f, though.

It's not bogus, the deadlock is there.

But I think it's breakable in the same way: if the deadlocked request
is aborted, the fault will release the page lock as well as mmap_sem,
and from there things will resolve themselves.

But you are definitely right about needing to clean up that mess in
fuse/dev.c and doing so by fixing up the arg refcounting for just the
read and write requests is going to be a lot simpler than having to do
that for all of them (which was my original plan).

So, I'll have a go at that sometime.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-07  7:19                             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-07  7:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Mon, Feb 06, 2017 at 03:18:42PM +0100, Miklos Szeredi wrote:

> But I think it's breakable in the same way: if the deadlocked request
> is aborted, the fault will release the page lock as well as mmap_sem,
> and from there things will resolve themselves.

Right you are - original holder of ->mmap_sem is waiting for request and
abort will wake it up...

> But you are definitely right about needing to clean up that mess in
> fuse/dev.c and doing so by fixing up the arg refcounting for just the
> read and write requests is going to be a lot simpler than having to do
> that for all of them (which was my original plan).

Speaking of refcounting - what's going on with fuse_file one?  My reading
of that code is that you have 4 states of that thing:
	* new (just created, fallback request allocated, use fuse_file_free()
to kill).  Refcount is 0.
	* intermediate - in fact it's already opened, but still not
put into ->private_data.  Refcount is still 0.  Use fuse_sync_release() to kill.
	* live - normal refcounting (fuse_file_get()/fuse_file_put()).
	* shutdown - refcount has reached 0.  Can't happen until ->release()
(obviously - ->private_data holds a counting reference), some pieces of
fuse_sync_release() correspond to some stuff in fuse_release_common(), some -
to final fuse_file_put().

To make it even more convoluted, cuse is using fuse_sync_release() and
apparently relies upon no references being grabbed after fuse_do_open(),
so that thing can be called with refcount 0 *or* refcount 1.

Another thing: what guarantees that places in writepages-related paths
where we store a reference into req->ff won't hit a request with already
non-NULL ->ff?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-07  7:19                             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-07  7:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Mon, Feb 06, 2017 at 03:18:42PM +0100, Miklos Szeredi wrote:

> But I think it's breakable in the same way: if the deadlocked request
> is aborted, the fault will release the page lock as well as mmap_sem,
> and from there things will resolve themselves.

Right you are - original holder of ->mmap_sem is waiting for request and
abort will wake it up...

> But you are definitely right about needing to clean up that mess in
> fuse/dev.c and doing so by fixing up the arg refcounting for just the
> read and write requests is going to be a lot simpler than having to do
> that for all of them (which was my original plan).

Speaking of refcounting - what's going on with fuse_file one?  My reading
of that code is that you have 4 states of that thing:
	* new (just created, fallback request allocated, use fuse_file_free()
to kill).  Refcount is 0.
	* intermediate - in fact it's already opened, but still not
put into ->private_data.  Refcount is still 0.  Use fuse_sync_release() to kill.
	* live - normal refcounting (fuse_file_get()/fuse_file_put()).
	* shutdown - refcount has reached 0.  Can't happen until ->release()
(obviously - ->private_data holds a counting reference), some pieces of
fuse_sync_release() correspond to some stuff in fuse_release_common(), some -
to final fuse_file_put().

To make it even more convoluted, cuse is using fuse_sync_release() and
apparently relies upon no references being grabbed after fuse_do_open(),
so that thing can be called with refcount 0 *or* refcount 1.

Another thing: what guarantees that places in writepages-related paths
where we store a reference into req->ff won't hit a request with already
non-NULL ->ff?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-07  7:19                             ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-07  7:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Linux NFS list, Jan Kara, Jeff Layton,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chris Wilson,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds,
	Kirill A. Shutemov, lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Mon, Feb 06, 2017 at 03:18:42PM +0100, Miklos Szeredi wrote:

> But I think it's breakable in the same way: if the deadlocked request
> is aborted, the fault will release the page lock as well as mmap_sem,
> and from there things will resolve themselves.

Right you are - original holder of ->mmap_sem is waiting for request and
abort will wake it up...

> But you are definitely right about needing to clean up that mess in
> fuse/dev.c and doing so by fixing up the arg refcounting for just the
> read and write requests is going to be a lot simpler than having to do
> that for all of them (which was my original plan).

Speaking of refcounting - what's going on with fuse_file one?  My reading
of that code is that you have 4 states of that thing:
	* new (just created, fallback request allocated, use fuse_file_free()
to kill).  Refcount is 0.
	* intermediate - in fact it's already opened, but still not
put into ->private_data.  Refcount is still 0.  Use fuse_sync_release() to kill.
	* live - normal refcounting (fuse_file_get()/fuse_file_put()).
	* shutdown - refcount has reached 0.  Can't happen until ->release()
(obviously - ->private_data holds a counting reference), some pieces of
fuse_sync_release() correspond to some stuff in fuse_release_common(), some -
to final fuse_file_put().

To make it even more convoluted, cuse is using fuse_sync_release() and
apparently relies upon no references being grabbed after fuse_do_open(),
so that thing can be called with refcount 0 *or* refcount 1.

Another thing: what guarantees that places in writepages-related paths
where we store a reference into req->ff won't hit a request with already
non-NULL ->ff?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-07 11:35                               ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-07 11:35 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Tue, Feb 07, 2017 at 07:19:09AM +0000, Al Viro wrote:

> Speaking of refcounting - what's going on with fuse_file one?  My reading
> of that code is that you have 4 states of that thing:
> 	* new (just created, fallback request allocated, use fuse_file_free()
> to kill).  Refcount is 0.
> 	* intermediate - in fact it's already opened, but still not
> put into ->private_data.  Refcount is still 0.  Use fuse_sync_release() to kill.
> 	* live - normal refcounting (fuse_file_get()/fuse_file_put()).
> 	* shutdown - refcount has reached 0.  Can't happen until ->release()
> (obviously - ->private_data holds a counting reference), some pieces of
> fuse_sync_release() correspond to some stuff in fuse_release_common(), some -
> to final fuse_file_put().
> 
> To make it even more convoluted, cuse is using fuse_sync_release() and
> apparently relies upon no references being grabbed after fuse_do_open(),
> so that thing can be called with refcount 0 *or* refcount 1.

Indeed, ugly.  Following should clean that up somewhat.  The states remain what
they are but refcounting is at least consistent now.  This also fixes a missing
FR_FORCE in the sync case.
 
> Another thing: what guarantees that places in writepages-related paths
> where we store a reference into req->ff won't hit a request with already
> non-NULL ->ff?

Well, it is set before being sent (queued onto queued_writes or queued on the
fuse device), but not when queued as secondary request onto an already in-flight
one.  It looks okay to me.

Thanks,
Miklos

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 811fd8929a18..e816166ce42f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -473,7 +473,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	if (err) {
 		fuse_sync_release(ff, flags);
 	} else {
-		file->private_data = fuse_file_get(ff);
+		file->private_data = ff;
 		fuse_finish_open(inode, file);
 	}
 	return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2401c5dabb2a..94cc4ab01f9a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
-	atomic_set(&ff->count, 0);
+	atomic_set(&ff->count, 1);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
 
@@ -75,7 +75,7 @@ void fuse_file_free(struct fuse_file *ff)
 	kfree(ff);
 }
 
-struct fuse_file *fuse_file_get(struct fuse_file *ff)
+static struct fuse_file *fuse_file_get(struct fuse_file *ff)
 {
 	atomic_inc(&ff->count);
 	return ff;
@@ -100,6 +100,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 			iput(req->misc.release.inode);
 			fuse_put_request(ff->fc, req);
 		} else if (sync) {
+			__set_bit(FR_FORCE, &req->flags);
 			__clear_bit(FR_BACKGROUND, &req->flags);
 			fuse_request_send(ff->fc, req);
 			iput(req->misc.release.inode);
@@ -146,7 +147,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		ff->open_flags &= ~FOPEN_DIRECT_IO;
 
 	ff->nodeid = nodeid;
-	file->private_data = fuse_file_get(ff);
+	file->private_data = ff;
 
 	return 0;
 }
@@ -297,13 +298,9 @@ static int fuse_release(struct inode *inode, struct file *file)
 
 void fuse_sync_release(struct fuse_file *ff, int flags)
 {
-	WARN_ON(atomic_read(&ff->count) > 1);
+	WARN_ON(atomic_read(&ff->count) != 1);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
-	__set_bit(FR_FORCE, &ff->reserved_req->flags);
-	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
-	fuse_request_send(ff->fc, ff->reserved_req);
-	fuse_put_request(ff->fc, ff->reserved_req);
-	kfree(ff);
+	fuse_file_put(ff, true);
 }
 EXPORT_SYMBOL_GPL(fuse_sync_release);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..83f797271aef 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -732,7 +732,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
 
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
-struct fuse_file *fuse_file_get(struct fuse_file *ff);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file);
 

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-07 11:35                               ` Miklos Szeredi
  0 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-07 11:35 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Tue, Feb 07, 2017 at 07:19:09AM +0000, Al Viro wrote:

> Speaking of refcounting - what's going on with fuse_file one?  My reading
> of that code is that you have 4 states of that thing:
> 	* new (just created, fallback request allocated, use fuse_file_free()
> to kill).  Refcount is 0.
> 	* intermediate - in fact it's already opened, but still not
> put into ->private_data.  Refcount is still 0.  Use fuse_sync_release() to kill.
> 	* live - normal refcounting (fuse_file_get()/fuse_file_put()).
> 	* shutdown - refcount has reached 0.  Can't happen until ->release()
> (obviously - ->private_data holds a counting reference), some pieces of
> fuse_sync_release() correspond to some stuff in fuse_release_common(), some -
> to final fuse_file_put().
> 
> To make it even more convoluted, cuse is using fuse_sync_release() and
> apparently relies upon no references being grabbed after fuse_do_open(),
> so that thing can be called with refcount 0 *or* refcount 1.

Indeed, ugly.  Following should clean that up somewhat.  The states remain what
they are but refcounting is at least consistent now.  This also fixes a missing
FR_FORCE in the sync case.
 
> Another thing: what guarantees that places in writepages-related paths
> where we store a reference into req->ff won't hit a request with already
> non-NULL ->ff?

Well, it is set before being sent (queued onto queued_writes or queued on the
fuse device), but not when queued as secondary request onto an already in-flight
one.  It looks okay to me.

Thanks,
Miklos

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 811fd8929a18..e816166ce42f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -473,7 +473,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	if (err) {
 		fuse_sync_release(ff, flags);
 	} else {
-		file->private_data = fuse_file_get(ff);
+		file->private_data = ff;
 		fuse_finish_open(inode, file);
 	}
 	return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2401c5dabb2a..94cc4ab01f9a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
-	atomic_set(&ff->count, 0);
+	atomic_set(&ff->count, 1);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
 
@@ -75,7 +75,7 @@ void fuse_file_free(struct fuse_file *ff)
 	kfree(ff);
 }
 
-struct fuse_file *fuse_file_get(struct fuse_file *ff)
+static struct fuse_file *fuse_file_get(struct fuse_file *ff)
 {
 	atomic_inc(&ff->count);
 	return ff;
@@ -100,6 +100,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 			iput(req->misc.release.inode);
 			fuse_put_request(ff->fc, req);
 		} else if (sync) {
+			__set_bit(FR_FORCE, &req->flags);
 			__clear_bit(FR_BACKGROUND, &req->flags);
 			fuse_request_send(ff->fc, req);
 			iput(req->misc.release.inode);
@@ -146,7 +147,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		ff->open_flags &= ~FOPEN_DIRECT_IO;
 
 	ff->nodeid = nodeid;
-	file->private_data = fuse_file_get(ff);
+	file->private_data = ff;
 
 	return 0;
 }
@@ -297,13 +298,9 @@ static int fuse_release(struct inode *inode, struct file *file)
 
 void fuse_sync_release(struct fuse_file *ff, int flags)
 {
-	WARN_ON(atomic_read(&ff->count) > 1);
+	WARN_ON(atomic_read(&ff->count) != 1);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
-	__set_bit(FR_FORCE, &ff->reserved_req->flags);
-	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
-	fuse_request_send(ff->fc, ff->reserved_req);
-	fuse_put_request(ff->fc, ff->reserved_req);
-	kfree(ff);
+	fuse_file_put(ff, true);
 }
 EXPORT_SYMBOL_GPL(fuse_sync_release);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..83f797271aef 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -732,7 +732,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
 
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
-struct fuse_file *fuse_file_get(struct fuse_file *ff);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file);
 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-08  5:54                                 ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-08  5:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Tue, Feb 07, 2017 at 12:35:54PM +0100, Miklos Szeredi wrote:
> > Another thing: what guarantees that places in writepages-related paths
> > where we store a reference into req->ff won't hit a request with already
> > non-NULL ->ff?
> 
> Well, it is set before being sent (queued onto queued_writes or queued on the
> fuse device), but not when queued as secondary request onto an already in-flight
> one.  It looks okay to me.

>  void fuse_sync_release(struct fuse_file *ff, int flags)
>  {
> -	WARN_ON(atomic_read(&ff->count) > 1);
> +	WARN_ON(atomic_read(&ff->count) != 1);
>  	fuse_prepare_release(ff, flags, FUSE_RELEASE);
> -	__set_bit(FR_FORCE, &ff->reserved_req->flags);
> -	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
> -	fuse_request_send(ff->fc, ff->reserved_req);
> -	fuse_put_request(ff->fc, ff->reserved_req);
> -	kfree(ff);
> +	fuse_file_put(ff, true);

Umm...  At the very least, that deserves a comment re "iput(NULL) is a no-op
and since the refcount is 1 and everything's synchronous, we are fine with
not doing igrab/iput here".  There's enough mysteries in that code as it is...

	Speaking of mysteries - how can ->private_data ever be NULL in
fuse_release_common()?  AFAICS, it's only called from ->release() instances
and those are only called after ->open() or ->atomic_open() on that struct file
has returned 0.  On the ->open() side, it means fuse_do_open() having returned
0; on ->atomic_open() one - fuse_create_open() having done the same.  Neither
is possible with ->private_data remaining NULL, and I don't see any places
that would modify it afterwards...

	Another thing: am I right assuming that ff->nodeid will be the same
for all ff over given inode (== get_node_id(inode))?  What about ff->fh?
Is that a per-open thing, or will it be identical for all opens of the same
inode?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-08  5:54                                 ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-08  5:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Tue, Feb 07, 2017 at 12:35:54PM +0100, Miklos Szeredi wrote:
> > Another thing: what guarantees that places in writepages-related paths
> > where we store a reference into req->ff won't hit a request with already
> > non-NULL ->ff?
> 
> Well, it is set before being sent (queued onto queued_writes or queued on the
> fuse device), but not when queued as secondary request onto an already in-flight
> one.  It looks okay to me.

>  void fuse_sync_release(struct fuse_file *ff, int flags)
>  {
> -	WARN_ON(atomic_read(&ff->count) > 1);
> +	WARN_ON(atomic_read(&ff->count) != 1);
>  	fuse_prepare_release(ff, flags, FUSE_RELEASE);
> -	__set_bit(FR_FORCE, &ff->reserved_req->flags);
> -	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
> -	fuse_request_send(ff->fc, ff->reserved_req);
> -	fuse_put_request(ff->fc, ff->reserved_req);
> -	kfree(ff);
> +	fuse_file_put(ff, true);

Umm...  At the very least, that deserves a comment re "iput(NULL) is a no-op
and since the refcount is 1 and everything's synchronous, we are fine with
not doing igrab/iput here".  There's enough mysteries in that code as it is...

	Speaking of mysteries - how can ->private_data ever be NULL in
fuse_release_common()?  AFAICS, it's only called from ->release() instances
and those are only called after ->open() or ->atomic_open() on that struct file
has returned 0.  On the ->open() side, it means fuse_do_open() having returned
0; on ->atomic_open() one - fuse_create_open() having done the same.  Neither
is possible with ->private_data remaining NULL, and I don't see any places
that would modify it afterwards...

	Another thing: am I right assuming that ff->nodeid will be the same
for all ff over given inode (== get_node_id(inode))?  What about ff->fh?
Is that a per-open thing, or will it be identical for all opens of the same
inode?

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-08  5:54                                 ` Al Viro
  0 siblings, 0 replies; 101+ messages in thread
From: Al Viro @ 2017-02-08  5:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux NFS list,
	ceph-devel-u79uwXL29TY76Z2rM5mHXA,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA,
	v9fs-developer-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Linus Torvalds,
	Jan Kara, Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Tue, Feb 07, 2017 at 12:35:54PM +0100, Miklos Szeredi wrote:
> > Another thing: what guarantees that places in writepages-related paths
> > where we store a reference into req->ff won't hit a request with already
> > non-NULL ->ff?
> 
> Well, it is set before being sent (queued onto queued_writes or queued on the
> fuse device), but not when queued as secondary request onto an already in-flight
> one.  It looks okay to me.

>  void fuse_sync_release(struct fuse_file *ff, int flags)
>  {
> -	WARN_ON(atomic_read(&ff->count) > 1);
> +	WARN_ON(atomic_read(&ff->count) != 1);
>  	fuse_prepare_release(ff, flags, FUSE_RELEASE);
> -	__set_bit(FR_FORCE, &ff->reserved_req->flags);
> -	__clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
> -	fuse_request_send(ff->fc, ff->reserved_req);
> -	fuse_put_request(ff->fc, ff->reserved_req);
> -	kfree(ff);
> +	fuse_file_put(ff, true);

Umm...  At the very least, that deserves a comment re "iput(NULL) is a no-op
and since the refcount is 1 and everything's synchronous, we are fine with
not doing igrab/iput here".  There's enough mysteries in that code as it is...

	Speaking of mysteries - how can ->private_data ever be NULL in
fuse_release_common()?  AFAICS, it's only called from ->release() instances
and those are only called after ->open() or ->atomic_open() on that struct file
has returned 0.  On the ->open() side, it means fuse_do_open() having returned
0; on ->atomic_open() one - fuse_create_open() having done the same.  Neither
is possible with ->private_data remaining NULL, and I don't see any places
that would modify it afterwards...

	Another thing: am I right assuming that ff->nodeid will be the same
for all ff over given inode (== get_node_id(inode))?  What about ff->fh?
Is that a per-open thing, or will it be identical for all opens of the same
inode?

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-08  5:54                                 ` Al Viro
  (?)
  (?)
@ 2017-02-08  9:53                                 ` Miklos Szeredi
  -1 siblings, 0 replies; 101+ messages in thread
From: Miklos Szeredi @ 2017-02-08  9:53 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, Linux NFS list, ceph-devel,
	lustre-devel, v9fs-developer, Linus Torvalds, Jan Kara,
	Chris Wilson, Kirill A. Shutemov, Jeff Layton

On Wed, Feb 8, 2017 at 6:54 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Tue, Feb 07, 2017 at 12:35:54PM +0100, Miklos Szeredi wrote:
>> > Another thing: what guarantees that places in writepages-related paths
>> > where we store a reference into req->ff won't hit a request with already
>> > non-NULL ->ff?
>>
>> Well, it is set before being sent (queued onto queued_writes or queued on the
>> fuse device), but not when queued as secondary request onto an already in-flight
>> one.  It looks okay to me.
>
>>  void fuse_sync_release(struct fuse_file *ff, int flags)
>>  {
>> -     WARN_ON(atomic_read(&ff->count) > 1);
>> +     WARN_ON(atomic_read(&ff->count) != 1);
>>       fuse_prepare_release(ff, flags, FUSE_RELEASE);
>> -     __set_bit(FR_FORCE, &ff->reserved_req->flags);
>> -     __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
>> -     fuse_request_send(ff->fc, ff->reserved_req);
>> -     fuse_put_request(ff->fc, ff->reserved_req);
>> -     kfree(ff);
>> +     fuse_file_put(ff, true);
>
> Umm...  At the very least, that deserves a comment re "iput(NULL) is a no-op
> and since the refcount is 1 and everything's synchronous, we are fine with
> not doing igrab/iput here".  There's enough mysteries in that code as it is...

Added comment.

>         Speaking of mysteries - how can ->private_data ever be NULL in
> fuse_release_common()?  AFAICS, it's only called from ->release() instances
> and those are only called after ->open() or ->atomic_open() on that struct file
> has returned 0.  On the ->open() side, it means fuse_do_open() having returned
> 0; on ->atomic_open() one - fuse_create_open() having done the same.  Neither
> is possible with ->private_data remaining NULL, and I don't see any places
> that would modify it afterwards...

Goes back to v2.6.15 (commit fd72faac95d7 "[PATCH] FUSE: atomic
create+open").  Didn't make sense back then, and it doesn't now.
Fixed.

>         Another thing: am I right assuming that ff->nodeid will be the same
> for all ff over given inode (== get_node_id(inode))?

Yes.  Except for cuse, where it's zero.

>  What about ff->fh?
> Is that a per-open thing, or will it be identical for all opens of the same
> inode?

A per-open thing (opaque identifier used by userspace fs to identify
the open file)

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-03 19:28                   ` Linus Torvalds
  (?)
  (?)
@ 2017-02-13  9:56                   ` Steve Capper
  2017-02-13 21:40                       ` Linus Torvalds
  -1 siblings, 1 reply; 101+ messages in thread
From: Steve Capper @ 2017-02-13  9:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Catalin Marinas, Hugh Dickins, Peter Zijlstra,
	Ingo Molnar, Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel, V9FS Developers, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Fri, Feb 03, 2017 at 11:28:48AM -0800, Linus Torvalds wrote:
> On Fri, Feb 3, 2017 at 11:08 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > On x86 it does.  I don't see anything equivalent in mm/gup.c one, and the
> > only kinda-sorta similar thing (access_ok() in __get_user_pages_fast()
> > there) is vulnerable to e.g. access via kernel_write().
> 
> Yeah, access_ok() is bogus. It needs to just check against TASK_SIZE
> or whatever.
> 
> > doesn't look promising - access_ok() is never sufficient.  Something like
> > _PAGE_USER tests in x86 one solves that problem, but if anything similar
> > works for HAVE_GENERIC_RCU_GUP I don't see it.  Thus the question re
> > what am I missing here...
> 
> Ok, I definitely agree that it looks like __get_user_pages_fast() just
> needs to get rid of the access_ok() and replace it with a proper check
> for the user address space range.
> 
> Looks like arm[64] and powerpc.are the current users. Adding in some
> people involved with the original submission a few years ago.

Hi,

[ Apologies for my late reply, I was on vacation then catchup... ]

> 
> I do note that the x86 __get_user_pages_fast() thing looks dodgy too.
> 
> In particular, we do it right in the *real* get_user_pages_fast(), see
> commit 7f8189068726 ("x86: don't use 'access_ok()' as a range check in
> get_user_pages_fast()"). But then the same bug was re-introduced when
> the "irq safe" version was merged. As well as in the GENERIC_RCU_GUP
> version.
> 
> Gaah. Apparently PeterZ copied the old buggy version before the fix
> when he added __get_user_pages_fast() in commit 465a454f254e ("x86,
> mm: Add __get_user_pages_fast()").
> 
> I guess it could be considered a merge error (both happened during the
> 2.6.31 merge window).
> 

Okay so looking at what we have for access_ok(.) on arm64, my
understanding is that we perform a 65-bit add/compare (in assembler) to
see whether or not the range is below the current_thread_info->addr_limit.
So I think this is a roundabout way of checking for no-wrap around and <= TASK_SIZE.

Looking at powerpc, I see it's a little different...

So if it sounds reasonable to folk I was going to send a patch to
replace the call to access_ok(.) with a wraparound + TASK_SIZE check
written explicitly in C? (and remove some of the comments talking about
access_ok(.)).

Cheers,
-- 
Steve

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-13 21:40                       ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-13 21:40 UTC (permalink / raw)
  To: Steve Capper
  Cc: Al Viro, Catalin Marinas, Hugh Dickins, Peter Zijlstra,
	Ingo Molnar, Jeff Layton, Christoph Hellwig, linux-fsdevel,
	Linux Kernel Mailing List, Linux NFS Mailing List, ceph-devel,
	lustre-devel, V9FS Developers, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Mon, Feb 13, 2017 at 1:56 AM, Steve Capper <steve.capper@linaro.org> wrote:
>
> Okay so looking at what we have for access_ok(.) on arm64, my
> understanding is that we perform a 65-bit add/compare (in assembler) to
> see whether or not the range is below the current_thread_info->addr_limit.
> So I think this is a roundabout way of checking for no-wrap around and <= TASK_SIZE.

No, that's the problem. It's *not* testing against TASK_SIZE.

Because add_limit is not always TASK_SIZE. When you do
set_fs(KERNEL_DS), you set addr_limit to infinity.

And yes, the kernel does read and write calls too. Seldom, but it
happens. And walking the page tables with kernel addresses is not
supposed to work (sometimes it happens to work by mistake). So if
somebody finds a path that gets from that kind of situation into the
get_user_pages() interface, bad things happen.

                 Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-13 21:40                       ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-13 21:40 UTC (permalink / raw)
  To: Steve Capper
  Cc: Linux NFS Mailing List, Jan Kara, Jeff Layton, Peter Zijlstra,
	Catalin Marinas, Hugh Dickins, Linux Kernel Mailing List,
	Chris Wilson, Christoph Hellwig, Al Viro, linux-fsdevel,
	V9FS Developers, ceph-devel, Ingo Molnar, Kirill A. Shutemov,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Mon, Feb 13, 2017 at 1:56 AM, Steve Capper <steve.capper-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org> wrote:
>
> Okay so looking at what we have for access_ok(.) on arm64, my
> understanding is that we perform a 65-bit add/compare (in assembler) to
> see whether or not the range is below the current_thread_info->addr_limit.
> So I think this is a roundabout way of checking for no-wrap around and <= TASK_SIZE.

No, that's the problem. It's *not* testing against TASK_SIZE.

Because add_limit is not always TASK_SIZE. When you do
set_fs(KERNEL_DS), you set addr_limit to infinity.

And yes, the kernel does read and write calls too. Seldom, but it
happens. And walking the page tables with kernel addresses is not
supposed to work (sometimes it happens to work by mistake). So if
somebody finds a path that gets from that kind of situation into the
get_user_pages() interface, bad things happen.

                 Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* [lustre-devel] [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
@ 2017-02-13 21:40                       ` Linus Torvalds
  0 siblings, 0 replies; 101+ messages in thread
From: Linus Torvalds @ 2017-02-13 21:40 UTC (permalink / raw)
  To: Steve Capper
  Cc: Linux NFS Mailing List, Jan Kara, Jeff Layton, Peter Zijlstra,
	Catalin Marinas, Hugh Dickins, Linux Kernel Mailing List,
	Chris Wilson, Christoph Hellwig, Al Viro, linux-fsdevel,
	V9FS Developers, ceph-devel, Ingo Molnar, Kirill A. Shutemov,
	lustre-devel-aLEFhgZF4x6X6Mz3xDxJMA

On Mon, Feb 13, 2017 at 1:56 AM, Steve Capper <steve.capper@linaro.org> wrote:
>
> Okay so looking at what we have for access_ok(.) on arm64, my
> understanding is that we perform a 65-bit add/compare (in assembler) to
> see whether or not the range is below the current_thread_info->addr_limit.
> So I think this is a roundabout way of checking for no-wrap around and <= TASK_SIZE.

No, that's the problem. It's *not* testing against TASK_SIZE.

Because add_limit is not always TASK_SIZE. When you do
set_fs(KERNEL_DS), you set addr_limit to infinity.

And yes, the kernel does read and write calls too. Seldom, but it
happens. And walking the page tables with kernel addresses is not
supposed to work (sometimes it happens to work by mistake). So if
somebody finds a path that gets from that kind of situation into the
get_user_pages() interface, bad things happen.

                 Linus

^ permalink raw reply	[flat|nested] 101+ messages in thread

* Re: [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call
  2017-02-02  9:51     ` Al Viro
                       ` (4 preceding siblings ...)
  (?)
@ 2017-02-16 13:10     ` Jeff Layton
  -1 siblings, 0 replies; 101+ messages in thread
From: Jeff Layton @ 2017-02-16 13:10 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel, linux-kernel, linux-nfs, ceph-devel, lustre-devel,
	v9fs-developer, Linus Torvalds, Jan Kara, Chris Wilson,
	Kirill A. Shutemov

On Thu, 2017-02-02 at 09:51 +0000, Al Viro wrote:
> On Wed, Jan 25, 2017 at 08:32:03AM -0500, Jeff Layton wrote:
> > Small respin of the patch that I sent yesterday for the same thing.
> > 
> > This moves the maxsize handling into iov_iter_pvec_size, so that we don't
> > end up iterating past the max size we'll use anyway when trying to
> > determine the pagevec length.
> > 
> > Also, a respun patch to make ceph use iov_iter_get_pages_alloc instead of
> > trying to do it via its own routine.
> > 
> > Al, if these look ok, do you want to pick these up or shall I ask
> > Ilya to merge them via the ceph tree?
> 
> I'd rather have that kind of work go through the vfs tree; said that,
> I really wonder if this is the right approach.  Most of the users of
> iov_iter_get_pages()/iov_iter_get_pages_alloc() look like they want
> something like
> 	iov_iter_for_each_page(iter, size, f, data)
> with int (*f)(struct page *page, size_t from, size_t size, void *data)
> passed as callback.  Not everything fits that model, but there's a whole
> lot of things that do.
> 

While I do like the above proposal better than what I originally had,
I'm guessing it won't be ready in time for v4.11.

Would it be reasonable to take the patch I proposed for v4.11 as an
interim fix? It does fix a rather easy-to-trigger softlockup in the ceph
code that xfstests can reliably hit.

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 101+ messages in thread

end of thread, other threads:[~2017-02-16 13:10 UTC | newest]

Thread overview: 101+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-24 21:23 [PATCH] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call Jeff Layton
2017-01-25 13:32 ` [PATCH v3 0/2] " Jeff Layton
2017-01-25 13:32   ` [PATCH v3 1/2] " Jeff Layton
2017-01-26 12:35     ` Jeff Layton
2017-01-27 13:24       ` [PATCH v4 0/2] " Jeff Layton
2017-01-27 13:24         ` [PATCH v4 1/2] " Jeff Layton
2017-01-27 13:24         ` [PATCH v4 2/2] ceph: switch DIO code to use iov_iter_get_pages_alloc Jeff Layton
2017-01-30 15:40           ` Jeff Layton
2017-01-30 15:40             ` Jeff Layton
2017-01-25 13:32   ` [PATCH v3 " Jeff Layton
2017-02-02  9:51   ` [PATCH v3 0/2] iov_iter: allow iov_iter_get_pages_alloc to allocate more pages per call Al Viro
2017-02-02  9:51     ` [lustre-devel] " Al Viro
2017-02-02  9:51     ` Al Viro
2017-02-02 10:56     ` Christoph Hellwig
2017-02-02 10:56       ` [lustre-devel] " Christoph Hellwig
2017-02-02 10:56       ` Christoph Hellwig
2017-02-02 11:16       ` Al Viro
2017-02-02 11:16         ` [lustre-devel] " Al Viro
2017-02-02 11:16         ` Al Viro
2017-02-02 13:00         ` Jeff Layton
2017-02-02 13:00           ` Jeff Layton
2017-02-03  7:29           ` Al Viro
2017-02-03  7:29             ` [lustre-devel] " Al Viro
2017-02-03  7:29             ` Al Viro
2017-02-03 18:29             ` Linus Torvalds
2017-02-03 18:29               ` [lustre-devel] " Linus Torvalds
2017-02-03 18:29               ` Linus Torvalds
2017-02-03 19:08               ` Al Viro
2017-02-03 19:08                 ` [lustre-devel] " Al Viro
2017-02-03 19:08                 ` Al Viro
2017-02-03 19:28                 ` Linus Torvalds
2017-02-03 19:28                   ` [lustre-devel] " Linus Torvalds
2017-02-03 19:28                   ` Linus Torvalds
2017-02-13  9:56                   ` Steve Capper
2017-02-13 21:40                     ` Linus Torvalds
2017-02-13 21:40                       ` [lustre-devel] " Linus Torvalds
2017-02-13 21:40                       ` Linus Torvalds
2017-02-03  7:49           ` Christoph Hellwig
2017-02-03  7:49             ` [lustre-devel] " Christoph Hellwig
2017-02-03  7:49             ` Christoph Hellwig
2017-02-03  8:54             ` Al Viro
2017-02-03  8:54               ` [lustre-devel] " Al Viro
2017-02-03  8:54               ` Al Viro
2017-02-03 11:09               ` Christoph Hellwig
2017-02-03 11:09                 ` [lustre-devel] " Christoph Hellwig
2017-02-03 11:09                 ` Christoph Hellwig
2017-02-02 14:48     ` Jan Kara
2017-02-02 14:48       ` [lustre-devel] " Jan Kara
2017-02-02 14:48       ` Jan Kara
2017-02-02 18:28       ` Al Viro
2017-02-02 18:28         ` [lustre-devel] " Al Viro
2017-02-02 18:28         ` Al Viro
2017-02-03 14:47         ` Jan Kara
2017-02-03 14:47           ` [lustre-devel] " Jan Kara
2017-02-03 14:47           ` Jan Kara
2017-02-04  3:08     ` Al Viro
2017-02-04  3:08       ` [lustre-devel] " Al Viro
2017-02-04  3:08       ` Al Viro
2017-02-04 19:26       ` Al Viro
2017-02-04 19:26         ` [lustre-devel] " Al Viro
2017-02-04 19:26         ` Al Viro
2017-02-04 22:12         ` Miklos Szeredi
2017-02-04 22:12           ` Miklos Szeredi
2017-02-04 22:11       ` Miklos Szeredi
2017-02-04 22:11         ` Miklos Szeredi
2017-02-05  1:51         ` Al Viro
2017-02-05  1:51           ` [lustre-devel] " Al Viro
2017-02-05  1:51           ` Al Viro
2017-02-05 20:15           ` Miklos Szeredi
2017-02-05 20:15             ` Miklos Szeredi
2017-02-05 21:01             ` Al Viro
2017-02-05 21:01               ` [lustre-devel] " Al Viro
2017-02-05 21:01               ` Al Viro
2017-02-05 21:19               ` Miklos Szeredi
2017-02-05 21:19                 ` Miklos Szeredi
2017-02-05 22:04                 ` Al Viro
2017-02-05 22:04                   ` [lustre-devel] " Al Viro
2017-02-05 22:04                   ` Al Viro
2017-02-05 22:04                   ` Al Viro
2017-02-06  3:05                   ` Al Viro
2017-02-06  3:05                     ` [lustre-devel] " Al Viro
2017-02-06  3:05                     ` Al Viro
2017-02-06  9:08                     ` Miklos Szeredi
2017-02-06  9:57                       ` Al Viro
2017-02-06  9:57                         ` [lustre-devel] " Al Viro
2017-02-06  9:57                         ` Al Viro
2017-02-06 14:18                         ` Miklos Szeredi
2017-02-07  7:19                           ` Al Viro
2017-02-07  7:19                             ` [lustre-devel] " Al Viro
2017-02-07  7:19                             ` Al Viro
2017-02-07 11:35                             ` Miklos Szeredi
2017-02-07 11:35                               ` Miklos Szeredi
2017-02-08  5:54                               ` Al Viro
2017-02-08  5:54                                 ` [lustre-devel] " Al Viro
2017-02-08  5:54                                 ` Al Viro
2017-02-08  9:53                                 ` Miklos Szeredi
2017-02-06  8:37                   ` Miklos Szeredi
2017-02-05 20:56           ` Al Viro
2017-02-05 20:56             ` [lustre-devel] " Al Viro
2017-02-05 20:56             ` Al Viro
2017-02-16 13:10     ` Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.