All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: linux-nfs@vger.kernel.org
Subject: [PATCH 17/17] pnfs/blocklayout: allocate separate pages for the layoutcommit payload
Date: Thu,  7 Aug 2014 09:23:49 +0200	[thread overview]
Message-ID: <1407396229-4785-18-git-send-email-hch@lst.de> (raw)
In-Reply-To: <1407396229-4785-1-git-send-email-hch@lst.de>

Instead of overflowing the XDR send buffer with our extent list allocate
pages and pre-encode the layoutupdate payload into them.  We optimistically
allocate a single page use alloc_page and only switch to vmalloc when we
have more extents outstanding.  Currently there is only a single testcase
(xfstests generic/113) which can reproduce large enough extent lists for
this to occur.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/blocklayout/blocklayout.c |  9 ++--
 fs/nfs/blocklayout/blocklayout.h |  6 ++-
 fs/nfs/blocklayout/extent_tree.c | 89 +++++++++++++++++++++++++++++++---------
 3 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d5a2b87..fae8144 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -498,12 +498,11 @@ bl_return_range(struct pnfs_layout_hdr *lo,
 	err = ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
 }
 
-static void
-bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
-		       const struct nfs4_layoutcommit_args *arg)
+static int
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
 {
 	dprintk("%s enter\n", __func__);
-	ext_tree_encode_commit(BLK_LO2EXT(lo), xdr);
+	return ext_tree_prepare_commit(arg);
 }
 
 static void
@@ -808,7 +807,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 	.alloc_lseg			= bl_alloc_lseg,
 	.free_lseg			= bl_free_lseg,
 	.return_range			= bl_return_range,
-	.encode_layoutcommit		= bl_encode_layoutcommit,
+	.prepare_layoutcommit		= bl_prepare_layoutcommit,
 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
 	.set_layoutdriver		= bl_set_layoutdriver,
 	.clear_layoutdriver		= bl_clear_layoutdriver,
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index b4f66d8..caae202 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -80,6 +80,9 @@ struct pnfs_block_extent {
 	unsigned int	be_tag;
 };
 
+/* on the wire size of the extent */
+#define BL_EXTENT_SIZE	(7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
 struct pnfs_block_layout {
 	struct pnfs_layout_hdr	bl_layout;
 	struct rb_root		bl_ext_rw;
@@ -138,8 +141,7 @@ int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 		sector_t len);
 bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 		struct pnfs_block_extent *ret, bool rw);
-int ext_tree_encode_commit(struct pnfs_block_layout *bl,
-		struct xdr_stream *xdr);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
 void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status);
 
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c7dacfa..3945221 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -465,32 +465,23 @@ out:
 	return err;
 }
 
-int
-ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+		size_t buffer_size, size_t *count)
 {
 	struct pnfs_block_extent *be;
-	unsigned int count = 0;
-	__be32 *p, *xdr_start;
 	int ret = 0;
 
-	dprintk("%s enter\n", __func__);
-
-	xdr_start = xdr_reserve_space(xdr, 8);
-	if (!xdr_start)
-		return -ENOSPC;
-
 	spin_lock(&bl->bl_ext_lock);
 	for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
 		if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 		    be->be_tag != EXTENT_WRITTEN)
 			continue;
 
-		p = xdr_reserve_space(xdr, 7 * sizeof(__be32) +
-					NFS4_DEVICEID4_SIZE);
-		if (!p) {
-			printk("%s: out of space for extent list\n", __func__);
+		(*count)++;
+		if (*count * BL_EXTENT_SIZE > buffer_size) {
+			/* keep counting.. */
 			ret = -ENOSPC;
-			break;
+			continue;
 		}
 
 		p = xdr_encode_opaque_fixed(p, be->be_devid.data,
@@ -501,15 +492,75 @@ ext_tree_encode_commit(struct pnfs_block_layout *bl, struct xdr_stream *xdr)
 		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
 
 		be->be_tag = EXTENT_COMMITTING;
-		count++;
 	}
 	spin_unlock(&bl->bl_ext_lock);
 
-	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
-	xdr_start[1] = cpu_to_be32(count);
+	return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+	struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+	size_t count = 0, buffer_size = PAGE_SIZE;
+	__be32 *start_p;
+	int ret;
+
+	dprintk("%s enter\n", __func__);
+
+	arg->layoutupdate_page = alloc_page(GFP_NOFS);
+	if (!arg->layoutupdate_page)
+		return -ENOMEM;
+	start_p = page_address(arg->layoutupdate_page);
+	arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+	if (unlikely(ret)) {
+		if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+			int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+			for (i = 0; i < nr_pages; i++)
+				put_page(arg->layoutupdate_pages[i]);
+			kfree(arg->layoutupdate_pages);
+		} else {
+			put_page(arg->layoutupdate_page);
+		}
+
+		buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+		count = 0;
+
+		arg->layoutupdate_pages =
+			kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+				sizeof(struct page *), GFP_NOFS);
+		if (!arg->layoutupdate_pages)
+			return -ENOMEM;
+
+		start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
+		if (!start_p) {
+			kfree(arg->layoutupdate_pages);
+			return -ENOMEM;
+		}
+
+		goto retry;
+	}
+
+	*start_p = cpu_to_be32(count);
+	arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+
+	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+		__be32 *p = start_p;
+		int i = 0;
+
+		for (p = start_p;
+		     p < start_p + arg->layoutupdate_len;
+		     p += PAGE_SIZE) {
+			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+		}
+	}
 
 	dprintk("%s found %i ranges\n", __func__, count);
-	return ret;
+	return 0;
 }
 
 void
-- 
1.9.1


  parent reply	other threads:[~2014-08-07  7:21 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-07  7:23 pnfs block layout driver fixes Christoph Hellwig
2014-08-07  7:23 ` [PATCH 01/17] nfs: cap request size to fit a kmalloced page array Christoph Hellwig
2014-08-07  7:23 ` [PATCH 02/17] pnfs: do not pass uninitialized lsegs to ->free_lseg Christoph Hellwig
2014-08-07  7:23 ` [PATCH 03/17] pnfs: force a layout commit when encountering busy segments during recall Christoph Hellwig
2014-08-07  7:23 ` [PATCH 04/17] pnfs: don't check sequence on new stateids in layoutget Christoph Hellwig
2014-08-07  7:23 ` [PATCH 05/17] pnfs: add flag to force read-modify-write in ->write_begin Christoph Hellwig
2014-08-07  7:23 ` [PATCH 06/17] pnfs: add return_range method Christoph Hellwig
2014-08-07  7:23 ` [PATCH 07/17] pnfs: allow splicing pre-encoded pages into the layoutcommit args Christoph Hellwig
2014-08-07  7:23 ` [PATCH 08/17] pnfs/blocklayout: reject pnfs blocksize larger than page size Christoph Hellwig
2014-08-07 10:43   ` Peng Tao
2014-08-07 11:25     ` Christoph Hellwig
2014-08-07 11:51       ` Peng Tao
2014-08-07 12:10         ` Christoph Hellwig
2014-08-07 13:43           ` Peng Tao
2014-08-07 16:20             ` Christoph Hellwig
2014-08-08 10:28               ` Peng Tao
2014-08-07 12:56         ` faibish, sorin
2014-08-07 13:45           ` Peng Tao
2014-08-07 16:10           ` Christoph Hellwig
2014-08-07 16:18             ` faibish, sorin
2014-08-07 13:13       ` Steve Dickson
2014-08-07 13:17         ` Steve Dickson
2014-08-07  7:23 ` [PATCH 09/17] pnfs/blocklayout: improve GETDEVICEINFO error reporting Christoph Hellwig
2014-08-07  7:23 ` [PATCH 10/17] pnfs/blocklayout: plug block queues Christoph Hellwig
2014-08-07  7:23 ` [PATCH 11/17] pnfs/blocklayout: correctly decrement extent length Christoph Hellwig
2014-08-07  7:23 ` [PATCH 12/17] pnfs/blocklayout: remove read-modify-write handling in bl_write_pagelist Christoph Hellwig
2014-08-07  7:23 ` [PATCH 13/17] pnfs/blocklayout: don't set pages uptodate Christoph Hellwig
2014-08-07  7:23 ` [PATCH 14/17] pnfs/blocklayout: rewrite extent tracking Christoph Hellwig
2014-08-07  7:23 ` [PATCH 15/17] pnfs/blocklayout: implement the return_range method Christoph Hellwig
2014-08-07  7:23 ` [PATCH 16/17] pnfs/blocklayout: return layouts on setattr Christoph Hellwig
2014-08-07  7:23 ` Christoph Hellwig [this message]
2014-08-07 11:05   ` [PATCH 17/17] pnfs/blocklayout: allocate separate pages for the layoutcommit payload Peng Tao
2014-08-07 11:27     ` Christoph Hellwig
2014-08-07 11:57       ` Peng Tao
2014-08-20 18:43 ` pnfs block layout driver fixes Jeff Layton
2014-08-20 22:14   ` Christoph Hellwig
2014-09-07 15:39 ` Christoph Hellwig
2014-09-07 16:44   ` Trond Myklebust

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1407396229-4785-18-git-send-email-hch@lst.de \
    --to=hch@lst.de \
    --cc=linux-nfs@vger.kernel.org \
    --cc=trond.myklebust@primarydata.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.