All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
@ 2011-08-07  2:53 Peng Tao
  2011-08-07  2:53 ` [PATCH 2/5] pNFS: recoalesce when ld read " Peng Tao
                   ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: Peng Tao @ 2011-08-07  2:53 UTC (permalink / raw)
  To: Benny Halevy; +Cc: linux-nfs, Peng Tao

For pnfs pagelist write failure, we need to pg_recoalesce and resend
IO to mds.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/internal.h |    4 ++++
 fs/nfs/pnfs.c     |   35 ++++++++++++++++++++++++++++++++---
 fs/nfs/write.c    |   21 ++++++++++++++-------
 3 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913..62f183d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -305,6 +305,10 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
 /* write.c */
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 		struct list_head *head);
+extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
+		struct nfs_pageio_descriptor *pgio);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+		struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_write_data *p);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e88..08aba45 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1172,6 +1172,13 @@ int
 pnfs_ld_write_done(struct nfs_write_data *data)
 {
 	int status;
+	struct nfs_pageio_descriptor pgio;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.range_start = data->mds_offset,
+		.nr_to_write = data->npages,
+		.range_end = LLONG_MAX,
+	};
 
 	if (!data->pnfs_error) {
 		pnfs_set_layoutcommit(data);
@@ -1180,11 +1187,33 @@ pnfs_ld_write_done(struct nfs_write_data *data)
 		return 0;
 	}
 
+	put_lseg(data->lseg);
+	data->lseg = NULL;
 	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
 		data->pnfs_error);
-	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
-				    data->mds_ops, NFS_FILE_SYNC);
-	return status ? : -EAGAIN;
+	nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
+	pgio.pg_recoalesce = 1;
+	while (!list_empty(&data->pages)) {
+		struct nfs_page *req = nfs_list_entry(data->pages.next);
+		struct page *page = req->wb_page;
+
+		nfs_list_remove_request(req);
+		nfs_clear_page_tag_locked(req);
+
+		end_page_writeback(page);
+
+		lock_page(page);
+		status = do_nfs_writepage(page, &wbc, &pgio);
+		if (status) {
+			/* FIXME: is this enough?? */
+			set_page_dirty(page);
+		}
+		unlock_page(page);
+	}
+	nfs_pageio_complete(&pgio);
+	nfs_writedata_release(data);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f..0ccdf98 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -285,14 +285,9 @@ out:
 	return ret;
 }
 
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+int do_nfs_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-	struct inode *inode = page->mapping->host;
 	int ret;
-
-	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
-	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
-
 	nfs_pageio_cond_complete(pgio, page->index);
 	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
@@ -301,6 +296,17 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(do_nfs_writepage);
+
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+{
+	struct inode *inode = page->mapping->host;
+
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
+
+	return do_nfs_writepage(page, wbc, pgio);
+}
 
 /*
  * Write an mmapped page to the server.
@@ -1051,12 +1057,13 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 	.pg_doio = nfs_generic_pg_writepages,
 };
 
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags)
 {
 	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
 				NFS_SERVER(inode)->wsize, ioflags);
 }
+EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds);
 
 void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 {
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/5] pNFS: recoalesce when ld read pagelist fails
  2011-08-07  2:53 [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Peng Tao
@ 2011-08-07  2:53 ` Peng Tao
  2011-08-10 17:55   ` Boaz Harrosh
  2011-08-07  2:53 ` [PATCH 3/5] pnfs: introduce pnfs private workqueue Peng Tao
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Peng Tao @ 2011-08-07  2:53 UTC (permalink / raw)
  To: Benny Halevy; +Cc: linux-nfs, Peng Tao

For pnfs pagelist read failure, we need to pg_recoalesce and resend
IO to mds.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/internal.h |    2 ++
 fs/nfs/pnfs.c     |   18 ++++++++++++++----
 fs/nfs/read.c     |    3 ++-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 62f183d..78b662e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 		struct list_head *head);
 extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
 		struct nfs_pageio_descriptor *pgio);
+extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+		struct inode *inode);
 extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 		struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 08aba45..66fc854 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1300,7 +1300,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 int
 pnfs_ld_read_done(struct nfs_read_data *data)
 {
-	int status;
+	struct nfs_pageio_descriptor pgio;
 
 	if (!data->pnfs_error) {
 		__nfs4_read_done_cb(data);
@@ -1309,11 +1309,21 @@ pnfs_ld_read_done(struct nfs_read_data *data)
 		return 0;
 	}
 
+	put_lseg(data->lseg);
+	data->lseg = NULL;
 	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
 		data->pnfs_error);
-	status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
-				   data->mds_ops);
-	return status ? : -EAGAIN;
+	nfs_pageio_init_read_mds(&pgio, data->inode);
+	pgio.pg_recoalesce = 1;
+	while (!list_empty(&data->pages)) {
+		struct nfs_page *req = nfs_list_entry(data->pages.next);
+
+		nfs_pageio_add_request(&pgio, req);
+	}
+	nfs_pageio_complete(&pgio);
+	nfs_readdata_release(data);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c04..2484131 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -112,12 +112,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 	}
 }
 
-static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
 		struct inode *inode)
 {
 	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
 			NFS_SERVER(inode)->rsize, 0);
 }
+EXPORT_SYMBOL_GPL(nfs_pageio_init_read_mds);
 
 void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 {
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/5] pnfs: introduce pnfs private workqueue
  2011-08-07  2:53 [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Peng Tao
  2011-08-07  2:53 ` [PATCH 2/5] pNFS: recoalesce when ld read " Peng Tao
@ 2011-08-07  2:53 ` Peng Tao
  2011-08-10 18:12   ` Boaz Harrosh
  2011-08-07  2:53 ` [PATCH 4/5] SUNRPC/NFS: make rpc pipe upcall generic Peng Tao
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Peng Tao @ 2011-08-07  2:53 UTC (permalink / raw)
  To: Benny Halevy; +Cc: linux-nfs, Peng Tao

For layoutdriver io done functions, default workqueue is not a good place
as the code is executed in IO path. So add a pnfs private workqueue to handle
them.

Also change block and object layout code to make use of this private workqueue.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |   17 ++++++++++----
 fs/nfs/objlayout/objio_osd.c     |    8 ++++++
 fs/nfs/objlayout/objlayout.c     |    4 +-
 fs/nfs/pnfs.c                    |   46 +++++++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h                    |    4 +++
 5 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9561c8f..3aef9f0 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -228,7 +228,7 @@ bl_end_par_io_read(void *data)
 	struct nfs_read_data *rdata = data;
 
 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
-	schedule_work(&rdata->task.u.tk_work);
+	queue_work(pnfsiod_workqueue, &rdata->task.u.tk_work);
 }
 
 /* We don't want normal .rpc_call_done callback used, so we replace it
@@ -418,7 +418,7 @@ static void bl_end_par_io_write(void *data)
 	wdata->task.tk_status = 0;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
-	schedule_work(&wdata->task.u.tk_work);
+	queue_work(pnfsiod_workqueue, &wdata->task.u.tk_work);
 }
 
 /* FIXME STUB - mark intersection of layout and page as bad, so is not
@@ -977,29 +977,35 @@ static int __init nfs4blocklayout_init(void)
 	if (ret)
 		goto out;
 
+	ret = pnfsiod_start();
+	if (ret)
+		goto out_remove;
+
 	init_waitqueue_head(&bl_wq);
 
 	mnt = rpc_get_mount();
 	if (IS_ERR(mnt)) {
 		ret = PTR_ERR(mnt);
-		goto out_remove;
+		goto out_stop;
 	}
 
 	ret = vfs_path_lookup(mnt->mnt_root,
 			      mnt,
 			      NFS_PIPE_DIRNAME, 0, &path);
 	if (ret)
-		goto out_remove;
+		goto out_stop;
 
 	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
 				    &bl_upcall_ops, 0);
 	if (IS_ERR(bl_device_pipe)) {
 		ret = PTR_ERR(bl_device_pipe);
-		goto out_remove;
+		goto out_stop;
 	}
 out:
 	return ret;
 
+out_stop:
+	pnfsiod_stop();
 out_remove:
 	pnfs_unregister_layoutdriver(&blocklayout_type);
 	return ret;
@@ -1011,6 +1017,7 @@ static void __exit nfs4blocklayout_exit(void)
 	       __func__);
 
 	pnfs_unregister_layoutdriver(&blocklayout_type);
+	pnfsiod_stop();
 	rpc_unlink(bl_device_pipe);
 }
 
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12..f28013f 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -1042,7 +1042,14 @@ static int __init
 objlayout_init(void)
 {
 	int ret = pnfs_register_layoutdriver(&objlayout_type);
+	if (ret)
+		goto out;
 
+	ret = pnfsiod_start();
+	if (ret)
+		pnfs_unregister_layoutdriver(&objlayout_type);
+
+out:
 	if (ret)
 		printk(KERN_INFO
 			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
@@ -1057,6 +1064,7 @@ static void __exit
 objlayout_exit(void)
 {
 	pnfs_unregister_layoutdriver(&objlayout_type);
+	pnfsiod_stop();
 	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
 	       __func__);
 }
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e..1e7fa05 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -305,7 +305,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
 		pnfs_ld_read_done(rdata);
 	else {
 		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
-		schedule_work(&rdata->task.u.tk_work);
+		queue_task(pnfsiod_workqueue, &rdata->task.u.tk_work);
 	}
 }
 
@@ -396,7 +396,7 @@ objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
 		pnfs_ld_write_done(wdata);
 	else {
 		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
-		schedule_work(&wdata->task.u.tk_work);
+		queue_task(pnfsiod_workqueue, &wdata->task.u.tk_work);
 	}
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 66fc854..e183a4f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -38,7 +38,7 @@
 /* Locking:
  *
  * pnfs_spinlock:
- *      protects pnfs_modules_tbl.
+ *      protects pnfs_modules_tbl, pnfsiod_workqueue and pnfsiod_users.
  */
 static DEFINE_SPINLOCK(pnfs_spinlock);
 
@@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
  */
 static LIST_HEAD(pnfs_modules_tbl);
 
+struct workqueue_struct *pnfsiod_workqueue;
+static int pnfsiod_users = 0;
+
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
 find_pnfs_driver_locked(u32 id)
@@ -1517,3 +1520,44 @@ out:
 	dprintk("<-- %s status %d\n", __func__, status);
 	return status;
 }
+
+/*
+ * start up the pnfsiod workqueue
+ */
+int pnfsiod_start(void)
+{
+	struct workqueue_struct *wq;
+	dprintk("RPC:       creating workqueue pnfsiod\n");
+	wq = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
+	if (wq == NULL)
+		return -ENOMEM;
+	spin_lock(&pnfs_spinlock);
+	pnfsiod_users++;
+	if (pnfsiod_workqueue == NULL) {
+		pnfsiod_workqueue = wq;
+	} else {
+		destroy_workqueue(wq);
+	}
+	spin_unlock(&pnfs_spinlock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnfsiod_start);
+
+/*
+ * Destroy the pnfsiod workqueue
+ */
+void pnfsiod_stop(void)
+{
+	struct workqueue_struct *wq == NULL;
+
+	spin_lock(&pnfs_spinlock);
+	pnfsiod_users--;
+	if (pnfsiod_users == 0) {
+		wq = pnfsiod_workqueue;
+		pnfsiod_workqueue = NULL;
+	}
+	spin_unlock(&pnfs_spinlock);
+	if (wq)
+		destroy_workqueue(wq);
+}
+EXPORT_SYMBOL_GPL(pnfsiod_stop);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd5..af7530b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -155,6 +155,10 @@ struct pnfs_devicelist {
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
+extern struct workqueue_struct *pnfsiod_workqueue;
+extern int pnfsiod_start(void);
+extern void pnfsiod_stop(void);
+
 /* nfs4proc.c */
 extern int nfs4_proc_getdevicelist(struct nfs_server *server,
 				   const struct nfs_fh *fh,
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/5] SUNRPC/NFS: make rpc pipe upcall generic
  2011-08-07  2:53 [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Peng Tao
  2011-08-07  2:53 ` [PATCH 2/5] pNFS: recoalesce when ld read " Peng Tao
  2011-08-07  2:53 ` [PATCH 3/5] pnfs: introduce pnfs private workqueue Peng Tao
@ 2011-08-07  2:53 ` Peng Tao
  2011-08-10 15:59   ` Jim Rees
  2011-08-07  2:53 ` [PATCH 5/5] pnfs: make _set_lo_fail generic Peng Tao
  2011-08-10 17:52 ` [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Boaz Harrosh
  4 siblings, 1 reply; 21+ messages in thread
From: Peng Tao @ 2011-08-07  2:53 UTC (permalink / raw)
  To: Benny Halevy; +Cc: linux-nfs, Peng Tao

The same function is used by idmap, gss and blocklayout code. Make it generic.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c    |    2 +-
 fs/nfs/blocklayout/blocklayout.h    |    2 --
 fs/nfs/blocklayout/blocklayoutdev.c |   22 ----------------------
 fs/nfs/idmap.c                      |   25 +------------------------
 include/linux/sunrpc/rpc_pipe_fs.h  |    2 ++
 net/sunrpc/auth_gss/auth_gss.c      |   24 ++----------------------
 net/sunrpc/rpc_pipe.c               |   20 ++++++++++++++++++++
 7 files changed, 26 insertions(+), 71 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3aef9f0..5c14ee2 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -960,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 };
 
 static const struct rpc_pipe_ops bl_upcall_ops = {
-	.upcall		= bl_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= bl_pipe_downcall,
 	.destroy_msg	= bl_pipe_destroy_msg,
 };
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827..8900396 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
 /* blocklayoutdev.c */
-ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-		       char __user *, size_t);
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
 struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393..46a2ce6 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
 	return blkdev_put(bdev, FMODE_READ);
 }
 
-/*
- * Shouldn't there be a rpc_generic_upcall() to do this for us?
- */
-ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		       char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len - msg->copied, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 static struct bl_dev_msg bl_mount_reply;
 
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801a..47d1c6f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
 	struct idmap_hashtable	idmap_group_hash;
 };
 
-static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-				 char __user *, size_t);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
 				   size_t);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static unsigned int fnvhash32(const void *, size_t);
 
 static const struct rpc_pipe_ops idmap_upcall_ops = {
-	.upcall		= idmap_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= idmap_pipe_downcall,
 	.destroy_msg	= idmap_pipe_destroy_msg,
 };
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 	return ret;
 }
 
-/* RPC pipefs upcall/downcall routines */
-static ssize_t
-idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		  char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index cf14db9..e4ea430 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -44,6 +44,8 @@ RPC_I(struct inode *inode)
 	return container_of(inode, struct rpc_inode, vfs_inode);
 }
 
+extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *,
+				       char __user *, size_t);
 extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *);
 
 struct rpc_clnt;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 364eb45..e9b7693 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -603,26 +603,6 @@ out:
 	return err;
 }
 
-static ssize_t
-gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-		char __user *dst, size_t buflen)
-{
-	char *data = (char *)msg->data + msg->copied;
-	size_t mlen = min(msg->len, buflen);
-	unsigned long left;
-
-	left = copy_to_user(dst, data, mlen);
-	if (left == mlen) {
-		msg->errno = -EFAULT;
-		return -EFAULT;
-	}
-
-	mlen -= left;
-	msg->copied += mlen;
-	msg->errno = 0;
-	return mlen;
-}
-
 #define MSG_BUF_MAXSIZE 1024
 
 static ssize_t
@@ -1590,7 +1570,7 @@ static const struct rpc_credops gss_nullops = {
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
-	.upcall		= gss_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= gss_pipe_downcall,
 	.destroy_msg	= gss_pipe_destroy_msg,
 	.open_pipe	= gss_pipe_open_v0,
@@ -1598,7 +1578,7 @@ static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
 };
 
 static const struct rpc_pipe_ops gss_upcall_ops_v1 = {
-	.upcall		= gss_pipe_upcall,
+	.upcall		= rpc_pipe_generic_upcall,
 	.downcall	= gss_pipe_downcall,
 	.destroy_msg	= gss_pipe_destroy_msg,
 	.open_pipe	= gss_pipe_open_v1,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b181e34..67dbc18 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -77,6 +77,26 @@ rpc_timeout_upcall_queue(struct work_struct *work)
 	rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT);
 }
 
+ssize_t rpc_pipe_generic_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+				char __user *dst, size_t buflen)
+{
+	char *data = (char *)msg->data + msg->copied;
+	size_t mlen = min(msg->len - msg->copied, buflen);
+	unsigned long left;
+
+	left = copy_to_user(dst, data, mlen);
+	if (left == mlen) {
+		msg->errno = -EFAULT;
+		return -EFAULT;
+	}
+
+	mlen -= left;
+	msg->copied += mlen;
+	msg->errno = 0;
+	return mlen;
+}
+EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall);
+
 /**
  * rpc_queue_upcall - queue an upcall message to userspace
  * @inode: inode of upcall pipe on which to queue given message
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/5] pnfs: make _set_lo_fail generic
  2011-08-07  2:53 [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Peng Tao
                   ` (2 preceding siblings ...)
  2011-08-07  2:53 ` [PATCH 4/5] SUNRPC/NFS: make rpc pipe upcall generic Peng Tao
@ 2011-08-07  2:53 ` Peng Tao
  2011-08-10 17:52 ` [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Boaz Harrosh
  4 siblings, 0 replies; 21+ messages in thread
From: Peng Tao @ 2011-08-07  2:53 UTC (permalink / raw)
  To: Benny Halevy; +Cc: linux-nfs, Peng Tao

file layout and block layout both use it to set mark layout io failure bit. So make it
generic.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/blocklayout/blocklayout.c |   17 +++--------------
 fs/nfs/nfs4filelayout.c          |   19 +++----------------
 fs/nfs/pnfs.c                    |   12 ++++++++++++
 fs/nfs/pnfs.h                    |    1 +
 4 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5c14ee2..56bbd78 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -176,17 +176,6 @@ retry:
 	return bio;
 }
 
-static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-	if (lseg->pls_range.iomode == IOMODE_RW) {
-		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-	} else {
-		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-	}
-}
-
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -206,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!rdata->pnfs_error)
 			rdata->pnfs_error = -EIO;
-		bl_set_lo_fail(rdata->lseg);
+		pnfs_set_lo_fail(rdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -370,7 +359,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
-		bl_set_lo_fail(wdata->lseg);
+		pnfs_set_lo_fail(wdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -386,7 +375,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 	if (!uptodate) {
 		if (!wdata->pnfs_error)
 			wdata->pnfs_error = -EIO;
-		bl_set_lo_fail(wdata->lseg);
+		pnfs_set_lo_fail(wdata->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4..4c78c62 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
-/* For data server errors we don't recover from */
-static void
-filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-	if (lseg->pls_range.iomode == IOMODE_RW) {
-		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-	} else {
-		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-	}
-}
-
 static int filelayout_async_handle_error(struct rpc_task *task,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
@@ -145,7 +132,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_read(task, data);
 			clp = NFS_SERVER(data->inode)->nfs_client;
 		}
@@ -221,7 +208,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 			nfs4_reset_write(task, data);
 			clp = NFS_SERVER(data->inode)->nfs_client;
 		} else
@@ -256,7 +243,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 			__func__, data->ds_clp, data->ds_clp->cl_session);
 		if (reset) {
 			prepare_to_resend_writes(data);
-			filelayout_set_lo_fail(data->lseg);
+			pnfs_set_lo_fail(data->lseg);
 		} else
 			nfs_restart_rpc(task, data->ds_clp);
 		return -EAGAIN;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e183a4f..4450848 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1423,6 +1423,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 	}
 }
 
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+	} else {
+		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	}
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
 void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index af7530b..709be0a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -182,6 +182,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
-- 
1.7.1.262.g5ef3d


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 4/5] SUNRPC/NFS: make rpc pipe upcall generic
  2011-08-07  2:53 ` [PATCH 4/5] SUNRPC/NFS: make rpc pipe upcall generic Peng Tao
@ 2011-08-10 15:59   ` Jim Rees
  0 siblings, 0 replies; 21+ messages in thread
From: Jim Rees @ 2011-08-10 15:59 UTC (permalink / raw)
  To: Peng Tao; +Cc: Benny Halevy, linux-nfs, Peng Tao

Peng Tao wrote:

  The same function is used by idmap, gss and blocklayout code. Make it
  generic.

Thank you.  That one bothered me (see comment) but I didn't have time to
take care of it during the merge.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-07  2:53 [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Peng Tao
                   ` (3 preceding siblings ...)
  2011-08-07  2:53 ` [PATCH 5/5] pnfs: make _set_lo_fail generic Peng Tao
@ 2011-08-10 17:52 ` Boaz Harrosh
  2011-08-11  0:03   ` Peng Tao
  4 siblings, 1 reply; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-10 17:52 UTC (permalink / raw)
  To: Peng Tao, Trond Myklebust; +Cc: Benny Halevy, linux-nfs, Peng Tao

On 08/06/2011 07:53 PM, Peng Tao wrote:
> For pnfs pagelist write failure, we need to pg_recoalesce and resend
> IO to mds.
> 

I have not given this subject any thought or investigation, so I don't
know what we should do, but the gut feeling is that I have seen all this
code else where and we could be having a bigger re-use of existing code.

What if we dig into:
	data->mds_ops->rpc_call_done(&data->task, data);
	data->mds_ops->rpc_release(data);

And do all the pages tear-down and unlocks but if there is an error
not set them as clean. That is keep them dirty. Then mark the layout
as error and let the normal code choose an MDS write_out. (Just a wild
thought)

Trond please look in here, can't it be made simpler?


> Signed-off-by: Peng Tao <peng_tao@emc.com>
> ---
>  fs/nfs/internal.h |    4 ++++
>  fs/nfs/pnfs.c     |   35 ++++++++++++++++++++++++++++++++---
>  fs/nfs/write.c    |   21 ++++++++++++++-------
>  3 files changed, 50 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index ab12913..62f183d 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -305,6 +305,10 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
>  /* write.c */
>  extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>  		struct list_head *head);
> +extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
> +		struct nfs_pageio_descriptor *pgio);
> +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
> +		struct inode *inode, int ioflags);
>  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
>  extern void nfs_writedata_release(struct nfs_write_data *wdata);
>  extern void nfs_commit_free(struct nfs_write_data *p);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index e550e88..08aba45 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1172,6 +1172,13 @@ int
>  pnfs_ld_write_done(struct nfs_write_data *data)
>  {
>  	int status;
> +	struct nfs_pageio_descriptor pgio;
> +	struct writeback_control wbc = {
> +		.sync_mode = WB_SYNC_ALL,
> +		.range_start = data->mds_offset,
> +		.nr_to_write = data->npages,
> +		.range_end = LLONG_MAX,
> +	};
>  
>  	if (!data->pnfs_error) {
>  		pnfs_set_layoutcommit(data);
> @@ -1180,11 +1187,33 @@ pnfs_ld_write_done(struct nfs_write_data *data)
>  		return 0;
>  	}
>  
> +	put_lseg(data->lseg);
> +	data->lseg = NULL;
>  	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>  		data->pnfs_error);
> -	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
> -				    data->mds_ops, NFS_FILE_SYNC);
> -	return status ? : -EAGAIN;
> +	nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
> +	pgio.pg_recoalesce = 1;
> +	while (!list_empty(&data->pages)) {
> +		struct nfs_page *req = nfs_list_entry(data->pages.next);
> +		struct page *page = req->wb_page;
> +
> +		nfs_list_remove_request(req);
> +		nfs_clear_page_tag_locked(req);
> +
> +		end_page_writeback(page);
> +
> +		lock_page(page);
> +		status = do_nfs_writepage(page, &wbc, &pgio);
> +		if (status) {
> +			/* FIXME: is this enough?? */
> +			set_page_dirty(page);
> +		}
> +		unlock_page(page);
> +	}
> +	nfs_pageio_complete(&pgio);
> +	nfs_writedata_release(data);
> +
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
>  
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b39b37f..0ccdf98 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -285,14 +285,9 @@ out:
>  	return ret;
>  }
>  
> -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
> +int do_nfs_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>  {
> -	struct inode *inode = page->mapping->host;
>  	int ret;
> -
> -	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
> -	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
> -
>  	nfs_pageio_cond_complete(pgio, page->index);
>  	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
>  	if (ret == -EAGAIN) {
> @@ -301,6 +296,17 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
>  	}
>  	return ret;
>  }
> +EXPORT_SYMBOL_GPL(do_nfs_writepage);
> +
> +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)

This is a terrible name please invent something more appropriate. You can't have an
nfs_do_writepage call a do_nfs_writepage it's the same name. Please use a different name
that describes what is the point of this function. (nfs_writepage_stats ???)

> +{
> +	struct inode *inode = page->mapping->host;
> +
> +	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
> +	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
> +
> +	return do_nfs_writepage(page, wbc, pgio);
> +}
>  
>  /*
>   * Write an mmapped page to the server.
> @@ -1051,12 +1057,13 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
>  	.pg_doio = nfs_generic_pg_writepages,
>  };
>  
> -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
> +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>  				  struct inode *inode, int ioflags)
>  {
>  	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
>  				NFS_SERVER(inode)->wsize, ioflags);
>  }
> +EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds);

Why is this EXPORT? if you are going to use it from LD driver later
in a patch that we did not yet see, please only make it export in the
patch that has a user for it. (You don't need EXPORT_X if it is used
by a different file inside nfs.ko, just only remove the static)

>  
>  void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
>  {

Thanks for looking into this issue. Actually looking back we always had
a problem here. I never was able to pass my error injection tests.

Boaz

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/5] pNFS: recoalesce when ld read pagelist fails
  2011-08-07  2:53 ` [PATCH 2/5] pNFS: recoalesce when ld read " Peng Tao
@ 2011-08-10 17:55   ` Boaz Harrosh
  2011-08-11  0:01     ` Peng Tao
  0 siblings, 1 reply; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-10 17:55 UTC (permalink / raw)
  To: Peng Tao; +Cc: Benny Halevy, linux-nfs, Peng Tao

On 08/06/2011 07:53 PM, Peng Tao wrote:
> For pnfs pagelist read failure, we need to pg_recoalesce and resend
> IO to mds.
> 
> Signed-off-by: Peng Tao <peng_tao@emc.com>
> ---
>  fs/nfs/internal.h |    2 ++
>  fs/nfs/pnfs.c     |   18 ++++++++++++++----
>  fs/nfs/read.c     |    3 ++-
>  3 files changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 62f183d..78b662e 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -307,6 +307,8 @@ extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>  		struct list_head *head);
>  extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
>  		struct nfs_pageio_descriptor *pgio);
> +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
> +		struct inode *inode);
>  extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>  		struct inode *inode, int ioflags);
>  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 08aba45..66fc854 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1300,7 +1300,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
>  int
>  pnfs_ld_read_done(struct nfs_read_data *data)
>  {
> -	int status;
> +	struct nfs_pageio_descriptor pgio;
>  
>  	if (!data->pnfs_error) {
>  		__nfs4_read_done_cb(data);
> @@ -1309,11 +1309,21 @@ pnfs_ld_read_done(struct nfs_read_data *data)
>  		return 0;
>  	}
>  
> +	put_lseg(data->lseg);
> +	data->lseg = NULL;
>  	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>  		data->pnfs_error);
> -	status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
> -				   data->mds_ops);
> -	return status ? : -EAGAIN;
> +	nfs_pageio_init_read_mds(&pgio, data->inode);
> +	pgio.pg_recoalesce = 1;
> +	while (!list_empty(&data->pages)) {
> +		struct nfs_page *req = nfs_list_entry(data->pages.next);
> +
> +		nfs_pageio_add_request(&pgio, req);
> +	}
> +	nfs_pageio_complete(&pgio);
> +	nfs_readdata_release(data);
> +
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
>  
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index 2171c04..2484131 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -112,12 +112,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
>  	}
>  }
>  
> -static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
> +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
>  		struct inode *inode)
>  {
>  	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
>  			NFS_SERVER(inode)->rsize, 0);
>  }
> +EXPORT_SYMBOL_GPL(nfs_pageio_init_read_mds);
>  

Here to. Who is the user of this export?

Boaz

>  void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
>  {


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/5] pnfs: introduce pnfs private workqueue
  2011-08-07  2:53 ` [PATCH 3/5] pnfs: introduce pnfs private workqueue Peng Tao
@ 2011-08-10 18:12   ` Boaz Harrosh
  2011-08-10 18:21     ` Boaz Harrosh
  0 siblings, 1 reply; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-10 18:12 UTC (permalink / raw)
  To: Peng Tao; +Cc: Benny Halevy, linux-nfs, Peng Tao

On 08/06/2011 07:53 PM, Peng Tao wrote:
> For layoutdriver io done functions, default workqueue is not a good place
> as the code is executed in IO path. So add a pnfs private workqueue to handle
> them.
> 
> Also change block and object layout code to make use of this private workqueue.
> 
> Signed-off-by: Peng Tao <peng_tao@emc.com>
> ---
>  fs/nfs/blocklayout/blocklayout.c |   17 ++++++++++----
>  fs/nfs/objlayout/objio_osd.c     |    8 ++++++
>  fs/nfs/objlayout/objlayout.c     |    4 +-
>  fs/nfs/pnfs.c                    |   46 +++++++++++++++++++++++++++++++++++++-
>  fs/nfs/pnfs.h                    |    4 +++
>  5 files changed, 71 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 9561c8f..3aef9f0 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -228,7 +228,7 @@ bl_end_par_io_read(void *data)
>  	struct nfs_read_data *rdata = data;
>  
>  	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
> -	schedule_work(&rdata->task.u.tk_work);
> +	queue_work(pnfsiod_workqueue, &rdata->task.u.tk_work);

If pnfsiod_workqueue is a global pnfs resource that users should
only use with global pnfsiod_start/stop() then I would like
a wrapper around queue_work, pnfsiod_queue_work(tk_work) that
keeps the pnfsiod_workqueue private to that subsystem. (And
the proper checks could be made, and races avoided)

>  }
>  
>  /* We don't want normal .rpc_call_done callback used, so we replace it
> @@ -418,7 +418,7 @@ static void bl_end_par_io_write(void *data)
>  	wdata->task.tk_status = 0;
>  	wdata->verf.committed = NFS_FILE_SYNC;
>  	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
> -	schedule_work(&wdata->task.u.tk_work);
> +	queue_work(pnfsiod_workqueue, &wdata->task.u.tk_work);
>  }
>  
>  /* FIXME STUB - mark intersection of layout and page as bad, so is not
> @@ -977,29 +977,35 @@ static int __init nfs4blocklayout_init(void)
>  	if (ret)
>  		goto out;
>  
> +	ret = pnfsiod_start();
> +	if (ret)
> +		goto out_remove;
> +
>  	init_waitqueue_head(&bl_wq);
>  
>  	mnt = rpc_get_mount();
>  	if (IS_ERR(mnt)) {
>  		ret = PTR_ERR(mnt);
> -		goto out_remove;
> +		goto out_stop;
>  	}
>  
>  	ret = vfs_path_lookup(mnt->mnt_root,
>  			      mnt,
>  			      NFS_PIPE_DIRNAME, 0, &path);
>  	if (ret)
> -		goto out_remove;
> +		goto out_stop;
>  
>  	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
>  				    &bl_upcall_ops, 0);
>  	if (IS_ERR(bl_device_pipe)) {
>  		ret = PTR_ERR(bl_device_pipe);
> -		goto out_remove;
> +		goto out_stop;
>  	}
>  out:
>  	return ret;
>  
> +out_stop:
> +	pnfsiod_stop();
>  out_remove:
>  	pnfs_unregister_layoutdriver(&blocklayout_type);
>  	return ret;
> @@ -1011,6 +1017,7 @@ static void __exit nfs4blocklayout_exit(void)
>  	       __func__);
>  
>  	pnfs_unregister_layoutdriver(&blocklayout_type);
> +	pnfsiod_stop();
>  	rpc_unlink(bl_device_pipe);
>  }
>  
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index d0cda12..f28013f 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -1042,7 +1042,14 @@ static int __init
>  objlayout_init(void)
>  {
>  	int ret = pnfs_register_layoutdriver(&objlayout_type);
> +	if (ret)
> +		goto out;
>  
> +	ret = pnfsiod_start();
> +	if (ret)
> +		pnfs_unregister_layoutdriver(&objlayout_type);
> +
> +out:
>  	if (ret)
>  		printk(KERN_INFO
>  			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
> @@ -1057,6 +1064,7 @@ static void __exit
>  objlayout_exit(void)
>  {
>  	pnfs_unregister_layoutdriver(&objlayout_type);
> +	pnfsiod_stop();
>  	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
>  	       __func__);
>  }
> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
> index 1d06f8e..1e7fa05 100644
> --- a/fs/nfs/objlayout/objlayout.c
> +++ b/fs/nfs/objlayout/objlayout.c
> @@ -305,7 +305,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
>  		pnfs_ld_read_done(rdata);
>  	else {
>  		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
> -		schedule_work(&rdata->task.u.tk_work);
> +		queue_task(pnfsiod_workqueue, &rdata->task.u.tk_work);
>  	}
>  }
>  
> @@ -396,7 +396,7 @@ objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
>  		pnfs_ld_write_done(wdata);
>  	else {
>  		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
> -		schedule_work(&wdata->task.u.tk_work);
> +		queue_task(pnfsiod_workqueue, &wdata->task.u.tk_work);
>  	}
>  }
>  
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 66fc854..e183a4f 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -38,7 +38,7 @@
>  /* Locking:
>   *
>   * pnfs_spinlock:
> - *      protects pnfs_modules_tbl.
> + *      protects pnfs_modules_tbl, pnfsiod_workqueue and pnfsiod_users.
>   */
>  static DEFINE_SPINLOCK(pnfs_spinlock);
>  
> @@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
>   */
>  static LIST_HEAD(pnfs_modules_tbl);
>  
> +struct workqueue_struct *pnfsiod_workqueue;
> +static int pnfsiod_users = 0;
> +
>  /* Return the registered pnfs layout driver module matching given id */
>  static struct pnfs_layoutdriver_type *
>  find_pnfs_driver_locked(u32 id)
> @@ -1517,3 +1520,44 @@ out:
>  	dprintk("<-- %s status %d\n", __func__, status);
>  	return status;
>  }
> +
> +/*
> + * start up the pnfsiod workqueue
> + */
> +int pnfsiod_start(void)
> +{
> +	struct workqueue_struct *wq;
> +	dprintk("RPC:       creating workqueue pnfsiod\n");
> +	wq = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
> +	if (wq == NULL)
> +		return -ENOMEM;
> +	spin_lock(&pnfs_spinlock);
> +	pnfsiod_users++;
> +	if (pnfsiod_workqueue == NULL) {
> +		pnfsiod_workqueue = wq;
> +	} else {
> +		destroy_workqueue(wq);
> +	}
> +	spin_unlock(&pnfs_spinlock);
> +	return 0;
> +}

What? why not a simple:
+	int ret = 0;

+	spin_lock(&pnfs_spinlock);
+	if (pnfsiod_workqueue == NULL) {
+		pnfsiod_workqueue = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
+		if (unlikely!(pnfsiod_workqueue)) {
+			ret = -ENOMEM;
			goto out;
		}
+	}
+	pnfsiod_users++;
+ out:
+	spin_unlock(&pnfs_spinlock);
+	return ret;

> +EXPORT_SYMBOL_GPL(pnfsiod_start);
> +
> +/*
> + * Destroy the pnfsiod workqueue
> + */
> +void pnfsiod_stop(void)
> +{
> +	struct workqueue_struct *wq == NULL;
> +
> +	spin_lock(&pnfs_spinlock);
> +	pnfsiod_users--;
> +	if (pnfsiod_users == 0) {
> +		wq = pnfsiod_workqueue;
> +		pnfsiod_workqueue = NULL;
> +	}
> +	spin_unlock(&pnfs_spinlock);
> +	if (wq)
> +		destroy_workqueue(wq);

Does destroy_workqueue wait for all pending work?

> +}
> +EXPORT_SYMBOL_GPL(pnfsiod_stop);
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 01cbfd5..af7530b 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -155,6 +155,10 @@ struct pnfs_devicelist {
>  extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
>  extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
>  
> +extern struct workqueue_struct *pnfsiod_workqueue;

As I said:
+extern int pnfsiod_queue_work(...);

> +extern int pnfsiod_start(void);
> +extern void pnfsiod_stop(void);
> +
>  /* nfs4proc.c */
>  extern int nfs4_proc_getdevicelist(struct nfs_server *server,
>  				   const struct nfs_fh *fh,

Thanks for doing this, looks good other wise
Boaz

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/5] pnfs: introduce pnfs private workqueue
  2011-08-10 18:12   ` Boaz Harrosh
@ 2011-08-10 18:21     ` Boaz Harrosh
  2011-08-11  0:00       ` Peng Tao
  0 siblings, 1 reply; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-10 18:21 UTC (permalink / raw)
  To: Peng Tao; +Cc: Benny Halevy, linux-nfs, Peng Tao

On 08/10/2011 11:12 AM, Boaz Harrosh wrote:
> On 08/06/2011 07:53 PM, Peng Tao wrote:
>> For layoutdriver io done functions, default workqueue is not a good place
>> as the code is executed in IO path. So add a pnfs private workqueue to handle
>> them.
>>
>> Also change block and object layout code to make use of this private workqueue.
>>
>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>> ---
>>  fs/nfs/blocklayout/blocklayout.c |   17 ++++++++++----
>>  fs/nfs/objlayout/objio_osd.c     |    8 ++++++
>>  fs/nfs/objlayout/objlayout.c     |    4 +-
>>  fs/nfs/pnfs.c                    |   46 +++++++++++++++++++++++++++++++++++++-
>>  fs/nfs/pnfs.h                    |    4 +++
>>  5 files changed, 71 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
>> index 9561c8f..3aef9f0 100644
>> --- a/fs/nfs/blocklayout/blocklayout.c
>> +++ b/fs/nfs/blocklayout/blocklayout.c
>> @@ -228,7 +228,7 @@ bl_end_par_io_read(void *data)
>>  	struct nfs_read_data *rdata = data;
>>  
>>  	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
>> -	schedule_work(&rdata->task.u.tk_work);
>> +	queue_work(pnfsiod_workqueue, &rdata->task.u.tk_work);
> 
> If pnfsiod_workqueue is a global pnfs resource that users should
> only use with global pnfsiod_start/stop() then I would like
> a wrapper around queue_work, pnfsiod_queue_work(tk_work) that
> keeps the pnfsiod_workqueue private to that subsystem. (And
> the proper checks could be made, and races avoided)
> 
>>  }
>>  
>>  /* We don't want normal .rpc_call_done callback used, so we replace it
>> @@ -418,7 +418,7 @@ static void bl_end_par_io_write(void *data)
>>  	wdata->task.tk_status = 0;
>>  	wdata->verf.committed = NFS_FILE_SYNC;
>>  	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
>> -	schedule_work(&wdata->task.u.tk_work);
>> +	queue_work(pnfsiod_workqueue, &wdata->task.u.tk_work);
>>  }
>>  
>>  /* FIXME STUB - mark intersection of layout and page as bad, so is not
>> @@ -977,29 +977,35 @@ static int __init nfs4blocklayout_init(void)
>>  	if (ret)
>>  		goto out;
>>  
>> +	ret = pnfsiod_start();
>> +	if (ret)
>> +		goto out_remove;
>> +
>>  	init_waitqueue_head(&bl_wq);
>>  
>>  	mnt = rpc_get_mount();
>>  	if (IS_ERR(mnt)) {
>>  		ret = PTR_ERR(mnt);
>> -		goto out_remove;
>> +		goto out_stop;
>>  	}
>>  
>>  	ret = vfs_path_lookup(mnt->mnt_root,
>>  			      mnt,
>>  			      NFS_PIPE_DIRNAME, 0, &path);
>>  	if (ret)
>> -		goto out_remove;
>> +		goto out_stop;
>>  
>>  	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
>>  				    &bl_upcall_ops, 0);
>>  	if (IS_ERR(bl_device_pipe)) {
>>  		ret = PTR_ERR(bl_device_pipe);
>> -		goto out_remove;
>> +		goto out_stop;
>>  	}
>>  out:
>>  	return ret;
>>  
>> +out_stop:
>> +	pnfsiod_stop();
>>  out_remove:
>>  	pnfs_unregister_layoutdriver(&blocklayout_type);
>>  	return ret;
>> @@ -1011,6 +1017,7 @@ static void __exit nfs4blocklayout_exit(void)
>>  	       __func__);
>>  
>>  	pnfs_unregister_layoutdriver(&blocklayout_type);
>> +	pnfsiod_stop();
>>  	rpc_unlink(bl_device_pipe);
>>  }
>>  
>> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
>> index d0cda12..f28013f 100644
>> --- a/fs/nfs/objlayout/objio_osd.c
>> +++ b/fs/nfs/objlayout/objio_osd.c
>> @@ -1042,7 +1042,14 @@ static int __init
>>  objlayout_init(void)
>>  {
>>  	int ret = pnfs_register_layoutdriver(&objlayout_type);
>> +	if (ret)
>> +		goto out;
>>  
>> +	ret = pnfsiod_start();
>> +	if (ret)
>> +		pnfs_unregister_layoutdriver(&objlayout_type);
>> +
>> +out:
>>  	if (ret)
>>  		printk(KERN_INFO
>>  			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
>> @@ -1057,6 +1064,7 @@ static void __exit
>>  objlayout_exit(void)
>>  {
>>  	pnfs_unregister_layoutdriver(&objlayout_type);
>> +	pnfsiod_stop();
>>  	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
>>  	       __func__);
>>  }
>> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
>> index 1d06f8e..1e7fa05 100644
>> --- a/fs/nfs/objlayout/objlayout.c
>> +++ b/fs/nfs/objlayout/objlayout.c
>> @@ -305,7 +305,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
>>  		pnfs_ld_read_done(rdata);
>>  	else {
>>  		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
>> -		schedule_work(&rdata->task.u.tk_work);
>> +		queue_task(pnfsiod_workqueue, &rdata->task.u.tk_work);
>>  	}
>>  }
>>  
>> @@ -396,7 +396,7 @@ objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
>>  		pnfs_ld_write_done(wdata);
>>  	else {
>>  		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
>> -		schedule_work(&wdata->task.u.tk_work);
>> +		queue_task(pnfsiod_workqueue, &wdata->task.u.tk_work);
>>  	}
>>  }
>>  
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 66fc854..e183a4f 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -38,7 +38,7 @@
>>  /* Locking:
>>   *
>>   * pnfs_spinlock:
>> - *      protects pnfs_modules_tbl.
>> + *      protects pnfs_modules_tbl, pnfsiod_workqueue and pnfsiod_users.
>>   */
>>  static DEFINE_SPINLOCK(pnfs_spinlock);
>>  
>> @@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
>>   */
>>  static LIST_HEAD(pnfs_modules_tbl);
>>  
>> +struct workqueue_struct *pnfsiod_workqueue;
>> +static int pnfsiod_users = 0;
>> +
>>  /* Return the registered pnfs layout driver module matching given id */
>>  static struct pnfs_layoutdriver_type *
>>  find_pnfs_driver_locked(u32 id)
>> @@ -1517,3 +1520,44 @@ out:
>>  	dprintk("<-- %s status %d\n", __func__, status);
>>  	return status;
>>  }
>> +
>> +/*
>> + * start up the pnfsiod workqueue
>> + */
>> +int pnfsiod_start(void)
>> +{
>> +	struct workqueue_struct *wq;
>> +	dprintk("RPC:       creating workqueue pnfsiod\n");
>> +	wq = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
>> +	if (wq == NULL)
>> +		return -ENOMEM;
>> +	spin_lock(&pnfs_spinlock);
>> +	pnfsiod_users++;
>> +	if (pnfsiod_workqueue == NULL) {
>> +		pnfsiod_workqueue = wq;
>> +	} else {
>> +		destroy_workqueue(wq);
>> +	}
>> +	spin_unlock(&pnfs_spinlock);
>> +	return 0;
>> +}
> 
> What? why not a simple:

Rrr right. Then you can't use a spin_lock. Sorry for
the noise it's fine. You are re-using an existing
spin_lock. The natural is to use a mutex in such a
cold path

Boaz

> +	int ret = 0;
> 
> +	spin_lock(&pnfs_spinlock);
> +	if (pnfsiod_workqueue == NULL) {
> +		pnfsiod_workqueue = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
> +		if (unlikely!(pnfsiod_workqueue)) {
> +			ret = -ENOMEM;
> 			goto out;
> 		}
> +	}
> +	pnfsiod_users++;
> + out:
> +	spin_unlock(&pnfs_spinlock);
> +	return ret;
> 
>> +EXPORT_SYMBOL_GPL(pnfsiod_start);
>> +
>> +/*
>> + * Destroy the pnfsiod workqueue
>> + */
>> +void pnfsiod_stop(void)
>> +{
>> +	struct workqueue_struct *wq == NULL;
>> +
>> +	spin_lock(&pnfs_spinlock);
>> +	pnfsiod_users--;
>> +	if (pnfsiod_users == 0) {
>> +		wq = pnfsiod_workqueue;
>> +		pnfsiod_workqueue = NULL;
>> +	}
>> +	spin_unlock(&pnfs_spinlock);
>> +	if (wq)
>> +		destroy_workqueue(wq);
> 
> Does destroy_workqueue wait for all pending work?
> 
>> +}
>> +EXPORT_SYMBOL_GPL(pnfsiod_stop);
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 01cbfd5..af7530b 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -155,6 +155,10 @@ struct pnfs_devicelist {
>>  extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
>>  extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
>>  
>> +extern struct workqueue_struct *pnfsiod_workqueue;
> 
> As I said:
> +extern int pnfsiod_queue_work(...);
> 
>> +extern int pnfsiod_start(void);
>> +extern void pnfsiod_stop(void);
>> +
>>  /* nfs4proc.c */
>>  extern int nfs4_proc_getdevicelist(struct nfs_server *server,
>>  				   const struct nfs_fh *fh,
> 
> Thanks for doing this, looks good other wise
> Boaz
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/5] pnfs: introduce pnfs private workqueue
  2011-08-10 18:21     ` Boaz Harrosh
@ 2011-08-11  0:00       ` Peng Tao
  0 siblings, 0 replies; 21+ messages in thread
From: Peng Tao @ 2011-08-11  0:00 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: Benny Halevy, linux-nfs, Peng Tao

Hi, Boaz,

On Thu, Aug 11, 2011 at 2:21 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/10/2011 11:12 AM, Boaz Harrosh wrote:
>> On 08/06/2011 07:53 PM, Peng Tao wrote:
>>> For layoutdriver io done functions, default workqueue is not a good place
>>> as the code is executed in IO path. So add a pnfs private workqueue to handle
>>> them.
>>>
>>> Also change block and object layout code to make use of this private workqueue.
>>>
>>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>>> ---
>>>  fs/nfs/blocklayout/blocklayout.c |   17 ++++++++++----
>>>  fs/nfs/objlayout/objio_osd.c     |    8 ++++++
>>>  fs/nfs/objlayout/objlayout.c     |    4 +-
>>>  fs/nfs/pnfs.c                    |   46 +++++++++++++++++++++++++++++++++++++-
>>>  fs/nfs/pnfs.h                    |    4 +++
>>>  5 files changed, 71 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
>>> index 9561c8f..3aef9f0 100644
>>> --- a/fs/nfs/blocklayout/blocklayout.c
>>> +++ b/fs/nfs/blocklayout/blocklayout.c
>>> @@ -228,7 +228,7 @@ bl_end_par_io_read(void *data)
>>>      struct nfs_read_data *rdata = data;
>>>
>>>      INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
>>> -    schedule_work(&rdata->task.u.tk_work);
>>> +    queue_work(pnfsiod_workqueue, &rdata->task.u.tk_work);
>>
>> If pnfsiod_workqueue is a global pnfs resource that users should
>> only use with global pnfsiod_start/stop() then I would like
>> a wrapper around queue_work, pnfsiod_queue_work(tk_work) that
>> keeps the pnfsiod_workqueue private to that subsystem. (And
>> the proper checks could be made, and races avoided)
Thanks. I will wrapper it so that callers don't need to access
pnfsiod_workqueue directly. And what kind of checks do you suggest? It
seems to me that there is no race of queuing tasks with
pnfsiod_workqueue. The only race is between create/destroy because
both block and object need it but we only want to have one pnfs
private workqueue.

>>
>>>  }
>>>
>>>  /* We don't want normal .rpc_call_done callback used, so we replace it
>>> @@ -418,7 +418,7 @@ static void bl_end_par_io_write(void *data)
>>>      wdata->task.tk_status = 0;
>>>      wdata->verf.committed = NFS_FILE_SYNC;
>>>      INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
>>> -    schedule_work(&wdata->task.u.tk_work);
>>> +    queue_work(pnfsiod_workqueue, &wdata->task.u.tk_work);
>>>  }
>>>
>>>  /* FIXME STUB - mark intersection of layout and page as bad, so is not
>>> @@ -977,29 +977,35 @@ static int __init nfs4blocklayout_init(void)
>>>      if (ret)
>>>              goto out;
>>>
>>> +    ret = pnfsiod_start();
>>> +    if (ret)
>>> +            goto out_remove;
>>> +
>>>      init_waitqueue_head(&bl_wq);
>>>
>>>      mnt = rpc_get_mount();
>>>      if (IS_ERR(mnt)) {
>>>              ret = PTR_ERR(mnt);
>>> -            goto out_remove;
>>> +            goto out_stop;
>>>      }
>>>
>>>      ret = vfs_path_lookup(mnt->mnt_root,
>>>                            mnt,
>>>                            NFS_PIPE_DIRNAME, 0, &path);
>>>      if (ret)
>>> -            goto out_remove;
>>> +            goto out_stop;
>>>
>>>      bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
>>>                                  &bl_upcall_ops, 0);
>>>      if (IS_ERR(bl_device_pipe)) {
>>>              ret = PTR_ERR(bl_device_pipe);
>>> -            goto out_remove;
>>> +            goto out_stop;
>>>      }
>>>  out:
>>>      return ret;
>>>
>>> +out_stop:
>>> +    pnfsiod_stop();
>>>  out_remove:
>>>      pnfs_unregister_layoutdriver(&blocklayout_type);
>>>      return ret;
>>> @@ -1011,6 +1017,7 @@ static void __exit nfs4blocklayout_exit(void)
>>>             __func__);
>>>
>>>      pnfs_unregister_layoutdriver(&blocklayout_type);
>>> +    pnfsiod_stop();
>>>      rpc_unlink(bl_device_pipe);
>>>  }
>>>
>>> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
>>> index d0cda12..f28013f 100644
>>> --- a/fs/nfs/objlayout/objio_osd.c
>>> +++ b/fs/nfs/objlayout/objio_osd.c
>>> @@ -1042,7 +1042,14 @@ static int __init
>>>  objlayout_init(void)
>>>  {
>>>      int ret = pnfs_register_layoutdriver(&objlayout_type);
>>> +    if (ret)
>>> +            goto out;
>>>
>>> +    ret = pnfsiod_start();
>>> +    if (ret)
>>> +            pnfs_unregister_layoutdriver(&objlayout_type);
>>> +
>>> +out:
>>>      if (ret)
>>>              printk(KERN_INFO
>>>                      "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
>>> @@ -1057,6 +1064,7 @@ static void __exit
>>>  objlayout_exit(void)
>>>  {
>>>      pnfs_unregister_layoutdriver(&objlayout_type);
>>> +    pnfsiod_stop();
>>>      printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
>>>             __func__);
>>>  }
>>> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
>>> index 1d06f8e..1e7fa05 100644
>>> --- a/fs/nfs/objlayout/objlayout.c
>>> +++ b/fs/nfs/objlayout/objlayout.c
>>> @@ -305,7 +305,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
>>>              pnfs_ld_read_done(rdata);
>>>      else {
>>>              INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
>>> -            schedule_work(&rdata->task.u.tk_work);
>>> +            queue_task(pnfsiod_workqueue, &rdata->task.u.tk_work);
>>>      }
>>>  }
>>>
>>> @@ -396,7 +396,7 @@ objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
>>>              pnfs_ld_write_done(wdata);
>>>      else {
>>>              INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
>>> -            schedule_work(&wdata->task.u.tk_work);
>>> +            queue_task(pnfsiod_workqueue, &wdata->task.u.tk_work);
>>>      }
>>>  }
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index 66fc854..e183a4f 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -38,7 +38,7 @@
>>>  /* Locking:
>>>   *
>>>   * pnfs_spinlock:
>>> - *      protects pnfs_modules_tbl.
>>> + *      protects pnfs_modules_tbl, pnfsiod_workqueue and pnfsiod_users.
>>>   */
>>>  static DEFINE_SPINLOCK(pnfs_spinlock);
>>>
>>> @@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
>>>   */
>>>  static LIST_HEAD(pnfs_modules_tbl);
>>>
>>> +struct workqueue_struct *pnfsiod_workqueue;
>>> +static int pnfsiod_users = 0;
>>> +
>>>  /* Return the registered pnfs layout driver module matching given id */
>>>  static struct pnfs_layoutdriver_type *
>>>  find_pnfs_driver_locked(u32 id)
>>> @@ -1517,3 +1520,44 @@ out:
>>>      dprintk("<-- %s status %d\n", __func__, status);
>>>      return status;
>>>  }
>>> +
>>> +/*
>>> + * start up the pnfsiod workqueue
>>> + */
>>> +int pnfsiod_start(void)
>>> +{
>>> +    struct workqueue_struct *wq;
>>> +    dprintk("RPC:       creating workqueue pnfsiod\n");
>>> +    wq = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
>>> +    if (wq == NULL)
>>> +            return -ENOMEM;
>>> +    spin_lock(&pnfs_spinlock);
>>> +    pnfsiod_users++;
>>> +    if (pnfsiod_workqueue == NULL) {
>>> +            pnfsiod_workqueue = wq;
>>> +    } else {
>>> +            destroy_workqueue(wq);
>>> +    }
>>> +    spin_unlock(&pnfs_spinlock);
>>> +    return 0;
>>> +}
>>
>> What? why not a simple:
>
> Rrr right. Then you can't use a spin_lock. Sorry for
> the noise it's fine. You are re-using an existing
> spin_lock. The natural is to use a mutex in such a
> cold path
My understanding is that the critical section doesn't need to sleep
and is short enough. Also it is only executed in module load/unload.
So I think reusing an existing spinlock will make life much easier
than creating a new mutex just for it.

Thanks,
Tao

>
> Boaz
>
>> +     int ret = 0;
>>
>> +     spin_lock(&pnfs_spinlock);
>> +     if (pnfsiod_workqueue == NULL) {
>> +             pnfsiod_workqueue = alloc_workqueue("pnfsiod", WQ_MEM_RECLAIM, 0);
>> +             if (unlikely!(pnfsiod_workqueue)) {
>> +                     ret = -ENOMEM;
>>                       goto out;
>>               }
>> +     }
>> +     pnfsiod_users++;
>> + out:
>> +     spin_unlock(&pnfs_spinlock);
>> +     return ret;
>>
>>> +EXPORT_SYMBOL_GPL(pnfsiod_start);
>>> +
>>> +/*
>>> + * Destroy the pnfsiod workqueue
>>> + */
>>> +void pnfsiod_stop(void)
>>> +{
>>> +    struct workqueue_struct *wq == NULL;
>>> +
>>> +    spin_lock(&pnfs_spinlock);
>>> +    pnfsiod_users--;
>>> +    if (pnfsiod_users == 0) {
>>> +            wq = pnfsiod_workqueue;
>>> +            pnfsiod_workqueue = NULL;
>>> +    }
>>> +    spin_unlock(&pnfs_spinlock);
>>> +    if (wq)
>>> +            destroy_workqueue(wq);
>>
>> Does destroy_workqueue wait for all pending work?
>>
>>> +}
>>> +EXPORT_SYMBOL_GPL(pnfsiod_stop);
>>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>>> index 01cbfd5..af7530b 100644
>>> --- a/fs/nfs/pnfs.h
>>> +++ b/fs/nfs/pnfs.h
>>> @@ -155,6 +155,10 @@ struct pnfs_devicelist {
>>>  extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
>>>  extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
>>>
>>> +extern struct workqueue_struct *pnfsiod_workqueue;
>>
>> As I said:
>> +extern int pnfsiod_queue_work(...);
>>
>>> +extern int pnfsiod_start(void);
>>> +extern void pnfsiod_stop(void);
>>> +
>>>  /* nfs4proc.c */
>>>  extern int nfs4_proc_getdevicelist(struct nfs_server *server,
>>>                                 const struct nfs_fh *fh,
>>
>> Thanks for doing this, looks good other wise
>> Boaz
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/5] pNFS: recoalesce when ld read pagelist fails
  2011-08-10 17:55   ` Boaz Harrosh
@ 2011-08-11  0:01     ` Peng Tao
  0 siblings, 0 replies; 21+ messages in thread
From: Peng Tao @ 2011-08-11  0:01 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: Benny Halevy, linux-nfs, Peng Tao

On Thu, Aug 11, 2011 at 1:55 AM, Boaz Harrosh <bharrosh@panasas.com> wr=
ote:
> On 08/06/2011 07:53 PM, Peng Tao wrote:
>> For pnfs pagelist read failure, we need to pg_recoalesce and resend
>> IO to mds.
>>
>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>> ---
>> =C2=A0fs/nfs/internal.h | =C2=A0 =C2=A02 ++
>> =C2=A0fs/nfs/pnfs.c =C2=A0 =C2=A0 | =C2=A0 18 ++++++++++++++----
>> =C2=A0fs/nfs/read.c =C2=A0 =C2=A0 | =C2=A0 =C2=A03 ++-
>> =C2=A03 files changed, 18 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index 62f183d..78b662e 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -307,6 +307,8 @@ extern int nfs_generic_flush(struct nfs_pageio_d=
escriptor *desc,
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 struct list_head *h=
ead);
>> =C2=A0extern int do_nfs_writepage(struct page *page, struct writebac=
k_control *wbc,
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 struct nfs_pageio_d=
escriptor *pgio);
>> +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *=
pgio,
>> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 struct inode *inode);
>> =C2=A0extern void nfs_pageio_init_write_mds(struct nfs_pageio_descri=
ptor *pgio,
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 struct inode *inode=
, int ioflags);
>> =C2=A0extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descr=
iptor *pgio);
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 08aba45..66fc854 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -1300,7 +1300,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
>> =C2=A0int
>> =C2=A0pnfs_ld_read_done(struct nfs_read_data *data)
>> =C2=A0{
>> - =C2=A0 =C2=A0 int status;
>> + =C2=A0 =C2=A0 struct nfs_pageio_descriptor pgio;
>>
>> =C2=A0 =C2=A0 =C2=A0 if (!data->pnfs_error) {
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 __nfs4_read_done_cb=
(data);
>> @@ -1309,11 +1309,21 @@ pnfs_ld_read_done(struct nfs_read_data *data=
)
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 return 0;
>> =C2=A0 =C2=A0 =C2=A0 }
>>
>> + =C2=A0 =C2=A0 put_lseg(data->lseg);
>> + =C2=A0 =C2=A0 data->lseg =3D NULL;
>> =C2=A0 =C2=A0 =C2=A0 dprintk("%s: pnfs_error=3D%d, retry via MDS\n",=
 __func__,
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 data->pnfs_error);
>> - =C2=A0 =C2=A0 status =3D nfs_initiate_read(data, NFS_CLIENT(data->=
inode),
>> - =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0data->mds_ops);
>> - =C2=A0 =C2=A0 return status ? : -EAGAIN;
>> + =C2=A0 =C2=A0 nfs_pageio_init_read_mds(&pgio, data->inode);
>> + =C2=A0 =C2=A0 pgio.pg_recoalesce =3D 1;
>> + =C2=A0 =C2=A0 while (!list_empty(&data->pages)) {
>> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 struct nfs_page *req =3D=
 nfs_list_entry(data->pages.next);
>> +
>> + =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 nfs_pageio_add_request(&=
pgio, req);
>> + =C2=A0 =C2=A0 }
>> + =C2=A0 =C2=A0 nfs_pageio_complete(&pgio);
>> + =C2=A0 =C2=A0 nfs_readdata_release(data);
>> +
>> + =C2=A0 =C2=A0 return 0;
>> =C2=A0}
>> =C2=A0EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
>>
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index 2171c04..2484131 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -112,12 +112,13 @@ static void nfs_readpage_truncate_uninitialise=
d_page(struct nfs_read_data *data)
>> =C2=A0 =C2=A0 =C2=A0 }
>> =C2=A0}
>>
>> -static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *=
pgio,
>> +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 struct inode *inode=
)
>> =C2=A0{
>> =C2=A0 =C2=A0 =C2=A0 nfs_pageio_init(pgio, inode, &nfs_pageio_read_o=
ps,
>> =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
 =C2=A0 NFS_SERVER(inode)->rsize, 0);
>> =C2=A0}
>> +EXPORT_SYMBOL_GPL(nfs_pageio_init_read_mds);
>>
>
> Here to. Who is the user of this export?
Will remove it. Thanks.

>
> Boaz
>
>> =C2=A0void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *p=
gio)
>> =C2=A0{
>
>



--=20
Thanks,
-Bergwolf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-10 17:52 ` [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Boaz Harrosh
@ 2011-08-11  0:03   ` Peng Tao
  2011-08-11 18:53     ` Boaz Harrosh
  0 siblings, 1 reply; 21+ messages in thread
From: Peng Tao @ 2011-08-11  0:03 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: Trond Myklebust, Benny Halevy, linux-nfs, Peng Tao

On Thu, Aug 11, 2011 at 1:52 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/06/2011 07:53 PM, Peng Tao wrote:
>> For pnfs pagelist write failure, we need to pg_recoalesce and resend
>> IO to mds.
>>
>
> I have not given this subject any thought or investigation, so I don't
> know what we should do, but the gut feeling is that I have seen all this
> code else where and we could be having a bigger re-use of existing code.
>
> What if we dig into:
>        data->mds_ops->rpc_call_done(&data->task, data);
>        data->mds_ops->rpc_release(data);
>
> And do all the pages tear-down and unlocks but if there is an error
> not set them as clean. That is keep them dirty. Then mark the layout
> as error and let the normal code choose an MDS write_out. (Just a wild
> thought)
This may work only for write failures. But for read, we will have to
recoalesce and send to MDS. So I prefer to let read and write have
similar retry code path like this.

>
> Trond please look in here, can't it be made simpler?
>
>
>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>> ---
>>  fs/nfs/internal.h |    4 ++++
>>  fs/nfs/pnfs.c     |   35 ++++++++++++++++++++++++++++++++---
>>  fs/nfs/write.c    |   21 ++++++++++++++-------
>>  3 files changed, 50 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index ab12913..62f183d 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -305,6 +305,10 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
>>  /* write.c */
>>  extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>>               struct list_head *head);
>> +extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
>> +             struct nfs_pageio_descriptor *pgio);
>> +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>> +             struct inode *inode, int ioflags);
>>  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
>>  extern void nfs_writedata_release(struct nfs_write_data *wdata);
>>  extern void nfs_commit_free(struct nfs_write_data *p);
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index e550e88..08aba45 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -1172,6 +1172,13 @@ int
>>  pnfs_ld_write_done(struct nfs_write_data *data)
>>  {
>>       int status;
>> +     struct nfs_pageio_descriptor pgio;
>> +     struct writeback_control wbc = {
>> +             .sync_mode = WB_SYNC_ALL,
>> +             .range_start = data->mds_offset,
>> +             .nr_to_write = data->npages,
>> +             .range_end = LLONG_MAX,
>> +     };
>>
>>       if (!data->pnfs_error) {
>>               pnfs_set_layoutcommit(data);
>> @@ -1180,11 +1187,33 @@ pnfs_ld_write_done(struct nfs_write_data *data)
>>               return 0;
>>       }
>>
>> +     put_lseg(data->lseg);
>> +     data->lseg = NULL;
>>       dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>>               data->pnfs_error);
>> -     status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
>> -                                 data->mds_ops, NFS_FILE_SYNC);
>> -     return status ? : -EAGAIN;
>> +     nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
>> +     pgio.pg_recoalesce = 1;
>> +     while (!list_empty(&data->pages)) {
>> +             struct nfs_page *req = nfs_list_entry(data->pages.next);
>> +             struct page *page = req->wb_page;
>> +
>> +             nfs_list_remove_request(req);
>> +             nfs_clear_page_tag_locked(req);
>> +
>> +             end_page_writeback(page);
>> +
>> +             lock_page(page);
>> +             status = do_nfs_writepage(page, &wbc, &pgio);
>> +             if (status) {
>> +                     /* FIXME: is this enough?? */
>> +                     set_page_dirty(page);
>> +             }
>> +             unlock_page(page);
>> +     }
>> +     nfs_pageio_complete(&pgio);
>> +     nfs_writedata_release(data);
>> +
>> +     return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
>>
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index b39b37f..0ccdf98 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -285,14 +285,9 @@ out:
>>       return ret;
>>  }
>>
>> -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>> +int do_nfs_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>>  {
>> -     struct inode *inode = page->mapping->host;
>>       int ret;
>> -
>> -     nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
>> -     nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
>> -
>>       nfs_pageio_cond_complete(pgio, page->index);
>>       ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
>>       if (ret == -EAGAIN) {
>> @@ -301,6 +296,17 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
>>       }
>>       return ret;
>>  }
>> +EXPORT_SYMBOL_GPL(do_nfs_writepage);
>> +
>> +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>
> This is a terrible name please invent something more appropriate. You can't have an
> nfs_do_writepage call a do_nfs_writepage it's the same name. Please use a different name
> that describes what is the point of this function. (nfs_writepage_stats ???)
Agreed. Will change it.

>
>> +{
>> +     struct inode *inode = page->mapping->host;
>> +
>> +     nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
>> +     nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
>> +
>> +     return do_nfs_writepage(page, wbc, pgio);
>> +}
>>
>>  /*
>>   * Write an mmapped page to the server.
>> @@ -1051,12 +1057,13 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
>>       .pg_doio = nfs_generic_pg_writepages,
>>  };
>>
>> -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>> +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>>                                 struct inode *inode, int ioflags)
>>  {
>>       nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
>>                               NFS_SERVER(inode)->wsize, ioflags);
>>  }
>> +EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds);
>
> Why is this EXPORT? if you are going to use it from LD driver later
> in a patch that we did not yet see, please only make it export in the
> patch that has a user for it. (You don't need EXPORT_X if it is used
> by a different file inside nfs.ko, just only remove the static)
Will change it. Thanks.

>
>>
>>  void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
>>  {
>
> Thanks for looking into this issue. Actually looking back we always had
> a problem here. I never was able to pass my error injection tests.
Thanks for reviewing. I will wait for Trond's comments before sending
the next version.

-- 
Thanks,
-Bergwolf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-11  0:03   ` Peng Tao
@ 2011-08-11 18:53     ` Boaz Harrosh
  2011-08-11 23:53       ` Peng Tao
  0 siblings, 1 reply; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-11 18:53 UTC (permalink / raw)
  To: Peng Tao; +Cc: Trond Myklebust, Benny Halevy, linux-nfs, Peng Tao

On 08/10/2011 05:03 PM, Peng Tao wrote:
> On Thu, Aug 11, 2011 at 1:52 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>> On 08/06/2011 07:53 PM, Peng Tao wrote:
>>> For pnfs pagelist write failure, we need to pg_recoalesce and resend
>>> IO to mds.
>>>
>>
>> I have not given this subject any thought or investigation, so I don't
>> know what we should do, but the gut feeling is that I have seen all this
>> code else where and we could be having a bigger re-use of existing code.
>>
>> What if we dig into:
>>        data->mds_ops->rpc_call_done(&data->task, data);
>>        data->mds_ops->rpc_release(data);
>>
>> And do all the pages tear-down and unlocks but if there is an error
>> not set them as clean. That is keep them dirty. Then mark the layout
>> as error and let the normal code choose an MDS write_out. (Just a wild
>> thought)
> This may work only for write failures. But for read, we will have to
> recoalesce and send to MDS. So I prefer to let read and write have
> similar retry code path like this.
> 

I disagree. Look even now the read path is very different then the write
path. (See your two patches: write-patch is 3 times bigger the read-patch)

You should see if what I say is possible for write. And then maybe some
thing will come up also for read. They do not necessarily need to be the
same. (I think)

>>
>> Trond please look in here, can't it be made simpler?

Thanks
Boaz

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-11 18:53     ` Boaz Harrosh
@ 2011-08-11 23:53       ` Peng Tao
  2011-08-12  0:10         ` Boaz Harrosh
  0 siblings, 1 reply; 21+ messages in thread
From: Peng Tao @ 2011-08-11 23:53 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: Trond Myklebust, Benny Halevy, linux-nfs, Peng Tao

On Fri, Aug 12, 2011 at 2:53 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/10/2011 05:03 PM, Peng Tao wrote:
>> On Thu, Aug 11, 2011 at 1:52 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>>> On 08/06/2011 07:53 PM, Peng Tao wrote:
>>>> For pnfs pagelist write failure, we need to pg_recoalesce and resend
>>>> IO to mds.
>>>>
>>>
>>> I have not given this subject any thought or investigation, so I don't
>>> know what we should do, but the gut feeling is that I have seen all this
>>> code else where and we could be having a bigger re-use of existing code.
>>>
>>> What if we dig into:
>>>        data->mds_ops->rpc_call_done(&data->task, data);
>>>        data->mds_ops->rpc_release(data);
>>>
>>> And do all the pages tear-down and unlocks but if there is an error
>>> not set them as clean. That is keep them dirty. Then mark the layout
>>> as error and let the normal code choose an MDS write_out. (Just a wild
>>> thought)
>> This may work only for write failures. But for read, we will have to
>> recoalesce and send to MDS. So I prefer to let read and write have
>> similar retry code path like this.
>>
>
> I disagree. Look even now the read path is very different then the write
> path. (See your two patches: write-patch is 3 times bigger the read-patch)
I mean their logic is the same: if pnfs_error is set, recoalesce the
pages and re-send to MDS :)

>
> You should see if what I say is possible for write. And then maybe some
> thing will come up also for read. They do not necessarily need to be the
> same. (I think)
I agree that it is possible for write. We can re-dirty the pages and
rely on next flush to write it out to MDS. This is mentioned by Trond
before. However, the method won't work for read failures. I don't see
how we can queue failed read pages and let someone else re-send it
later.

-- 
Thanks,
Tao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-11 23:53       ` Peng Tao
@ 2011-08-12  0:10         ` Boaz Harrosh
  2011-08-12  2:07           ` tao.peng
  2011-08-16  7:20           ` tao.peng
  0 siblings, 2 replies; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-12  0:10 UTC (permalink / raw)
  To: Peng Tao; +Cc: Trond Myklebust, Benny Halevy, linux-nfs, Peng Tao

On 08/11/2011 04:53 PM, Peng Tao wrote:
>>
>> You should see if what I say is possible for write. And then maybe some
>> thing will come up also for read. They do not necessarily need to be the
>> same. (I think)
> I agree that it is possible for write. We can re-dirty the pages and
> rely on next flush to write it out to MDS. This is mentioned by Trond
> before. However, the method won't work for read failures. I don't see
> how we can queue failed read pages and let someone else re-send it
> later.
> 

Lets leave the read patch as is for now. Lets try to do it only for
writes.

It is OK to have write do one way and read do another way, I think.

Maybe later we can find a better solution for reads as well.

Thanks
Boaz

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-12  0:10         ` Boaz Harrosh
@ 2011-08-12  2:07           ` tao.peng
  2011-08-16  7:20           ` tao.peng
  1 sibling, 0 replies; 21+ messages in thread
From: tao.peng @ 2011-08-12  2:07 UTC (permalink / raw)
  To: bharrosh, bergwolf; +Cc: Trond.Myklebust, benny, linux-nfs

PiAtLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0KPiBGcm9tOiBCb2F6IEhhcnJvc2ggW21haWx0
bzpiaGFycm9zaEBwYW5hc2FzLmNvbV0NCj4gU2VudDogRnJpZGF5LCBBdWd1c3QgMTIsIDIwMTEg
ODoxMCBBTQ0KPiBUbzogUGVuZyBUYW8NCj4gQ2M6IFRyb25kIE15a2xlYnVzdDsgQmVubnkgSGFs
ZXZ5OyBsaW51eC1uZnNAdmdlci5rZXJuZWwub3JnOyBQZW5nLCBUYW8NCj4gU3ViamVjdDogUmU6
IFtQQVRDSCAxLzVdIHBORlM6IHJlY29hbGVzY2Ugd2hlbiBsZCB3cml0ZSBwYWdlbGlzdCBmYWls
cw0KPiANCj4gT24gMDgvMTEvMjAxMSAwNDo1MyBQTSwgUGVuZyBUYW8gd3JvdGU6DQo+ID4+DQo+
ID4+IFlvdSBzaG91bGQgc2VlIGlmIHdoYXQgSSBzYXkgaXMgcG9zc2libGUgZm9yIHdyaXRlLiBB
bmQgdGhlbiBtYXliZSBzb21lDQo+ID4+IHRoaW5nIHdpbGwgY29tZSB1cCBhbHNvIGZvciByZWFk
LiBUaGV5IGRvIG5vdCBuZWNlc3NhcmlseSBuZWVkIHRvIGJlIHRoZQ0KPiA+PiBzYW1lLiAoSSB0
aGluaykNCj4gPiBJIGFncmVlIHRoYXQgaXQgaXMgcG9zc2libGUgZm9yIHdyaXRlLiBXZSBjYW4g
cmUtZGlydHkgdGhlIHBhZ2VzIGFuZA0KPiA+IHJlbHkgb24gbmV4dCBmbHVzaCB0byB3cml0ZSBp
dCBvdXQgdG8gTURTLiBUaGlzIGlzIG1lbnRpb25lZCBieSBUcm9uZA0KPiA+IGJlZm9yZS4gSG93
ZXZlciwgdGhlIG1ldGhvZCB3b24ndCB3b3JrIGZvciByZWFkIGZhaWx1cmVzLiBJIGRvbid0IHNl
ZQ0KPiA+IGhvdyB3ZSBjYW4gcXVldWUgZmFpbGVkIHJlYWQgcGFnZXMgYW5kIGxldCBzb21lb25l
IGVsc2UgcmUtc2VuZCBpdA0KPiA+IGxhdGVyLg0KPiA+DQo+IA0KPiBMZXRzIGxlYXZlIHRoZSBy
ZWFkIHBhdGNoIGFzIGlzIGZvciBub3cuIExldHMgdHJ5IHRvIGRvIGl0IG9ubHkgZm9yDQo+IHdy
aXRlcy4NCj4gDQo+IEl0IGlzIE9LIHRvIGhhdmUgd3JpdGUgZG8gb25lIHdheSBhbmQgcmVhZCBk
byBhbm90aGVyIHdheSwgSSB0aGluay4NCj4gDQo+IE1heWJlIGxhdGVyIHdlIGNhbiBmaW5kIGEg
YmV0dGVyIHNvbHV0aW9uIGZvciByZWFkcyBhcyB3ZWxsLg0KDQpPSy4gSSB3aWxsIHRyeSB0byBy
ZXdyaXRlIHRoZSBwYXRjaCBhcyB5b3Ugc3VnZ2VzdGVkLg0KDQpUaGFua3MsDQpUYW8NCgTvv717
Lm7vv70r77+977+977+977+977+977+977+9KyXvv73vv71sendt77+977+9Yu+/veunsu+/ve+/
vXLvv73vv716WO+/ve+/vRnfsinvv73vv73vv713Kh9qZ++/ve+/ve+/vR7vv73vv73vv73vv73v
v73domov77+977+977+9eu+/vd6W77+977+9Mu+/vd6Z77+977+977+9Ju+/vSnfoe+/vWHvv73v
v71/77+977+9Hu+/vUfvv73vv73vv71o77+9D++/vWo6K3bvv73vv73vv71377+92aU=

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-12  0:10         ` Boaz Harrosh
  2011-08-12  2:07           ` tao.peng
@ 2011-08-16  7:20           ` tao.peng
  2011-08-16 20:19             ` Boaz Harrosh
  1 sibling, 1 reply; 21+ messages in thread
From: tao.peng @ 2011-08-16  7:20 UTC (permalink / raw)
  To: bharrosh, bergwolf; +Cc: Trond.Myklebust, benny, linux-nfs

SGksIEJvYXosDQoNCj4gLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gRnJvbTogQm9heiBI
YXJyb3NoIFttYWlsdG86YmhhcnJvc2hAcGFuYXNhcy5jb21dDQo+IFNlbnQ6IEZyaWRheSwgQXVn
dXN0IDEyLCAyMDExIDg6MTAgQU0NCj4gVG86IFBlbmcgVGFvDQo+IENjOiBUcm9uZCBNeWtsZWJ1
c3Q7IEJlbm55IEhhbGV2eTsgbGludXgtbmZzQHZnZXIua2VybmVsLm9yZzsgUGVuZywgVGFvDQo+
IFN1YmplY3Q6IFJlOiBbUEFUQ0ggMS81XSBwTkZTOiByZWNvYWxlc2NlIHdoZW4gbGQgd3JpdGUg
cGFnZWxpc3QgZmFpbHMNCj4gDQo+IE9uIDA4LzExLzIwMTEgMDQ6NTMgUE0sIFBlbmcgVGFvIHdy
b3RlOg0KPiA+Pg0KPiA+PiBZb3Ugc2hvdWxkIHNlZSBpZiB3aGF0IEkgc2F5IGlzIHBvc3NpYmxl
IGZvciB3cml0ZS4gQW5kIHRoZW4gbWF5YmUgc29tZQ0KPiA+PiB0aGluZyB3aWxsIGNvbWUgdXAg
YWxzbyBmb3IgcmVhZC4gVGhleSBkbyBub3QgbmVjZXNzYXJpbHkgbmVlZCB0byBiZSB0aGUNCj4g
Pj4gc2FtZS4gKEkgdGhpbmspDQo+ID4gSSBhZ3JlZSB0aGF0IGl0IGlzIHBvc3NpYmxlIGZvciB3
cml0ZS4gV2UgY2FuIHJlLWRpcnR5IHRoZSBwYWdlcyBhbmQNCj4gPiByZWx5IG9uIG5leHQgZmx1
c2ggdG8gd3JpdGUgaXQgb3V0IHRvIE1EUy4gVGhpcyBpcyBtZW50aW9uZWQgYnkgVHJvbmQNCj4g
PiBiZWZvcmUuIEhvd2V2ZXIsIHRoZSBtZXRob2Qgd29uJ3Qgd29yayBmb3IgcmVhZCBmYWlsdXJl
cy4gSSBkb24ndCBzZWUNCj4gPiBob3cgd2UgY2FuIHF1ZXVlIGZhaWxlZCByZWFkIHBhZ2VzIGFu
ZCBsZXQgc29tZW9uZSBlbHNlIHJlLXNlbmQgaXQNCj4gPiBsYXRlci4NCj4gPg0KPiANCj4gTGV0
cyBsZWF2ZSB0aGUgcmVhZCBwYXRjaCBhcyBpcyBmb3Igbm93LiBMZXRzIHRyeSB0byBkbyBpdCBv
bmx5IGZvcg0KPiB3cml0ZXMuDQo+IA0KPiBJdCBpcyBPSyB0byBoYXZlIHdyaXRlIGRvIG9uZSB3
YXkgYW5kIHJlYWQgZG8gYW5vdGhlciB3YXksIEkgdGhpbmsuDQo+IA0KPiBNYXliZSBsYXRlciB3
ZSBjYW4gZmluZCBhIGJldHRlciBzb2x1dGlvbiBmb3IgcmVhZHMgYXMgd2VsbC4NCg0KSSB0cmll
ZCB0byByZXdyaXRlIHRoZSB3cml0ZSBwYXRjaCB0byBoYW5kbGUgZmFpbHVyZXMgaW5zaWRlIG1k
c19vcHMtPnJwY19yZWxlYXNlLiBIb3dldmVyLCBJIGdldCBhIHByb2JsZW0gdy5yLnQuICJyZWRp
cnR5IGFuZCByZWx5IG9uIG5leHQgZmx1c2giLiBJZiB0aGUgZmFpbGVkIHdyaXRlIGlzIHRoZSAq
bGFzdCBmbHVzaCosIHdlIGVuZCB1cCB3aXRoIHJlbHlpbmcgbm8gb25lIGFuZCB0aGUgZGlydHkg
cGFnZXMgYXJlIHNpbXBseSBkcm9wcGVkLiBEbyB5b3UgaGF2ZSBhbnkgc3VnZ2VzdGlvbnMgaG93
IHRvIGhhbmRsZSBpdD8NCg0KVGhhbmtzLA0KVGFvDQoNCg0KDQoNCg0KDQoNCg0KDQoNCgTvv717
Lm7vv70r77+977+977+977+977+977+977+9KyXvv73vv71sendt77+977+9Yu+/veunsu+/ve+/
vXLvv73vv716WO+/ve+/vRnfsinvv73vv73vv713Kh9qZ++/ve+/ve+/vR7vv73vv73vv73vv73v
v73domov77+977+977+9eu+/vd6W77+977+9Mu+/vd6Z77+977+977+9Ju+/vSnfoe+/vWHvv73v
v71/77+977+9Hu+/vUfvv73vv73vv71o77+9D++/vWo6K3bvv73vv73vv71377+92aU=

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-16  7:20           ` tao.peng
@ 2011-08-16 20:19             ` Boaz Harrosh
  2011-08-17  9:44               ` Peng Tao
  0 siblings, 1 reply; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-16 20:19 UTC (permalink / raw)
  To: tao.peng; +Cc: bergwolf, Trond.Myklebust, benny, linux-nfs

On 08/16/2011 12:20 AM, tao.peng@emc.com wrote:
> 
> I tried to rewrite the write patch to handle failures inside
> mds_ops->rpc_release. However, I get a problem w.r.t. "redirty and
> rely on next flush". If the failed write is the *last flush*, we end
> up with relying no one and the dirty pages are simply dropped. Do you
> have any suggestions how to handle it?
> 
> Thanks, Tao
> 

Tao Hi.

OK, I see what you mean. That would be a problem

I had a totally different idea You know how today we just do:
	nfs_initiate_write()

Which is bad because it can actually dead lock because
we are taking up the only thread that needs to service
that call. Well I thought, with you thread-for-pnfs patch,
can it not now work? I think it is worth a try?

See if you can advance your thread-for-blocks-objects
patch to current code and inject some errors. I think
it will work this time. What do you think?

Thanks
Boaz

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-16 20:19             ` Boaz Harrosh
@ 2011-08-17  9:44               ` Peng Tao
  2011-08-22 23:28                 ` Boaz Harrosh
  0 siblings, 1 reply; 21+ messages in thread
From: Peng Tao @ 2011-08-17  9:44 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: tao.peng, Trond.Myklebust, benny, linux-nfs

Hi, Boaz,

On Wed, Aug 17, 2011 at 4:19 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/16/2011 12:20 AM, tao.peng@emc.com wrote:
>>
>> I tried to rewrite the write patch to handle failures inside
>> mds_ops->rpc_release. However, I get a problem w.r.t. "redirty and
>> rely on next flush". If the failed write is the *last flush*, we end
>> up with relying no one and the dirty pages are simply dropped. Do you
>> have any suggestions how to handle it?
>>
>> Thanks, Tao
>>
>
> Tao Hi.
>
> OK, I see what you mean. That would be a problem
>
> I had a totally different idea You know how today we just do:
>        nfs_initiate_write()
>
> Which is bad because it can actually dead lock because
> we are taking up the only thread that needs to service
> that call. Well I thought, with you thread-for-pnfs patch,
> can it not now work? I think it is worth a try?
The problem w/ directly calling nfs_initiate_write() is that we may
have pagelist length larger than server's rsize/wsize. So if client
sends all the pages in single READ/WRITE rpc, MDS will reject the
READ/WRITE operation. Therefore we need to recoalesce them before
re-sending to MDS.

>
> See if you can advance your thread-for-blocks-objects
> patch to current code and inject some errors. I think
> it will work this time. What do you think?
The thread-for-block-objects patch is to solve the default workqueue
deadlock problem. But it can't solve the too large IO size for MDS
problem. So I think we need all of the three patches to handle LD IO
failures.

Thanks,
Tao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails
  2011-08-17  9:44               ` Peng Tao
@ 2011-08-22 23:28                 ` Boaz Harrosh
  0 siblings, 0 replies; 21+ messages in thread
From: Boaz Harrosh @ 2011-08-22 23:28 UTC (permalink / raw)
  To: Peng Tao; +Cc: tao.peng, Trond.Myklebust, benny, linux-nfs

On 08/17/2011 02:44 AM, Peng Tao wrote:
<snip>

> The problem w/ directly calling nfs_initiate_write() is that we may
> have pagelist length larger than server's rsize/wsize. So if client
> sends all the pages in single READ/WRITE rpc, MDS will reject the
> READ/WRITE operation. Therefore we need to recoalesce them before
> re-sending to MDS.
> 

Rrrr, you are absolutely right. I needed a smack on the head, to
get back on track.

<snip>

Thanks
Boaz

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2011-08-22 23:29 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-07  2:53 [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Peng Tao
2011-08-07  2:53 ` [PATCH 2/5] pNFS: recoalesce when ld read " Peng Tao
2011-08-10 17:55   ` Boaz Harrosh
2011-08-11  0:01     ` Peng Tao
2011-08-07  2:53 ` [PATCH 3/5] pnfs: introduce pnfs private workqueue Peng Tao
2011-08-10 18:12   ` Boaz Harrosh
2011-08-10 18:21     ` Boaz Harrosh
2011-08-11  0:00       ` Peng Tao
2011-08-07  2:53 ` [PATCH 4/5] SUNRPC/NFS: make rpc pipe upcall generic Peng Tao
2011-08-10 15:59   ` Jim Rees
2011-08-07  2:53 ` [PATCH 5/5] pnfs: make _set_lo_fail generic Peng Tao
2011-08-10 17:52 ` [PATCH 1/5] pNFS: recoalesce when ld write pagelist fails Boaz Harrosh
2011-08-11  0:03   ` Peng Tao
2011-08-11 18:53     ` Boaz Harrosh
2011-08-11 23:53       ` Peng Tao
2011-08-12  0:10         ` Boaz Harrosh
2011-08-12  2:07           ` tao.peng
2011-08-16  7:20           ` tao.peng
2011-08-16 20:19             ` Boaz Harrosh
2011-08-17  9:44               ` Peng Tao
2011-08-22 23:28                 ` Boaz Harrosh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.