All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] ceph: addr.c cleanups
@ 2020-09-16 17:38 Jeff Layton
  2020-09-16 17:38 ` [PATCH 1/5] ceph: break out writeback of incompatible snap context to separate function Jeff Layton
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 17:38 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

This set cleans up and reorganizes the readpage, writepage and
write_begin code in ceph, with the resulting code (hopefully) being a
bit more clear. There should be no behavioral changes here.

This should also help prepare ceph for some coming changes to the fscache
infrastructure and fscrypt I/O path integration.

There's also a modest reduction in LoC.

Jeff Layton (5):
  ceph: break out writeback of incompatible snap context to separate
    function
  ceph: don't call ceph_update_writeable_page from page_mkwrite
  ceph: fold ceph_sync_readpages into ceph_readpage
  ceph: fold ceph_sync_writepages into writepage_nounlock
  ceph: fold ceph_update_writeable_page into ceph_write_begin

 fs/ceph/addr.c | 369 ++++++++++++++++++++++---------------------------
 1 file changed, 169 insertions(+), 200 deletions(-)

-- 
2.26.2


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/5] ceph: break out writeback of incompatible snap context to separate function
  2020-09-16 17:38 [PATCH 0/5] ceph: addr.c cleanups Jeff Layton
@ 2020-09-16 17:38 ` Jeff Layton
  2020-09-16 17:38 ` [PATCH 2/5] ceph: don't call ceph_update_writeable_page from page_mkwrite Jeff Layton
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 17:38 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

When dirtying a page, we have to flush incompatible contexts. Move that
into a separate function.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 96 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 36 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7b1f3dad576f..c8e98fee8164 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1298,40 +1298,34 @@ static int context_is_writeable_or_written(struct inode *inode,
 	return ret;
 }
 
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
+/**
+ * ceph_find_incompatible - find an incompatible context and return it
+ * @inode: inode associated with page
+ * @page: page being dirtied
  *
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
+ * Returns NULL on success, negative error code on error, and a snapc ref that should be
+ * waited on otherwise.
  */
-static int ceph_update_writeable_page(struct file *file,
-			    loff_t pos, unsigned len,
-			    struct page *page)
+static struct ceph_snap_context *
+ceph_find_incompatible(struct inode *inode, struct page *page)
 {
-	struct inode *inode = file_inode(file);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	loff_t page_off = pos & PAGE_MASK;
-	int pos_in_page = pos & ~PAGE_MASK;
-	int end_in_page = pos_in_page + len;
-	loff_t i_size;
-	int r;
-	struct ceph_snap_context *snapc, *oldest;
 
 	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
 		dout(" page %p forced umount\n", page);
-		unlock_page(page);
-		return -EIO;
+		return ERR_PTR(-EIO);
 	}
 
-retry_locked:
-	/* writepages currently holds page lock, but if we change that later, */
-	wait_on_page_writeback(page);
+	for (;;) {
+		struct ceph_snap_context *snapc, *oldest;
+
+		wait_on_page_writeback(page);
+
+		snapc = page_snap_context(page);
+		if (!snapc || snapc == ci->i_head_snapc)
+			break;
 
-	snapc = page_snap_context(page);
-	if (snapc && snapc != ci->i_head_snapc) {
 		/*
 		 * this page is already dirty in another (older) snap
 		 * context!  is it writeable now?
@@ -1346,26 +1340,56 @@ static int ceph_update_writeable_page(struct file *file,
 			 * be writeable or written
 			 */
 			snapc = ceph_get_snap_context(snapc);
-			unlock_page(page);
-			ceph_queue_writeback(inode);
-			r = wait_event_killable(ci->i_cap_wq,
-			       context_is_writeable_or_written(inode, snapc));
-			ceph_put_snap_context(snapc);
-			if (r == -ERESTARTSYS)
-				return r;
-			return -EAGAIN;
+			return snapc;
 		}
 		ceph_put_snap_context(oldest);
 
 		/* yay, writeable, do it now (without dropping page lock) */
 		dout(" page %p snapc %p not current, but oldest\n",
 		     page, snapc);
-		if (!clear_page_dirty_for_io(page))
-			goto retry_locked;
-		r = writepage_nounlock(page, NULL);
-		if (r < 0)
+		if (clear_page_dirty_for_io(page)) {
+			int r = writepage_nounlock(page, NULL);
+			if (r < 0)
+				return ERR_PTR(r);
+		}
+	}
+	return NULL;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+			    loff_t pos, unsigned len,
+			    struct page *page)
+{
+	struct inode *inode = file_inode(file);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc;
+	loff_t page_off = pos & PAGE_MASK;
+	int pos_in_page = pos & ~PAGE_MASK;
+	int end_in_page = pos_in_page + len;
+	loff_t i_size;
+	int r;
+
+retry_locked:
+	snapc = ceph_find_incompatible(inode, page);
+	if (snapc) {
+		if (IS_ERR(snapc)) {
+			r = PTR_ERR(snapc);
 			goto fail_unlock;
-		goto retry_locked;
+		}
+		unlock_page(page);
+		ceph_queue_writeback(inode);
+		r = wait_event_killable(ci->i_cap_wq,
+					context_is_writeable_or_written(inode, snapc));
+		ceph_put_snap_context(snapc);
+		return -EAGAIN;
 	}
 
 	if (PageUptodate(page)) {
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/5] ceph: don't call ceph_update_writeable_page from page_mkwrite
  2020-09-16 17:38 [PATCH 0/5] ceph: addr.c cleanups Jeff Layton
  2020-09-16 17:38 ` [PATCH 1/5] ceph: break out writeback of incompatible snap context to separate function Jeff Layton
@ 2020-09-16 17:38 ` Jeff Layton
  2020-09-16 17:38 ` [PATCH 3/5] ceph: fold ceph_sync_readpages into ceph_readpage Jeff Layton
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 17:38 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

page_mkwrite should only be called with Uptodate pages, so we should
only need to flush incompatible snap contexts.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c8e98fee8164..02e286c30d44 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1691,6 +1691,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 	inode_inc_iversion_raw(inode);
 
 	do {
+		struct ceph_snap_context *snapc;
+
 		lock_page(page);
 
 		if (page_mkwrite_check_truncate(page, inode) < 0) {
@@ -1699,13 +1701,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 			break;
 		}
 
-		err = ceph_update_writeable_page(vma->vm_file, off, len, page);
-		if (err >= 0) {
+		snapc = ceph_find_incompatible(inode, page);
+		if (!snapc) {
 			/* success.  we'll keep the page locked. */
 			set_page_dirty(page);
 			ret = VM_FAULT_LOCKED;
+			break;
 		}
-	} while (err == -EAGAIN);
+
+		unlock_page(page);
+
+		if (IS_ERR(snapc)) {
+			ret = VM_FAULT_SIGBUS;
+			break;
+		}
+
+		ceph_queue_writeback(inode);
+		err = wait_event_killable(ci->i_cap_wq,
+				context_is_writeable_or_written(inode, snapc));
+		ceph_put_snap_context(snapc);
+	} while (err == 0);
 
 	if (ret == VM_FAULT_LOCKED ||
 	    ci->i_inline_version != CEPH_INLINE_NONE) {
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/5] ceph: fold ceph_sync_readpages into ceph_readpage
  2020-09-16 17:38 [PATCH 0/5] ceph: addr.c cleanups Jeff Layton
  2020-09-16 17:38 ` [PATCH 1/5] ceph: break out writeback of incompatible snap context to separate function Jeff Layton
  2020-09-16 17:38 ` [PATCH 2/5] ceph: don't call ceph_update_writeable_page from page_mkwrite Jeff Layton
@ 2020-09-16 17:38 ` Jeff Layton
  2020-09-16 17:38 ` [PATCH 4/5] ceph: fold ceph_sync_writepages into writepage_nounlock Jeff Layton
  2020-09-16 17:38 ` [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin Jeff Layton
  4 siblings, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 17:38 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

It's the only caller and this will make reorg easier.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 79 ++++++++++++++++----------------------------------
 1 file changed, 25 insertions(+), 54 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 02e286c30d44..19de3e9ccafe 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
 	return !PagePrivate(page);
 }
 
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-static int ceph_sync_readpages(struct ceph_fs_client *fsc,
-			       struct ceph_vino vino,
-			       struct ceph_file_layout *layout,
-			       u64 off, u64 *plen,
-			       u32 truncate_seq, u64 truncate_size,
-			       struct page **pages, int num_pages,
-			       int page_align)
-{
-	struct ceph_osd_client *osdc = &fsc->client->osdc;
-	struct ceph_osd_request *req;
-	int rc = 0;
-
-	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-	     vino.snap, off, *plen);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
-				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, truncate_seq, truncate_size,
-				    false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	/* it may be a short read due to an object boundary */
-	osd_req_op_extent_osd_data_pages(req, 0,
-				pages, *plen, page_align, false, false);
-
-	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
-	     off, *plen, *plen, page_align);
-
-	rc = ceph_osdc_start_request(osdc, req, false);
-	if (!rc)
-		rc = ceph_osdc_wait_request(osdc, req);
-
-	ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
-				 req->r_end_latency, rc);
-
-	ceph_osdc_put_request(req);
-	dout("readpages result %d\n", rc);
-	return rc;
-}
-
-/*
- * read a single page, without unlocking it.
- */
+/* read a single page, without unlocking it. */
 static int ceph_do_readpage(struct file *filp, struct page *page)
 {
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	struct ceph_osd_request *req;
+	struct ceph_vino vino = ceph_vino(inode);
 	int err = 0;
 	u64 off = page_offset(page);
 	u64 len = PAGE_SIZE;
@@ -260,12 +217,27 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
 	if (err == 0)
 		return -EINPROGRESS;
 
-	dout("readpage inode %p file %p page %p index %lu\n",
-	     inode, filp, page, page->index);
-	err = ceph_sync_readpages(fsc, ceph_vino(inode),
-				  &ci->i_layout, off, &len,
-				  ci->i_truncate_seq, ci->i_truncate_size,
-				  &page, 1, 0);
+	dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+	     vino.ino, vino.snap, filp, off, len, page, page->index);
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+
+	err = ceph_osdc_start_request(osdc, req, false);
+	if (!err)
+		err = ceph_osdc_wait_request(osdc, req);
+
+	ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
+				 req->r_end_latency, err);
+
+	ceph_osdc_put_request(req);
+	dout("readpage result %d\n", err);
+
 	if (err == -ENOENT)
 		err = 0;
 	if (err < 0) {
@@ -283,7 +255,6 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
 
 	SetPageUptodate(page);
 	ceph_readpage_to_fscache(inode, page);
-
 out:
 	return err < 0 ? err : 0;
 }
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/5] ceph: fold ceph_sync_writepages into writepage_nounlock
  2020-09-16 17:38 [PATCH 0/5] ceph: addr.c cleanups Jeff Layton
                   ` (2 preceding siblings ...)
  2020-09-16 17:38 ` [PATCH 3/5] ceph: fold ceph_sync_readpages into ceph_readpage Jeff Layton
@ 2020-09-16 17:38 ` Jeff Layton
  2020-09-16 17:38 ` [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin Jeff Layton
  4 siblings, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 17:38 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

It's the only caller, and this will make it easier to refactor.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 93 +++++++++++++++++++-------------------------------
 1 file changed, 35 insertions(+), 58 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 19de3e9ccafe..a6c54cfc3fea 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -590,50 +590,6 @@ static u64 get_writepages_data_length(struct inode *inode,
 	return end > start ? end - start : 0;
 }
 
-/*
- * do a synchronous write on N pages
- */
-static int ceph_sync_writepages(struct ceph_fs_client *fsc,
-				struct ceph_vino vino,
-				struct ceph_file_layout *layout,
-				struct ceph_snap_context *snapc,
-				u64 off, u64 len,
-				u32 truncate_seq, u64 truncate_size,
-				struct timespec64 *mtime,
-				struct page **pages, int num_pages)
-{
-	struct ceph_osd_client *osdc = &fsc->client->osdc;
-	struct ceph_osd_request *req;
-	int rc = 0;
-	int page_align = off & ~PAGE_MASK;
-
-	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
-				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
-				    snapc, truncate_seq, truncate_size,
-				    true);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
-	/* it may be a short write due to an object boundary */
-	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
-				false, false);
-	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
-	req->r_mtime = *mtime;
-	rc = ceph_osdc_start_request(osdc, req, true);
-	if (!rc)
-		rc = ceph_osdc_wait_request(osdc, req);
-
-	ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
-				  req->r_end_latency, rc);
-
-	ceph_osdc_put_request(req);
-	if (rc == 0)
-		rc = len;
-	dout("writepages result %d\n", rc);
-	return rc;
-}
-
 /*
  * Write a single page, but leave the page locked.
  *
@@ -642,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
  */
 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
-	struct inode *inode;
-	struct ceph_inode_info *ci;
-	struct ceph_fs_client *fsc;
+	struct inode *inode = page->mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_snap_context *snapc, *oldest;
 	loff_t page_off = page_offset(page);
-	int err, len = PAGE_SIZE;
+	int err;
+	loff_t len = PAGE_SIZE;
 	struct ceph_writeback_ctl ceph_wbc;
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	struct ceph_osd_request *req;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
-	inode = page->mapping->host;
-	ci = ceph_inode(inode);
-	fsc = ceph_inode_to_client(inode);
-
 	/* verify this is a writeable snap context */
 	snapc = page_snap_context(page);
 	if (!snapc) {
@@ -684,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	if (ceph_wbc.i_size < page_off + len)
 		len = ceph_wbc.i_size - page_off;
 
-	dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+	dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
 	     inode, page, page->index, page_off, len, snapc, snapc->seq);
 
 	if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -692,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
 	set_page_writeback(page);
-	err = ceph_sync_writepages(fsc, ceph_vino(inode),
-				   &ci->i_layout, snapc, page_off, len,
-				   ceph_wbc.truncate_seq,
-				   ceph_wbc.truncate_size,
-				   &inode->i_mtime, &page, 1);
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
+				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
+				    ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
+				    true);
+	if (IS_ERR(req)) {
+		redirty_page_for_writepage(wbc, page);
+		end_page_writeback(page);
+		return PTR_ERR(req);
+	}
+
+	/* it may be a short write due to an object boundary */
+	WARN_ON_ONCE(len > PAGE_SIZE);
+	osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+	dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+
+	req->r_mtime = inode->i_mtime;
+	err = ceph_osdc_start_request(osdc, req, true);
+	if (!err)
+		err = ceph_osdc_wait_request(osdc, req);
+
+	ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+				  req->r_end_latency, err);
+
+	ceph_osdc_put_request(req);
+	if (err == 0)
+		err = len;
+
 	if (err < 0) {
 		struct writeback_control tmp_wbc;
 		if (!wbc)
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2020-09-16 17:38 [PATCH 0/5] ceph: addr.c cleanups Jeff Layton
                   ` (3 preceding siblings ...)
  2020-09-16 17:38 ` [PATCH 4/5] ceph: fold ceph_sync_writepages into writepage_nounlock Jeff Layton
@ 2020-09-16 17:38 ` Jeff Layton
  2020-09-16 19:16   ` Jeff Layton
  4 siblings, 1 reply; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 17:38 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

...and reorganize the loop for better clarity.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 148 ++++++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 83 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a6c54cfc3fea..89c4d8a9a5eb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1251,6 +1251,8 @@ static int context_is_writeable_or_written(struct inode *inode,
  * @inode: inode associated with page
  * @page: page being dirtied
  *
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
  * Returns NULL on success, negative error code on error, and a snapc ref that should be
  * waited on otherwise.
  */
@@ -1307,104 +1309,84 @@ ceph_find_incompatible(struct inode *inode, struct page *page)
 /*
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
- *
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
  */
-static int ceph_update_writeable_page(struct file *file,
-			    loff_t pos, unsigned len,
-			    struct page *page)
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc;
-	loff_t page_off = pos & PAGE_MASK;
+	struct page *page = NULL;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	int pos_in_page = pos & ~PAGE_MASK;
-	int end_in_page = pos_in_page + len;
-	loff_t i_size;
-	int r;
+	int r = 0;
 
-retry_locked:
-	snapc = ceph_find_incompatible(inode, page);
-	if (snapc) {
-		if (IS_ERR(snapc)) {
-			r = PTR_ERR(snapc);
-			goto fail_unlock;
+	dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
+
+	for (;;) {
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page) {
+			r = -ENOMEM;
+			break;
 		}
-		unlock_page(page);
-		ceph_queue_writeback(inode);
-		r = wait_event_killable(ci->i_cap_wq,
-					context_is_writeable_or_written(inode, snapc));
-		ceph_put_snap_context(snapc);
-		return -EAGAIN;
-	}
 
-	if (PageUptodate(page)) {
-		dout(" page %p already uptodate\n", page);
-		return 0;
-	}
+		snapc = ceph_find_incompatible(inode, page);
+		if (snapc) {
+			if (IS_ERR(snapc)) {
+				r = PTR_ERR(snapc);
+				break;
+			}
+			unlock_page(page);
+			ceph_queue_writeback(inode);
+			r = wait_event_killable(ci->i_cap_wq,
+						context_is_writeable_or_written(inode, snapc));
+			ceph_put_snap_context(snapc);
+			put_page(page);
+			page = NULL;
+			if (r != 0)
+				break;
+			continue;
+		}
 
-	/* full page? */
-	if (pos_in_page == 0 && len == PAGE_SIZE)
-		return 0;
+		if (PageUptodate(page)) {
+			dout(" page %p already uptodate\n", page);
+			break;
+		}
 
-	/* past end of file? */
-	i_size = i_size_read(inode);
-
-	if (page_off >= i_size ||
-	    (pos_in_page == 0 && (pos+len) >= i_size &&
-	     end_in_page - pos_in_page != PAGE_SIZE)) {
-		dout(" zeroing %p 0 - %d and %d - %d\n",
-		     page, pos_in_page, end_in_page, (int)PAGE_SIZE);
-		zero_user_segments(page,
-				   0, pos_in_page,
-				   end_in_page, PAGE_SIZE);
-		return 0;
-	}
+		/*
+		 * In some cases we don't need to read at all:
+		 * - full page write
+		 * - write that lies completely beyond EOF
+		 * - write that covers the the page from start to EOF or beyond it
+		 */
+		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
+		    (pos >= i_size_read(inode)) ||
+		    (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
+			zero_user_segments(page, 0, pos_in_page,
+					   pos_in_page + len, PAGE_SIZE);
+			break;
+		}
 
-	/* we need to read it. */
-	r = ceph_do_readpage(file, page);
-	if (r < 0) {
-		if (r == -EINPROGRESS)
-			return -EAGAIN;
-		goto fail_unlock;
+		/*
+		 * We need to read it. If we get back -EINPROGRESS, then the page was
+		 * handed off to fscache and it will be unlocked when the read completes.
+		 * Refind the page in that case so we can reacquire the page lock. Otherwise
+		 * we got a hard error or the read was completed synchronously.
+		 */
+		r = ceph_do_readpage(file, page);
+		if (r != -EINPROGRESS)
+			break;
 	}
-	goto retry_locked;
-fail_unlock:
-	unlock_page(page);
-	return r;
-}
 
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
- */
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
-{
-	struct inode *inode = file_inode(file);
-	struct page *page;
-	pgoff_t index = pos >> PAGE_SHIFT;
-	int r;
-
-	do {
-		/* get a page */
-		page = grab_cache_page_write_begin(mapping, index, 0);
-		if (!page)
-			return -ENOMEM;
-
-		dout("write_begin file %p inode %p page %p %d~%d\n", file,
-		     inode, page, (int)pos, (int)len);
-
-		r = ceph_update_writeable_page(file, pos, len, page);
-		if (r < 0)
+	if (r < 0) {
+		if (page) {
+			unlock_page(page);
 			put_page(page);
-		else
-			*pagep = page;
-	} while (r == -EAGAIN);
-
+		}
+	} else {
+		*pagep = page;
+	}
 	return r;
 }
 
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2020-09-16 17:38 ` [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin Jeff Layton
@ 2020-09-16 19:16   ` Jeff Layton
  2021-06-11 14:14     ` Andrew W Elble
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Layton @ 2020-09-16 19:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: ukernel, idryomov

On Wed, 2020-09-16 at 13:38 -0400, Jeff Layton wrote:
> ...and reorganize the loop for better clarity.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/addr.c | 148 ++++++++++++++++++++++---------------------------
>  1 file changed, 65 insertions(+), 83 deletions(-)
> 
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index a6c54cfc3fea..89c4d8a9a5eb 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1251,6 +1251,8 @@ static int context_is_writeable_or_written(struct inode *inode,
>   * @inode: inode associated with page
>   * @page: page being dirtied
>   *
> + * We are only allowed to write into/dirty the page if the page is
> + * clean, or already dirty within the same snap context.
>   * Returns NULL on success, negative error code on error, and a snapc ref that should be
>   * waited on otherwise.
>   */
> @@ -1307,104 +1309,84 @@ ceph_find_incompatible(struct inode *inode, struct page *page)
>  /*
>   * We are only allowed to write into/dirty the page if the page is
>   * clean, or already dirty within the same snap context.
> - *
> - * called with page locked.
> - * return success with page locked,
> - * or any failure (incl -EAGAIN) with page unlocked.
>   */
> -static int ceph_update_writeable_page(struct file *file,
> -			    loff_t pos, unsigned len,
> -			    struct page *page)
> +static int ceph_write_begin(struct file *file, struct address_space *mapping,
> +			    loff_t pos, unsigned len, unsigned flags,
> +			    struct page **pagep, void **fsdata)
>  {
>  	struct inode *inode = file_inode(file);
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  	struct ceph_snap_context *snapc;
> -	loff_t page_off = pos & PAGE_MASK;
> +	struct page *page = NULL;
> +	pgoff_t index = pos >> PAGE_SHIFT;
>  	int pos_in_page = pos & ~PAGE_MASK;
> -	int end_in_page = pos_in_page + len;
> -	loff_t i_size;
> -	int r;
> +	int r = 0;
>  
> -retry_locked:
> -	snapc = ceph_find_incompatible(inode, page);
> -	if (snapc) {
> -		if (IS_ERR(snapc)) {
> -			r = PTR_ERR(snapc);
> -			goto fail_unlock;
> +	dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
> +
> +	for (;;) {
> +		page = grab_cache_page_write_begin(mapping, index, 0);
> +		if (!page) {
> +			r = -ENOMEM;
> +			break;
>  		}
> -		unlock_page(page);
> -		ceph_queue_writeback(inode);
> -		r = wait_event_killable(ci->i_cap_wq,
> -					context_is_writeable_or_written(inode, snapc));
> -		ceph_put_snap_context(snapc);
> -		return -EAGAIN;
> -	}
>  
> -	if (PageUptodate(page)) {
> -		dout(" page %p already uptodate\n", page);
> -		return 0;
> -	}
> +		snapc = ceph_find_incompatible(inode, page);
> +		if (snapc) {
> +			if (IS_ERR(snapc)) {
> +				r = PTR_ERR(snapc);
> +				break;
> +			}
> +			unlock_page(page);
> +			ceph_queue_writeback(inode);
> +			r = wait_event_killable(ci->i_cap_wq,
> +						context_is_writeable_or_written(inode, snapc));
> +			ceph_put_snap_context(snapc);
> +			put_page(page);
> +			page = NULL;

Now that I look a bit more, there is probably no need to hold the page
reference across this wait. I'll change that to put it after unlocking
and re-test.

> +			if (r != 0)
> +				break;
> +			continue;
> +		}
>  
> -	/* full page? */
> -	if (pos_in_page == 0 && len == PAGE_SIZE)
> -		return 0;
> +		if (PageUptodate(page)) {
> +			dout(" page %p already uptodate\n", page);
> +			break;
> +		}
>  
> -	/* past end of file? */
> -	i_size = i_size_read(inode);
> -
> -	if (page_off >= i_size ||
> -	    (pos_in_page == 0 && (pos+len) >= i_size &&
> -	     end_in_page - pos_in_page != PAGE_SIZE)) {
> -		dout(" zeroing %p 0 - %d and %d - %d\n",
> -		     page, pos_in_page, end_in_page, (int)PAGE_SIZE);
> -		zero_user_segments(page,
> -				   0, pos_in_page,
> -				   end_in_page, PAGE_SIZE);
> -		return 0;
> -	}
> +		/*
> +		 * In some cases we don't need to read at all:
> +		 * - full page write
> +		 * - write that lies completely beyond EOF
> +		 * - write that covers the the page from start to EOF or beyond it
> +		 */
> +		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
> +		    (pos >= i_size_read(inode)) ||
> +		    (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
> +			zero_user_segments(page, 0, pos_in_page,
> +					   pos_in_page + len, PAGE_SIZE);
> +			break;
> +		}
>  
> -	/* we need to read it. */
> -	r = ceph_do_readpage(file, page);
> -	if (r < 0) {
> -		if (r == -EINPROGRESS)
> -			return -EAGAIN;
> -		goto fail_unlock;
> +		/*
> +		 * We need to read it. If we get back -EINPROGRESS, then the page was
> +		 * handed off to fscache and it will be unlocked when the read completes.
> +		 * Refind the page in that case so we can reacquire the page lock. Otherwise
> +		 * we got a hard error or the read was completed synchronously.
> +		 */
> +		r = ceph_do_readpage(file, page);
> +		if (r != -EINPROGRESS)
> +			break;
>  	}
> -	goto retry_locked;
> -fail_unlock:
> -	unlock_page(page);
> -	return r;
> -}
>  
> -/*
> - * We are only allowed to write into/dirty the page if the page is
> - * clean, or already dirty within the same snap context.
> - */
> -static int ceph_write_begin(struct file *file, struct address_space *mapping,
> -			    loff_t pos, unsigned len, unsigned flags,
> -			    struct page **pagep, void **fsdata)
> -{
> -	struct inode *inode = file_inode(file);
> -	struct page *page;
> -	pgoff_t index = pos >> PAGE_SHIFT;
> -	int r;
> -
> -	do {
> -		/* get a page */
> -		page = grab_cache_page_write_begin(mapping, index, 0);
> -		if (!page)
> -			return -ENOMEM;
> -
> -		dout("write_begin file %p inode %p page %p %d~%d\n", file,
> -		     inode, page, (int)pos, (int)len);
> -
> -		r = ceph_update_writeable_page(file, pos, len, page);
> -		if (r < 0)
> +	if (r < 0) {
> +		if (page) {
> +			unlock_page(page);
>  			put_page(page);
> -		else
> -			*pagep = page;
> -	} while (r == -EAGAIN);
> -
> +		}
> +	} else {
> +		*pagep = page;
> +	}
>  	return r;
>  }
>  

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2020-09-16 19:16   ` Jeff Layton
@ 2021-06-11 14:14     ` Andrew W Elble
  2021-06-11 14:52       ` Jeff Layton
  2021-06-11 15:11       ` David Howells
  0 siblings, 2 replies; 16+ messages in thread
From: Andrew W Elble @ 2021-06-11 14:14 UTC (permalink / raw)
  To: Jeff Layton; +Cc: ceph-devel, pfmeec


We're seeing file corruption while running 5.10, bisected to 1cc1699070bd:

>> +static int ceph_write_begin(struct file *file, struct address_space *mapping,
>> +			    loff_t pos, unsigned len, unsigned flags,
>> +			    struct page **pagep, void **fsdata)

<snip>

>> +		/*
>> +		 * In some cases we don't need to read at all:
>> +		 * - full page write
>> +		 * - write that lies completely beyond EOF
>> +		 * - write that covers the the page from start to EOF or beyond it
>> +		 */
>> +		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
>> +		    (pos >= i_size_read(inode)) ||

Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ?

Seems like fs/netfs/read_helper.c currently has the same issue?

>> +		    (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
>> +			zero_user_segments(page, 0, pos_in_page,
>> +					   pos_in_page + len, PAGE_SIZE);
>> +			break;
>> +		}

-- 
Andrew W. Elble
Infrastructure Engineer
Information and Technology Services
Rochester Institute of Technology
tel: (585)-475-2411 | aweits@rit.edu
PGP: BFAD 8461 4CCF DC95 DA2C B0EB 965B 082E 863E C912

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 14:14     ` Andrew W Elble
@ 2021-06-11 14:52       ` Jeff Layton
  2021-06-11 15:11       ` David Howells
  1 sibling, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2021-06-11 14:52 UTC (permalink / raw)
  To: Andrew W Elble; +Cc: ceph-devel, pfmeec, David Howells, linux-cachefs

On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote:
> We're seeing file corruption while running 5.10, bisected to 1cc1699070bd:
> 
> > > +static int ceph_write_begin(struct file *file, struct address_space *mapping,
> > > +			    loff_t pos, unsigned len, unsigned flags,
> > > +			    struct page **pagep, void **fsdata)
> 
> <snip>
> 
> > > +		/*
> > > +		 * In some cases we don't need to read at all:
> > > +		 * - full page write
> > > +		 * - write that lies completely beyond EOF
> > > +		 * - write that covers the the page from start to EOF or beyond it
> > > +		 */
> > > +		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
> > > +		    (pos >= i_size_read(inode)) ||
> 
> Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ?
> 
> Seems like fs/netfs/read_helper.c currently has the same issue?
> 

Yeah...I think you may be right. Have you tried a patch that does that
and does it fix the issue?

Also, do you happen to have a testcase that we could stick in xfstests
for this? If not, we can probably write one, but if you already have one
that'd be great.



> > > +		    (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
> > > +			zero_user_segments(page, 0, pos_in_page,
> > > +					   pos_in_page + len, PAGE_SIZE);
> > > +			break;
> > > +		}
> 

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 14:14     ` Andrew W Elble
  2021-06-11 14:52       ` Jeff Layton
@ 2021-06-11 15:11       ` David Howells
  2021-06-11 15:20         ` Matthew Wilcox
  2021-06-11 15:35         ` David Howells
  1 sibling, 2 replies; 16+ messages in thread
From: David Howells @ 2021-06-11 15:11 UTC (permalink / raw)
  To: Jeff Layton, willy
  Cc: dhowells, Andrew W Elble, ceph-devel, pfmeec, linux-cachefs

Jeff Layton <jlayton@kernel.org> wrote:

> On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote:
> > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd:
> > 
> > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping,
> > > > +			    loff_t pos, unsigned len, unsigned flags,
> > > > +			    struct page **pagep, void **fsdata)
> > 
> > <snip>
> > 
> > > > +		/*
> > > > +		 * In some cases we don't need to read at all:
> > > > +		 * - full page write
> > > > +		 * - write that lies completely beyond EOF
> > > > +		 * - write that covers the the page from start to EOF or beyond it
> > > > +		 */
> > > > +		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
> > > > +		    (pos >= i_size_read(inode)) ||
> > 
> > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ?
> > 
> > Seems like fs/netfs/read_helper.c currently has the same issue?

That's not quite right either.  page may be larger than PAGE_MASK if
grab_cache_page_write_begin() returns a THP (if that's possible).

Maybe:

	(pos & thp_size(page) - 1) >= i_size_read(inode)

Really, we want something like thp_pos().  Maybe Willy has something like that
up his sleeve.

In fact, in netfs_write_begin(), index and pos_in_page should be calculated
after grab_cache_page_write_begin() has been called, just in case the new page
extends before the page containing the requested position.

David


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 15:11       ` David Howells
@ 2021-06-11 15:20         ` Matthew Wilcox
  2021-06-11 15:38           ` Jeff Layton
  2021-06-11 15:35         ` David Howells
  1 sibling, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2021-06-11 15:20 UTC (permalink / raw)
  To: David Howells
  Cc: Jeff Layton, Andrew W Elble, ceph-devel, pfmeec, linux-cachefs

On Fri, Jun 11, 2021 at 04:11:49PM +0100, David Howells wrote:
> Jeff Layton <jlayton@kernel.org> wrote:
> 
> > On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote:
> > > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd:
> > > 
> > > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping,
> > > > > +			    loff_t pos, unsigned len, unsigned flags,
> > > > > +			    struct page **pagep, void **fsdata)
> > > 
> > > <snip>
> > > 
> > > > > +		/*
> > > > > +		 * In some cases we don't need to read at all:
> > > > > +		 * - full page write
> > > > > +		 * - write that lies completely beyond EOF
> > > > > +		 * - write that covers the the page from start to EOF or beyond it
> > > > > +		 */
> > > > > +		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
> > > > > +		    (pos >= i_size_read(inode)) ||
> > > 
> > > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ?
> > > 
> > > Seems like fs/netfs/read_helper.c currently has the same issue?

How does (pos & PAGE_MASK) >= i_size_read() make sense?  That could only
be true if the file is less than a page in size, whereas it should
always be true if the write starts outside the current i_size.

> That's not quite right either.  page may be larger than PAGE_MASK if
> grab_cache_page_write_begin() returns a THP (if that's possible).
> 
> Maybe:
> 
> 	(pos & thp_size(page) - 1) >= i_size_read(inode)
> 
> Really, we want something like thp_pos().  Maybe Willy has something like that
> up his sleeve.
> 
> In fact, in netfs_write_begin(), index and pos_in_page should be calculated
> after grab_cache_page_write_begin() has been called, just in case the new page
> extends before the page containing the requested position.

Yes.  I do that kind of thing in iomap.  What you're doing there looks
a bit like offset_in_folio(), but I don't understand the problem with
just checking pos against i_size directly.

https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio
contains a number of commits that start 'iomap:' which may be of interest.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 15:11       ` David Howells
  2021-06-11 15:20         ` Matthew Wilcox
@ 2021-06-11 15:35         ` David Howells
  2021-06-11 15:59           ` Andrew W Elble
                             ` (2 more replies)
  1 sibling, 3 replies; 16+ messages in thread
From: David Howells @ 2021-06-11 15:35 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: dhowells, Jeff Layton, Andrew W Elble, ceph-devel, pfmeec, linux-cachefs

Matthew Wilcox <willy@infradead.org> wrote:

> Yes.  I do that kind of thing in iomap.  What you're doing there looks
> a bit like offset_in_folio(), but I don't understand the problem with
> just checking pos against i_size directly.

pos can be in the middle of a page.  If i_size is at, say, the same point in
the middle of that page and the page isn't currently in the cache, then we'll
just clear the entire page and not read the bottom part of it (ie. the bit
prior to the EOF).

It's odd, though, that xfstests doesn't catch this.

David


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 15:20         ` Matthew Wilcox
@ 2021-06-11 15:38           ` Jeff Layton
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2021-06-11 15:38 UTC (permalink / raw)
  To: Matthew Wilcox, David Howells
  Cc: Andrew W Elble, ceph-devel, pfmeec, linux-cachefs

On Fri, 2021-06-11 at 16:20 +0100, Matthew Wilcox wrote:
> On Fri, Jun 11, 2021 at 04:11:49PM +0100, David Howells wrote:
> > Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > > On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote:
> > > > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd:
> > > > 
> > > > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping,
> > > > > > +			    loff_t pos, unsigned len, unsigned flags,
> > > > > > +			    struct page **pagep, void **fsdata)
> > > > 
> > > > <snip>
> > > > 
> > > > > > +		/*
> > > > > > +		 * In some cases we don't need to read at all:
> > > > > > +		 * - full page write
> > > > > > +		 * - write that lies completely beyond EOF
> > > > > > +		 * - write that covers the the page from start to EOF or beyond it
> > > > > > +		 */
> > > > > > +		if ((pos_in_page == 0 && len == PAGE_SIZE) ||
> > > > > > +		    (pos >= i_size_read(inode)) ||
> > > > 
> > > > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ?
> > > > 
> > > > Seems like fs/netfs/read_helper.c currently has the same issue?
> 
> How does (pos & PAGE_MASK) >= i_size_read() make sense?  That could only
> be true if the file is less than a page in size, whereas it should
> always be true if the write starts outside the current i_size.
> 

Yeah, I guess what we really need is to round the i_size up to the start
of the next page and then compare whether pos is beyond that.

> > That's not quite right either.  page may be larger than PAGE_MASK if
> > grab_cache_page_write_begin() returns a THP (if that's possible).
> > 
> > Maybe:
> > 
> > 	(pos & thp_size(page) - 1) >= i_size_read(inode)
> > 
> > Really, we want something like thp_pos().  Maybe Willy has something like that
> > up his sleeve.
> > 
> > In fact, in netfs_write_begin(), index and pos_in_page should be calculated
> > after grab_cache_page_write_begin() has been called, just in case the new page
> > extends before the page containing the requested position.
> 
> Yes.  I do that kind of thing in iomap.  What you're doing there looks
> a bit like offset_in_folio(), but I don't understand the problem with
> just checking pos against i_size directly.
> 

Suppose the i_size is 3 and you do a 1 byte write at offset 5. You're
beyond the EOF, so the condition would return true, but you still need
to read in the start of the page in that case.

I think we probably need a testcase that does this in xfstests:

open file
write 3 bytes at start
close
unmount or drop pagecache in some way
then write 1 byte at offset 5
see whether the resulting contents match the expect ones

>
https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio
> contains a number of commits that start 'iomap:' which may be of interest.

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 15:35         ` David Howells
@ 2021-06-11 15:59           ` Andrew W Elble
  2021-06-11 17:56           ` Matthew Wilcox
  2021-06-11 21:47           ` David Howells
  2 siblings, 0 replies; 16+ messages in thread
From: Andrew W Elble @ 2021-06-11 15:59 UTC (permalink / raw)
  To: David Howells
  Cc: Matthew Wilcox, Jeff Layton, ceph-devel, pfmeec, linux-cachefs

David Howells <dhowells@redhat.com> writes:

> Matthew Wilcox <willy@infradead.org> wrote:
>
>> Yes.  I do that kind of thing in iomap.  What you're doing there looks
>> a bit like offset_in_folio(), but I don't understand the problem with
>> just checking pos against i_size directly.
>
> pos can be in the middle of a page.  If i_size is at, say, the same point in
> the middle of that page and the page isn't currently in the cache, then we'll
> just clear the entire page and not read the bottom part of it (ie. the bit
> prior to the EOF).
>
> It's odd, though, that xfstests doesn't catch this.

Some more details/information:

1.) The test is a really simple piece of code that essentially writes
    a incrementing number to a file on 'node a', while a 'tail -f'
    is run and exited multiple times on 'node b'. We haven't had much
    luck in this causing the problem when done on a single node
2.) ((pos & PAGE_MASK) >= i_size_read(inode)) ||' merely reflects the
    logic that was in place prior to 1cc1699070bd. Patching
    fs/ceph/addr.c with this does seem to eliminate the corruption.
    (We'll test with the patch Jeff posted here shortly)
3.) After finding all of this, I went upstream to look at fs/ceph/addr.c
    and discovered the move to leveraging fs/netfs/read_helper.c - we've
    only just compiled against git master and replicated the corruption
    there.


-- 
Andrew W. Elble
Infrastructure Engineer
Information and Technology Services
Rochester Institute of Technology
tel: (585)-475-2411 | aweits@rit.edu
PGP: BFAD 8461 4CCF DC95 DA2C B0EB 965B 082E 863E C912

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 15:35         ` David Howells
  2021-06-11 15:59           ` Andrew W Elble
@ 2021-06-11 17:56           ` Matthew Wilcox
  2021-06-11 21:47           ` David Howells
  2 siblings, 0 replies; 16+ messages in thread
From: Matthew Wilcox @ 2021-06-11 17:56 UTC (permalink / raw)
  To: David Howells
  Cc: Jeff Layton, Andrew W Elble, ceph-devel, pfmeec, linux-cachefs

On Fri, Jun 11, 2021 at 04:35:25PM +0100, David Howells wrote:
> Matthew Wilcox <willy@infradead.org> wrote:
> 
> > Yes.  I do that kind of thing in iomap.  What you're doing there looks
> > a bit like offset_in_folio(), but I don't understand the problem with
> > just checking pos against i_size directly.
> 
> pos can be in the middle of a page.  If i_size is at, say, the same point in
> the middle of that page and the page isn't currently in the cache, then we'll
> just clear the entire page and not read the bottom part of it (ie. the bit
> prior to the EOF).

The comment says that's expected, though.  I think the comment is wrong;
consider the case where the page size is 4kB, file is 5kB long and
we do a 1kB write at offset=6kB.  The existing CEPH code (prior to
netfs) will zero out 4-6KB and 7-8kB, which is wrong.

Anyway, looking at netfs_write_begin(), it's wrong too, in a bunch of
ways.  You don't need to zero out the part of the page you're going to
copy into.  And the condition is overly complicated which makes it
hard to know what's going on.  Setting aside the is_cache_enabled part,
I think you want:

	if (offset == 0 && len >= thp_size(page))
		goto have_page_no_wait;
	if (page_offset(page) >= size) {
		zero_user_segments(page, 0, offset,
				   offset + len, thp_size(page));
		goto have_page_no_wait;
	}
	... read the interesting chunks of page ...


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin
  2021-06-11 15:35         ` David Howells
  2021-06-11 15:59           ` Andrew W Elble
  2021-06-11 17:56           ` Matthew Wilcox
@ 2021-06-11 21:47           ` David Howells
  2 siblings, 0 replies; 16+ messages in thread
From: David Howells @ 2021-06-11 21:47 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: dhowells, Jeff Layton, Andrew W Elble, ceph-devel, pfmeec, linux-cachefs

Matthew Wilcox <willy@infradead.org> wrote:

> Anyway, looking at netfs_write_begin(), it's wrong too, in a bunch of
> ways.  You don't need to zero out the part of the page you're going to
> copy into.

Zeroing it out isn't 'wrong', per se, just inefficient.  Fixing that needs the
filesystem to deal with it if the copy fails.

> And the condition is overly complicated which makes it
> hard to know what's going on.  Setting aside the is_cache_enabled part,
> I think you want:
> 
> 	if (offset == 0 && len >= thp_size(page))
> 		goto have_page_no_wait;
> 	if (page_offset(page) >= size) {
> 		zero_user_segments(page, 0, offset,
> 				   offset + len, thp_size(page));

There's a third case too: where the write starts at the beginning of the page
and goes to/straddles the EOF - but doesn't continue to the end of the page.

You also didn't set PG_uptodate - presumably deliberately because there's a
hole potentially containing random rubbish in the middle.

> 		goto have_page_no_wait;
> 	}
> 	... read the interesting chunks of page ...

David


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-06-11 21:47 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-16 17:38 [PATCH 0/5] ceph: addr.c cleanups Jeff Layton
2020-09-16 17:38 ` [PATCH 1/5] ceph: break out writeback of incompatible snap context to separate function Jeff Layton
2020-09-16 17:38 ` [PATCH 2/5] ceph: don't call ceph_update_writeable_page from page_mkwrite Jeff Layton
2020-09-16 17:38 ` [PATCH 3/5] ceph: fold ceph_sync_readpages into ceph_readpage Jeff Layton
2020-09-16 17:38 ` [PATCH 4/5] ceph: fold ceph_sync_writepages into writepage_nounlock Jeff Layton
2020-09-16 17:38 ` [PATCH 5/5] ceph: fold ceph_update_writeable_page into ceph_write_begin Jeff Layton
2020-09-16 19:16   ` Jeff Layton
2021-06-11 14:14     ` Andrew W Elble
2021-06-11 14:52       ` Jeff Layton
2021-06-11 15:11       ` David Howells
2021-06-11 15:20         ` Matthew Wilcox
2021-06-11 15:38           ` Jeff Layton
2021-06-11 15:35         ` David Howells
2021-06-11 15:59           ` Andrew W Elble
2021-06-11 17:56           ` Matthew Wilcox
2021-06-11 21:47           ` David Howells

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.