All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: ceph-devel@vger.kernel.org, linux-fscrypt@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, idryomov@gmail.com
Subject: [RFC PATCH v10 43/48] ceph: add read/modify/write to ceph_sync_write
Date: Tue, 11 Jan 2022 14:16:03 -0500	[thread overview]
Message-ID: <20220111191608.88762-44-jlayton@kernel.org> (raw)
In-Reply-To: <20220111191608.88762-1-jlayton@kernel.org>

When doing a synchronous write on an encrypted inode, we have no
guarantee that the caller is writing crypto block-aligned data. When
that happens, we must do a read/modify/write cycle.

First, expand the range to cover complete blocks. If we had to change
the original pos or length, issue a read to fill the first and/or last
pages, and fetch the version of the object from the result.

We then copy data into the pages as usual, encrypt the result and issue
a write prefixed by an assertion that the version hasn't changed. If it has
changed then we restart the whole thing again.

If there is no object at that position in the file (-ENOENT), we prefix
the write on an exclusive create of the object instead.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/file.c | 260 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 228 insertions(+), 32 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a6305ad5519b..41766b2012e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1468,18 +1468,16 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_vino vino;
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_osd_request *req;
 	struct page **pages;
 	u64 len;
 	int num_pages;
 	int written = 0;
-	int flags;
 	int ret;
 	bool check_caps = false;
 	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(from);
-	size_t off;
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
@@ -1499,70 +1497,267 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	if (ret < 0)
 		dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-	flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
-
 	while ((len = iov_iter_count(from)) > 0) {
 		size_t left;
 		int n;
+		u64 write_pos = pos;
+		u64 write_len = len;
+		u64 objnum, objoff;
+		u32 xlen;
+		u64 assert_ver;
+		bool rmw;
+		bool first, last;
+		struct iov_iter saved_iter = *from;
+		size_t off;
+
+		fscrypt_adjust_off_and_len(inode, &write_pos, &write_len);
+
+		/* clamp the length to the end of first object */
+		ceph_calc_file_object_mapping(&ci->i_layout, write_pos,
+						write_len, &objnum, &objoff,
+						&xlen);
+		write_len = xlen;
+
+		/* adjust len downward if it goes beyond current object */
+		if (pos + len > write_pos + write_len)
+			len = write_pos + write_len - pos;
 
-		vino = ceph_vino(inode);
-		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-					    vino, pos, &len, 0, 1,
-					    CEPH_OSD_OP_WRITE, flags, snapc,
-					    ci->i_truncate_seq,
-					    ci->i_truncate_size,
-					    false);
-		if (IS_ERR(req)) {
-			ret = PTR_ERR(req);
-			break;
-		}
+		/*
+		 * If we had to adjust the length or position to align with a
+		 * crypto block, then we must do a read/modify/write cycle. We
+		 * use a version assertion to redrive the thing if something
+		 * changes in between.
+		 */
+		first = pos != write_pos;
+		last = (pos + len) != (write_pos + write_len);
+		rmw = first || last;
 
-		/* FIXME: express in FSCRYPT_BLOCK_SIZE units */
-		num_pages = calc_pages_for(pos, len);
+		/*
+		 * The data is emplaced into the page as it would be if it were in
+		 * an array of pagecache pages.
+		 */
+		num_pages = calc_pages_for(write_pos, write_len);
 		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
-			goto out;
+			break;
+		}
+
+		/* Do we need to preload the pages? */
+		if (rmw) {
+			u64 first_pos = write_pos;
+			u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE;
+			u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
+
+			/* We should only need to do this for encrypted inodes */
+			WARN_ON_ONCE(!IS_ENCRYPTED(inode));
+
+			/* No need to do two reads if first and last blocks are same */
+			if (first && last_pos == first_pos)
+				last = false;
+
+			/*
+			 * Allocate a read request for one or two extents, depending
+			 * on how the request was aligned.
+			 */
+			req = ceph_osdc_new_request(osdc, &ci->i_layout,
+					ci->i_vino, first ? first_pos : last_pos,
+					&read_len, 0, (first && last) ? 2 : 1,
+					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+					NULL, ci->i_truncate_seq,
+					ci->i_truncate_size, false);
+			if (IS_ERR(req)) {
+				ceph_release_page_vector(pages, num_pages);
+				ret = PTR_ERR(req);
+				break;
+			}
+
+			/* Something is misaligned! */
+			if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
+				ret = -EIO;
+				break;
+			}
+
+			/* Add extent for first block? */
+			if (first)
+				osd_req_op_extent_osd_data_pages(req, 0, pages,
+							 CEPH_FSCRYPT_BLOCK_SIZE,
+							 offset_in_page(first_pos),
+							 false, false);
+
+			/* Add extent for last block */
+			if (last) {
+				/* Init the other extent if first extent has been used */
+				if (first) {
+					osd_req_op_extent_init(req, 1, CEPH_OSD_OP_READ,
+							last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
+							ci->i_truncate_size,
+							ci->i_truncate_seq);
+				}
+
+				osd_req_op_extent_osd_data_pages(req, first ? 1 : 0,
+							&pages[num_pages - 1],
+							CEPH_FSCRYPT_BLOCK_SIZE,
+							offset_in_page(last_pos),
+							false, false);
+			}
+
+			ret = ceph_osdc_start_request(osdc, req, false);
+			if (!ret)
+				ret = ceph_osdc_wait_request(osdc, req);
+
+			/* FIXME: length field is wrong if there are 2 extents */
+			ceph_update_read_metrics(&fsc->mdsc->metric,
+						 req->r_start_latency,
+						 req->r_end_latency,
+						 read_len, ret);
+
+			/* Ok if object is not already present */
+			if (ret == -ENOENT) {
+				/*
+				 * If there is no object, then we can't assert
+				 * on its version. Set it to 0, and we'll use an
+				 * exclusive create instead.
+				 */
+				ceph_osdc_put_request(req);
+				assert_ver = 0;
+				ret = 0;
+
+				/*
+				 * zero out the soon-to-be uncopied parts of the
+				 * first and last pages.
+				 */
+				if (first)
+					zero_user_segment(pages[0], 0,
+							  offset_in_page(first_pos));
+				if (last)
+					zero_user_segment(pages[num_pages - 1],
+							  offset_in_page(last_pos),
+							  PAGE_SIZE);
+			} else {
+				/* Grab assert version. It must be non-zero. */
+				assert_ver = req->r_version;
+				WARN_ON_ONCE(ret > 0 && assert_ver == 0);
+
+				ceph_osdc_put_request(req);
+				if (ret < 0) {
+					ceph_release_page_vector(pages, num_pages);
+					break;
+				}
+
+				if (first) {
+					ret = ceph_fscrypt_decrypt_block_inplace(inode,
+							pages[0],
+							CEPH_FSCRYPT_BLOCK_SIZE,
+							offset_in_page(first_pos),
+							first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+					if (ret < 0)
+						break;
+				}
+				if (last) {
+					ret = ceph_fscrypt_decrypt_block_inplace(inode,
+							pages[num_pages - 1],
+							CEPH_FSCRYPT_BLOCK_SIZE,
+							offset_in_page(last_pos),
+							last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+					if (ret < 0)
+						break;
+				}
+			}
 		}
 
 		left = len;
-		off = pos & ~CEPH_FSCRYPT_BLOCK_MASK;
+		off = offset_in_page(pos);
 		for (n = 0; n < num_pages; n++) {
-			size_t plen = min_t(size_t, left, CEPH_FSCRYPT_BLOCK_SIZE - off);
+			size_t plen = min_t(size_t, left, PAGE_SIZE - off);
+
+			/* copy the data */
 			ret = copy_page_from_iter(pages[n], off, plen, from);
-			off = 0;
 			if (ret != plen) {
 				ret = -EFAULT;
 				break;
 			}
+			off = 0;
 			left -= ret;
 		}
-
 		if (ret < 0) {
+			dout("sync_write write failed with %d\n", ret);
 			ceph_release_page_vector(pages, num_pages);
-			goto out;
+			break;
 		}
 
-		req->r_inode = inode;
+		if (IS_ENCRYPTED(inode)) {
+			ret = ceph_fscrypt_encrypt_pages(inode, pages,
+							 write_pos, write_len,
+							 GFP_KERNEL);
+			if (ret < 0) {
+				dout("encryption failed with %d\n", ret);
+				break;
+			}
+		}
 
-		osd_req_op_extent_osd_data_pages(req, 0, pages, len,
-						 pos & ~CEPH_FSCRYPT_BLOCK_MASK,
-						 false, true);
+		req = ceph_osdc_new_request(osdc, &ci->i_layout,
+					    ci->i_vino, write_pos, &write_len,
+					    rmw ? 1 : 0, rmw ? 2 : 1,
+					    CEPH_OSD_OP_WRITE,
+					    CEPH_OSD_FLAG_WRITE,
+					    snapc, ci->i_truncate_seq,
+					    ci->i_truncate_size, false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			ceph_release_page_vector(pages, num_pages);
+			break;
+		}
 
+		dout("sync_write write op %lld~%llu\n", write_pos, write_len);
+		osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
+						 offset_in_page(write_pos), false,
+						 true);
+		req->r_inode = inode;
 		req->r_mtime = mtime;
-		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+
+		/* Set up the assertion */
+		if (rmw) {
+			/*
+			 * Set up the assertion. If we don't have a version number,
+			 * then the object doesn't exist yet. Use an exclusive create
+			 * instead of a version assertion in that case.
+			 */
+			if (assert_ver) {
+				osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0);
+				req->r_ops[0].assert_ver.ver = assert_ver;
+			} else {
+				osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE,
+						CEPH_OSD_OP_FLAG_EXCL);
+			}
+		}
+
+		ret = ceph_osdc_start_request(osdc, req, false);
 		if (!ret)
-			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+			ret = ceph_osdc_wait_request(osdc, req);
 
 		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 					  req->r_end_latency, len, ret);
-out:
 		ceph_osdc_put_request(req);
 		if (ret != 0) {
+			dout("sync_write osd write returned %d\n", ret);
+			/* Version changed! Must re-do the rmw cycle */
+			if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
+			     (!assert_ver && ret == -EEXIST)) {
+				/* We should only ever see this on a rmw */
+				WARN_ON_ONCE(!rmw);
+
+				/* The version should never go backward */
+				WARN_ON_ONCE(ret == -EOVERFLOW);
+
+				*from = saved_iter;
+
+				/* FIXME: limit number of times we loop? */
+				continue;
+			}
 			ceph_set_error_write(ci);
 			break;
 		}
-
 		ceph_clear_error_write(ci);
 		pos += len;
 		written += len;
@@ -1580,6 +1775,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		ret = written;
 		iocb->ki_pos = pos;
 	}
+	dout("sync_write returning %d\n", ret);
 	return ret;
 }
 
-- 
2.34.1


  parent reply	other threads:[~2022-01-11 19:17 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-11 19:15 [RFC PATCH v10 00/48] ceph+fscrypt: full support Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 01/48] vfs: export new_inode_pseudo Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 02/48] fscrypt: export fscrypt_base64url_encode and fscrypt_base64url_decode Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 03/48] fscrypt: export fscrypt_fname_encrypt and fscrypt_fname_encrypted_size Jeff Layton
2022-01-27  1:58   ` Eric Biggers
2022-01-11 19:15 ` [RFC PATCH v10 04/48] fscrypt: add fscrypt_context_for_new_inode Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 05/48] ceph: preallocate inode for ops that may create one Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 06/48] ceph: crypto context handling for ceph Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 07/48] ceph: parse new fscrypt_auth and fscrypt_file fields in inode traces Jeff Layton
2022-02-17  8:25   ` Xiubo Li
2022-02-17 11:39     ` Jeff Layton
2022-02-18  1:09       ` Xiubo Li
2022-01-11 19:15 ` [RFC PATCH v10 08/48] ceph: add fscrypt_* handling to caps.c Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 09/48] ceph: add ability to set fscrypt_auth via setattr Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 10/48] ceph: implement -o test_dummy_encryption mount option Jeff Layton
2022-02-11 13:50   ` Luís Henriques
2022-02-11 14:52     ` Jeff Layton
2022-02-14  9:29       ` Luís Henriques
2022-01-11 19:15 ` [RFC PATCH v10 11/48] ceph: decode alternate_name in lease info Jeff Layton
2022-03-01 10:57   ` Xiubo Li
2022-03-01 11:18     ` Xiubo Li
2022-03-01 13:10     ` Jeff Layton
2022-03-01 13:51       ` Xiubo Li
2022-03-01 13:57         ` Jeff Layton
2022-03-01 14:07           ` Xiubo Li
2022-03-01 14:14             ` Jeff Layton
2022-03-01 14:30               ` Xiubo Li
2022-01-11 19:15 ` [RFC PATCH v10 12/48] ceph: add fscrypt ioctls Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 13/48] ceph: make ceph_msdc_build_path use ref-walk Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 14/48] ceph: add encrypted fname handling to ceph_mdsc_build_path Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 15/48] ceph: send altname in MClientRequest Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 16/48] ceph: encode encrypted name in dentry release Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 17/48] ceph: properly set DCACHE_NOKEY_NAME flag in lookup Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 18/48] ceph: make d_revalidate call fscrypt revalidator for encrypted dentries Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 19/48] ceph: add helpers for converting names for userland presentation Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 20/48] ceph: add fscrypt support to ceph_fill_trace Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 21/48] ceph: add support to readdir for encrypted filenames Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 22/48] ceph: create symlinks with encrypted and base64-encoded targets Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 23/48] ceph: make ceph_get_name decrypt filenames Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 24/48] ceph: add a new ceph.fscrypt.auth vxattr Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 25/48] ceph: add some fscrypt guardrails Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 26/48] ceph: don't allow changing layout on encrypted files/directories Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 27/48] libceph: add CEPH_OSD_OP_ASSERT_VER support Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 28/48] ceph: size handling for encrypted inodes in cap updates Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 29/48] ceph: fscrypt_file field handling in MClientRequest messages Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 30/48] ceph: get file size from fscrypt_file when present in inode traces Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 31/48] ceph: handle fscrypt fields in cap messages from MDS Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 32/48] ceph: add __ceph_get_caps helper support Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 33/48] ceph: add __ceph_sync_read " Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 34/48] ceph: add object version support for sync read Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 35/48] ceph: add infrastructure for file encryption and decryption Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 36/48] ceph: add truncate size handling support for fscrypt Jeff Layton
2022-01-12  8:41   ` Xiubo Li
2022-01-11 19:15 ` [RFC PATCH v10 37/48] libceph: allow ceph_osdc_new_request to accept a multi-op read Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 38/48] ceph: disable fallocate for encrypted inodes Jeff Layton
2022-01-11 19:15 ` [RFC PATCH v10 39/48] ceph: disable copy offload on " Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 40/48] ceph: don't use special DIO path for " Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 41/48] ceph: set encryption context on open Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 42/48] ceph: align data in pages in ceph_sync_write Jeff Layton
2022-01-11 19:16 ` Jeff Layton [this message]
2022-01-19  3:21   ` [RFC PATCH v10 43/48] ceph: add read/modify/write to ceph_sync_write Xiubo Li
2022-01-19  5:08     ` Xiubo Li
2022-01-19 11:06       ` Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 44/48] ceph: plumb in decryption during sync reads Jeff Layton
2022-01-19  5:18   ` Xiubo Li
2022-01-19 18:49     ` Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 45/48] ceph: set i_blkbits to crypto block size for encrypted inodes Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 46/48] ceph: add fscrypt decryption support to ceph_netfs_issue_op Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 47/48] ceph: add encryption support to writepage Jeff Layton
2022-01-11 19:16 ` [RFC PATCH v10 48/48] ceph: fscrypt support for writepages Jeff Layton
2022-01-11 19:26 ` [RFC PATCH v10 00/48] ceph+fscrypt: full support Jeff Layton
2022-01-27  2:14 ` Eric Biggers
2022-01-27 11:08   ` Jeff Layton
2022-01-28 20:39     ` Eric Biggers
2022-01-28 20:47       ` Jeff Layton
2022-02-14  9:37 ` Xiubo Li
2022-02-14 11:33   ` Jeff Layton
2022-02-14 12:08     ` Xiubo Li
2022-02-15  0:44       ` Xiubo Li
2022-02-14 17:57 ` Luís Henriques
2022-02-14 18:39   ` Jeff Layton
2022-02-14 21:00     ` Luís Henriques
2022-02-14 21:10       ` Jeff Layton
2022-02-16 16:13     ` Luís Henriques

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220111191608.88762-44-jlayton@kernel.org \
    --to=jlayton@kernel.org \
    --cc=ceph-devel@vger.kernel.org \
    --cc=idryomov@gmail.com \
    --cc=linux-fscrypt@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.