linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: ceph-devel@vger.kernel.org
Cc: xiubli@redhat.com, idryomov@gmail.com, lhenriques@suse.de,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH v12 49/54] ceph: add read/modify/write to ceph_sync_write
Date: Thu, 31 Mar 2022 11:31:25 -0400	[thread overview]
Message-ID: <20220331153130.41287-50-jlayton@kernel.org> (raw)
In-Reply-To: <20220331153130.41287-1-jlayton@kernel.org>

When doing a synchronous write on an encrypted inode, we have no
guarantee that the caller is writing crypto block-aligned data. When
that happens, we must do a read/modify/write cycle.

First, expand the range to cover complete blocks. If we had to change
the original pos or length, issue a read to fill the first and/or last
pages, and fetch the version of the object from the result.

We then copy data into the pages as usual, encrypt the result and issue
a write prefixed by an assertion that the version hasn't changed. If it has
changed then we restart the whole thing again.

If there is no object at that position in the file (-ENOENT), we prefix
the write on an exclusive create of the object instead.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/file.c | 319 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 290 insertions(+), 29 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ec6324d23aa6..aaa7a9d0c439 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1542,18 +1542,16 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	struct ceph_vino vino;
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_osd_request *req;
 	struct page **pages;
 	u64 len;
 	int num_pages;
 	int written = 0;
-	int flags;
 	int ret;
 	bool check_caps = false;
 	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(from);
-	size_t off;
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
@@ -1573,29 +1571,236 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	if (ret < 0)
 		dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-	flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
-
 	while ((len = iov_iter_count(from)) > 0) {
 		size_t left;
 		int n;
+		u64 write_pos = pos;
+		u64 write_len = len;
+		u64 objnum, objoff;
+		u32 xlen;
+		u64 assert_ver;
+		bool rmw;
+		bool first, last;
+		struct iov_iter saved_iter = *from;
+		size_t off;
+
+		ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len);
+
+		/* clamp the length to the end of first object */
+		ceph_calc_file_object_mapping(&ci->i_layout, write_pos,
+						write_len, &objnum, &objoff,
+						&xlen);
+		write_len = xlen;
+
+		/* adjust len downward if it goes beyond current object */
+		if (pos + len > write_pos + write_len)
+			len = write_pos + write_len - pos;
 
-		vino = ceph_vino(inode);
-		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-					    vino, pos, &len, 0, 1,
-					    CEPH_OSD_OP_WRITE, flags, snapc,
-					    ci->i_truncate_seq,
-					    ci->i_truncate_size,
-					    false);
-		if (IS_ERR(req)) {
-			ret = PTR_ERR(req);
-			break;
-		}
+		/*
+		 * If we had to adjust the length or position to align with a
+		 * crypto block, then we must do a read/modify/write cycle. We
+		 * use a version assertion to redrive the thing if something
+		 * changes in between.
+		 */
+		first = pos != write_pos;
+		last = (pos + len) != (write_pos + write_len);
+		rmw = first || last;
 
-		num_pages = calc_pages_for(pos, len);
+		dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+		     ci->i_vino.ino, pos, len, write_pos, write_len, rmw ? "" : "no ");
+
+		/*
+		 * The data is emplaced into the page as it would be if it were in
+		 * an array of pagecache pages.
+		 */
+		num_pages = calc_pages_for(write_pos, write_len);
 		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 		if (IS_ERR(pages)) {
 			ret = PTR_ERR(pages);
-			goto out;
+			break;
+		}
+
+		/* Do we need to preload the pages? */
+		if (rmw) {
+			u64 first_pos = write_pos;
+			u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE;
+			u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
+			struct ceph_osd_req_op *op;
+
+			/* We should only need to do this for encrypted inodes */
+			WARN_ON_ONCE(!IS_ENCRYPTED(inode));
+
+			/* No need to do two reads if first and last blocks are same */
+			if (first && last_pos == first_pos)
+				last = false;
+
+			/*
+			 * Allocate a read request for one or two extents, depending
+			 * on how the request was aligned.
+			 */
+			req = ceph_osdc_new_request(osdc, &ci->i_layout,
+					ci->i_vino, first ? first_pos : last_pos,
+					&read_len, 0, (first && last) ? 2 : 1,
+					CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
+					NULL, ci->i_truncate_seq,
+					ci->i_truncate_size, false);
+			if (IS_ERR(req)) {
+				ceph_release_page_vector(pages, num_pages);
+				ret = PTR_ERR(req);
+				break;
+			}
+
+			/* Something is misaligned! */
+			if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
+				ceph_osdc_put_request(req);
+				ceph_release_page_vector(pages, num_pages);
+				ret = -EIO;
+				break;
+			}
+
+			/* Add extent for first block? */
+			op = &req->r_ops[0];
+
+			if (first) {
+				osd_req_op_extent_osd_data_pages(req, 0, pages,
+							 CEPH_FSCRYPT_BLOCK_SIZE,
+							 offset_in_page(first_pos),
+							 false, false);
+				/* We only expect a single extent here */
+				ret = ceph_alloc_sparse_ext_map(op, 1);
+				if (ret) {
+					ceph_osdc_put_request(req);
+					ceph_release_page_vector(pages, num_pages);
+					break;
+				}
+			}
+
+			/* Add extent for last block */
+			if (last) {
+				/* Init the other extent if first extent has been used */
+				if (first) {
+					op = &req->r_ops[1];
+					osd_req_op_extent_init(req, 1, CEPH_OSD_OP_SPARSE_READ,
+							last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
+							ci->i_truncate_size,
+							ci->i_truncate_seq);
+				}
+
+				ret = ceph_alloc_sparse_ext_map(op, 1);
+				if (ret) {
+					ceph_osdc_put_request(req);
+					ceph_release_page_vector(pages, num_pages);
+					break;
+				}
+
+				osd_req_op_extent_osd_data_pages(req, first ? 1 : 0,
+							&pages[num_pages - 1],
+							CEPH_FSCRYPT_BLOCK_SIZE,
+							offset_in_page(last_pos),
+							false, false);
+			}
+
+			ret = ceph_osdc_start_request(osdc, req, false);
+			if (!ret)
+				ret = ceph_osdc_wait_request(osdc, req);
+
+			/* FIXME: length field is wrong if there are 2 extents */
+			ceph_update_read_metrics(&fsc->mdsc->metric,
+						 req->r_start_latency,
+						 req->r_end_latency,
+						 read_len, ret);
+
+			/* Ok if object is not already present */
+			if (ret == -ENOENT) {
+				/*
+				 * If there is no object, then we can't assert
+				 * on its version. Set it to 0, and we'll use an
+				 * exclusive create instead.
+				 */
+				ceph_osdc_put_request(req);
+				assert_ver = 0;
+				ret = 0;
+
+				/*
+				 * zero out the soon-to-be uncopied parts of the
+				 * first and last pages.
+				 */
+				if (first)
+					zero_user_segment(pages[0], 0,
+							  offset_in_page(first_pos));
+				if (last)
+					zero_user_segment(pages[num_pages - 1],
+							  offset_in_page(last_pos),
+							  PAGE_SIZE);
+			} else {
+				if (ret < 0) {
+					ceph_osdc_put_request(req);
+					ceph_release_page_vector(pages, num_pages);
+					break;
+				}
+
+				op = &req->r_ops[0];
+				if (op->extent.sparse_ext_cnt == 0) {
+					if (first)
+						zero_user_segment(pages[0], 0,
+								  offset_in_page(first_pos));
+					else
+						zero_user_segment(pages[num_pages - 1],
+								  offset_in_page(last_pos),
+								  PAGE_SIZE);
+				} else if (op->extent.sparse_ext_cnt != 1 ||
+					   ceph_sparse_ext_map_end(op) !=
+						CEPH_FSCRYPT_BLOCK_SIZE) {
+					ret = -EIO;
+					ceph_osdc_put_request(req);
+					ceph_release_page_vector(pages, num_pages);
+					break;
+				}
+
+				if (first && last) {
+					op = &req->r_ops[1];
+					if (op->extent.sparse_ext_cnt == 0) {
+						zero_user_segment(pages[num_pages - 1],
+								  offset_in_page(last_pos),
+								  PAGE_SIZE);
+					} else if (op->extent.sparse_ext_cnt != 1 ||
+						   ceph_sparse_ext_map_end(op) !=
+							CEPH_FSCRYPT_BLOCK_SIZE) {
+						ret = -EIO;
+						ceph_osdc_put_request(req);
+						ceph_release_page_vector(pages, num_pages);
+						break;
+					}
+				}
+
+				/* Grab assert version. It must be non-zero. */
+				assert_ver = req->r_version;
+				WARN_ON_ONCE(ret > 0 && assert_ver == 0);
+
+				ceph_osdc_put_request(req);
+				if (first) {
+					ret = ceph_fscrypt_decrypt_block_inplace(inode,
+							pages[0],
+							CEPH_FSCRYPT_BLOCK_SIZE,
+							offset_in_page(first_pos),
+							first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+					if (ret < 0) {
+						ceph_release_page_vector(pages, num_pages);
+						break;
+					}
+				}
+				if (last) {
+					ret = ceph_fscrypt_decrypt_block_inplace(inode,
+							pages[num_pages - 1],
+							CEPH_FSCRYPT_BLOCK_SIZE,
+							offset_in_page(last_pos),
+							last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+					if (ret < 0) {
+						ceph_release_page_vector(pages, num_pages);
+						break;
+					}
+				}
+			}
 		}
 
 		left = len;
@@ -1603,43 +1808,98 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		for (n = 0; n < num_pages; n++) {
 			size_t plen = min_t(size_t, left, PAGE_SIZE - off);
 
+			/* copy the data */
 			ret = copy_page_from_iter(pages[n], off, plen, from);
-			off = 0;
 			if (ret != plen) {
 				ret = -EFAULT;
 				break;
 			}
+			off = 0;
 			left -= ret;
 		}
-
 		if (ret < 0) {
+			dout("sync_write write failed with %d\n", ret);
 			ceph_release_page_vector(pages, num_pages);
-			goto out;
+			break;
 		}
 
-		req->r_inode = inode;
+		if (IS_ENCRYPTED(inode)) {
+			ret = ceph_fscrypt_encrypt_pages(inode, pages,
+							 write_pos, write_len,
+							 GFP_KERNEL);
+			if (ret < 0) {
+				dout("encryption failed with %d\n", ret);
+				ceph_release_page_vector(pages, num_pages);
+				break;
+			}
+		}
 
-		osd_req_op_extent_osd_data_pages(req, 0, pages, len,
-						 offset_in_page(pos),
-						 false, true);
+		req = ceph_osdc_new_request(osdc, &ci->i_layout,
+					    ci->i_vino, write_pos, &write_len,
+					    rmw ? 1 : 0, rmw ? 2 : 1,
+					    CEPH_OSD_OP_WRITE,
+					    CEPH_OSD_FLAG_WRITE,
+					    snapc, ci->i_truncate_seq,
+					    ci->i_truncate_size, false);
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
+			ceph_release_page_vector(pages, num_pages);
+			break;
+		}
 
+		dout("sync_write write op %lld~%llu\n", write_pos, write_len);
+		osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
+						 offset_in_page(write_pos), false,
+						 true);
+		req->r_inode = inode;
 		req->r_mtime = mtime;
-		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+
+		/* Set up the assertion */
+		if (rmw) {
+			/*
+			 * Set up the assertion. If we don't have a version number,
+			 * then the object doesn't exist yet. Use an exclusive create
+			 * instead of a version assertion in that case.
+			 */
+			if (assert_ver) {
+				osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0);
+				req->r_ops[0].assert_ver.ver = assert_ver;
+			} else {
+				osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE,
+						CEPH_OSD_OP_FLAG_EXCL);
+			}
+		}
+
+		ret = ceph_osdc_start_request(osdc, req, false);
 		if (!ret)
-			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+			ret = ceph_osdc_wait_request(osdc, req);
 
 		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 					  req->r_end_latency, len, ret);
-out:
 		ceph_osdc_put_request(req);
 		if (ret != 0) {
+			dout("sync_write osd write returned %d\n", ret);
+			/* Version changed! Must re-do the rmw cycle */
+			if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
+			     (!assert_ver && ret == -EEXIST)) {
+				/* We should only ever see this on a rmw */
+				WARN_ON_ONCE(!rmw);
+
+				/* The version should never go backward */
+				WARN_ON_ONCE(ret == -EOVERFLOW);
+
+				*from = saved_iter;
+
+				/* FIXME: limit number of times we loop? */
+				continue;
+			}
 			ceph_set_error_write(ci);
 			break;
 		}
-
 		ceph_clear_error_write(ci);
 		pos += len;
 		written += len;
+		dout("sync_write written %d\n", written);
 		if (pos > i_size_read(inode)) {
 			check_caps = ceph_inode_set_size(inode, pos);
 			if (check_caps)
@@ -1654,6 +1914,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		ret = written;
 		iocb->ki_pos = pos;
 	}
+	dout("sync_write returning %d\n", ret);
 	return ret;
 }
 
-- 
2.35.1


  parent reply	other threads:[~2022-03-31 15:35 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-31 15:30 [PATCH v12 00/54] ceph+fscrypt: fully-working prototype Jeff Layton
2022-03-31 15:30 ` [PATCH v12 01/54] vfs: export new_inode_pseudo Jeff Layton
2022-03-31 19:50   ` Al Viro
2022-03-31 22:23     ` Jeff Layton
2022-03-31 15:30 ` [PATCH v12 02/54] fscrypt: export fscrypt_base64url_encode and fscrypt_base64url_decode Jeff Layton
2022-03-31 15:30 ` [PATCH v12 03/54] fscrypt: export fscrypt_fname_encrypt and fscrypt_fname_encrypted_size Jeff Layton
2022-03-31 15:30 ` [PATCH v12 04/54] fscrypt: add fscrypt_context_for_new_inode Jeff Layton
2022-03-31 15:30 ` [PATCH v12 05/54] ceph: preallocate inode for ops that may create one Jeff Layton
2022-03-31 15:30 ` [PATCH v12 06/54] ceph: crypto context handling for ceph Jeff Layton
2022-03-31 15:30 ` [PATCH v12 07/54] ceph: support legacy v1 encryption policy keysetup Jeff Layton
2022-03-31 20:16   ` Eric Biggers
2022-04-01 10:22     ` Luís Henriques
2022-03-31 15:30 ` [PATCH v12 08/54] ceph: add a has_stable_inodes operation for ceph Jeff Layton
2022-03-31 20:03   ` Eric Biggers
2022-04-01 10:37     ` Jeff Layton
2022-04-01 18:16       ` Eric Biggers
2022-04-01 18:51         ` Jeff Layton
2022-03-31 15:30 ` [PATCH v12 09/54] ceph: ensure that we accept a new context from MDS for new inodes Jeff Layton
2022-03-31 15:30 ` [PATCH v12 10/54] ceph: add support for fscrypt_auth/fscrypt_file to cap messages Jeff Layton
2022-03-31 15:30 ` [PATCH v12 11/54] ceph: add ability to set fscrypt_auth via setattr Jeff Layton
2022-03-31 15:30 ` [PATCH v12 12/54] ceph: implement -o test_dummy_encryption mount option Jeff Layton
2022-03-31 15:30 ` [PATCH v12 13/54] ceph: decode alternate_name in lease info Jeff Layton
2022-03-31 15:30 ` [PATCH v12 14/54] ceph: add fscrypt ioctls Jeff Layton
2022-03-31 15:30 ` [PATCH v12 15/54] ceph: make the ioctl cmd more readable in debug log Jeff Layton
2022-03-31 15:30 ` [PATCH v12 16/54] ceph: make ceph_msdc_build_path use ref-walk Jeff Layton
2022-03-31 15:30 ` [PATCH v12 17/54] ceph: add encrypted fname handling to ceph_mdsc_build_path Jeff Layton
2022-03-31 15:30 ` [PATCH v12 18/54] ceph: send altname in MClientRequest Jeff Layton
2022-03-31 15:30 ` [PATCH v12 19/54] ceph: encode encrypted name in dentry release Jeff Layton
2022-03-31 15:30 ` [PATCH v12 20/54] ceph: properly set DCACHE_NOKEY_NAME flag in lookup Jeff Layton
2022-03-31 15:30 ` [PATCH v12 21/54] ceph: set DCACHE_NOKEY_NAME in atomic open Jeff Layton
2022-03-31 15:30 ` [PATCH v12 22/54] ceph: make d_revalidate call fscrypt revalidator for encrypted dentries Jeff Layton
2022-03-31 15:30 ` [PATCH v12 23/54] ceph: add helpers for converting names for userland presentation Jeff Layton
2022-03-31 15:31 ` [PATCH v12 24/54] ceph: fix base64 encoded name's length check in ceph_fname_to_usr() Jeff Layton
2022-03-31 15:31 ` [PATCH v12 25/54] ceph: add fscrypt support to ceph_fill_trace Jeff Layton
2022-03-31 15:31 ` [PATCH v12 26/54] ceph: pass the request to parse_reply_info_readdir() Jeff Layton
2022-03-31 15:31 ` [PATCH v12 27/54] ceph: add ceph_encode_encrypted_dname() helper Jeff Layton
2022-03-31 15:31 ` [PATCH v12 28/54] ceph: add support to readdir for encrypted filenames Jeff Layton
2022-03-31 15:31 ` [PATCH v12 29/54] ceph: create symlinks with encrypted and base64-encoded targets Jeff Layton
2022-03-31 15:31 ` [PATCH v12 30/54] ceph: make ceph_get_name decrypt filenames Jeff Layton
2022-03-31 15:31 ` [PATCH v12 31/54] ceph: add a new ceph.fscrypt.auth vxattr Jeff Layton
2022-03-31 15:31 ` [PATCH v12 32/54] ceph: add some fscrypt guardrails Jeff Layton
2022-03-31 15:31 ` [PATCH v12 33/54] ceph: don't allow changing layout on encrypted files/directories Jeff Layton
2022-03-31 15:31 ` [PATCH v12 34/54] libceph: add CEPH_OSD_OP_ASSERT_VER support Jeff Layton
2022-03-31 15:31 ` [PATCH v12 35/54] ceph: size handling for encrypted inodes in cap updates Jeff Layton
2022-03-31 15:31 ` [PATCH v12 36/54] ceph: fscrypt_file field handling in MClientRequest messages Jeff Layton
2022-03-31 15:31 ` [PATCH v12 37/54] ceph: get file size from fscrypt_file when present in inode traces Jeff Layton
2022-03-31 15:31 ` [PATCH v12 38/54] ceph: handle fscrypt fields in cap messages from MDS Jeff Layton
2022-03-31 15:31 ` [PATCH v12 39/54] ceph: add __ceph_get_caps helper support Jeff Layton
2022-03-31 15:31 ` [PATCH v12 40/54] ceph: add __ceph_sync_read " Jeff Layton
2022-03-31 15:31 ` [PATCH v12 41/54] ceph: add object version support for sync read Jeff Layton
2022-03-31 15:31 ` [PATCH v12 42/54] ceph: add infrastructure for file encryption and decryption Jeff Layton
2022-03-31 15:31 ` [PATCH v12 43/54] ceph: add truncate size handling support for fscrypt Jeff Layton
2022-03-31 15:31 ` [PATCH v12 44/54] libceph: allow ceph_osdc_new_request to accept a multi-op read Jeff Layton
2022-03-31 15:31 ` [PATCH v12 45/54] ceph: disable fallocate for encrypted inodes Jeff Layton
2022-03-31 15:31 ` [PATCH v12 46/54] ceph: disable copy offload on " Jeff Layton
2022-03-31 15:31 ` [PATCH v12 47/54] ceph: don't use special DIO path for " Jeff Layton
2022-03-31 15:31 ` [PATCH v12 48/54] ceph: align data in pages in ceph_sync_write Jeff Layton
2022-03-31 15:31 ` Jeff Layton [this message]
2022-03-31 15:31 ` [PATCH v12 50/54] ceph: plumb in decryption during sync reads Jeff Layton
2022-03-31 15:31 ` [PATCH v12 51/54] ceph: add fscrypt decryption support to ceph_netfs_issue_op Jeff Layton
2022-03-31 15:31 ` [PATCH v12 52/54] ceph: set i_blkbits to crypto block size for encrypted inodes Jeff Layton
2022-03-31 15:31 ` [PATCH v12 53/54] ceph: add encryption support to writepage Jeff Layton
2022-03-31 15:31 ` [PATCH v12 54/54] ceph: fscrypt support for writepages Jeff Layton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220331153130.41287-50-jlayton@kernel.org \
    --to=jlayton@kernel.org \
    --cc=ceph-devel@vger.kernel.org \
    --cc=idryomov@gmail.com \
    --cc=lhenriques@suse.de \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=xiubli@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).