All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: ceph-devel@vger.kernel.org
Cc: zyan@redhat.com, sage@redhat.com, idryomov@gmail.com,
	pdonnell@redhat.com, xiubli@redhat.com
Subject: [RFC PATCH v2 10/10] ceph: attempt to do async create when possible
Date: Wed, 15 Jan 2020 15:59:12 -0500	[thread overview]
Message-ID: <20200115205912.38688-11-jlayton@kernel.org> (raw)
In-Reply-To: <20200115205912.38688-1-jlayton@kernel.org>

With the Octopus release, the MDS will hand out directory create caps.

If we have Fxc caps on the directory, and complete directory information
or a known negative dentry, then we can return without waiting on the
reply, allowing the open() call to return very quickly to userland.

We use the normal ceph_fill_inode() routine to fill in the inode, so we
have to gin up some reply inode information with what we'd expect the
newly-created inode to have. The client assumes that it has a full set
of caps on the new inode, and that the MDS will revoke them when there
is conflicting access.

This functionality is gated on the enable_async_dirops module option,
along with async unlinks, and on the server supporting the necessary
CephFS feature bit.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/file.c               | 196 +++++++++++++++++++++++++++++++++--
 include/linux/ceph/ceph_fs.h |   3 +
 2 files changed, 190 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b44ccbc85fe4..2742417fa5ec 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -448,6 +448,169 @@ cache_file_layout(struct inode *dst, struct inode *src)
 	spin_unlock(&cdst->i_ceph_lock);
 }
 
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static bool try_prep_async_create(struct inode *dir, struct dentry *dentry,
+				  struct ceph_file_layout *lo,
+				  unsigned long *pino)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	bool ret = false;
+	unsigned long ino;
+
+	spin_lock(&ci->i_ceph_lock);
+	/* No auth cap means no chance for Dc caps */
+	if (!ci->i_auth_cap)
+		goto no_async;
+
+	/* Any delegated inos? */
+	if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+		goto no_async;
+
+	if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+		goto no_async;
+
+	/* Use LOOKUP_RCU since we're under i_ceph_lock */
+	if (!__ceph_dir_is_complete(ci) &&
+	    !dentry_lease_is_valid(dentry, LOOKUP_RCU))
+		goto no_async;
+
+	if (!(__ceph_caps_issued(ci, NULL) &
+	      (CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE)))
+		goto no_async;
+
+	ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+	if (!ino)
+		goto no_async;
+
+	*pino = ino;
+	ceph_take_cap_refs(ci, CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE, false);
+	memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+	rcu_assign_pointer(lo->pool_ns,
+			   ceph_try_get_string(ci->i_cached_layout.pool_ns));
+	ret = true;
+no_async:
+	spin_unlock(&ci->i_ceph_lock);
+	return ret;
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_request *req)
+{
+	mapping_set_error(req->r_parent->i_mapping, req->r_err);
+
+	if (req->r_target_inode) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+		u64 ino = ceph_vino(req->r_target_inode).ino;
+
+		if (req->r_deleg_ino != ino)
+			pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n",
+				__func__, req->r_err, req->r_deleg_ino, ino);
+		mapping_set_error(req->r_target_inode->i_mapping, req->r_err);
+
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+			ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+			wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+		}
+		spin_unlock(&ci->i_ceph_lock);
+	} else {
+		pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__,
+			req->r_deleg_ino);
+	}
+	ceph_put_cap_refs(ceph_inode(req->r_parent),
+			  CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+				    struct file *file, umode_t mode,
+				    struct ceph_mds_request *req,
+				    struct ceph_acl_sec_ctx *as_ctx,
+				    struct ceph_file_layout *lo)
+{
+	int ret;
+	char xattr_buf[4];
+	struct ceph_mds_reply_inode in = { };
+	struct ceph_mds_reply_info_in iinfo = { .in = &in };
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct inode *inode;
+	struct timespec64 now;
+	struct ceph_vino vino = { .ino = req->r_deleg_ino,
+				  .snap = CEPH_NOSNAP };
+
+	ktime_get_real_ts64(&now);
+
+	inode = ceph_get_inode(dentry->d_sb, vino);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	iinfo.inline_version = CEPH_INLINE_NONE;
+	iinfo.change_attr = 1;
+	ceph_encode_timespec64(&iinfo.btime, &now);
+
+	iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+	iinfo.xattr_data = xattr_buf;
+	memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+
+	in.ino = cpu_to_le64(vino.ino);
+	in.snapid = cpu_to_le64(CEPH_NOSNAP);
+	in.version = cpu_to_le64(1);	// ???
+	in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+	in.cap.cap_id = cpu_to_le64(1);
+	in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+	in.cap.flags = CEPH_CAP_FLAG_AUTH;
+	in.ctime = in.mtime = in.atime = iinfo.btime;
+	in.mode = cpu_to_le32((u32)mode);
+	in.truncate_seq = cpu_to_le32(1);
+	in.truncate_size = cpu_to_le64(-1ULL);
+	in.xattr_version = cpu_to_le64(1);
+	in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+	in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
+				dir->i_gid : current_fsgid()));
+	in.nlink = cpu_to_le32(1);
+	in.max_size = cpu_to_le64(lo->stripe_unit);
+
+	ceph_file_layout_to_legacy(lo, &in.layout);
+
+	ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+			      req->r_fmode, NULL);
+	if (ret) {
+		dout("%s failed to fill inode: %d\n", __func__, ret);
+		if (inode->i_state & I_NEW)
+			discard_new_inode(inode);
+	} else {
+		struct dentry *dn;
+
+		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
+			vino.ino, dir->i_ino, dentry->d_name.name);
+		ceph_dir_clear_ordered(dir);
+		ceph_init_inode_acls(inode, as_ctx);
+		if (inode->i_state & I_NEW) {
+			/*
+			 * If it's not I_NEW, then someone created this before
+			 * we got here. Assume the server is aware of it at
+			 * that point and don't worry about setting
+			 * CEPH_I_ASYNC_CREATE.
+			 */
+			ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+			unlock_new_inode(inode);
+		}
+		if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+			if (!d_unhashed(dentry))
+				d_drop(dentry);
+			dn = d_splice_alias(inode, dentry);
+			WARN_ON_ONCE(dn && dn != dentry);
+		}
+		file->f_mode |= FMODE_CREATED;
+		ret = finish_open(file, dentry, ceph_open);
+	}
+	return ret;
+}
+
 /*
  * Do a lookup + open with a single request.  If we get a non-existent
  * file or symlink, return 1 so the VFS can retry.
@@ -460,6 +623,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	struct ceph_mds_request *req;
 	struct dentry *dn;
 	struct ceph_acl_sec_ctx as_ctx = {};
+	bool try_async = enable_async_dirops;
 	int mask;
 	int err;
 
@@ -492,28 +656,41 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	}
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
+	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+	if (ceph_security_xattr_wanted(dir))
+		mask |= CEPH_CAP_XATTR_SHARED;
+	req->r_args.open.mask = cpu_to_le32(mask);
+	req->r_parent = dir;
+
 	if (flags & O_CREAT) {
+		struct ceph_file_layout lo;
+
 		req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
 		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
 		if (as_ctx.pagelist) {
 			req->r_pagelist = as_ctx.pagelist;
 			as_ctx.pagelist = NULL;
 		}
+		if (try_async && try_prep_async_create(dir, dentry, &lo,
+						       &req->r_deleg_ino)) {
+			set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+			req->r_callback = ceph_async_create_cb;
+			err = ceph_mdsc_submit_request(mdsc, dir, req);
+			if (!err)
+				err = ceph_finish_async_create(dir, dentry,
+							file, mode, req,
+							&as_ctx, &lo);
+			goto out_req;
+		}
 	}
 
-       mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
-       if (ceph_security_xattr_wanted(dir))
-               mask |= CEPH_CAP_XATTR_SHARED;
-       req->r_args.open.mask = cpu_to_le32(mask);
-
-	req->r_parent = dir;
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
 	err = ceph_mdsc_do_request(mdsc,
 				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 				   req);
 	err = ceph_handle_snapdir(req, dentry, err);
 	if (err)
-		goto out_req;
+		goto out_fmode;
 
 	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
@@ -527,7 +704,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		dn = NULL;
 	}
 	if (err)
-		goto out_req;
+		goto out_fmode;
 	if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
 		/* make vfs retry on splice, ENOENT, or symlink */
 		dout("atomic_open finish_no_open on dn %p\n", dn);
@@ -543,9 +720,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		}
 		err = finish_open(file, dentry, ceph_open);
 	}
-out_req:
+out_fmode:
 	if (!req->r_err && req->r_target_inode)
 		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
+out_req:
 	ceph_mdsc_put_request(req);
 out_ctx:
 	ceph_release_acl_sec_ctx(&as_ctx);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 91d09cf37649..e035c5194005 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -659,6 +659,9 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
 			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
 			   CEPH_CAP_PIN)
+#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
+			   CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
+			   CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
 
 #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
 			CEPH_LOCK_IXATTR)
-- 
2.24.1

  parent reply	other threads:[~2020-01-15 20:59 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-15 20:59 [RFC PATCH v2 00/10] ceph: asynchronous file create support Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 01/10] libceph: export ceph_file_layout_is_valid Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 02/10] ceph: make ceph_fill_inode non-static Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 03/10] ceph: make dentry_lease_is_valid non-static Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 04/10] ceph: make __take_cap_refs a public function Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 05/10] ceph: decode interval_sets for delegated inos Jeff Layton
2020-01-16 14:32   ` Yan, Zheng
2020-01-16 15:37     ` Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 06/10] ceph: add flag to designate that a request is asynchronous Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 07/10] ceph: add infrastructure for waiting for async create to complete Jeff Layton
2020-01-17 15:00   ` Ilya Dryomov
2020-01-17 16:49     ` Jeff Layton
2020-01-15 20:59 ` [RFC PATCH v2 08/10] ceph: add new MDS req field to hold delegated inode number Jeff Layton
2020-01-17 14:47   ` Ilya Dryomov
2020-01-17 16:53     ` Jeff Layton
2020-01-17 17:42       ` Ilya Dryomov
2020-01-17 18:31         ` Jeff Layton
2020-01-20  9:41           ` Ilya Dryomov
2020-01-15 20:59 ` [RFC PATCH v2 09/10] ceph: cache layout in parent dir on first sync create Jeff Layton
2020-01-15 20:59 ` Jeff Layton [this message]
2020-01-16 15:09   ` [RFC PATCH v2 10/10] ceph: attempt to do async create when possible Yan, Zheng
2020-01-16 16:21     ` Jeff Layton
2020-01-17 13:28   ` Yan, Zheng
2020-01-17 17:40     ` Jeff Layton
2020-01-18  2:42       ` Yan, Zheng
2020-01-15 21:30 ` [RFC PATCH v2 00/10] ceph: asynchronous file create support Jeff Layton
2020-01-16  8:57 ` Xiubo Li
2020-01-16 13:10 ` Yan, Zheng
2020-01-16 14:15   ` Jeff Layton
2020-01-20 13:20 ` Yan, Zheng
2020-01-21 10:56   ` Jeff Layton
2020-01-21 13:20     ` Yan, Zheng
2020-01-21 14:37       ` Jeff Layton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200115205912.38688-11-jlayton@kernel.org \
    --to=jlayton@kernel.org \
    --cc=ceph-devel@vger.kernel.org \
    --cc=idryomov@gmail.com \
    --cc=pdonnell@redhat.com \
    --cc=sage@redhat.com \
    --cc=xiubli@redhat.com \
    --cc=zyan@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.