All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient
@ 2020-02-20 12:26 Yan, Zheng
  2020-02-20 12:26 ` [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps Yan, Zheng
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Yan, Zheng @ 2020-02-20 12:26 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

Not only after mds closes session and caps get dropped. This is
preparation patch for not requesting caps for idle open files.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c       | 36 +++++++++++++++---------------------
 fs/ceph/mds_client.c |  5 -----
 fs/ceph/super.h      |  1 -
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d05717397c2a..293920d013ff 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2659,6 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 		}
 	} else {
 		int session_readonly = false;
+		int mds_wanted;
 		if (ci->i_auth_cap &&
 		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
 			struct ceph_mds_session *s = ci->i_auth_cap->session;
@@ -2667,32 +2668,27 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 			spin_unlock(&s->s_cap_lock);
 		}
 		if (session_readonly) {
-			dout("get_cap_refs %p needed %s but mds%d readonly\n",
+			dout("get_cap_refs %p need %s but mds%d readonly\n",
 			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
 			ret = -EROFS;
 			goto out_unlock;
 		}
 
-		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
-			int mds_wanted;
-			if (READ_ONCE(mdsc->fsc->mount_state) ==
-			    CEPH_MOUNT_SHUTDOWN) {
-				dout("get_cap_refs %p forced umount\n", inode);
-				ret = -EIO;
-				goto out_unlock;
-			}
-			mds_wanted = __ceph_caps_mds_wanted(ci, false);
-			if (need & ~(mds_wanted & need)) {
-				dout("get_cap_refs %p caps were dropped"
-				     " (session killed?)\n", inode);
-				ret = -ESTALE;
-				goto out_unlock;
-			}
-			if (!(file_wanted & ~mds_wanted))
-				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			dout("get_cap_refs %p forced umount\n", inode);
+			ret = -EIO;
+			goto out_unlock;
+		}
+		mds_wanted = __ceph_caps_mds_wanted(ci, false);
+		if (need & ~mds_wanted) {
+			dout("get_cap_refs %p need %s > mds_wanted %s\n",
+			     inode, ceph_cap_string(need),
+			     ceph_cap_string(mds_wanted));
+			ret = -ESTALE;
+			goto out_unlock;
 		}
 
-		dout("get_cap_refs %p have %s needed %s\n", inode,
+		dout("get_cap_refs %p have %s need %s\n", inode,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
 out_unlock:
@@ -3646,8 +3642,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 		goto out_unlock;
 
 	if (target < 0) {
-		if (cap->mds_wanted | cap->issued)
-			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 		__ceph_remove_cap(cap, false);
 		goto out_unlock;
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fab9d6461a65..98d746b3bb53 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1411,8 +1411,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
-	if (cap->mds_wanted | cap->issued)
-		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 	__ceph_remove_cap(cap, false);
 	if (!ci->i_auth_cap) {
 		struct ceph_cap_flush *cf;
@@ -1578,9 +1576,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 			/* mds did not re-issue stale cap */
 			spin_lock(&ci->i_ceph_lock);
 			cap->issued = cap->implemented = CEPH_CAP_PIN;
-			/* make sure mds knows what we want */
-			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
-				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 			spin_unlock(&ci->i_ceph_lock);
 		}
 	} else if (ev == FORCE_RO) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 37dc1ac8f6c3..d370f89df358 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -517,7 +517,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
 #define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
 #define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
-#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
 #define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
 #define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
 #define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps
  2020-02-20 12:26 [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
@ 2020-02-20 12:26 ` Yan, Zheng
  2020-02-20 14:14   ` Jeff Layton
  2020-02-20 19:06   ` Jeff Layton
  2020-02-20 12:26 ` [PATCH 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 14+ messages in thread
From: Yan, Zheng @ 2020-02-20 12:26 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

When getting caps for read/write, update corresponding file's last
read/write. If a file hasn't been read/write for 'caps_wanted_delay_max'
seconds, ignore the file when calculating wanted caps.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c               | 149 ++++++++++++++++++++++++-----------
 fs/ceph/file.c               |  23 +++---
 fs/ceph/inode.c              |  15 +++-
 fs/ceph/ioctl.c              |   4 +-
 fs/ceph/super.h              |  16 +++-
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 145 insertions(+), 63 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 293920d013ff..ccdc47bd7cf0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
 	return used;
 }
 
+#define FMODE_WAIT_BIAS 1000
+
 /*
  * wanted, by virtue of open file modes
  */
 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
+	struct ceph_mount_options *opt =
+		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+	unsigned long used_cutoff =
+		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
+	unsigned long idle_cutoff =
+		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
 	int i, bits = 0;
+
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
-		if (ci->i_nr_by_mode[i])
+		if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) {
+			/* there are cap waiters or lots of open files */
 			bits |= 1 << i;
+		} else if (ci->i_file_by_mode[i].nr > 0) {
+			if (i ==  CEPH_FILE_MODE_PIN ||
+			    time_after(ci->i_file_by_mode[i].last_used,
+				       used_cutoff))
+				bits |= 1 << i;
+		} else if ((ci->i_file_by_mode[i].last_used & 1)) {
+			if (time_after(ci->i_file_by_mode[i].last_used,
+				       idle_cutoff)) {
+				bits |= 1 << i;
+			} else {
+				ci->i_file_by_mode[i].last_used &= ~1UL;
+			}
+		}
 	}
 	if (bits == 0)
 		return 0;
+	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
+		return 0;
+
 	return ceph_caps_for_mode(bits >> 1);
 }
 
@@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
 	return mds_wanted;
 }
 
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
-	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
 int ceph_is_any_caps(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
 		flags |= CHECK_CAPS_FLUSH;
 
-	if (!(flags & CHECK_CAPS_AUTHONLY) ||
-	    (ci->i_auth_cap && __ceph_is_single_caps(ci)))
-		__cap_delay_cancel(mdsc, ci);
-
 	goto retry_locked;
 retry:
 	spin_lock(&ci->i_ceph_lock);
@@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
-	/* Reschedule delayed caps release if we delayed anything */
-	if (delayed)
-		__cap_delay_requeue(mdsc, ci, false);
+	if (list_empty(&ci->i_cap_delay_list)) {
+	    if (delayed) {
+		    /* Reschedule delayed caps release if we delayed anything */
+		    __cap_delay_requeue(mdsc, ci, false);
+	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
+			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		    /* periodically re-calculate caps wanted by open files */
+		    __cap_delay_requeue(mdsc, ci, true);
+	    }
+	}
 
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
  * FIXME: how does a 0 return differ from -EAGAIN?
  */
 enum {
-	NON_BLOCKING	= 1,
-	CHECK_FILELOCK	= 2,
+	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+	NON_BLOCKING	= (1 << 8),
+	CHECK_FILELOCK	= (1 << 9),
 };
 
 static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	int ret = 0;
 	int have, implemented;
-	int file_wanted;
 	bool snap_rwsem_locked = false;
 
 	dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 		goto out_unlock;
 	}
 
-	/* make sure file is actually open */
-	file_wanted = __ceph_caps_file_wanted(ci);
-	if ((file_wanted & need) != need) {
-		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
-		     ceph_cap_string(need), ceph_cap_string(file_wanted));
-		ret = -EBADF;
-		goto out_unlock;
-	}
-
 	/* finish pending truncate */
 	while (ci->i_truncate_pending) {
 		spin_unlock(&ci->i_ceph_lock);
@@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
 out_unlock:
+
+	__ceph_touch_fmode(ci, mdsc, flags);
+
 	spin_unlock(&ci->i_ceph_lock);
 	if (snap_rwsem_locked)
 		up_read(&mdsc->snap_rwsem);
@@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
 
+static inline int get_used_file_mode(int need, int want)
+{
+	int fmode = 0;
+	if (need & CEPH_CAP_FILE_RD)
+		fmode |= CEPH_FILE_MODE_RD;
+	if (need & CEPH_CAP_FILE_WR)
+		fmode |= CEPH_FILE_MODE_WR;
+	if (want & CEPH_CAP_FILE_LAZYIO)
+		fmode |= CEPH_FILE_MODE_LAZY;
+	return fmode;
+}
+
 int ceph_try_get_caps(struct inode *inode, int need, int want,
 		      bool nonblock, int *got)
 {
-	int ret;
+	int ret, flags;
 
 	BUG_ON(need & ~CEPH_CAP_FILE_RD);
 	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
@@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
 	if (ret < 0)
 		return ret;
 
-	ret = try_get_cap_refs(inode, need, want, 0,
-			       (nonblock ? NON_BLOCKING : 0), got);
+	flags = get_used_file_mode(need, want);
+	if (nonblock)
+		flags |= NON_BLOCKING;
+
+	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
 	return ret == -EAGAIN ? 0 : ret;
 }
 
@@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
 	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
 		return -EBADF;
 
+	flags = get_used_file_mode(need, want);
+
 	while (true) {
 		if (endoff > 0)
 			check_max_size(inode, endoff);
 
-		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+		flags &= CEPH_FILE_MODE_MASK;
+		if (atomic_read(&fi->num_locks))
+			flags |= CHECK_FILELOCK;
 		_got = 0;
 		ret = try_get_cap_refs(inode, need, want, endoff,
 				       flags, &_got);
@@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			list_add(&cw.list, &mdsc->cap_wait_list);
 			spin_unlock(&mdsc->caps_list_lock);
 
+			/* make sure used fmode not timeout */
+			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
 			add_wait_queue(&ci->i_cap_wq, &wait);
 
 			flags |= NON_BLOCKING;
@@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			}
 
 			remove_wait_queue(&ci->i_cap_wq, &wait);
+			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
 
 			spin_lock(&mdsc->caps_list_lock);
 			list_del(&cw.list);
@@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 		if (ret < 0) {
 			if (ret == -ESTALE) {
 				/* session was killed, try renew caps */
-				ret = ceph_renew_caps(inode);
+				ret = ceph_renew_caps(inode, flags);
 				if (ret == 0)
 					continue;
 			}
@@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 	dout("flush_dirty_caps done\n");
 }
 
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+			struct ceph_mds_client *mdsc, int fmode)
+{
+	int i;
+	int bits = (fmode << 1);
+	unsigned long now = jiffies | 1;
+	for (i = 1; i < CEPH_FILE_MODE_BITS; i++) {
+		if (bits & (1 << i))
+			ci->i_file_by_mode[i].last_used = now;
+	}
+
+	/* queue periodic check */
+	if (bits && list_empty(&ci->i_cap_delay_list))
+		__cap_delay_requeue(mdsc, ci, true);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
+{
+	int i;
+	int bits = (fmode << 1) | 1;
+	spin_lock(&ci->i_ceph_lock);
+	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+		if (bits & (1 << i))
+			ci->i_file_by_mode[i].nr += count;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+}
+
 void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
 {
 	int i;
 	int bits = (fmode << 1) | 1;
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
 		if (bits & (1 << i))
-			ci->i_nr_by_mode[i]++;
+			ci->i_file_by_mode[i].nr++;
 	}
 }
 
@@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
  * we may need to release capabilities to the MDS (or schedule
  * their delayed release).
  */
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
 {
-	int i, last = 0;
+	int i;
 	int bits = (fmode << 1) | 1;
 	spin_lock(&ci->i_ceph_lock);
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
 		if (bits & (1 << i)) {
-			BUG_ON(ci->i_nr_by_mode[i] == 0);
-			if (--ci->i_nr_by_mode[i] == 0)
-				last++;
+			BUG_ON(ci->i_file_by_mode[i].nr < count);
+			ci->i_file_by_mode[i].nr -= count;
 		}
 	}
-	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
-	     &ci->vfs_inode, fmode,
-	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
-	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
 	spin_unlock(&ci->i_ceph_lock);
-
-	if (last && ci->i_vino.snap == CEPH_NOSNAP)
-		ceph_check_caps(ci, 0, NULL);
 }
 
 /*
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7e0190b1f821..f28f420bad23 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 		struct ceph_dir_file_info *dfi =
 			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
 		if (!dfi) {
-			ceph_put_fmode(ci, fmode); /* clean up */
+			ceph_put_fmode(ci, fmode, 1); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 	} else {
 		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
 		if (!fi) {
-			ceph_put_fmode(ci, fmode); /* clean up */
+			ceph_put_fmode(ci, fmode, 1); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	case S_IFLNK:
 		dout("init_file %p %p 0%o (symlink)\n", inode, file,
 		     inode->i_mode);
-		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		break;
 
 	default:
@@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 		 * we need to drop the open ref now, since we don't
 		 * have .release set to ceph_release.
 		 */
-		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		BUG_ON(inode->i_fop->release == ceph_release);
 
 		/* call the proper open fop */
@@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 /*
  * try renew caps after session gets killed.
  */
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	int err, flags, wanted;
 
 	spin_lock(&ci->i_ceph_lock);
+	__ceph_touch_fmode(ci, mdsc, fmode);
 	wanted = __ceph_caps_file_wanted(ci);
 	if (__ceph_is_any_real_caps(ci) &&
 	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
 		   (ci->i_snap_caps & wanted) == wanted) {
 		__ceph_get_fmode(ci, fmode);
+		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
 	}
@@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	}
 out_req:
 	if (!req->r_err && req->r_target_inode)
-		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
+		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
 	ceph_mdsc_put_request(req);
 out_ctx:
 	ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
 		dout("release inode %p dir file %p\n", inode, file);
 		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
 
-		ceph_put_fmode(ci, dfi->file_info.fmode);
+		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
 
 		if (dfi->last_readdir)
 			ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
 		dout("release inode %p regular file %p\n", inode, file);
 		WARN_ON(!list_empty(&fi->rw_contexts));
 
-		ceph_put_fmode(ci, fi->fmode);
+		ceph_put_fmode(ci, fi->fmode, 1);
+
 		kmem_cache_free(ceph_file_cachep, fi);
 	}
 
@@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
 		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
-			ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+			ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 	}
 
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 094b8fc37787..b279bd8e168e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_head_snapc = NULL;
 	ci->i_snap_caps = 0;
 
-	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
-		ci->i_nr_by_mode[i] = 0;
+	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+		ci->i_file_by_mode[i].nr = 0;
+		ci->i_file_by_mode[i].last_used = 0;
+	}
 
 	mutex_init(&ci->i_truncate_mutex);
 	ci->i_truncate_seq = 0;
@@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			if ((issued & (CEPH_CAP_FILE_CACHE|
 				       CEPH_CAP_FILE_BUFFER)) ||
 			    mapping_mapped(inode->i_mapping) ||
-			    __ceph_caps_file_wanted(ci)) {
+			    __ceph_is_file_opened(ci)) {
 				ci->i_truncate_pending++;
 				queue_trunc = 1;
 			}
@@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			fill_inline = true;
 	}
 
+	if (cap_fmode >= 0) {
+		if (!info_caps)
+			pr_warn("mds issued no caps on %llx.%llx\n",
+				ceph_vinop(inode));
+		__ceph_touch_fmode(ci, mdsc, cap_fmode);
+	}
+
 	spin_unlock(&ci->i_ceph_lock);
 
 	if (fill_inline)
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c90f03beb15d..da0ee54ae5bc 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 
 	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
 		spin_lock(&ci->i_ceph_lock);
 		fi->fmode |= CEPH_FILE_MODE_LAZY;
-		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+		ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++;
+		__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY);
 		spin_unlock(&ci->i_ceph_lock);
 		dout("ioctl_layzio: file %p marked lazy\n", file);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d370f89df358..029823643b8b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -361,7 +361,10 @@ struct ceph_inode_info {
 						    dirty|flushing caps */
 	unsigned i_snap_caps;           /* cap bits for snapped files */
 
-	int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
+	struct {
+		int nr;
+		unsigned long last_used;
+	} i_file_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
 
 	struct mutex i_truncate_mutex;
 	u32 i_truncate_seq;        /* last truncate to smaller size */
@@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
 
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
+{
+	return ci->i_file_by_mode[0].nr;
+}
 extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
 extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
 
@@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode,
 
 /* for counting open files by mode */
 extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+			       struct ceph_mds_client *mdsc, int fmode);
 
 /* addr.c */
 extern const struct address_space_operations ceph_aops;
@@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags, umode_t mode);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cb21c5cf12c3..8017130a08a1 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -564,6 +564,7 @@ struct ceph_filelock {
 #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
 #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
 #define CEPH_FILE_MODE_BITS       4
+#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
 
 int ceph_flags_to_mode(int flags);
 
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] ceph: simplify calling of ceph_get_fmode()
  2020-02-20 12:26 [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
  2020-02-20 12:26 ` [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps Yan, Zheng
@ 2020-02-20 12:26 ` Yan, Zheng
  2020-02-20 19:14   ` Jeff Layton
  2020-02-20 12:26 ` [PATCH 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
  2020-02-20 13:42 ` [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Jeff Layton
  3 siblings, 1 reply; 14+ messages in thread
From: Yan, Zheng @ 2020-02-20 12:26 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

Call ceph_get_fmode() when initializing file. Because fill_inode()
already calls ceph_touch_fmode() for open file request. It affects
__ceph_caps_file_wanted() for 'caps_wanted_delay_min' seconds, enough
for ceph_get_fmode() to get called by ceph_init_file_info().

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c  | 26 +++-----------------------
 fs/ceph/file.c  | 21 +++++----------------
 fs/ceph/inode.c |  8 +-------
 fs/ceph/super.h |  3 +--
 4 files changed, 10 insertions(+), 48 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ccdc47bd7cf0..2f4ff7e9508e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -606,7 +606,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
  */
 void ceph_add_cap(struct inode *inode,
 		  struct ceph_mds_session *session, u64 cap_id,
-		  int fmode, unsigned issued, unsigned wanted,
+		  unsigned issued, unsigned wanted,
 		  unsigned seq, unsigned mseq, u64 realmino, int flags,
 		  struct ceph_cap **new_cap)
 {
@@ -622,13 +622,6 @@ void ceph_add_cap(struct inode *inode,
 	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
 
-	/*
-	 * If we are opening the file, include file mode wanted bits
-	 * in wanted.
-	 */
-	if (fmode >= 0)
-		wanted |= ceph_caps_for_mode(fmode);
-
 	spin_lock(&session->s_gen_ttl_lock);
 	gen = session->s_cap_gen;
 	spin_unlock(&session->s_gen_ttl_lock);
@@ -753,9 +746,6 @@ void ceph_add_cap(struct inode *inode,
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
 	cap->cap_gen = gen;
-
-	if (fmode >= 0)
-		__ceph_get_fmode(ci, fmode);
 }
 
 /*
@@ -3726,7 +3716,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 		/* add placeholder for the export tagert */
 		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
 		tcap = new_cap;
-		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+		ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
 			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
 		if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3831,7 +3821,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 	__ceph_caps_issued(ci, &issued);
 	issued |= __ceph_caps_dirty(ci);
 
-	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+	ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
 		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
 
 	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -4186,16 +4176,6 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
 	spin_unlock(&ci->i_ceph_lock);
 }
 
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
-{
-	int i;
-	int bits = (fmode << 1) | 1;
-	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
-		if (bits & (1 << i))
-			ci->i_file_by_mode[i].nr++;
-	}
-}
-
 /*
  * Drop open file reference.  If we were the last open file,
  * we may need to release capabilities to the MDS (or schedule
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f28f420bad23..60a2dfa02ba2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 	if (isdir) {
 		struct ceph_dir_file_info *dfi =
 			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
-		if (!dfi) {
-			ceph_put_fmode(ci, fmode, 1); /* clean up */
+		if (!dfi)
 			return -ENOMEM;
-		}
 
 		file->private_data = dfi;
 		fi = &dfi->file_info;
@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 		dfi->readdir_cache_idx = -1;
 	} else {
 		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
-		if (!fi) {
-			ceph_put_fmode(ci, fmode, 1); /* clean up */
+		if (!fi)
 			return -ENOMEM;
-		}
 
 		file->private_data = fi;
 	}
 
+	ceph_get_fmode(ci, fmode, 1);
 	fi->fmode = fmode;
+
 	spin_lock_init(&fi->rw_contexts_lock);
 	INIT_LIST_HEAD(&fi->rw_contexts);
 	fi->meta_err = errseq_sample(&ci->i_meta_err);
@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	case S_IFLNK:
 		dout("init_file %p %p 0%o (symlink)\n", inode, file,
 		     inode->i_mode);
-		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		break;
 
 	default:
@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 		 * we need to drop the open ref now, since we don't
 		 * have .release set to ceph_release.
 		 */
-		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		BUG_ON(inode->i_fop->release == ceph_release);
 
 		/* call the proper open fop */
@@ -327,7 +323,6 @@ int ceph_renew_caps(struct inode *inode, int fmode)
 	req->r_inode = inode;
 	ihold(inode);
 	req->r_num_caps = 1;
-	req->r_fmode = -1;
 
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
@@ -373,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
 
 	/* trivially open snapdir */
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
-		spin_lock(&ci->i_ceph_lock);
-		__ceph_get_fmode(ci, fmode);
-		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
 	}
 
@@ -393,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
 		dout("open %p fmode %d want %s issued %s using existing\n",
 		     inode, fmode, ceph_cap_string(wanted),
 		     ceph_cap_string(issued));
-		__ceph_get_fmode(ci, fmode);
+		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 
 		/* adjust wanted? */
@@ -405,7 +397,6 @@ int ceph_open(struct inode *inode, struct file *file)
 		return ceph_init_file(inode, file, fmode);
 	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
 		   (ci->i_snap_caps & wanted) == wanted) {
-		__ceph_get_fmode(ci, fmode);
 		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
@@ -526,8 +517,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = finish_open(file, dentry, ceph_open);
 	}
 out_req:
-	if (!req->r_err && req->r_target_inode)
-		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
 	ceph_mdsc_put_request(req);
 out_ctx:
 	ceph_release_acl_sec_ctx(&as_ctx);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b279bd8e168e..bb73b0c8c4d9 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -969,7 +969,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		if (ceph_snap(inode) == CEPH_NOSNAP) {
 			ceph_add_cap(inode, session,
 				     le64_to_cpu(info->cap.cap_id),
-				     cap_fmode, info_caps,
+				     info_caps,
 				     le32_to_cpu(info->cap.wanted),
 				     le32_to_cpu(info->cap.seq),
 				     le32_to_cpu(info->cap.mseq),
@@ -994,13 +994,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			dout(" %p got snap_caps %s\n", inode,
 			     ceph_cap_string(info_caps));
 			ci->i_snap_caps |= info_caps;
-			if (cap_fmode >= 0)
-				__ceph_get_fmode(ci, cap_fmode);
 		}
-	} else if (cap_fmode >= 0) {
-		pr_warn("mds issued no caps on %llx.%llx\n",
-			   ceph_vinop(inode));
-		__ceph_get_fmode(ci, cap_fmode);
 	}
 
 	if (iinfo->inline_version > 0 &&
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 029823643b8b..1ea76466efcb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1038,7 +1038,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 				     struct ceph_cap_reservation *ctx);
 extern void ceph_add_cap(struct inode *inode,
 			 struct ceph_mds_session *session, u64 cap_id,
-			 int fmode, unsigned issued, unsigned wanted,
+			 unsigned issued, unsigned wanted,
 			 unsigned cap, unsigned seq, u64 realmino, int flags,
 			 struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
@@ -1080,7 +1080,6 @@ extern int ceph_try_get_caps(struct inode *inode,
 			     int need, int want, bool nonblock, int *got);
 
 /* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
 extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
 extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
 extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/4] ceph: remove delay check logic from ceph_check_caps()
  2020-02-20 12:26 [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
  2020-02-20 12:26 ` [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps Yan, Zheng
  2020-02-20 12:26 ` [PATCH 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
@ 2020-02-20 12:26 ` Yan, Zheng
  2020-02-20 19:57   ` Jeff Layton
  2020-02-20 13:42 ` [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Jeff Layton
  3 siblings, 1 reply; 14+ messages in thread
From: Yan, Zheng @ 2020-02-20 12:26 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

__ceph_caps_file_wanted() already checks 'caps_wanted_delay_min' and
'caps_wanted_delay_max'. There is no need to duplicte the logic in
ceph_check_caps() and __send_cap()

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c  | 146 ++++++++++++------------------------------------
 fs/ceph/file.c  |   5 +-
 fs/ceph/inode.c |   1 -
 fs/ceph/super.h |   8 +--
 4 files changed, 41 insertions(+), 119 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2f4ff7e9508e..39bf41d0fbdb 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
 	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
-	ci->i_hold_caps_min = round_jiffies(jiffies +
-					    opt->caps_wanted_delay_min * HZ);
 	ci->i_hold_caps_max = round_jiffies(jiffies +
 					    opt->caps_wanted_delay_max * HZ);
-	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
-	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+	dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_max - jiffies);
 }
 
 /*
@@ -508,8 +505,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
  *    -> we take mdsc->cap_delay_lock
  */
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
-				struct ceph_inode_info *ci,
-				bool set_timeout)
+				struct ceph_inode_info *ci)
 {
 	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
 	     ci->i_ceph_flags, ci->i_hold_caps_max);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 				goto no_change;
 			list_del_init(&ci->i_cap_delay_list);
 		}
-		if (set_timeout)
-			__cap_set_timeouts(mdsc, ci);
+		__cap_set_timeouts(mdsc, ci);
 		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 no_change:
 		spin_unlock(&mdsc->cap_delay_lock);
@@ -719,7 +714,7 @@ void ceph_add_cap(struct inode *inode,
 		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
 		     ceph_cap_string(issued), ceph_cap_string(wanted),
 		     ceph_cap_string(actual_wanted));
-		__cap_delay_requeue(mdsc, ci, true);
+		__cap_delay_requeue(mdsc, ci);
 	}
 
 	if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -1299,7 +1294,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	struct cap_msg_args arg;
 	int held, revoking;
 	int wake = 0;
-	int delayed = 0;
 	int ret;
 
 	held = cap->issued | cap->implemented;
@@ -1312,28 +1306,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	     ceph_cap_string(revoking));
 	BUG_ON((retain & CEPH_CAP_PIN) == 0);
 
-	arg.session = cap->session;
-
-	/* don't release wanted unless we've waited a bit. */
-	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
-	    time_before(jiffies, ci->i_hold_caps_min)) {
-		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
-		     ceph_cap_string(cap->issued),
-		     ceph_cap_string(cap->issued & retain),
-		     ceph_cap_string(cap->mds_wanted),
-		     ceph_cap_string(want));
-		want |= cap->mds_wanted;
-		retain |= cap->issued;
-		delayed = 1;
-	}
-	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
-	if (want & ~cap->mds_wanted) {
-		/* user space may open/close single file frequently.
-		 * This avoids droping mds_wanted immediately after
-		 * requesting new mds_wanted.
-		 */
-		__cap_set_timeouts(mdsc, ci);
-	}
+	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
 
 	cap->issued &= retain;  /* drop bits we don't want */
 	if (cap->implemented & ~cap->issued) {
@@ -1348,6 +1321,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	cap->implemented &= cap->issued | used;
 	cap->mds_wanted = want;
 
+	arg.session = cap->session;
 	arg.ino = ceph_vino(inode).ino;
 	arg.cid = cap->cap_id;
 	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
@@ -1408,14 +1382,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 
 	ret = send_cap_msg(&arg);
 	if (ret < 0) {
-		dout("error sending cap msg, must requeue %p\n", inode);
-		delayed = 1;
+		pr_err("error sending cap msg, ino (%llx.%llx) "
+		       "flushing %s tid %llu, requeue\n",
+		       ceph_vinop(inode), ceph_cap_string(flushing),
+		       flush_tid);
+		spin_lock(&ci->i_ceph_lock);
+		__cap_delay_requeue(mdsc, ci);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 
 	if (wake)
 		wake_up_all(&ci->i_cap_wq);
 
-	return delayed;
+	return ret;
 }
 
 static inline int __send_flush_snap(struct inode *inode,
@@ -1679,7 +1658,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
 	    (mask & CEPH_CAP_FILE_BUFFER))
 		dirty |= I_DIRTY_DATASYNC;
-	__cap_delay_requeue(mdsc, ci, true);
+	__cap_delay_requeue(mdsc, ci);
 	return dirty;
 }
 
@@ -1830,8 +1809,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
  * versus held caps.  Release, flush, ack revoked caps to mds as
  * appropriate.
  *
- *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- *    cap release further.
  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
  *    further delay.
@@ -1850,17 +1827,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
 			   to avoid an infinite loop on retry */
 	struct rb_node *p;
-	int delayed = 0, sent = 0;
-	bool no_delay = flags & CHECK_CAPS_NODELAY;
 	bool queue_invalidate = false;
 	bool tried_invalidate = false;
 
-	/* if we are unmounting, flush any unused caps immediately. */
-	if (mdsc->stopping)
-		no_delay = true;
-
 	spin_lock(&ci->i_ceph_lock);
-
 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
 		flags |= CHECK_CAPS_FLUSH;
 
@@ -1906,14 +1876,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	}
 
 	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
-	     " issued %s revoking %s retain %s %s%s%s\n", inode,
+	     " issued %s revoking %s retain %s %s%s\n", inode,
 	     ceph_cap_string(file_wanted),
 	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_flushing_caps),
 	     ceph_cap_string(issued), ceph_cap_string(revoking),
 	     ceph_cap_string(retain),
 	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
-	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
 	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
 
 	/*
@@ -1921,7 +1890,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	 * have cached pages, but don't want them, then try to invalidate.
 	 * If we fail, it's because pages are locked.... try again later.
 	 */
-	if ((!no_delay || mdsc->stopping) &&
+	if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
 	    S_ISREG(inode->i_mode) &&
 	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
 	    inode->i_data.nrpages &&		/* have cached pages */
@@ -2001,21 +1970,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		if ((cap->issued & ~retain) == 0)
 			continue;     /* nope, all good */
 
-		if (no_delay)
-			goto ack;
-
-		/* delay? */
-		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
-		    time_before(jiffies, ci->i_hold_caps_max)) {
-			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
-			     ceph_cap_string(cap->issued),
-			     ceph_cap_string(cap->issued & retain),
-			     ceph_cap_string(cap->mds_wanted),
-			     ceph_cap_string(want));
-			delayed++;
-			continue;
-		}
-
 ack:
 		if (session && session != cap->session) {
 			dout("oops, wrong session %p mutex\n", session);
@@ -2076,24 +2030,18 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		}
 
 		mds = cap->mds;  /* remember mds, so we don't repeat */
-		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
-				cap_used, want, retain, flushing,
-				flush_tid, oldest_flush_tid);
+		__send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
+			   retain, flushing, flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
-	if (list_empty(&ci->i_cap_delay_list)) {
-	    if (delayed) {
-		    /* Reschedule delayed caps release if we delayed anything */
-		    __cap_delay_requeue(mdsc, ci, false);
-	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
-			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
-		    /* periodically re-calculate caps wanted by open files */
-		    __cap_delay_requeue(mdsc, ci, true);
-	    }
+	/* periodically re-calculate caps wanted by open files */
+	if (list_empty(&ci->i_cap_delay_list) &&
+	    (file_wanted & ~CEPH_CAP_PIN) &&
+	    !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		__cap_delay_requeue(mdsc, ci);
 	}
 
 	spin_unlock(&ci->i_ceph_lock);
@@ -2123,7 +2071,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
 retry_locked:
 	if (ci->i_dirty_caps && ci->i_auth_cap) {
 		struct ceph_cap *cap = ci->i_auth_cap;
-		int delayed;
 
 		if (session != cap->session) {
 			spin_unlock(&ci->i_ceph_lock);
@@ -2152,18 +2099,10 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
 						 &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-				     CEPH_CLIENT_CAPS_SYNC,
-				     __ceph_caps_used(ci),
-				     __ceph_caps_wanted(ci),
-				     (cap->issued | cap->implemented),
-				     flushing, flush_tid, oldest_flush_tid);
-
-		if (delayed) {
-			spin_lock(&ci->i_ceph_lock);
-			__cap_delay_requeue(mdsc, ci, true);
-			spin_unlock(&ci->i_ceph_lock);
-		}
+		__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+			   __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+			   (cap->issued | cap->implemented),
+			   flushing, flush_tid, oldest_flush_tid);
 	} else {
 		if (!list_empty(&ci->i_cap_flush_list)) {
 			struct ceph_cap_flush *cf =
@@ -2363,22 +2302,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 		if (cf->caps) {
 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
-			ci->i_ceph_flags |= CEPH_I_NODELAY;
-
-			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+			__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
 					 (cf->tid < last_snap_flush ?
 					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
 					  __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
 					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
-			if (ret) {
-				pr_err("kick_flushing_caps: error sending "
-					"cap flush, ino (%llx.%llx) "
-					"tid %llu flushing %s\n",
-					ceph_vinop(inode), cf->tid,
-					ceph_cap_string(cf->caps));
-			}
 		} else {
 			struct ceph_cap_snap *capsnap =
 					container_of(cf, struct ceph_cap_snap,
@@ -2999,7 +2929,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
 	     last ? " last" : "", put ? " put" : "");
 
-	if (last && !flushsnaps)
+	if (last)
 		ceph_check_caps(ci, 0, NULL);
 	else if (flushsnaps)
 		ceph_flush_snaps(ci, NULL);
@@ -3417,10 +3347,10 @@ static void handle_cap_grant(struct inode *inode,
 		wake_up_all(&ci->i_cap_wq);
 
 	if (check_caps == 1)
-		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
 				session);
 	else if (check_caps == 2)
-		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+		ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
 	else
 		mutex_unlock(&session->s_mutex);
 }
@@ -4095,7 +4025,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	int flags = CHECK_CAPS_NODELAY;
 
 	dout("check_delayed_caps\n");
 	while (1) {
@@ -4115,7 +4044,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 
 		if (inode) {
 			dout("check_delayed_caps on %p\n", inode);
-			ceph_check_caps(ci, flags, NULL);
+			ceph_check_caps(ci, 0, NULL);
 			/* avoid calling iput_final() in tick thread */
 			ceph_async_iput(inode);
 		}
@@ -4140,7 +4069,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 		ihold(inode);
 		dout("flush_dirty_caps %p\n", inode);
 		spin_unlock(&mdsc->cap_dirty_lock);
-		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
 		iput(inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 	}
@@ -4161,7 +4090,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
 
 	/* queue periodic check */
 	if (bits && list_empty(&ci->i_cap_delay_list))
-		__cap_delay_requeue(mdsc, ci, true);
+		__cap_delay_requeue(mdsc, ci);
 }
 
 void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
@@ -4210,7 +4139,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
 	if (inode->i_nlink == 1) {
 		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
 
-		ci->i_ceph_flags |= CEPH_I_NODELAY;
 		if (__ceph_caps_dirty(ci)) {
 			struct ceph_mds_client *mdsc =
 				ceph_inode_to_client(inode)->mdsc;
@@ -4266,8 +4194,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 		if (force || (cap->issued & drop)) {
 			if (cap->issued & drop) {
 				int wanted = __ceph_caps_wanted(ci);
-				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
-					wanted |= cap->mds_wanted;
 				dout("encode_inode_release %p cap %p "
 				     "%s -> %s, wanted %s -> %s\n", inode, cap,
 				     ceph_cap_string(cap->issued),
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 60a2dfa02ba2..ed70bb448568 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2129,12 +2129,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 
 	if (endoff > size) {
 		int caps_flags = 0;
-
 		/* Let the MDS know about dst file size change */
-		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
-			caps_flags |= CHECK_CAPS_NODELAY;
 		if (ceph_inode_set_size(dst_inode, endoff))
 			caps_flags |= CHECK_CAPS_AUTHONLY;
+		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
+			caps_flags |= CHECK_CAPS_AUTHONLY;
 		if (caps_flags)
 			ceph_check_caps(dst_ci, caps_flags, NULL);
 	}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bb73b0c8c4d9..5c8d3b01bc5d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -471,7 +471,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_prealloc_cap_flush = NULL;
 	INIT_LIST_HEAD(&ci->i_cap_flush_list);
 	init_waitqueue_head(&ci->i_cap_wq);
-	ci->i_hold_caps_min = 0;
 	ci->i_hold_caps_max = 0;
 	INIT_LIST_HEAD(&ci->i_cap_delay_list);
 	INIT_LIST_HEAD(&ci->i_cap_snaps);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1ea76466efcb..9ddaaeefc6f0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -170,9 +170,9 @@ struct ceph_cap {
 	struct list_head caps_item;
 };
 
-#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
-#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY   1  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      2  /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL    4  /* don't invalidate pagecache */
 
 struct ceph_cap_flush {
 	u64 tid;
@@ -352,7 +352,6 @@ struct ceph_inode_info {
 	struct ceph_cap_flush *i_prealloc_cap_flush;
 	struct list_head i_cap_flush_list;
 	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
-	unsigned long i_hold_caps_min; /* jiffies */
 	unsigned long i_hold_caps_max; /* jiffies */
 	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
 	struct ceph_cap_reservation i_cap_migration_resv;
@@ -514,7 +513,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
  * Ceph inode.
  */
 #define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
-#define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
 #define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
 #define CEPH_I_POOL_PERM	(1 << 3)  /* pool rd/wr bits are valid */
 #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient
  2020-02-20 12:26 [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
                   ` (2 preceding siblings ...)
  2020-02-20 12:26 ` [PATCH 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
@ 2020-02-20 13:42 ` Jeff Layton
  3 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-02-20 13:42 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> Not only after mds closes session and caps get dropped. This is
> preparation patch for not requesting caps for idle open files.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>

In this kind of series, it's be nice to add a cover letter that explains
the problem you're trying to solve and how you intend to solve it. I'm
lurking on the bug that I know this involves, but not everyone on the
public mailing list will be familiar with it.

> ---
>  fs/ceph/caps.c       | 36 +++++++++++++++---------------------
>  fs/ceph/mds_client.c |  5 -----
>  fs/ceph/super.h      |  1 -
>  3 files changed, 15 insertions(+), 27 deletions(-)
> 

I do like the diffstat, and that fact that it removes a chunk of code
that I guess is no longer needed.

> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index d05717397c2a..293920d013ff 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2659,6 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		}
>  	} else {
>  		int session_readonly = false;
> +		int mds_wanted;
>  		if (ci->i_auth_cap &&
>  		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
>  			struct ceph_mds_session *s = ci->i_auth_cap->session;
> @@ -2667,32 +2668,27 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  			spin_unlock(&s->s_cap_lock);
>  		}
>  		if (session_readonly) {
> -			dout("get_cap_refs %p needed %s but mds%d readonly\n",
> +			dout("get_cap_refs %p need %s but mds%d readonly\n",
>  			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
>  			ret = -EROFS;
>  			goto out_unlock;
>  		}
>  
> -		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
> -			int mds_wanted;
> -			if (READ_ONCE(mdsc->fsc->mount_state) ==
> -			    CEPH_MOUNT_SHUTDOWN) {
> -				dout("get_cap_refs %p forced umount\n", inode);
> -				ret = -EIO;
> -				goto out_unlock;
> -			}
> -			mds_wanted = __ceph_caps_mds_wanted(ci, false);
> -			if (need & ~(mds_wanted & need)) {
> -				dout("get_cap_refs %p caps were dropped"
> -				     " (session killed?)\n", inode);
> -				ret = -ESTALE;
> -				goto out_unlock;
> -			}
> -			if (!(file_wanted & ~mds_wanted))
> -				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
> +		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> +			dout("get_cap_refs %p forced umount\n", inode);
> +			ret = -EIO;
> +			goto out_unlock;
> +		}
> +		mds_wanted = __ceph_caps_mds_wanted(ci, false);
> +		if (need & ~mds_wanted) {
> +			dout("get_cap_refs %p need %s > mds_wanted %s\n",
> +			     inode, ceph_cap_string(need),
> +			     ceph_cap_string(mds_wanted));
> +			ret = -ESTALE;
> +			goto out_unlock;

Not directly related to your patch since it did this before, but why
does this cause an -ESTALE return here? ceph seems to have repurposed
the meaning of ESTALE (which usually means "I can't find this inode").

Oh, ok -- it looks like it's being used as an error when the session has
been reconnected.

I think we ought to rework ceph to use a different, more distinct error
code for this purpose that isn't used by exportfs code. I worry that
this could leak into higher layers, when it shouldn't.

include/linux/errno.h has an existing pile of errors to choose from.
Maybe EBADCOOKIE instead? That's only used by the NFS client so it
should be safe for this.

I can look at spinning up a patch for that if you don't want to do it
here.


>  		}
>  
> -		dout("get_cap_refs %p have %s needed %s\n", inode,
> +		dout("get_cap_refs %p have %s need %s\n", inode,
>  		     ceph_cap_string(have), ceph_cap_string(need));
>  	}
>  out_unlock:
> @@ -3646,8 +3642,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
>  		goto out_unlock;
>  
>  	if (target < 0) {
> -		if (cap->mds_wanted | cap->issued)
> -			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>  		__ceph_remove_cap(cap, false);
>  		goto out_unlock;
>  	}
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index fab9d6461a65..98d746b3bb53 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1411,8 +1411,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  	dout("removing cap %p, ci is %p, inode is %p\n",
>  	     cap, ci, &ci->vfs_inode);
>  	spin_lock(&ci->i_ceph_lock);
> -	if (cap->mds_wanted | cap->issued)
> -		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>  	__ceph_remove_cap(cap, false);
>  	if (!ci->i_auth_cap) {
>  		struct ceph_cap_flush *cf;
> @@ -1578,9 +1576,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
>  			/* mds did not re-issue stale cap */
>  			spin_lock(&ci->i_ceph_lock);
>  			cap->issued = cap->implemented = CEPH_CAP_PIN;
> -			/* make sure mds knows what we want */
> -			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
> -				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>  			spin_unlock(&ci->i_ceph_lock);
>  		}
>  	} else if (ev == FORCE_RO) {
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 37dc1ac8f6c3..d370f89df358 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -517,7 +517,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>  #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
>  #define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
>  #define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
> -#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */

This does a lot more than is explained in the changelog. Dropping a flag
wholesale should warrant explaning why. What did this mean before and
why is it no longer relevant?

>  #define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
>  #define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
>  #define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */

Let's collapse down the number range since you're removing the flag.
There should be no ABI concerns here.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps
  2020-02-20 12:26 ` [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps Yan, Zheng
@ 2020-02-20 14:14   ` Jeff Layton
  2020-02-20 14:53     ` Yan, Zheng
  2020-02-20 19:06   ` Jeff Layton
  1 sibling, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2020-02-20 14:14 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> When getting caps for read/write, update corresponding file's last
> read/write. If a file hasn't been read/write for 'caps_wanted_delay_max'
> seconds, ignore the file when calculating wanted caps.
> 

Please explain in the changelog how the new info is to be stored, given
that it is quite complex.

> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c               | 149 ++++++++++++++++++++++++-----------
>  fs/ceph/file.c               |  23 +++---
>  fs/ceph/inode.c              |  15 +++-
>  fs/ceph/ioctl.c              |   4 +-
>  fs/ceph/super.h              |  16 +++-
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 145 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 293920d013ff..ccdc47bd7cf0 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
>  	return used;
>  }
>  
> +#define FMODE_WAIT_BIAS 1000
> +
>  /*
>   * wanted, by virtue of open file modes
>   */
>  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>  {
> +	struct ceph_mount_options *opt =
> +		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> +	unsigned long used_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> +	unsigned long idle_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
>  	int i, bits = 0;
> +
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> -		if (ci->i_nr_by_mode[i])
> +		if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) {
> +			/* there are cap waiters or lots of open files */
>  			bits |= 1 << i;
> +		} else if (ci->i_file_by_mode[i].nr > 0) {
> +			if (i ==  CEPH_FILE_MODE_PIN ||
> +			    time_after(ci->i_file_by_mode[i].last_used,
> +				       used_cutoff))
> +				bits |= 1 << i;
> +		} else if ((ci->i_file_by_mode[i].last_used & 1)) {
> +			if (time_after(ci->i_file_by_mode[i].last_used,
> +				       idle_cutoff)) {
> +				bits |= 1 << i;
> +			} else {
> +				ci->i_file_by_mode[i].last_used &= ~1UL;
> +			}
> +		}
>  	}
>  	if (bits == 0)
>  		return 0;
> +	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> +		return 0;
> +
>  	return ceph_caps_for_mode(bits >> 1);
>  }
>  
> @@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
>  	return mds_wanted;
>  }
>  
> -/*
> - * called under i_ceph_lock
> - */
> -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> -{
> -	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> -}
> -
>  int ceph_is_any_caps(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  		flags |= CHECK_CAPS_FLUSH;
>  
> -	if (!(flags & CHECK_CAPS_AUTHONLY) ||
> -	    (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> -		__cap_delay_cancel(mdsc, ci);
> -
>  	goto retry_locked;
>  retry:
>  	spin_lock(&ci->i_ceph_lock);
> @@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		goto retry; /* retake i_ceph_lock and restart our cap scan. */
>  	}
>  
> -	/* Reschedule delayed caps release if we delayed anything */
> -	if (delayed)
> -		__cap_delay_requeue(mdsc, ci, false);
> +	if (list_empty(&ci->i_cap_delay_list)) {
> +	    if (delayed) {
> +		    /* Reschedule delayed caps release if we delayed anything */
> +		    __cap_delay_requeue(mdsc, ci, false);
> +	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> +			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +		    /* periodically re-calculate caps wanted by open files */
> +		    __cap_delay_requeue(mdsc, ci, true);
> +	    }
> +	}
>  
>  	spin_unlock(&ci->i_ceph_lock);
>  
> @@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
>   * FIXME: how does a 0 return differ from -EAGAIN?
>   */
>  enum {
> -	NON_BLOCKING	= 1,
> -	CHECK_FILELOCK	= 2,
> +	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> +	NON_BLOCKING	= (1 << 8),
> +	CHECK_FILELOCK	= (1 << 9),
>  };
>  
>  static int try_get_cap_refs(struct inode *inode, int need, int want,
> @@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	int ret = 0;
>  	int have, implemented;
> -	int file_wanted;
>  	bool snap_rwsem_locked = false;
>  
>  	dout("get_cap_refs %p need %s want %s\n", inode,
> @@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		goto out_unlock;
>  	}
>  
> -	/* make sure file is actually open */
> -	file_wanted = __ceph_caps_file_wanted(ci);
> -	if ((file_wanted & need) != need) {
> -		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> -		     ceph_cap_string(need), ceph_cap_string(file_wanted));
> -		ret = -EBADF;
> -		goto out_unlock;
> -	}
> -
>  	/* finish pending truncate */
>  	while (ci->i_truncate_pending) {
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		     ceph_cap_string(have), ceph_cap_string(need));
>  	}
>  out_unlock:
> +
> +	__ceph_touch_fmode(ci, mdsc, flags);
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  	if (snap_rwsem_locked)
>  		up_read(&mdsc->snap_rwsem);
> @@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff)
>  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
>  }
>  
> +static inline int get_used_file_mode(int need, int want)
> +{
> +	int fmode = 0;
> +	if (need & CEPH_CAP_FILE_RD)
> +		fmode |= CEPH_FILE_MODE_RD;
> +	if (need & CEPH_CAP_FILE_WR)
> +		fmode |= CEPH_FILE_MODE_WR;
> +	if (want & CEPH_CAP_FILE_LAZYIO)
> +		fmode |= CEPH_FILE_MODE_LAZY;
> +	return fmode;
> +}
> +
>  int ceph_try_get_caps(struct inode *inode, int need, int want,
>  		      bool nonblock, int *got)
>  {
> -	int ret;
> +	int ret, flags;
>  
>  	BUG_ON(need & ~CEPH_CAP_FILE_RD);
>  	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> @@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
>  	if (ret < 0)
>  		return ret;
>  
> -	ret = try_get_cap_refs(inode, need, want, 0,
> -			       (nonblock ? NON_BLOCKING : 0), got);
> +	flags = get_used_file_mode(need, want);
> +	if (nonblock)
> +		flags |= NON_BLOCKING;
> +
> +	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
>  	return ret == -EAGAIN ? 0 : ret;
>  }
>  
> @@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
>  		return -EBADF;
>  
> +	flags = get_used_file_mode(need, want);
> +
>  	while (true) {
>  		if (endoff > 0)
>  			check_max_size(inode, endoff);
>  
> -		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> +		flags &= CEPH_FILE_MODE_MASK;
> +		if (atomic_read(&fi->num_locks))
> +			flags |= CHECK_FILELOCK;
>  		_got = 0;
>  		ret = try_get_cap_refs(inode, need, want, endoff,
>  				       flags, &_got);
> @@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			list_add(&cw.list, &mdsc->cap_wait_list);
>  			spin_unlock(&mdsc->caps_list_lock);
>  
> +			/* make sure used fmode not timeout */
> +			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
>  			add_wait_queue(&ci->i_cap_wq, &wait);
>  
>  			flags |= NON_BLOCKING;
> @@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			}
>  
>  			remove_wait_queue(&ci->i_cap_wq, &wait);
> +			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
>  
>  			spin_lock(&mdsc->caps_list_lock);
>  			list_del(&cw.list);
> @@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  		if (ret < 0) {
>  			if (ret == -ESTALE) {
>  				/* session was killed, try renew caps */
> -				ret = ceph_renew_caps(inode);
> +				ret = ceph_renew_caps(inode, flags);
>  				if (ret == 0)
>  					continue;
>  			}
> @@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>  	dout("flush_dirty_caps done\n");
>  }
>  
> +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			struct ceph_mds_client *mdsc, int fmode)
> +{
> +	int i;
> +	int bits = (fmode << 1);
> +	unsigned long now = jiffies | 1;
> +	for (i = 1; i < CEPH_FILE_MODE_BITS; i++) {
> +		if (bits & (1 << i))
> +			ci->i_file_by_mode[i].last_used = now;
> +	}
> +
> +	/* queue periodic check */
> +	if (bits && list_empty(&ci->i_cap_delay_list))
> +		__cap_delay_requeue(mdsc, ci, true);
> +}
> +
> +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> +{
> +	int i;
> +	int bits = (fmode << 1) | 1;
> +	spin_lock(&ci->i_ceph_lock);
> +	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> +		if (bits & (1 << i))
> +			ci->i_file_by_mode[i].nr += count;
> +	}
> +	spin_unlock(&ci->i_ceph_lock);
> +}
> +
>  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>  {
>  	int i;
>  	int bits = (fmode << 1) | 1;
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>  		if (bits & (1 << i))
> -			ci->i_nr_by_mode[i]++;
> +			ci->i_file_by_mode[i].nr++;
>  	}
>  }
>  
> @@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>   * we may need to release capabilities to the MDS (or schedule
>   * their delayed release).
>   */
> -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
>  {
> -	int i, last = 0;
> +	int i;
>  	int bits = (fmode << 1) | 1;
>  	spin_lock(&ci->i_ceph_lock);
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>  		if (bits & (1 << i)) {
> -			BUG_ON(ci->i_nr_by_mode[i] == 0);
> -			if (--ci->i_nr_by_mode[i] == 0)
> -				last++;
> +			BUG_ON(ci->i_file_by_mode[i].nr < count);
> +			ci->i_file_by_mode[i].nr -= count;
>  		}
>  	}
> -	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> -	     &ci->vfs_inode, fmode,
> -	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> -	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
>  	spin_unlock(&ci->i_ceph_lock);
> -
> -	if (last && ci->i_vino.snap == CEPH_NOSNAP)
> -		ceph_check_caps(ci, 0, NULL);
>  }
>  
>  /*
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 7e0190b1f821..f28f420bad23 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  		struct ceph_dir_file_info *dfi =
>  			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
>  		if (!dfi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  	} else {
>  		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
>  		if (!fi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  	case S_IFLNK:
>  		dout("init_file %p %p 0%o (symlink)\n", inode, file,
>  		     inode->i_mode);
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		break;
>  
>  	default:
> @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  		 * we need to drop the open ref now, since we don't
>  		 * have .release set to ceph_release.
>  		 */
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		BUG_ON(inode->i_fop->release == ceph_release);
>  
>  		/* call the proper open fop */
> @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  /*
>   * try renew caps after session gets killed.
>   */
> -int ceph_renew_caps(struct inode *inode)
> +int ceph_renew_caps(struct inode *inode, int fmode)
>  {
> -	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  	struct ceph_mds_request *req;
>  	int err, flags, wanted;
>  
>  	spin_lock(&ci->i_ceph_lock);
> +	__ceph_touch_fmode(ci, mdsc, fmode);
>  	wanted = __ceph_caps_file_wanted(ci);
>  	if (__ceph_is_any_real_caps(ci) &&
>  	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
>  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
>  		   (ci->i_snap_caps & wanted) == wanted) {
>  		__ceph_get_fmode(ci, fmode);
> +		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
>  	}
> @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  	}
>  out_req:
>  	if (!req->r_err && req->r_target_inode)
> -		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> +		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
>  	ceph_mdsc_put_request(req);
>  out_ctx:
>  	ceph_release_acl_sec_ctx(&as_ctx);
> @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p dir file %p\n", inode, file);
>  		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
>  
> -		ceph_put_fmode(ci, dfi->file_info.fmode);
> +		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
>  
>  		if (dfi->last_readdir)
>  			ceph_mdsc_put_request(dfi->last_readdir);
> @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p regular file %p\n", inode, file);
>  		WARN_ON(!list_empty(&fi->rw_contexts));
>  
> -		ceph_put_fmode(ci, fi->fmode);
> +		ceph_put_fmode(ci, fi->fmode, 1);
> +
>  		kmem_cache_free(ceph_file_cachep, fi);
>  	}
>  
> @@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		if (dirty)
>  			__mark_inode_dirty(inode, dirty);
>  		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> -			ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> +			ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
>  	}
>  
>  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 094b8fc37787..b279bd8e168e 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_head_snapc = NULL;
>  	ci->i_snap_caps = 0;
>  
> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
> -		ci->i_nr_by_mode[i] = 0;
> +	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> +		ci->i_file_by_mode[i].nr = 0;
> +		ci->i_file_by_mode[i].last_used = 0;
> +	}
>  
>  	mutex_init(&ci->i_truncate_mutex);
>  	ci->i_truncate_seq = 0;
> @@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
>  			if ((issued & (CEPH_CAP_FILE_CACHE|
>  				       CEPH_CAP_FILE_BUFFER)) ||
>  			    mapping_mapped(inode->i_mapping) ||
> -			    __ceph_caps_file_wanted(ci)) {
> +			    __ceph_is_file_opened(ci)) {
>  				ci->i_truncate_pending++;
>  				queue_trunc = 1;
>  			}
> @@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  			fill_inline = true;
>  	}
>  
> +	if (cap_fmode >= 0) {
> +		if (!info_caps)
> +			pr_warn("mds issued no caps on %llx.%llx\n",
> +				ceph_vinop(inode));
> +		__ceph_touch_fmode(ci, mdsc, cap_fmode);
> +	}
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  
>  	if (fill_inline)
> diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> index c90f03beb15d..da0ee54ae5bc 100644
> --- a/fs/ceph/ioctl.c
> +++ b/fs/ceph/ioctl.c
> @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
>  	struct ceph_file_info *fi = file->private_data;
>  	struct inode *inode = file_inode(file);
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  
>  	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
>  		spin_lock(&ci->i_ceph_lock);
>  		fi->fmode |= CEPH_FILE_MODE_LAZY;
> -		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> +		ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++;
> +		__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY);
>  		spin_unlock(&ci->i_ceph_lock);
>  		dout("ioctl_layzio: file %p marked lazy\n", file);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index d370f89df358..029823643b8b 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -361,7 +361,10 @@ struct ceph_inode_info {
>  						    dirty|flushing caps */
>  	unsigned i_snap_caps;           /* cap bits for snapped files */
>  
> -	int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> +	struct {
> +		int nr;
> +		unsigned long last_used;
> +	} i_file_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
>  

Ok, so we're growing ceph_inode_info by 40 bytes here (on 64-bit arch).

That's quite a bit, actually, but it turns out that there are 32 bytes
worth of holes in ceph_inode_info now. It'd be good to reorganize the
struct for better packing before you do this, so that this at least
won't make memory consumption too much worse.

There other ways we could approach this too that would be more space
efficient. We don't really need to keep a timestamp for each mode bit.
All we're really interested in is what modes were used in the last time
interval.

We could keep an active and inactive set of CEPH_FILE_MODE bits (which
are just a single byte each), and a timestamp representing the switch
between the two.

As we use the file, we'd set bits in the active mask if the timestamp is
less than half the time interval old. If it's more than half the
interval, copy the active mask to the inactive one and zero out the
active mask first.

When you go to check what modes have been used you can do the switch
again first if the timestamp is too old. To see what bits were actually
used, you just logically or the active and inactive sets together.

That would take a lot less space per inode.

>  	struct mutex i_truncate_mutex;
>  	u32 i_truncate_seq;        /* last truncate to smaller size */
> @@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
>  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
>  extern int __ceph_caps_used(struct ceph_inode_info *ci);
>  
> +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> +{
> +	return ci->i_file_by_mode[0].nr;
> +}
>  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
>  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
>  
> @@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode,
>  
>  /* for counting open files by mode */
>  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			       struct ceph_mds_client *mdsc, int fmode);
>  
>  /* addr.c */
>  extern const struct address_space_operations ceph_aops;
> @@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
>  /* file.c */
>  extern const struct file_operations ceph_file_fops;
>  
> -extern int ceph_renew_caps(struct inode *inode);
> +extern int ceph_renew_caps(struct inode *inode, int fmode);
>  extern int ceph_open(struct inode *inode, struct file *file);
>  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  			    struct file *file, unsigned flags, umode_t mode);
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index cb21c5cf12c3..8017130a08a1 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -564,6 +564,7 @@ struct ceph_filelock {
>  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
>  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
>  #define CEPH_FILE_MODE_BITS       4
> +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
>  
>  int ceph_flags_to_mode(int flags);
>  

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps
  2020-02-20 14:14   ` Jeff Layton
@ 2020-02-20 14:53     ` Yan, Zheng
  2020-02-20 16:20       ` Jeff Layton
  0 siblings, 1 reply; 14+ messages in thread
From: Yan, Zheng @ 2020-02-20 14:53 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Yan, Zheng, ceph-devel

On Thu, Feb 20, 2020 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> > When getting caps for read/write, update corresponding file's last
> > read/write. If a file hasn't been read/write for 'caps_wanted_delay_max'
> > seconds, ignore the file when calculating wanted caps.
> >
>
> Please explain in the changelog how the new info is to be stored, given
> that it is quite complex.
>
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/caps.c               | 149 ++++++++++++++++++++++++-----------
> >  fs/ceph/file.c               |  23 +++---
> >  fs/ceph/inode.c              |  15 +++-
> >  fs/ceph/ioctl.c              |   4 +-
> >  fs/ceph/super.h              |  16 +++-
> >  include/linux/ceph/ceph_fs.h |   1 +
> >  6 files changed, 145 insertions(+), 63 deletions(-)
> >
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 293920d013ff..ccdc47bd7cf0 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
> >       return used;
> >  }
> >
> > +#define FMODE_WAIT_BIAS 1000
> > +
> >  /*
> >   * wanted, by virtue of open file modes
> >   */
> >  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
> >  {
> > +     struct ceph_mount_options *opt =
> > +             ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> > +     unsigned long used_cutoff =
> > +             round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> > +     unsigned long idle_cutoff =
> > +             round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
> >       int i, bits = 0;
> > +
> >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > -             if (ci->i_nr_by_mode[i])
> > +             if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) {
> > +                     /* there are cap waiters or lots of open files */
> >                       bits |= 1 << i;
> > +             } else if (ci->i_file_by_mode[i].nr > 0) {
> > +                     if (i ==  CEPH_FILE_MODE_PIN ||
> > +                         time_after(ci->i_file_by_mode[i].last_used,
> > +                                    used_cutoff))
> > +                             bits |= 1 << i;
> > +             } else if ((ci->i_file_by_mode[i].last_used & 1)) {
> > +                     if (time_after(ci->i_file_by_mode[i].last_used,
> > +                                    idle_cutoff)) {
> > +                             bits |= 1 << i;
> > +                     } else {
> > +                             ci->i_file_by_mode[i].last_used &= ~1UL;
> > +                     }
> > +             }
> >       }
> >       if (bits == 0)
> >               return 0;
> > +     if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> > +             return 0;
> > +
> >       return ceph_caps_for_mode(bits >> 1);
> >  }
> >
> > @@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
> >       return mds_wanted;
> >  }
> >
> > -/*
> > - * called under i_ceph_lock
> > - */
> > -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> > -{
> > -     return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> > -}
> > -
> >  int ceph_is_any_caps(struct inode *inode)
> >  {
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> > @@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >       if (ci->i_ceph_flags & CEPH_I_FLUSH)
> >               flags |= CHECK_CAPS_FLUSH;
> >
> > -     if (!(flags & CHECK_CAPS_AUTHONLY) ||
> > -         (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> > -             __cap_delay_cancel(mdsc, ci);
> > -
> >       goto retry_locked;
> >  retry:
> >       spin_lock(&ci->i_ceph_lock);
> > @@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >               goto retry; /* retake i_ceph_lock and restart our cap scan. */
> >       }
> >
> > -     /* Reschedule delayed caps release if we delayed anything */
> > -     if (delayed)
> > -             __cap_delay_requeue(mdsc, ci, false);
> > +     if (list_empty(&ci->i_cap_delay_list)) {
> > +         if (delayed) {
> > +                 /* Reschedule delayed caps release if we delayed anything */
> > +                 __cap_delay_requeue(mdsc, ci, false);
> > +         } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> > +                     !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > +                 /* periodically re-calculate caps wanted by open files */
> > +                 __cap_delay_requeue(mdsc, ci, true);
> > +         }
> > +     }
> >
> >       spin_unlock(&ci->i_ceph_lock);
> >
> > @@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
> >   * FIXME: how does a 0 return differ from -EAGAIN?
> >   */
> >  enum {
> > -     NON_BLOCKING    = 1,
> > -     CHECK_FILELOCK  = 2,
> > +     /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> > +     NON_BLOCKING    = (1 << 8),
> > +     CHECK_FILELOCK  = (1 << 9),
> >  };
> >
> >  static int try_get_cap_refs(struct inode *inode, int need, int want,
> > @@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >       int ret = 0;
> >       int have, implemented;
> > -     int file_wanted;
> >       bool snap_rwsem_locked = false;
> >
> >       dout("get_cap_refs %p need %s want %s\n", inode,
> > @@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >               goto out_unlock;
> >       }
> >
> > -     /* make sure file is actually open */
> > -     file_wanted = __ceph_caps_file_wanted(ci);
> > -     if ((file_wanted & need) != need) {
> > -             dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> > -                  ceph_cap_string(need), ceph_cap_string(file_wanted));
> > -             ret = -EBADF;
> > -             goto out_unlock;
> > -     }
> > -
> >       /* finish pending truncate */
> >       while (ci->i_truncate_pending) {
> >               spin_unlock(&ci->i_ceph_lock);
> > @@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >                    ceph_cap_string(have), ceph_cap_string(need));
> >       }
> >  out_unlock:
> > +
> > +     __ceph_touch_fmode(ci, mdsc, flags);
> > +
> >       spin_unlock(&ci->i_ceph_lock);
> >       if (snap_rwsem_locked)
> >               up_read(&mdsc->snap_rwsem);
> > @@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff)
> >               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
> >  }
> >
> > +static inline int get_used_file_mode(int need, int want)
> > +{
> > +     int fmode = 0;
> > +     if (need & CEPH_CAP_FILE_RD)
> > +             fmode |= CEPH_FILE_MODE_RD;
> > +     if (need & CEPH_CAP_FILE_WR)
> > +             fmode |= CEPH_FILE_MODE_WR;
> > +     if (want & CEPH_CAP_FILE_LAZYIO)
> > +             fmode |= CEPH_FILE_MODE_LAZY;
> > +     return fmode;
> > +}
> > +
> >  int ceph_try_get_caps(struct inode *inode, int need, int want,
> >                     bool nonblock, int *got)
> >  {
> > -     int ret;
> > +     int ret, flags;
> >
> >       BUG_ON(need & ~CEPH_CAP_FILE_RD);
> >       BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> > @@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
> >       if (ret < 0)
> >               return ret;
> >
> > -     ret = try_get_cap_refs(inode, need, want, 0,
> > -                            (nonblock ? NON_BLOCKING : 0), got);
> > +     flags = get_used_file_mode(need, want);
> > +     if (nonblock)
> > +             flags |= NON_BLOCKING;
> > +
> > +     ret = try_get_cap_refs(inode, need, want, 0, flags, got);
> >       return ret == -EAGAIN ? 0 : ret;
> >  }
> >
> > @@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >           fi->filp_gen != READ_ONCE(fsc->filp_gen))
> >               return -EBADF;
> >
> > +     flags = get_used_file_mode(need, want);
> > +
> >       while (true) {
> >               if (endoff > 0)
> >                       check_max_size(inode, endoff);
> >
> > -             flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> > +             flags &= CEPH_FILE_MODE_MASK;
> > +             if (atomic_read(&fi->num_locks))
> > +                     flags |= CHECK_FILELOCK;
> >               _got = 0;
> >               ret = try_get_cap_refs(inode, need, want, endoff,
> >                                      flags, &_got);
> > @@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >                       list_add(&cw.list, &mdsc->cap_wait_list);
> >                       spin_unlock(&mdsc->caps_list_lock);
> >
> > +                     /* make sure used fmode not timeout */
> > +                     ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
> >                       add_wait_queue(&ci->i_cap_wq, &wait);
> >
> >                       flags |= NON_BLOCKING;
> > @@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >                       }
> >
> >                       remove_wait_queue(&ci->i_cap_wq, &wait);
> > +                     ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
> >
> >                       spin_lock(&mdsc->caps_list_lock);
> >                       list_del(&cw.list);
> > @@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >               if (ret < 0) {
> >                       if (ret == -ESTALE) {
> >                               /* session was killed, try renew caps */
> > -                             ret = ceph_renew_caps(inode);
> > +                             ret = ceph_renew_caps(inode, flags);
> >                               if (ret == 0)
> >                                       continue;
> >                       }
> > @@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
> >       dout("flush_dirty_caps done\n");
> >  }
> >
> > +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > +                     struct ceph_mds_client *mdsc, int fmode)
> > +{
> > +     int i;
> > +     int bits = (fmode << 1);
> > +     unsigned long now = jiffies | 1;
> > +     for (i = 1; i < CEPH_FILE_MODE_BITS; i++) {
> > +             if (bits & (1 << i))
> > +                     ci->i_file_by_mode[i].last_used = now;
> > +     }
> > +
> > +     /* queue periodic check */
> > +     if (bits && list_empty(&ci->i_cap_delay_list))
> > +             __cap_delay_requeue(mdsc, ci, true);
> > +}
> > +
> > +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> > +{
> > +     int i;
> > +     int bits = (fmode << 1) | 1;
> > +     spin_lock(&ci->i_ceph_lock);
> > +     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > +             if (bits & (1 << i))
> > +                     ci->i_file_by_mode[i].nr += count;
> > +     }
> > +     spin_unlock(&ci->i_ceph_lock);
> > +}
> > +
> >  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> >  {
> >       int i;
> >       int bits = (fmode << 1) | 1;
> >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> >               if (bits & (1 << i))
> > -                     ci->i_nr_by_mode[i]++;
> > +                     ci->i_file_by_mode[i].nr++;
> >       }
> >  }
> >
> > @@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> >   * we may need to release capabilities to the MDS (or schedule
> >   * their delayed release).
> >   */
> > -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> > +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
> >  {
> > -     int i, last = 0;
> > +     int i;
> >       int bits = (fmode << 1) | 1;
> >       spin_lock(&ci->i_ceph_lock);
> >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> >               if (bits & (1 << i)) {
> > -                     BUG_ON(ci->i_nr_by_mode[i] == 0);
> > -                     if (--ci->i_nr_by_mode[i] == 0)
> > -                             last++;
> > +                     BUG_ON(ci->i_file_by_mode[i].nr < count);
> > +                     ci->i_file_by_mode[i].nr -= count;
> >               }
> >       }
> > -     dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> > -          &ci->vfs_inode, fmode,
> > -          ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> > -          ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
> >       spin_unlock(&ci->i_ceph_lock);
> > -
> > -     if (last && ci->i_vino.snap == CEPH_NOSNAP)
> > -             ceph_check_caps(ci, 0, NULL);
> >  }
> >
> >  /*
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 7e0190b1f821..f28f420bad23 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >               struct ceph_dir_file_info *dfi =
> >                       kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> >               if (!dfi) {
> > -                     ceph_put_fmode(ci, fmode); /* clean up */
> > +                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> >                       return -ENOMEM;
> >               }
> >
> > @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >       } else {
> >               fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> >               if (!fi) {
> > -                     ceph_put_fmode(ci, fmode); /* clean up */
> > +                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> >                       return -ENOMEM;
> >               }
> >
> > @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >       case S_IFLNK:
> >               dout("init_file %p %p 0%o (symlink)\n", inode, file,
> >                    inode->i_mode);
> > -             ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> >               break;
> >
> >       default:
> > @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >                * we need to drop the open ref now, since we don't
> >                * have .release set to ceph_release.
> >                */
> > -             ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> >               BUG_ON(inode->i_fop->release == ceph_release);
> >
> >               /* call the proper open fop */
> > @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >  /*
> >   * try renew caps after session gets killed.
> >   */
> > -int ceph_renew_caps(struct inode *inode)
> > +int ceph_renew_caps(struct inode *inode, int fmode)
> >  {
> > -     struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> > +     struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> >       struct ceph_mds_request *req;
> >       int err, flags, wanted;
> >
> >       spin_lock(&ci->i_ceph_lock);
> > +     __ceph_touch_fmode(ci, mdsc, fmode);
> >       wanted = __ceph_caps_file_wanted(ci);
> >       if (__ceph_is_any_real_caps(ci) &&
> >           (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> > @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
> >       } else if (ceph_snap(inode) != CEPH_NOSNAP &&
> >                  (ci->i_snap_caps & wanted) == wanted) {
> >               __ceph_get_fmode(ci, fmode);
> > +             __ceph_touch_fmode(ci, mdsc, fmode);
> >               spin_unlock(&ci->i_ceph_lock);
> >               return ceph_init_file(inode, file, fmode);
> >       }
> > @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >       }
> >  out_req:
> >       if (!req->r_err && req->r_target_inode)
> > -             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> > +             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
> >       ceph_mdsc_put_request(req);
> >  out_ctx:
> >       ceph_release_acl_sec_ctx(&as_ctx);
> > @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
> >               dout("release inode %p dir file %p\n", inode, file);
> >               WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
> >
> > -             ceph_put_fmode(ci, dfi->file_info.fmode);
> > +             ceph_put_fmode(ci, dfi->file_info.fmode, 1);
> >
> >               if (dfi->last_readdir)
> >                       ceph_mdsc_put_request(dfi->last_readdir);
> > @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
> >               dout("release inode %p regular file %p\n", inode, file);
> >               WARN_ON(!list_empty(&fi->rw_contexts));
> >
> > -             ceph_put_fmode(ci, fi->fmode);
> > +             ceph_put_fmode(ci, fi->fmode, 1);
> > +
> >               kmem_cache_free(ceph_file_cachep, fi);
> >       }
> >
> > @@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >               if (dirty)
> >                       __mark_inode_dirty(inode, dirty);
> >               if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> > -                     ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> > +                     ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
> >       }
> >
> >       dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 094b8fc37787..b279bd8e168e 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >       ci->i_head_snapc = NULL;
> >       ci->i_snap_caps = 0;
> >
> > -     for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
> > -             ci->i_nr_by_mode[i] = 0;
> > +     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > +             ci->i_file_by_mode[i].nr = 0;
> > +             ci->i_file_by_mode[i].last_used = 0;
> > +     }
> >
> >       mutex_init(&ci->i_truncate_mutex);
> >       ci->i_truncate_seq = 0;
> > @@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
> >                       if ((issued & (CEPH_CAP_FILE_CACHE|
> >                                      CEPH_CAP_FILE_BUFFER)) ||
> >                           mapping_mapped(inode->i_mapping) ||
> > -                         __ceph_caps_file_wanted(ci)) {
> > +                         __ceph_is_file_opened(ci)) {
> >                               ci->i_truncate_pending++;
> >                               queue_trunc = 1;
> >                       }
> > @@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> >                       fill_inline = true;
> >       }
> >
> > +     if (cap_fmode >= 0) {
> > +             if (!info_caps)
> > +                     pr_warn("mds issued no caps on %llx.%llx\n",
> > +                             ceph_vinop(inode));
> > +             __ceph_touch_fmode(ci, mdsc, cap_fmode);
> > +     }
> > +
> >       spin_unlock(&ci->i_ceph_lock);
> >
> >       if (fill_inline)
> > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> > index c90f03beb15d..da0ee54ae5bc 100644
> > --- a/fs/ceph/ioctl.c
> > +++ b/fs/ceph/ioctl.c
> > @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
> >       struct ceph_file_info *fi = file->private_data;
> >       struct inode *inode = file_inode(file);
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> > +     struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >
> >       if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
> >               spin_lock(&ci->i_ceph_lock);
> >               fi->fmode |= CEPH_FILE_MODE_LAZY;
> > -             ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> > +             ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++;
> > +             __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY);
> >               spin_unlock(&ci->i_ceph_lock);
> >               dout("ioctl_layzio: file %p marked lazy\n", file);
> >
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index d370f89df358..029823643b8b 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -361,7 +361,10 @@ struct ceph_inode_info {
> >                                                   dirty|flushing caps */
> >       unsigned i_snap_caps;           /* cap bits for snapped files */
> >
> > -     int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> > +     struct {
> > +             int nr;
> > +             unsigned long last_used;
> > +     } i_file_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> >
>
> Ok, so we're growing ceph_inode_info by 40 bytes here (on 64-bit arch).
>
> That's quite a bit, actually, but it turns out that there are 32 bytes
> worth of holes in ceph_inode_info now. It'd be good to reorganize the
> struct for better packing before you do this, so that this at least
> won't make memory consumption too much worse.
>
> There other ways we could approach this too that would be more space
> efficient. We don't really need to keep a timestamp for each mode bit.
> All we're really interested in is what modes were used in the last time
> interval.
>
> We could keep an active and inactive set of CEPH_FILE_MODE bits (which
> are just a single byte each), and a timestamp representing the switch
> between the two.
>
> As we use the file, we'd set bits in the active mask if the timestamp is
> less than half the time interval old. If it's more than half the
> interval, copy the active mask to the inactive one and zero out the
> active mask first.
>
> When you go to check what modes have been used you can do the switch
> again first if the timestamp is too old. To see what bits were actually
> used, you just logically or the active and inactive sets together.
>
> That would take a lot less space per inode.

The problem is there is no reliable tick for interval. besides, we can
not have patch 4 if use active bit. patch 4 simplifies code a lot. I
really like to keep it.  we can just track last use of read/write,
which uses 16 bytes.  this patch removes i_hold_caps_min. So only need
8 more bytes.

>
> >       struct mutex i_truncate_mutex;
> >       u32 i_truncate_seq;        /* last truncate to smaller size */
> > @@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
> >  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
> >  extern int __ceph_caps_used(struct ceph_inode_info *ci);
> >
> > +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> > +{
> > +     return ci->i_file_by_mode[0].nr;
> > +}
> >  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
> >  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
> >
> > @@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode,
> >
> >  /* for counting open files by mode */
> >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> > +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> > +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> > +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > +                            struct ceph_mds_client *mdsc, int fmode);
> >
> >  /* addr.c */
> >  extern const struct address_space_operations ceph_aops;
> > @@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
> >  /* file.c */
> >  extern const struct file_operations ceph_file_fops;
> >
> > -extern int ceph_renew_caps(struct inode *inode);
> > +extern int ceph_renew_caps(struct inode *inode, int fmode);
> >  extern int ceph_open(struct inode *inode, struct file *file);
> >  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >                           struct file *file, unsigned flags, umode_t mode);
> > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > index cb21c5cf12c3..8017130a08a1 100644
> > --- a/include/linux/ceph/ceph_fs.h
> > +++ b/include/linux/ceph/ceph_fs.h
> > @@ -564,6 +564,7 @@ struct ceph_filelock {
> >  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
> >  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
> >  #define CEPH_FILE_MODE_BITS       4
> > +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
> >
> >  int ceph_flags_to_mode(int flags);
> >
>
> --
> Jeff Layton <jlayton@kernel.org>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps
  2020-02-20 14:53     ` Yan, Zheng
@ 2020-02-20 16:20       ` Jeff Layton
  0 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-02-20 16:20 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Yan, Zheng, ceph-devel

On Thu, 2020-02-20 at 22:53 +0800, Yan, Zheng wrote:
> On Thu, Feb 20, 2020 at 10:18 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> > > When getting caps for read/write, update corresponding file's last
> > > read/write. If a file hasn't been read/write for 'caps_wanted_delay_max'
> > > seconds, ignore the file when calculating wanted caps.
> > > 
> > 
> > Please explain in the changelog how the new info is to be stored, given
> > that it is quite complex.
> > 
> > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > ---
> > >  fs/ceph/caps.c               | 149 ++++++++++++++++++++++++-----------
> > >  fs/ceph/file.c               |  23 +++---
> > >  fs/ceph/inode.c              |  15 +++-
> > >  fs/ceph/ioctl.c              |   4 +-
> > >  fs/ceph/super.h              |  16 +++-
> > >  include/linux/ceph/ceph_fs.h |   1 +
> > >  6 files changed, 145 insertions(+), 63 deletions(-)
> > > 
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index 293920d013ff..ccdc47bd7cf0 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
> > >       return used;
> > >  }
> > > 
> > > +#define FMODE_WAIT_BIAS 1000
> > > +
> > >  /*
> > >   * wanted, by virtue of open file modes
> > >   */
> > >  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
> > >  {
> > > +     struct ceph_mount_options *opt =
> > > +             ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> > > +     unsigned long used_cutoff =
> > > +             round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> > > +     unsigned long idle_cutoff =
> > > +             round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
> > >       int i, bits = 0;
> > > +
> > >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > > -             if (ci->i_nr_by_mode[i])
> > > +             if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) {
> > > +                     /* there are cap waiters or lots of open files */
> > >                       bits |= 1 << i;
> > > +             } else if (ci->i_file_by_mode[i].nr > 0) {
> > > +                     if (i ==  CEPH_FILE_MODE_PIN ||
> > > +                         time_after(ci->i_file_by_mode[i].last_used,
> > > +                                    used_cutoff))
> > > +                             bits |= 1 << i;
> > > +             } else if ((ci->i_file_by_mode[i].last_used & 1)) {
> > > +                     if (time_after(ci->i_file_by_mode[i].last_used,
> > > +                                    idle_cutoff)) {
> > > +                             bits |= 1 << i;
> > > +                     } else {
> > > +                             ci->i_file_by_mode[i].last_used &= ~1UL;
> > > +                     }
> > > +             }
> > >       }
> > >       if (bits == 0)
> > >               return 0;
> > > +     if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> > > +             return 0;
> > > +
> > >       return ceph_caps_for_mode(bits >> 1);
> > >  }
> > > 
> > > @@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
> > >       return mds_wanted;
> > >  }
> > > 
> > > -/*
> > > - * called under i_ceph_lock
> > > - */
> > > -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> > > -{
> > > -     return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> > > -}
> > > -
> > >  int ceph_is_any_caps(struct inode *inode)
> > >  {
> > >       struct ceph_inode_info *ci = ceph_inode(inode);
> > > @@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> > >       if (ci->i_ceph_flags & CEPH_I_FLUSH)
> > >               flags |= CHECK_CAPS_FLUSH;
> > > 
> > > -     if (!(flags & CHECK_CAPS_AUTHONLY) ||
> > > -         (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> > > -             __cap_delay_cancel(mdsc, ci);
> > > -
> > >       goto retry_locked;
> > >  retry:
> > >       spin_lock(&ci->i_ceph_lock);
> > > @@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> > >               goto retry; /* retake i_ceph_lock and restart our cap scan. */
> > >       }
> > > 
> > > -     /* Reschedule delayed caps release if we delayed anything */
> > > -     if (delayed)
> > > -             __cap_delay_requeue(mdsc, ci, false);
> > > +     if (list_empty(&ci->i_cap_delay_list)) {
> > > +         if (delayed) {
> > > +                 /* Reschedule delayed caps release if we delayed anything */
> > > +                 __cap_delay_requeue(mdsc, ci, false);
> > > +         } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> > > +                     !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > > +                 /* periodically re-calculate caps wanted by open files */
> > > +                 __cap_delay_requeue(mdsc, ci, true);
> > > +         }
> > > +     }
> > > 
> > >       spin_unlock(&ci->i_ceph_lock);
> > > 
> > > @@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
> > >   * FIXME: how does a 0 return differ from -EAGAIN?
> > >   */
> > >  enum {
> > > -     NON_BLOCKING    = 1,
> > > -     CHECK_FILELOCK  = 2,
> > > +     /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> > > +     NON_BLOCKING    = (1 << 8),
> > > +     CHECK_FILELOCK  = (1 << 9),
> > >  };
> > > 
> > >  static int try_get_cap_refs(struct inode *inode, int need, int want,
> > > @@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> > >       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> > >       int ret = 0;
> > >       int have, implemented;
> > > -     int file_wanted;
> > >       bool snap_rwsem_locked = false;
> > > 
> > >       dout("get_cap_refs %p need %s want %s\n", inode,
> > > @@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> > >               goto out_unlock;
> > >       }
> > > 
> > > -     /* make sure file is actually open */
> > > -     file_wanted = __ceph_caps_file_wanted(ci);
> > > -     if ((file_wanted & need) != need) {
> > > -             dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> > > -                  ceph_cap_string(need), ceph_cap_string(file_wanted));
> > > -             ret = -EBADF;
> > > -             goto out_unlock;
> > > -     }
> > > -
> > >       /* finish pending truncate */
> > >       while (ci->i_truncate_pending) {
> > >               spin_unlock(&ci->i_ceph_lock);
> > > @@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> > >                    ceph_cap_string(have), ceph_cap_string(need));
> > >       }
> > >  out_unlock:
> > > +
> > > +     __ceph_touch_fmode(ci, mdsc, flags);
> > > +
> > >       spin_unlock(&ci->i_ceph_lock);
> > >       if (snap_rwsem_locked)
> > >               up_read(&mdsc->snap_rwsem);
> > > @@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff)
> > >               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
> > >  }
> > > 
> > > +static inline int get_used_file_mode(int need, int want)
> > > +{
> > > +     int fmode = 0;
> > > +     if (need & CEPH_CAP_FILE_RD)
> > > +             fmode |= CEPH_FILE_MODE_RD;
> > > +     if (need & CEPH_CAP_FILE_WR)
> > > +             fmode |= CEPH_FILE_MODE_WR;
> > > +     if (want & CEPH_CAP_FILE_LAZYIO)
> > > +             fmode |= CEPH_FILE_MODE_LAZY;
> > > +     return fmode;
> > > +}
> > > +
> > >  int ceph_try_get_caps(struct inode *inode, int need, int want,
> > >                     bool nonblock, int *got)
> > >  {
> > > -     int ret;
> > > +     int ret, flags;
> > > 
> > >       BUG_ON(need & ~CEPH_CAP_FILE_RD);
> > >       BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> > > @@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
> > >       if (ret < 0)
> > >               return ret;
> > > 
> > > -     ret = try_get_cap_refs(inode, need, want, 0,
> > > -                            (nonblock ? NON_BLOCKING : 0), got);
> > > +     flags = get_used_file_mode(need, want);
> > > +     if (nonblock)
> > > +             flags |= NON_BLOCKING;
> > > +
> > > +     ret = try_get_cap_refs(inode, need, want, 0, flags, got);
> > >       return ret == -EAGAIN ? 0 : ret;
> > >  }
> > > 
> > > @@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > >           fi->filp_gen != READ_ONCE(fsc->filp_gen))
> > >               return -EBADF;
> > > 
> > > +     flags = get_used_file_mode(need, want);
> > > +
> > >       while (true) {
> > >               if (endoff > 0)
> > >                       check_max_size(inode, endoff);
> > > 
> > > -             flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> > > +             flags &= CEPH_FILE_MODE_MASK;
> > > +             if (atomic_read(&fi->num_locks))
> > > +                     flags |= CHECK_FILELOCK;
> > >               _got = 0;
> > >               ret = try_get_cap_refs(inode, need, want, endoff,
> > >                                      flags, &_got);
> > > @@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > >                       list_add(&cw.list, &mdsc->cap_wait_list);
> > >                       spin_unlock(&mdsc->caps_list_lock);
> > > 
> > > +                     /* make sure used fmode not timeout */
> > > +                     ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
> > >                       add_wait_queue(&ci->i_cap_wq, &wait);
> > > 
> > >                       flags |= NON_BLOCKING;
> > > @@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > >                       }
> > > 
> > >                       remove_wait_queue(&ci->i_cap_wq, &wait);
> > > +                     ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
> > > 
> > >                       spin_lock(&mdsc->caps_list_lock);
> > >                       list_del(&cw.list);
> > > @@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > >               if (ret < 0) {
> > >                       if (ret == -ESTALE) {
> > >                               /* session was killed, try renew caps */
> > > -                             ret = ceph_renew_caps(inode);
> > > +                             ret = ceph_renew_caps(inode, flags);
> > >                               if (ret == 0)
> > >                                       continue;
> > >                       }
> > > @@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
> > >       dout("flush_dirty_caps done\n");
> > >  }
> > > 
> > > +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > > +                     struct ceph_mds_client *mdsc, int fmode)
> > > +{
> > > +     int i;
> > > +     int bits = (fmode << 1);
> > > +     unsigned long now = jiffies | 1;
> > > +     for (i = 1; i < CEPH_FILE_MODE_BITS; i++) {
> > > +             if (bits & (1 << i))
> > > +                     ci->i_file_by_mode[i].last_used = now;
> > > +     }
> > > +
> > > +     /* queue periodic check */
> > > +     if (bits && list_empty(&ci->i_cap_delay_list))
> > > +             __cap_delay_requeue(mdsc, ci, true);
> > > +}
> > > +
> > > +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> > > +{
> > > +     int i;
> > > +     int bits = (fmode << 1) | 1;
> > > +     spin_lock(&ci->i_ceph_lock);
> > > +     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > > +             if (bits & (1 << i))
> > > +                     ci->i_file_by_mode[i].nr += count;
> > > +     }
> > > +     spin_unlock(&ci->i_ceph_lock);
> > > +}
> > > +
> > >  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> > >  {
> > >       int i;
> > >       int bits = (fmode << 1) | 1;
> > >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > >               if (bits & (1 << i))
> > > -                     ci->i_nr_by_mode[i]++;
> > > +                     ci->i_file_by_mode[i].nr++;
> > >       }
> > >  }
> > > 
> > > @@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> > >   * we may need to release capabilities to the MDS (or schedule
> > >   * their delayed release).
> > >   */
> > > -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> > > +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
> > >  {
> > > -     int i, last = 0;
> > > +     int i;
> > >       int bits = (fmode << 1) | 1;
> > >       spin_lock(&ci->i_ceph_lock);
> > >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > >               if (bits & (1 << i)) {
> > > -                     BUG_ON(ci->i_nr_by_mode[i] == 0);
> > > -                     if (--ci->i_nr_by_mode[i] == 0)
> > > -                             last++;
> > > +                     BUG_ON(ci->i_file_by_mode[i].nr < count);
> > > +                     ci->i_file_by_mode[i].nr -= count;
> > >               }
> > >       }
> > > -     dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> > > -          &ci->vfs_inode, fmode,
> > > -          ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> > > -          ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
> > >       spin_unlock(&ci->i_ceph_lock);
> > > -
> > > -     if (last && ci->i_vino.snap == CEPH_NOSNAP)
> > > -             ceph_check_caps(ci, 0, NULL);
> > >  }
> > > 
> > >  /*
> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > index 7e0190b1f821..f28f420bad23 100644
> > > --- a/fs/ceph/file.c
> > > +++ b/fs/ceph/file.c
> > > @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > >               struct ceph_dir_file_info *dfi =
> > >                       kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> > >               if (!dfi) {
> > > -                     ceph_put_fmode(ci, fmode); /* clean up */
> > > +                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> > >                       return -ENOMEM;
> > >               }
> > > 
> > > @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > >       } else {
> > >               fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> > >               if (!fi) {
> > > -                     ceph_put_fmode(ci, fmode); /* clean up */
> > > +                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> > >                       return -ENOMEM;
> > >               }
> > > 
> > > @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> > >       case S_IFLNK:
> > >               dout("init_file %p %p 0%o (symlink)\n", inode, file,
> > >                    inode->i_mode);
> > > -             ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > > +             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> > >               break;
> > > 
> > >       default:
> > > @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> > >                * we need to drop the open ref now, since we don't
> > >                * have .release set to ceph_release.
> > >                */
> > > -             ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > > +             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> > >               BUG_ON(inode->i_fop->release == ceph_release);
> > > 
> > >               /* call the proper open fop */
> > > @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> > >  /*
> > >   * try renew caps after session gets killed.
> > >   */
> > > -int ceph_renew_caps(struct inode *inode)
> > > +int ceph_renew_caps(struct inode *inode, int fmode)
> > >  {
> > > -     struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> > > +     struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> > >       struct ceph_inode_info *ci = ceph_inode(inode);
> > >       struct ceph_mds_request *req;
> > >       int err, flags, wanted;
> > > 
> > >       spin_lock(&ci->i_ceph_lock);
> > > +     __ceph_touch_fmode(ci, mdsc, fmode);
> > >       wanted = __ceph_caps_file_wanted(ci);
> > >       if (__ceph_is_any_real_caps(ci) &&
> > >           (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> > > @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
> > >       } else if (ceph_snap(inode) != CEPH_NOSNAP &&
> > >                  (ci->i_snap_caps & wanted) == wanted) {
> > >               __ceph_get_fmode(ci, fmode);
> > > +             __ceph_touch_fmode(ci, mdsc, fmode);
> > >               spin_unlock(&ci->i_ceph_lock);
> > >               return ceph_init_file(inode, file, fmode);
> > >       }
> > > @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > >       }
> > >  out_req:
> > >       if (!req->r_err && req->r_target_inode)
> > > -             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> > > +             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
> > >       ceph_mdsc_put_request(req);
> > >  out_ctx:
> > >       ceph_release_acl_sec_ctx(&as_ctx);
> > > @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
> > >               dout("release inode %p dir file %p\n", inode, file);
> > >               WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
> > > 
> > > -             ceph_put_fmode(ci, dfi->file_info.fmode);
> > > +             ceph_put_fmode(ci, dfi->file_info.fmode, 1);
> > > 
> > >               if (dfi->last_readdir)
> > >                       ceph_mdsc_put_request(dfi->last_readdir);
> > > @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
> > >               dout("release inode %p regular file %p\n", inode, file);
> > >               WARN_ON(!list_empty(&fi->rw_contexts));
> > > 
> > > -             ceph_put_fmode(ci, fi->fmode);
> > > +             ceph_put_fmode(ci, fi->fmode, 1);
> > > +
> > >               kmem_cache_free(ceph_file_cachep, fi);
> > >       }
> > > 
> > > @@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > >               if (dirty)
> > >                       __mark_inode_dirty(inode, dirty);
> > >               if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> > > -                     ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> > > +                     ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
> > >       }
> > > 
> > >       dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index 094b8fc37787..b279bd8e168e 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > >       ci->i_head_snapc = NULL;
> > >       ci->i_snap_caps = 0;
> > > 
> > > -     for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
> > > -             ci->i_nr_by_mode[i] = 0;
> > > +     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > > +             ci->i_file_by_mode[i].nr = 0;
> > > +             ci->i_file_by_mode[i].last_used = 0;
> > > +     }
> > > 
> > >       mutex_init(&ci->i_truncate_mutex);
> > >       ci->i_truncate_seq = 0;
> > > @@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
> > >                       if ((issued & (CEPH_CAP_FILE_CACHE|
> > >                                      CEPH_CAP_FILE_BUFFER)) ||
> > >                           mapping_mapped(inode->i_mapping) ||
> > > -                         __ceph_caps_file_wanted(ci)) {
> > > +                         __ceph_is_file_opened(ci)) {
> > >                               ci->i_truncate_pending++;
> > >                               queue_trunc = 1;
> > >                       }
> > > @@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> > >                       fill_inline = true;
> > >       }
> > > 
> > > +     if (cap_fmode >= 0) {
> > > +             if (!info_caps)
> > > +                     pr_warn("mds issued no caps on %llx.%llx\n",
> > > +                             ceph_vinop(inode));
> > > +             __ceph_touch_fmode(ci, mdsc, cap_fmode);
> > > +     }
> > > +
> > >       spin_unlock(&ci->i_ceph_lock);
> > > 
> > >       if (fill_inline)
> > > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> > > index c90f03beb15d..da0ee54ae5bc 100644
> > > --- a/fs/ceph/ioctl.c
> > > +++ b/fs/ceph/ioctl.c
> > > @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
> > >       struct ceph_file_info *fi = file->private_data;
> > >       struct inode *inode = file_inode(file);
> > >       struct ceph_inode_info *ci = ceph_inode(inode);
> > > +     struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> > > 
> > >       if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
> > >               spin_lock(&ci->i_ceph_lock);
> > >               fi->fmode |= CEPH_FILE_MODE_LAZY;
> > > -             ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> > > +             ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++;
> > > +             __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY);
> > >               spin_unlock(&ci->i_ceph_lock);
> > >               dout("ioctl_layzio: file %p marked lazy\n", file);
> > > 
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index d370f89df358..029823643b8b 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -361,7 +361,10 @@ struct ceph_inode_info {
> > >                                                   dirty|flushing caps */
> > >       unsigned i_snap_caps;           /* cap bits for snapped files */
> > > 
> > > -     int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> > > +     struct {
> > > +             int nr;
> > > +             unsigned long last_used;
> > > +     } i_file_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> > > 
> > 
> > Ok, so we're growing ceph_inode_info by 40 bytes here (on 64-bit arch).
> > 
> > That's quite a bit, actually, but it turns out that there are 32 bytes
> > worth of holes in ceph_inode_info now. It'd be good to reorganize the
> > struct for better packing before you do this, so that this at least
> > won't make memory consumption too much worse.
> > 
> > There other ways we could approach this too that would be more space
> > efficient. We don't really need to keep a timestamp for each mode bit.
> > All we're really interested in is what modes were used in the last time
> > interval.
> > 
> > We could keep an active and inactive set of CEPH_FILE_MODE bits (which
> > are just a single byte each), and a timestamp representing the switch
> > between the two.
> > 
> > As we use the file, we'd set bits in the active mask if the timestamp is
> > less than half the time interval old. If it's more than half the
> > interval, copy the active mask to the inactive one and zero out the
> > active mask first.
> > 
> > When you go to check what modes have been used you can do the switch
> > again first if the timestamp is too old. To see what bits were actually
> > used, you just logically or the activ You could couple this with
> > something that requeues e and inactive sets together.
> > 
> > That would take a lot less space per inode.
> 
> The problem is there is no reliable tick for interval.

I handwaved over that part, but you could ensure that it was revisited
in a timely fashion the same way as here. Just call __cap_delay_requeue
at each active/inactive switch event.

You might not end up with a hard timeout for each individual bit, but
I'm not sure that's really required to fix this problem. There may also
be benefits in keeping the timeouts "looser" -- more opportunities to
batch up cap releases when there are spiky, mixed read/write workloads.

> besides, we can
> not have patch 4 if use active bit. patch 4 simplifies code a lot. I
> really like to keep it.  we can just track last use of read/write,
> which uses 16 bytes.  this patch removes i_hold_caps_min. So only need
> 8 more bytes.

Ok, I do like the last two patches in the series a lot.

Don't you also need to timestamp LAZY too if you do it this way? If you
have a mix of lazy and nonlazy clients and the lazy one is holding the
cap w/o using it, that seems like it could be the same problem all over
again.

> > >       struct mutex i_truncate_mutex;
> > >       u32 i_truncate_seq;        /* last truncate to smaller size */
> > > @@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
> > >  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
> > >  extern int __ceph_caps_used(struct ceph_inode_info *ci);
> > > 
> > > +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> > > +{
> > > +     return ci->i_file_by_mode[0].nr;
> > > +}
> > >  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
> > >  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
> > > 
> > > @@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode,
> > > 
> > >  /* for counting open files by mode */
> > >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > > -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> > > +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> > > +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> > > +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > > +                            struct ceph_mds_client *mdsc, int fmode);
> > > 
> > >  /* addr.c */
> > >  extern const struct address_space_operations ceph_aops;
> > > @@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
> > >  /* file.c */
> > >  extern const struct file_operations ceph_file_fops;
> > > 
> > > -extern int ceph_renew_caps(struct inode *inode);
> > > +extern int ceph_renew_caps(struct inode *inode, int fmode);
> > >  extern int ceph_open(struct inode *inode, struct file *file);
> > >  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > >                           struct file *file, unsigned flags, umode_t mode);
> > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > > index cb21c5cf12c3..8017130a08a1 100644
> > > --- a/include/linux/ceph/ceph_fs.h
> > > +++ b/include/linux/ceph/ceph_fs.h
> > > @@ -564,6 +564,7 @@ struct ceph_filelock {
> > >  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
> > >  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
> > >  #define CEPH_FILE_MODE_BITS       4
> > > +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
> > > 
> > >  int ceph_flags_to_mode(int flags);
> > > 
> > 
> > --
> > Jeff Layton <jlayton@kernel.org>
> > 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps
  2020-02-20 12:26 ` [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps Yan, Zheng
  2020-02-20 14:14   ` Jeff Layton
@ 2020-02-20 19:06   ` Jeff Layton
  2020-02-21  3:22     ` Yan, Zheng
  1 sibling, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2020-02-20 19:06 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> When getting caps for read/write, update corresponding file's last
> read/write. If a file hasn't been read/write for 'caps_wanted_delay_max'
> seconds, ignore the file when calculating wanted caps.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c               | 149 ++++++++++++++++++++++++-----------
>  fs/ceph/file.c               |  23 +++---
>  fs/ceph/inode.c              |  15 +++-
>  fs/ceph/ioctl.c              |   4 +-
>  fs/ceph/super.h              |  16 +++-
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 145 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 293920d013ff..ccdc47bd7cf0 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
>  	return used;
>  }
>  
> +#define FMODE_WAIT_BIAS 1000
> +
>  /*
>   * wanted, by virtue of open file modes
>   */
>  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>  {
> +	struct ceph_mount_options *opt =
> +		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> +	unsigned long used_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> +	unsigned long idle_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
>  	int i, bits = 0;
> +
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> -		if (ci->i_nr_by_mode[i])
> +		if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) {
> +			/* there are cap waiters or lots of open files */
>  			bits |= 1 << i;
> +		} else if (ci->i_file_by_mode[i].nr > 0) {
> +			if (i ==  CEPH_FILE_MODE_PIN ||
> +			    time_after(ci->i_file_by_mode[i].last_used,
> +				       used_cutoff))
> +				bits |= 1 << i;

Ok, so in the case where we have active references give it up if
last_used is after the max delay.

> +		} else if ((ci->i_file_by_mode[i].last_used & 1)) {
> +			if (time_after(ci->i_file_by_mode[i].last_used,
> +				       idle_cutoff)) {
> +				bits |= 1 << i;
> +			} else {
> +				ci->i_file_by_mode[i].last_used &= ~1UL;
> +			}
> +		}

What's the need for the trickery with the bottom bit here? It seems
simpler (and about as fast) to just do a time_after check every time.

>  	}
>  	if (bits == 0)
>  		return 0;
> +	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> +		return 0;
> +
>  	return ceph_caps_for_mode(bits >> 1);
>  }
>  
> @@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
>  	return mds_wanted;
>  }
>  
> -/*
> - * called under i_ceph_lock
> - */
> -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> -{
> -	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> -}
> -
>  int ceph_is_any_caps(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  		flags |= CHECK_CAPS_FLUSH;
>  
> -	if (!(flags & CHECK_CAPS_AUTHONLY) ||
> -	    (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> -		__cap_delay_cancel(mdsc, ci);
> -
>  	goto retry_locked;
>  retry:
>  	spin_lock(&ci->i_ceph_lock);
> @@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		goto retry; /* retake i_ceph_lock and restart our cap scan. */
>  	}
>  
> -	/* Reschedule delayed caps release if we delayed anything */
> -	if (delayed)
> -		__cap_delay_requeue(mdsc, ci, false);
> +	if (list_empty(&ci->i_cap_delay_list)) {
> +	    if (delayed) {
> +		    /* Reschedule delayed caps release if we delayed anything */
> +		    __cap_delay_requeue(mdsc, ci, false);
> +	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> +			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +		    /* periodically re-calculate caps wanted by open files */
> +		    __cap_delay_requeue(mdsc, ci, true);
> +	    }
> +	}
>  
>  	spin_unlock(&ci->i_ceph_lock);
>  
> @@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
>   * FIXME: how does a 0 return differ from -EAGAIN?
>   */
>  enum {
> -	NON_BLOCKING	= 1,
> -	CHECK_FILELOCK	= 2,
> +	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> +	NON_BLOCKING	= (1 << 8),
> +	CHECK_FILELOCK	= (1 << 9),
>  };
>  
>  static int try_get_cap_refs(struct inode *inode, int need, int want,
> @@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	int ret = 0;
>  	int have, implemented;
> -	int file_wanted;
>  	bool snap_rwsem_locked = false;
>  
>  	dout("get_cap_refs %p need %s want %s\n", inode,
> @@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		goto out_unlock;
>  	}
>  
> -	/* make sure file is actually open */
> -	file_wanted = __ceph_caps_file_wanted(ci);
> -	if ((file_wanted & need) != need) {
> -		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> -		     ceph_cap_string(need), ceph_cap_string(file_wanted));
> -		ret = -EBADF;
> -		goto out_unlock;
> -	}
> -
>  	/* finish pending truncate */
>  	while (ci->i_truncate_pending) {
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		     ceph_cap_string(have), ceph_cap_string(need));
>  	}
>  out_unlock:
> +
> +	__ceph_touch_fmode(ci, mdsc, flags);
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  	if (snap_rwsem_locked)
>  		up_read(&mdsc->snap_rwsem);
> @@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff)
>  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
>  }
>  
> +static inline int get_used_file_mode(int need, int want)
> +{
> +	int fmode = 0;
> +	if (need & CEPH_CAP_FILE_RD)
> +		fmode |= CEPH_FILE_MODE_RD;
> +	if (need & CEPH_CAP_FILE_WR)
> +		fmode |= CEPH_FILE_MODE_WR;
> +	if (want & CEPH_CAP_FILE_LAZYIO)
> +		fmode |= CEPH_FILE_MODE_LAZY;
> +	return fmode;
> +}
> +
>  int ceph_try_get_caps(struct inode *inode, int need, int want,
>  		      bool nonblock, int *got)
>  {
> -	int ret;
> +	int ret, flags;
>  
>  	BUG_ON(need & ~CEPH_CAP_FILE_RD);
>  	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> @@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
>  	if (ret < 0)
>  		return ret;
>  
> -	ret = try_get_cap_refs(inode, need, want, 0,
> -			       (nonblock ? NON_BLOCKING : 0), got);
> +	flags = get_used_file_mode(need, want);
> +	if (nonblock)
> +		flags |= NON_BLOCKING;
> +
> +	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
>  	return ret == -EAGAIN ? 0 : ret;
>  }
>  
> @@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
>  		return -EBADF;
>  
> +	flags = get_used_file_mode(need, want);
> +
>  	while (true) {
>  		if (endoff > 0)
>  			check_max_size(inode, endoff);
>  
> -		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> +		flags &= CEPH_FILE_MODE_MASK;
> +		if (atomic_read(&fi->num_locks))
> +			flags |= CHECK_FILELOCK;
>  		_got = 0;
>  		ret = try_get_cap_refs(inode, need, want, endoff,
>  				       flags, &_got);
> @@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			list_add(&cw.list, &mdsc->cap_wait_list);
>  			spin_unlock(&mdsc->caps_list_lock);
>  
> +			/* make sure used fmode not timeout */
> +			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
>  			add_wait_queue(&ci->i_cap_wq, &wait);
>  
>  			flags |= NON_BLOCKING;
> @@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			}
>  
>  			remove_wait_queue(&ci->i_cap_wq, &wait);
> +			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
>  
>  			spin_lock(&mdsc->caps_list_lock);
>  			list_del(&cw.list);
> @@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  		if (ret < 0) {
>  			if (ret == -ESTALE) {
>  				/* session was killed, try renew caps */
> -				ret = ceph_renew_caps(inode);
> +				ret = ceph_renew_caps(inode, flags);
>  				if (ret == 0)
>  					continue;
>  			}
> @@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>  	dout("flush_dirty_caps done\n");
>  }
>  
> +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			struct ceph_mds_client *mdsc, int fmode)
> +{
> +	int i;
> +	int bits = (fmode << 1);
> +	unsigned long now = jiffies | 1;
> +	for (i = 1; i < CEPH_FILE_MODE_BITS; i++) {
> +		if (bits & (1 << i))
> +			ci->i_file_by_mode[i].last_used = now;
> +	}
> +
> +	/* queue periodic check */
> +	if (bits && list_empty(&ci->i_cap_delay_list))
> +		__cap_delay_requeue(mdsc, ci, true);
> +}
> +
> +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> +{
> +	int i;
> +	int bits = (fmode << 1) | 1;
> +	spin_lock(&ci->i_ceph_lock);
> +	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> +		if (bits & (1 << i))
> +			ci->i_file_by_mode[i].nr += count;
> +	}
> +	spin_unlock(&ci->i_ceph_lock);
> +}
> +
>  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>  {
>  	int i;
>  	int bits = (fmode << 1) | 1;
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>  		if (bits & (1 << i))
> -			ci->i_nr_by_mode[i]++;
> +			ci->i_file_by_mode[i].nr++;
>  	}
>  }
>  
> @@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>   * we may need to release capabilities to the MDS (or schedule
>   * their delayed release).
>   */
> -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
>  {
> -	int i, last = 0;
> +	int i;
>  	int bits = (fmode << 1) | 1;
>  	spin_lock(&ci->i_ceph_lock);
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>  		if (bits & (1 << i)) {
> -			BUG_ON(ci->i_nr_by_mode[i] == 0);
> -			if (--ci->i_nr_by_mode[i] == 0)
> -				last++;
> +			BUG_ON(ci->i_file_by_mode[i].nr < count);
> +			ci->i_file_by_mode[i].nr -= count;
>  		}
>  	}
> -	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> -	     &ci->vfs_inode, fmode,
> -	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> -	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
>  	spin_unlock(&ci->i_ceph_lock);
> -
> -	if (last && ci->i_vino.snap == CEPH_NOSNAP)
> -		ceph_check_caps(ci, 0, NULL);
>  }
>  
>  /*
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 7e0190b1f821..f28f420bad23 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  		struct ceph_dir_file_info *dfi =
>  			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
>  		if (!dfi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  	} else {
>  		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
>  		if (!fi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  	case S_IFLNK:
>  		dout("init_file %p %p 0%o (symlink)\n", inode, file,
>  		     inode->i_mode);
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		break;
>  
>  	default:
> @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  		 * we need to drop the open ref now, since we don't
>  		 * have .release set to ceph_release.
>  		 */
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		BUG_ON(inode->i_fop->release == ceph_release);
>  
>  		/* call the proper open fop */
> @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  /*
>   * try renew caps after session gets killed.
>   */
> -int ceph_renew_caps(struct inode *inode)
> +int ceph_renew_caps(struct inode *inode, int fmode)
>  {
> -	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  	struct ceph_mds_request *req;
>  	int err, flags, wanted;
>  
>  	spin_lock(&ci->i_ceph_lock);
> +	__ceph_touch_fmode(ci, mdsc, fmode);
>  	wanted = __ceph_caps_file_wanted(ci);
>  	if (__ceph_is_any_real_caps(ci) &&
>  	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
>  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
>  		   (ci->i_snap_caps & wanted) == wanted) {
>  		__ceph_get_fmode(ci, fmode);
> +		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
>  	}
> @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  	}
>  out_req:
>  	if (!req->r_err && req->r_target_inode)
> -		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> +		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
>  	ceph_mdsc_put_request(req);
>  out_ctx:
>  	ceph_release_acl_sec_ctx(&as_ctx);
> @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p dir file %p\n", inode, file);
>  		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
>  
> -		ceph_put_fmode(ci, dfi->file_info.fmode);
> +		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
>  
>  		if (dfi->last_readdir)
>  			ceph_mdsc_put_request(dfi->last_readdir);
> @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p regular file %p\n", inode, file);
>  		WARN_ON(!list_empty(&fi->rw_contexts));
>  
> -		ceph_put_fmode(ci, fi->fmode);
> +		ceph_put_fmode(ci, fi->fmode, 1);
> +
>  		kmem_cache_free(ceph_file_cachep, fi);
>  	}
>  
> @@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		if (dirty)
>  			__mark_inode_dirty(inode, dirty);
>  		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> -			ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> +			ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);

NODELAY to AUTHONLY? Why? I guess if we're approaching quota we do want
to be talking to the authority, but I'm not sure that change belongs in
this patch.

>  	}
>  
>  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 094b8fc37787..b279bd8e168e 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_head_snapc = NULL;
>  	ci->i_snap_caps = 0;
>  
> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
> -		ci->i_nr_by_mode[i] = 0;
> +	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> +		ci->i_file_by_mode[i].nr = 0;
> +		ci->i_file_by_mode[i].last_used = 0;
> +	}
>  
>  	mutex_init(&ci->i_truncate_mutex);
>  	ci->i_truncate_seq = 0;
> @@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
>  			if ((issued & (CEPH_CAP_FILE_CACHE|
>  				       CEPH_CAP_FILE_BUFFER)) ||
>  			    mapping_mapped(inode->i_mapping) ||
> -			    __ceph_caps_file_wanted(ci)) {
> +			    __ceph_is_file_opened(ci)) {
>  				ci->i_truncate_pending++;
>  				queue_trunc = 1;
>  			}
> @@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  			fill_inline = true;
>  	}
>  
> +	if (cap_fmode >= 0) {
> +		if (!info_caps)
> +			pr_warn("mds issued no caps on %llx.%llx\n",
> +				ceph_vinop(inode));
> +		__ceph_touch_fmode(ci, mdsc, cap_fmode);
> +	}
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  
>  	if (fill_inline)
> diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> index c90f03beb15d..da0ee54ae5bc 100644
> --- a/fs/ceph/ioctl.c
> +++ b/fs/ceph/ioctl.c
> @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
>  	struct ceph_file_info *fi = file->private_data;
>  	struct inode *inode = file_inode(file);
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  
>  	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
>  		spin_lock(&ci->i_ceph_lock);
>  		fi->fmode |= CEPH_FILE_MODE_LAZY;
> -		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> +		ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++;
> +		__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY);
>  		spin_unlock(&ci->i_ceph_lock);
>  		dout("ioctl_layzio: file %p marked lazy\n", file);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index d370f89df358..029823643b8b 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -361,7 +361,10 @@ struct ceph_inode_info {
>  						    dirty|flushing caps */
>  	unsigned i_snap_caps;           /* cap bits for snapped files */
>  
> -	int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> +	struct {
> +		int nr;
> +		unsigned long last_used;
> +	} i_file_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
>  
>  	struct mutex i_truncate_mutex;
>  	u32 i_truncate_seq;        /* last truncate to smaller size */
> @@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
>  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
>  extern int __ceph_caps_used(struct ceph_inode_info *ci);
>  
> +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> +{
> +	return ci->i_file_by_mode[0].nr;
> +}
>  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
>  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
>  
> @@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode,
>  
>  /* for counting open files by mode */
>  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			       struct ceph_mds_client *mdsc, int fmode);
>  
>  /* addr.c */
>  extern const struct address_space_operations ceph_aops;
> @@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
>  /* file.c */
>  extern const struct file_operations ceph_file_fops;
>  
> -extern int ceph_renew_caps(struct inode *inode);
> +extern int ceph_renew_caps(struct inode *inode, int fmode);
>  extern int ceph_open(struct inode *inode, struct file *file);
>  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  			    struct file *file, unsigned flags, umode_t mode);
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index cb21c5cf12c3..8017130a08a1 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -564,6 +564,7 @@ struct ceph_filelock {
>  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
>  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
>  #define CEPH_FILE_MODE_BITS       4
> +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
>  
>  int ceph_flags_to_mode(int flags);
>  

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/4] ceph: simplify calling of ceph_get_fmode()
  2020-02-20 12:26 ` [PATCH 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
@ 2020-02-20 19:14   ` Jeff Layton
  2020-02-21  2:34     ` Yan, Zheng
  0 siblings, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2020-02-20 19:14 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> Call ceph_get_fmode() when initializing file. Because fill_inode()
> already calls ceph_touch_fmode() for open file request. It affects

You mean __ceph_touch_fmode()

> __ceph_caps_file_wanted() for 'caps_wanted_delay_min' seconds, enough
> for ceph_get_fmode() to get called by ceph_init_file_info().
> 

I don't understand this changelog. Are you saying there is a potential
race of some sort, but that you don't think it's something we can hit in
practice?

> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c  | 26 +++-----------------------
>  fs/ceph/file.c  | 21 +++++----------------
>  fs/ceph/inode.c |  8 +-------
>  fs/ceph/super.h |  3 +--
>  4 files changed, 10 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index ccdc47bd7cf0..2f4ff7e9508e 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -606,7 +606,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
>   */
>  void ceph_add_cap(struct inode *inode,
>  		  struct ceph_mds_session *session, u64 cap_id,
> -		  int fmode, unsigned issued, unsigned wanted,
> +		  unsigned issued, unsigned wanted,
>  		  unsigned seq, unsigned mseq, u64 realmino, int flags,
>  		  struct ceph_cap **new_cap)
>  {
> @@ -622,13 +622,6 @@ void ceph_add_cap(struct inode *inode,
>  	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
>  	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
>  
> -	/*
> -	 * If we are opening the file, include file mode wanted bits
> -	 * in wanted.
> -	 */
> -	if (fmode >= 0)
> -		wanted |= ceph_caps_for_mode(fmode);
> -
>  	spin_lock(&session->s_gen_ttl_lock);
>  	gen = session->s_cap_gen;
>  	spin_unlock(&session->s_gen_ttl_lock);
> @@ -753,9 +746,6 @@ void ceph_add_cap(struct inode *inode,
>  	cap->issue_seq = seq;
>  	cap->mseq = mseq;
>  	cap->cap_gen = gen;
> -
> -	if (fmode >= 0)
> -		__ceph_get_fmode(ci, fmode);
>  }
>  
>  /*
> @@ -3726,7 +3716,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
>  		/* add placeholder for the export tagert */
>  		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
>  		tcap = new_cap;
> -		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
> +		ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
>  			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
>  
>  		if (!list_empty(&ci->i_cap_flush_list) &&
> @@ -3831,7 +3821,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
>  	__ceph_caps_issued(ci, &issued);
>  	issued |= __ceph_caps_dirty(ci);
>  
> -	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
> +	ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
>  		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
>  
>  	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
> @@ -4186,16 +4176,6 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
>  	spin_unlock(&ci->i_ceph_lock);
>  }
>  
> -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> -{
> -	int i;
> -	int bits = (fmode << 1) | 1;
> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> -		if (bits & (1 << i))
> -			ci->i_file_by_mode[i].nr++;
> -	}
> -}
> -
>  /*
>   * Drop open file reference.  If we were the last open file,
>   * we may need to release capabilities to the MDS (or schedule
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f28f420bad23..60a2dfa02ba2 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  	if (isdir) {
>  		struct ceph_dir_file_info *dfi =
>  			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> -		if (!dfi) {
> -			ceph_put_fmode(ci, fmode, 1); /* clean up */
> +		if (!dfi)
>  			return -ENOMEM;
> -		}
>  
>  		file->private_data = dfi;
>  		fi = &dfi->file_info;
> @@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  		dfi->readdir_cache_idx = -1;
>  	} else {
>  		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> -		if (!fi) {
> -			ceph_put_fmode(ci, fmode, 1); /* clean up */
> +		if (!fi)
>  			return -ENOMEM;
> -		}
>  
>  		file->private_data = fi;
>  	}
>  
> +	ceph_get_fmode(ci, fmode, 1);
>  	fi->fmode = fmode;
> +
>  	spin_lock_init(&fi->rw_contexts_lock);
>  	INIT_LIST_HEAD(&fi->rw_contexts);
>  	fi->meta_err = errseq_sample(&ci->i_meta_err);
> @@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  	case S_IFLNK:
>  		dout("init_file %p %p 0%o (symlink)\n", inode, file,
>  		     inode->i_mode);
> -		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		break;
>  
>  	default:
> @@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  		 * we need to drop the open ref now, since we don't
>  		 * have .release set to ceph_release.
>  		 */
> -		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		BUG_ON(inode->i_fop->release == ceph_release);
>  
>  		/* call the proper open fop */
> @@ -327,7 +323,6 @@ int ceph_renew_caps(struct inode *inode, int fmode)
>  	req->r_inode = inode;
>  	ihold(inode);
>  	req->r_num_caps = 1;
> -	req->r_fmode = -1;
>  
>  	err = ceph_mdsc_do_request(mdsc, NULL, req);
>  	ceph_mdsc_put_request(req);
> @@ -373,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
>  
>  	/* trivially open snapdir */
>  	if (ceph_snap(inode) == CEPH_SNAPDIR) {
> -		spin_lock(&ci->i_ceph_lock);
> -		__ceph_get_fmode(ci, fmode);
> -		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
>  	}
>  
> @@ -393,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
>  		dout("open %p fmode %d want %s issued %s using existing\n",
>  		     inode, fmode, ceph_cap_string(wanted),
>  		     ceph_cap_string(issued));
> -		__ceph_get_fmode(ci, fmode);
> +		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  
>  		/* adjust wanted? */
> @@ -405,7 +397,6 @@ int ceph_open(struct inode *inode, struct file *file)
>  		return ceph_init_file(inode, file, fmode);
>  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
>  		   (ci->i_snap_caps & wanted) == wanted) {
> -		__ceph_get_fmode(ci, fmode);
>  		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
> @@ -526,8 +517,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  		err = finish_open(file, dentry, ceph_open);
>  	}
>  out_req:
> -	if (!req->r_err && req->r_target_inode)
> -		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
>  	ceph_mdsc_put_request(req);
>  out_ctx:
>  	ceph_release_acl_sec_ctx(&as_ctx);
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index b279bd8e168e..bb73b0c8c4d9 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -969,7 +969,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  		if (ceph_snap(inode) == CEPH_NOSNAP) {
>  			ceph_add_cap(inode, session,
>  				     le64_to_cpu(info->cap.cap_id),
> -				     cap_fmode, info_caps,
> +				     info_caps,
>  				     le32_to_cpu(info->cap.wanted),
>  				     le32_to_cpu(info->cap.seq),
>  				     le32_to_cpu(info->cap.mseq),
> @@ -994,13 +994,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  			dout(" %p got snap_caps %s\n", inode,
>  			     ceph_cap_string(info_caps));
>  			ci->i_snap_caps |= info_caps;
> -			if (cap_fmode >= 0)
> -				__ceph_get_fmode(ci, cap_fmode);
>  		}
> -	} else if (cap_fmode >= 0) {
> -		pr_warn("mds issued no caps on %llx.%llx\n",
> -			   ceph_vinop(inode));
> -		__ceph_get_fmode(ci, cap_fmode);
>  	}
>  
>  	if (iinfo->inline_version > 0 &&
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 029823643b8b..1ea76466efcb 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1038,7 +1038,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
>  				     struct ceph_cap_reservation *ctx);
>  extern void ceph_add_cap(struct inode *inode,
>  			 struct ceph_mds_session *session, u64 cap_id,
> -			 int fmode, unsigned issued, unsigned wanted,
> +			 unsigned issued, unsigned wanted,
>  			 unsigned cap, unsigned seq, u64 realmino, int flags,
>  			 struct ceph_cap **new_cap);
>  extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
> @@ -1080,7 +1080,6 @@ extern int ceph_try_get_caps(struct inode *inode,
>  			     int need, int want, bool nonblock, int *got);
>  
>  /* for counting open files by mode */
> -extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
>  extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
>  extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
>  extern void __ceph_touch_fmode(struct ceph_inode_info *ci,

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] ceph: remove delay check logic from ceph_check_caps()
  2020-02-20 12:26 ` [PATCH 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
@ 2020-02-20 19:57   ` Jeff Layton
  0 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-02-20 19:57 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> __ceph_caps_file_wanted() already checks 'caps_wanted_delay_min' and
> 'caps_wanted_delay_max'. There is no need to duplicte the logic in
> ceph_check_caps() and __send_cap()
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c  | 146 ++++++++++++------------------------------------
>  fs/ceph/file.c  |   5 +-
>  fs/ceph/inode.c |   1 -
>  fs/ceph/super.h |   8 +--
>  4 files changed, 41 insertions(+), 119 deletions(-)
> 

Love that diffstat!

I like this patch overall, but it seems to rely on some of the earlier
ones, so I'll wait for your responses or a v2 set.

I think we'll probably want to merge this before the async dirops
series. This makes some significant changes to the underlying cap
handling and I think it'll be cleanest to adapt the async dirops series
on top of it and then merge it after.

> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 2f4ff7e9508e..39bf41d0fbdb 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>  			       struct ceph_inode_info *ci)
>  {
>  	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
> -
> -	ci->i_hold_caps_min = round_jiffies(jiffies +
> -					    opt->caps_wanted_delay_min * HZ);
>  	ci->i_hold_caps_max = round_jiffies(jiffies +
>  					    opt->caps_wanted_delay_max * HZ);
> -	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
> -	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
> +	dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
> +	     ci->i_hold_caps_max - jiffies);
>  }
>  
>  /*
> @@ -508,8 +505,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>   *    -> we take mdsc->cap_delay_lock
>   */
>  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> -				struct ceph_inode_info *ci,
> -				bool set_timeout)
> +				struct ceph_inode_info *ci)
>  {
>  	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
>  	     ci->i_ceph_flags, ci->i_hold_caps_max);
> @@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  				goto no_change;
>  			list_del_init(&ci->i_cap_delay_list);
>  		}
> -		if (set_timeout)
> -			__cap_set_timeouts(mdsc, ci);
> +		__cap_set_timeouts(mdsc, ci);
>  		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>  no_change:
>  		spin_unlock(&mdsc->cap_delay_lock);
> @@ -719,7 +714,7 @@ void ceph_add_cap(struct inode *inode,
>  		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
>  		     ceph_cap_string(issued), ceph_cap_string(wanted),
>  		     ceph_cap_string(actual_wanted));
> -		__cap_delay_requeue(mdsc, ci, true);
> +		__cap_delay_requeue(mdsc, ci);
>  	}
>  
>  	if (flags & CEPH_CAP_FLAG_AUTH) {
> @@ -1299,7 +1294,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  	struct cap_msg_args arg;
>  	int held, revoking;
>  	int wake = 0;
> -	int delayed = 0;
>  	int ret;
>  
>  	held = cap->issued | cap->implemented;
> @@ -1312,28 +1306,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  	     ceph_cap_string(revoking));
>  	BUG_ON((retain & CEPH_CAP_PIN) == 0);
>  
> -	arg.session = cap->session;
> -
> -	/* don't release wanted unless we've waited a bit. */
> -	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
> -	    time_before(jiffies, ci->i_hold_caps_min)) {
> -		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
> -		     ceph_cap_string(cap->issued),
> -		     ceph_cap_string(cap->issued & retain),
> -		     ceph_cap_string(cap->mds_wanted),
> -		     ceph_cap_string(want));
> -		want |= cap->mds_wanted;
> -		retain |= cap->issued;
> -		delayed = 1;
> -	}
> -	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
> -	if (want & ~cap->mds_wanted) {
> -		/* user space may open/close single file frequently.
> -		 * This avoids droping mds_wanted immediately after
> -		 * requesting new mds_wanted.
> -		 */
> -		__cap_set_timeouts(mdsc, ci);
> -	}
> +	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
>  
>  	cap->issued &= retain;  /* drop bits we don't want */
>  	if (cap->implemented & ~cap->issued) {
> @@ -1348,6 +1321,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  	cap->implemented &= cap->issued | used;
>  	cap->mds_wanted = want;
>  
> +	arg.session = cap->session;
>  	arg.ino = ceph_vino(inode).ino;
>  	arg.cid = cap->cap_id;
>  	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
> @@ -1408,14 +1382,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  
>  	ret = send_cap_msg(&arg);
>  	if (ret < 0) {
> -		dout("error sending cap msg, must requeue %p\n", inode);
> -		delayed = 1;
> +		pr_err("error sending cap msg, ino (%llx.%llx) "
> +		       "flushing %s tid %llu, requeue\n",
> +		       ceph_vinop(inode), ceph_cap_string(flushing),
> +		       flush_tid);
> +		spin_lock(&ci->i_ceph_lock);
> +		__cap_delay_requeue(mdsc, ci);
> +		spin_unlock(&ci->i_ceph_lock);
>  	}
>  
>  	if (wake)
>  		wake_up_all(&ci->i_cap_wq);
>  
> -	return delayed;
> +	return ret;
>  }
>  
>  static inline int __send_flush_snap(struct inode *inode,
> @@ -1679,7 +1658,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
>  	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
>  	    (mask & CEPH_CAP_FILE_BUFFER))
>  		dirty |= I_DIRTY_DATASYNC;
> -	__cap_delay_requeue(mdsc, ci, true);
> +	__cap_delay_requeue(mdsc, ci);
>  	return dirty;
>  }
>  
> @@ -1830,8 +1809,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
>   * versus held caps.  Release, flush, ack revoked caps to mds as
>   * appropriate.
>   *
> - *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
> - *    cap release further.
>   *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
>   *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
>   *    further delay.
> @@ -1850,17 +1827,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	int mds = -1;   /* keep track of how far we've gone through i_caps list
>  			   to avoid an infinite loop on retry */
>  	struct rb_node *p;
> -	int delayed = 0, sent = 0;
> -	bool no_delay = flags & CHECK_CAPS_NODELAY;
>  	bool queue_invalidate = false;
>  	bool tried_invalidate = false;
>  
> -	/* if we are unmounting, flush any unused caps immediately. */
> -	if (mdsc->stopping)
> -		no_delay = true;
> -
>  	spin_lock(&ci->i_ceph_lock);
> -
>  	if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  		flags |= CHECK_CAPS_FLUSH;
>  
> @@ -1906,14 +1876,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	}
>  
>  	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
> -	     " issued %s revoking %s retain %s %s%s%s\n", inode,
> +	     " issued %s revoking %s retain %s %s%s\n", inode,
>  	     ceph_cap_string(file_wanted),
>  	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
>  	     ceph_cap_string(ci->i_flushing_caps),
>  	     ceph_cap_string(issued), ceph_cap_string(revoking),
>  	     ceph_cap_string(retain),
>  	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
> -	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
>  	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
>  
>  	/*
> @@ -1921,7 +1890,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	 * have cached pages, but don't want them, then try to invalidate.
>  	 * If we fail, it's because pages are locked.... try again later.
>  	 */
> -	if ((!no_delay || mdsc->stopping) &&
> +	if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
>  	    S_ISREG(inode->i_mode) &&
>  	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
>  	    inode->i_data.nrpages &&		/* have cached pages */
> @@ -2001,21 +1970,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		if ((cap->issued & ~retain) == 0)
>  			continue;     /* nope, all good */
>  
> -		if (no_delay)
> -			goto ack;
> -
> -		/* delay? */
> -		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
> -		    time_before(jiffies, ci->i_hold_caps_max)) {
> -			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
> -			     ceph_cap_string(cap->issued),
> -			     ceph_cap_string(cap->issued & retain),
> -			     ceph_cap_string(cap->mds_wanted),
> -			     ceph_cap_string(want));
> -			delayed++;
> -			continue;
> -		}
> -
>  ack:
>  		if (session && session != cap->session) {
>  			dout("oops, wrong session %p mutex\n", session);
> @@ -2076,24 +2030,18 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		}
>  
>  		mds = cap->mds;  /* remember mds, so we don't repeat */
> -		sent++;
>  
>  		/* __send_cap drops i_ceph_lock */
> -		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
> -				cap_used, want, retain, flushing,
> -				flush_tid, oldest_flush_tid);
> +		__send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
> +			   retain, flushing, flush_tid, oldest_flush_tid);
>  		goto retry; /* retake i_ceph_lock and restart our cap scan. */
>  	}
>  
> -	if (list_empty(&ci->i_cap_delay_list)) {
> -	    if (delayed) {
> -		    /* Reschedule delayed caps release if we delayed anything */
> -		    __cap_delay_requeue(mdsc, ci, false);
> -	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> -			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> -		    /* periodically re-calculate caps wanted by open files */
> -		    __cap_delay_requeue(mdsc, ci, true);
> -	    }
> +	/* periodically re-calculate caps wanted by open files */
> +	if (list_empty(&ci->i_cap_delay_list) &&
> +	    (file_wanted & ~CEPH_CAP_PIN) &&
> +	    !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +		__cap_delay_requeue(mdsc, ci);
>  	}
>  
>  	spin_unlock(&ci->i_ceph_lock);
> @@ -2123,7 +2071,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
>  retry_locked:
>  	if (ci->i_dirty_caps && ci->i_auth_cap) {
>  		struct ceph_cap *cap = ci->i_auth_cap;
> -		int delayed;
>  
>  		if (session != cap->session) {
>  			spin_unlock(&ci->i_ceph_lock);
> @@ -2152,18 +2099,10 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
>  						 &oldest_flush_tid);
>  
>  		/* __send_cap drops i_ceph_lock */
> -		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> -				     CEPH_CLIENT_CAPS_SYNC,
> -				     __ceph_caps_used(ci),
> -				     __ceph_caps_wanted(ci),
> -				     (cap->issued | cap->implemented),
> -				     flushing, flush_tid, oldest_flush_tid);
> -
> -		if (delayed) {
> -			spin_lock(&ci->i_ceph_lock);
> -			__cap_delay_requeue(mdsc, ci, true);
> -			spin_unlock(&ci->i_ceph_lock);
> -		}
> +		__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
> +			   __ceph_caps_used(ci), __ceph_caps_wanted(ci),
> +			   (cap->issued | cap->implemented),
> +			   flushing, flush_tid, oldest_flush_tid);
>  	} else {
>  		if (!list_empty(&ci->i_cap_flush_list)) {
>  			struct ceph_cap_flush *cf =
> @@ -2363,22 +2302,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
>  		if (cf->caps) {
>  			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
>  			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
> -			ci->i_ceph_flags |= CEPH_I_NODELAY;
> -
> -			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> +			__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
>  					 (cf->tid < last_snap_flush ?
>  					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
>  					  __ceph_caps_used(ci),
>  					  __ceph_caps_wanted(ci),
>  					  (cap->issued | cap->implemented),
>  					  cf->caps, cf->tid, oldest_flush_tid);
> -			if (ret) {
> -				pr_err("kick_flushing_caps: error sending "
> -					"cap flush, ino (%llx.%llx) "
> -					"tid %llu flushing %s\n",
> -					ceph_vinop(inode), cf->tid,
> -					ceph_cap_string(cf->caps));
> -			}
>  		} else {
>  			struct ceph_cap_snap *capsnap =
>  					container_of(cf, struct ceph_cap_snap,
> @@ -2999,7 +2929,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>  	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
>  	     last ? " last" : "", put ? " put" : "");
>  
> -	if (last && !flushsnaps)
> +	if (last)
>  		ceph_check_caps(ci, 0, NULL);
>  	else if (flushsnaps)
>  		ceph_flush_snaps(ci, NULL);
> @@ -3417,10 +3347,10 @@ static void handle_cap_grant(struct inode *inode,
>  		wake_up_all(&ci->i_cap_wq);
>  
>  	if (check_caps == 1)
> -		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
> +		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
>  				session);
>  	else if (check_caps == 2)
> -		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
> +		ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
>  	else
>  		mutex_unlock(&session->s_mutex);
>  }
> @@ -4095,7 +4025,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
>  {
>  	struct inode *inode;
>  	struct ceph_inode_info *ci;
> -	int flags = CHECK_CAPS_NODELAY;
>  
>  	dout("check_delayed_caps\n");
>  	while (1) {
> @@ -4115,7 +4044,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
>  
>  		if (inode) {
>  			dout("check_delayed_caps on %p\n", inode);
> -			ceph_check_caps(ci, flags, NULL);
> +			ceph_check_caps(ci, 0, NULL);
>  			/* avoid calling iput_final() in tick thread */
>  			ceph_async_iput(inode);
>  		}
> @@ -4140,7 +4069,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>  		ihold(inode);
>  		dout("flush_dirty_caps %p\n", inode);
>  		spin_unlock(&mdsc->cap_dirty_lock);
> -		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
> +		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
>  		iput(inode);
>  		spin_lock(&mdsc->cap_dirty_lock);
>  	}
> @@ -4161,7 +4090,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
>  
>  	/* queue periodic check */
>  	if (bits && list_empty(&ci->i_cap_delay_list))
> -		__cap_delay_requeue(mdsc, ci, true);
> +		__cap_delay_requeue(mdsc, ci);
>  }
>  
>  void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> @@ -4210,7 +4139,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
>  	if (inode->i_nlink == 1) {
>  		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
>  
> -		ci->i_ceph_flags |= CEPH_I_NODELAY;
>  		if (__ceph_caps_dirty(ci)) {
>  			struct ceph_mds_client *mdsc =
>  				ceph_inode_to_client(inode)->mdsc;
> @@ -4266,8 +4194,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
>  		if (force || (cap->issued & drop)) {
>  			if (cap->issued & drop) {
>  				int wanted = __ceph_caps_wanted(ci);
> -				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
> -					wanted |= cap->mds_wanted;
>  				dout("encode_inode_release %p cap %p "
>  				     "%s -> %s, wanted %s -> %s\n", inode, cap,
>  				     ceph_cap_string(cap->issued),
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 60a2dfa02ba2..ed70bb448568 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -2129,12 +2129,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  
>  	if (endoff > size) {
>  		int caps_flags = 0;
> -
>  		/* Let the MDS know about dst file size change */
> -		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
> -			caps_flags |= CHECK_CAPS_NODELAY;
>  		if (ceph_inode_set_size(dst_inode, endoff))
>  			caps_flags |= CHECK_CAPS_AUTHONLY;
> +		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
> +			caps_flags |= CHECK_CAPS_AUTHONLY;
>  		if (caps_flags)
>  			ceph_check_caps(dst_ci, caps_flags, NULL);
>  	}
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index bb73b0c8c4d9..5c8d3b01bc5d 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -471,7 +471,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_prealloc_cap_flush = NULL;
>  	INIT_LIST_HEAD(&ci->i_cap_flush_list);
>  	init_waitqueue_head(&ci->i_cap_wq);
> -	ci->i_hold_caps_min = 0;
>  	ci->i_hold_caps_max = 0;
>  	INIT_LIST_HEAD(&ci->i_cap_delay_list);
>  	INIT_LIST_HEAD(&ci->i_cap_snaps);
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 1ea76466efcb..9ddaaeefc6f0 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -170,9 +170,9 @@ struct ceph_cap {
>  	struct list_head caps_item;
>  };
>  
> -#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
> -#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
> -#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
> +#define CHECK_CAPS_AUTHONLY   1  /* only check auth cap */
> +#define CHECK_CAPS_FLUSH      2  /* flush any dirty caps */
> +#define CHECK_CAPS_NOINVAL    4  /* don't invalidate pagecache */
>  

Some mention of the change to NOINVAL in the changelog would be nice.

>  struct ceph_cap_flush {
>  	u64 tid;
> @@ -352,7 +352,6 @@ struct ceph_inode_info {
>  	struct ceph_cap_flush *i_prealloc_cap_flush;
>  	struct list_head i_cap_flush_list;
>  	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
> -	unsigned long i_hold_caps_min; /* jiffies */

This is also nice.

>  	unsigned long i_hold_caps_max; /* jiffies */
>  	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
>  	struct ceph_cap_reservation i_cap_migration_resv;
> @@ -514,7 +513,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>   * Ceph inode.
>   */
>  #define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
> -#define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
>  #define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
>  #define CEPH_I_POOL_PERM	(1 << 3)  /* pool rd/wr bits are valid */
>  #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/4] ceph: simplify calling of ceph_get_fmode()
  2020-02-20 19:14   ` Jeff Layton
@ 2020-02-21  2:34     ` Yan, Zheng
  2020-02-21 12:06       ` Jeff Layton
  0 siblings, 1 reply; 14+ messages in thread
From: Yan, Zheng @ 2020-02-21  2:34 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Yan, Zheng, ceph-devel

On Fri, Feb 21, 2020 at 3:15 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> > Call ceph_get_fmode() when initializing file. Because fill_inode()
> > already calls ceph_touch_fmode() for open file request. It affects
>
> You mean __ceph_touch_fmode()
>
> > __ceph_caps_file_wanted() for 'caps_wanted_delay_min' seconds, enough
> > for ceph_get_fmode() to get called by ceph_init_file_info().
> >
>
> I don't understand this changelog. Are you saying there is a potential
> race of some sort, but that you don't think it's something we can hit in
> practice?
>

threads run handle_reply() and open request initiator are different.
there is a small window between waking up request initiator and
request initiator calls ceph_get_fmode(). someone else may call
ceph_check_caps() in the window and wrongly release wanted caps. This
is the reason old code calls _ceph_get_fmode() in fill_inode().


> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/caps.c  | 26 +++-----------------------
> >  fs/ceph/file.c  | 21 +++++----------------
> >  fs/ceph/inode.c |  8 +-------
> >  fs/ceph/super.h |  3 +--
> >  4 files changed, 10 insertions(+), 48 deletions(-)
> >
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index ccdc47bd7cf0..2f4ff7e9508e 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -606,7 +606,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
> >   */
> >  void ceph_add_cap(struct inode *inode,
> >                 struct ceph_mds_session *session, u64 cap_id,
> > -               int fmode, unsigned issued, unsigned wanted,
> > +               unsigned issued, unsigned wanted,
> >                 unsigned seq, unsigned mseq, u64 realmino, int flags,
> >                 struct ceph_cap **new_cap)
> >  {
> > @@ -622,13 +622,6 @@ void ceph_add_cap(struct inode *inode,
> >       dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
> >            session->s_mds, cap_id, ceph_cap_string(issued), seq);
> >
> > -     /*
> > -      * If we are opening the file, include file mode wanted bits
> > -      * in wanted.
> > -      */
> > -     if (fmode >= 0)
> > -             wanted |= ceph_caps_for_mode(fmode);
> > -
> >       spin_lock(&session->s_gen_ttl_lock);
> >       gen = session->s_cap_gen;
> >       spin_unlock(&session->s_gen_ttl_lock);
> > @@ -753,9 +746,6 @@ void ceph_add_cap(struct inode *inode,
> >       cap->issue_seq = seq;
> >       cap->mseq = mseq;
> >       cap->cap_gen = gen;
> > -
> > -     if (fmode >= 0)
> > -             __ceph_get_fmode(ci, fmode);
> >  }
> >
> >  /*
> > @@ -3726,7 +3716,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
> >               /* add placeholder for the export tagert */
> >               int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
> >               tcap = new_cap;
> > -             ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
> > +             ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
> >                            t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
> >
> >               if (!list_empty(&ci->i_cap_flush_list) &&
> > @@ -3831,7 +3821,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
> >       __ceph_caps_issued(ci, &issued);
> >       issued |= __ceph_caps_dirty(ci);
> >
> > -     ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
> > +     ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
> >                    realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
> >
> >       ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
> > @@ -4186,16 +4176,6 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> >       spin_unlock(&ci->i_ceph_lock);
> >  }
> >
> > -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> > -{
> > -     int i;
> > -     int bits = (fmode << 1) | 1;
> > -     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > -             if (bits & (1 << i))
> > -                     ci->i_file_by_mode[i].nr++;
> > -     }
> > -}
> > -
> >  /*
> >   * Drop open file reference.  If we were the last open file,
> >   * we may need to release capabilities to the MDS (or schedule
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index f28f420bad23..60a2dfa02ba2 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >       if (isdir) {
> >               struct ceph_dir_file_info *dfi =
> >                       kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> > -             if (!dfi) {
> > -                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> > +             if (!dfi)
> >                       return -ENOMEM;
> > -             }
> >
> >               file->private_data = dfi;
> >               fi = &dfi->file_info;
> > @@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >               dfi->readdir_cache_idx = -1;
> >       } else {
> >               fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> > -             if (!fi) {
> > -                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> > +             if (!fi)
> >                       return -ENOMEM;
> > -             }
> >
> >               file->private_data = fi;
> >       }
> >
> > +     ceph_get_fmode(ci, fmode, 1);
> >       fi->fmode = fmode;
> > +
> >       spin_lock_init(&fi->rw_contexts_lock);
> >       INIT_LIST_HEAD(&fi->rw_contexts);
> >       fi->meta_err = errseq_sample(&ci->i_meta_err);
> > @@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >       case S_IFLNK:
> >               dout("init_file %p %p 0%o (symlink)\n", inode, file,
> >                    inode->i_mode);
> > -             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> >               break;
> >
> >       default:
> > @@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >                * we need to drop the open ref now, since we don't
> >                * have .release set to ceph_release.
> >                */
> > -             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> >               BUG_ON(inode->i_fop->release == ceph_release);
> >
> >               /* call the proper open fop */
> > @@ -327,7 +323,6 @@ int ceph_renew_caps(struct inode *inode, int fmode)
> >       req->r_inode = inode;
> >       ihold(inode);
> >       req->r_num_caps = 1;
> > -     req->r_fmode = -1;
> >
> >       err = ceph_mdsc_do_request(mdsc, NULL, req);
> >       ceph_mdsc_put_request(req);
> > @@ -373,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
> >
> >       /* trivially open snapdir */
> >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > -             spin_lock(&ci->i_ceph_lock);
> > -             __ceph_get_fmode(ci, fmode);
> > -             spin_unlock(&ci->i_ceph_lock);
> >               return ceph_init_file(inode, file, fmode);
> >       }
> >
> > @@ -393,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
> >               dout("open %p fmode %d want %s issued %s using existing\n",
> >                    inode, fmode, ceph_cap_string(wanted),
> >                    ceph_cap_string(issued));
> > -             __ceph_get_fmode(ci, fmode);
> > +             __ceph_touch_fmode(ci, mdsc, fmode);
> >               spin_unlock(&ci->i_ceph_lock);
> >
> >               /* adjust wanted? */
> > @@ -405,7 +397,6 @@ int ceph_open(struct inode *inode, struct file *file)
> >               return ceph_init_file(inode, file, fmode);
> >       } else if (ceph_snap(inode) != CEPH_NOSNAP &&
> >                  (ci->i_snap_caps & wanted) == wanted) {
> > -             __ceph_get_fmode(ci, fmode);
> >               __ceph_touch_fmode(ci, mdsc, fmode);
> >               spin_unlock(&ci->i_ceph_lock);
> >               return ceph_init_file(inode, file, fmode);
> > @@ -526,8 +517,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >               err = finish_open(file, dentry, ceph_open);
> >       }
> >  out_req:
> > -     if (!req->r_err && req->r_target_inode)
> > -             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
> >       ceph_mdsc_put_request(req);
> >  out_ctx:
> >       ceph_release_acl_sec_ctx(&as_ctx);
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index b279bd8e168e..bb73b0c8c4d9 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -969,7 +969,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> >               if (ceph_snap(inode) == CEPH_NOSNAP) {
> >                       ceph_add_cap(inode, session,
> >                                    le64_to_cpu(info->cap.cap_id),
> > -                                  cap_fmode, info_caps,
> > +                                  info_caps,
> >                                    le32_to_cpu(info->cap.wanted),
> >                                    le32_to_cpu(info->cap.seq),
> >                                    le32_to_cpu(info->cap.mseq),
> > @@ -994,13 +994,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> >                       dout(" %p got snap_caps %s\n", inode,
> >                            ceph_cap_string(info_caps));
> >                       ci->i_snap_caps |= info_caps;
> > -                     if (cap_fmode >= 0)
> > -                             __ceph_get_fmode(ci, cap_fmode);
> >               }
> > -     } else if (cap_fmode >= 0) {
> > -             pr_warn("mds issued no caps on %llx.%llx\n",
> > -                        ceph_vinop(inode));
> > -             __ceph_get_fmode(ci, cap_fmode);
> >       }
> >
> >       if (iinfo->inline_version > 0 &&
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 029823643b8b..1ea76466efcb 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -1038,7 +1038,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
> >                                    struct ceph_cap_reservation *ctx);
> >  extern void ceph_add_cap(struct inode *inode,
> >                        struct ceph_mds_session *session, u64 cap_id,
> > -                      int fmode, unsigned issued, unsigned wanted,
> > +                      unsigned issued, unsigned wanted,
> >                        unsigned cap, unsigned seq, u64 realmino, int flags,
> >                        struct ceph_cap **new_cap);
> >  extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
> > @@ -1080,7 +1080,6 @@ extern int ceph_try_get_caps(struct inode *inode,
> >                            int need, int want, bool nonblock, int *got);
> >
> >  /* for counting open files by mode */
> > -extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> >  extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> >  extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> >  extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
>
> --
> Jeff Layton <jlayton@kernel.org>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps
  2020-02-20 19:06   ` Jeff Layton
@ 2020-02-21  3:22     ` Yan, Zheng
  0 siblings, 0 replies; 14+ messages in thread
From: Yan, Zheng @ 2020-02-21  3:22 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Yan, Zheng, ceph-devel

On Fri, Feb 21, 2020 at 3:08 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> > When getting caps for read/write, update corresponding file's last
> > read/write. If a file hasn't been read/write for 'caps_wanted_delay_max'
> > seconds, ignore the file when calculating wanted caps.
> >
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/caps.c               | 149 ++++++++++++++++++++++++-----------
> >  fs/ceph/file.c               |  23 +++---
> >  fs/ceph/inode.c              |  15 +++-
> >  fs/ceph/ioctl.c              |   4 +-
> >  fs/ceph/super.h              |  16 +++-
> >  include/linux/ceph/ceph_fs.h |   1 +
> >  6 files changed, 145 insertions(+), 63 deletions(-)
> >
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 293920d013ff..ccdc47bd7cf0 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -971,18 +971,44 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
> >       return used;
> >  }
> >
> > +#define FMODE_WAIT_BIAS 1000
> > +
> >  /*
> >   * wanted, by virtue of open file modes
> >   */
> >  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
> >  {
> > +     struct ceph_mount_options *opt =
> > +             ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> > +     unsigned long used_cutoff =
> > +             round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> > +     unsigned long idle_cutoff =
> > +             round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
> >       int i, bits = 0;
> > +
> >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > -             if (ci->i_nr_by_mode[i])
> > +             if (ci->i_file_by_mode[i].nr >= FMODE_WAIT_BIAS) {
> > +                     /* there are cap waiters or lots of open files */
> >                       bits |= 1 << i;
> > +             } else if (ci->i_file_by_mode[i].nr > 0) {
> > +                     if (i ==  CEPH_FILE_MODE_PIN ||
> > +                         time_after(ci->i_file_by_mode[i].last_used,
> > +                                    used_cutoff))
> > +                             bits |= 1 << i;
>
> Ok, so in the case where we have active references give it up if
> last_used is after the max delay.
>
> > +             } else if ((ci->i_file_by_mode[i].last_used & 1)) {
> > +                     if (time_after(ci->i_file_by_mode[i].last_used,
> > +                                    idle_cutoff)) {
> > +                             bits |= 1 << i;
> > +                     } else {
> > +                             ci->i_file_by_mode[i].last_used &= ~1UL;
> > +                     }
> > +             }
>
> What's the need for the trickery with the bottom bit here? It seems
> simpler (and about as fast) to just do a time_after check every time.
>

because last_used is initialized to 0. 0 is an valid jiffies value.
Without the bottom bit, the check may return false positive. I think
return false positive is not big issue, I will remove this in next
version.


> >       }
> >       if (bits == 0)
> >               return 0;
> > +     if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> > +             return 0;
> > +
> >       return ceph_caps_for_mode(bits >> 1);
> >  }
> >
> > @@ -1021,14 +1047,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
> >       return mds_wanted;
> >  }
> >
> > -/*
> > - * called under i_ceph_lock
> > - */
> > -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> > -{
> > -     return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> > -}
> > -
> >  int ceph_is_any_caps(struct inode *inode)
> >  {
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> > @@ -1856,10 +1874,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >       if (ci->i_ceph_flags & CEPH_I_FLUSH)
> >               flags |= CHECK_CAPS_FLUSH;
> >
> > -     if (!(flags & CHECK_CAPS_AUTHONLY) ||
> > -         (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> > -             __cap_delay_cancel(mdsc, ci);
> > -
> >       goto retry_locked;
> >  retry:
> >       spin_lock(&ci->i_ceph_lock);
> > @@ -2081,9 +2095,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >               goto retry; /* retake i_ceph_lock and restart our cap scan. */
> >       }
> >
> > -     /* Reschedule delayed caps release if we delayed anything */
> > -     if (delayed)
> > -             __cap_delay_requeue(mdsc, ci, false);
> > +     if (list_empty(&ci->i_cap_delay_list)) {
> > +         if (delayed) {
> > +                 /* Reschedule delayed caps release if we delayed anything */
> > +                 __cap_delay_requeue(mdsc, ci, false);
> > +         } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> > +                     !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > +                 /* periodically re-calculate caps wanted by open files */
> > +                 __cap_delay_requeue(mdsc, ci, true);
> > +         }
> > +     }
> >
> >       spin_unlock(&ci->i_ceph_lock);
> >
> > @@ -2549,8 +2570,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
> >   * FIXME: how does a 0 return differ from -EAGAIN?
> >   */
> >  enum {
> > -     NON_BLOCKING    = 1,
> > -     CHECK_FILELOCK  = 2,
> > +     /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> > +     NON_BLOCKING    = (1 << 8),
> > +     CHECK_FILELOCK  = (1 << 9),
> >  };
> >
> >  static int try_get_cap_refs(struct inode *inode, int need, int want,
> > @@ -2560,7 +2582,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >       int ret = 0;
> >       int have, implemented;
> > -     int file_wanted;
> >       bool snap_rwsem_locked = false;
> >
> >       dout("get_cap_refs %p need %s want %s\n", inode,
> > @@ -2576,15 +2597,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >               goto out_unlock;
> >       }
> >
> > -     /* make sure file is actually open */
> > -     file_wanted = __ceph_caps_file_wanted(ci);
> > -     if ((file_wanted & need) != need) {
> > -             dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> > -                  ceph_cap_string(need), ceph_cap_string(file_wanted));
> > -             ret = -EBADF;
> > -             goto out_unlock;
> > -     }
> > -
> >       /* finish pending truncate */
> >       while (ci->i_truncate_pending) {
> >               spin_unlock(&ci->i_ceph_lock);
> > @@ -2692,6 +2704,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >                    ceph_cap_string(have), ceph_cap_string(need));
> >       }
> >  out_unlock:
> > +
> > +     __ceph_touch_fmode(ci, mdsc, flags);
> > +
> >       spin_unlock(&ci->i_ceph_lock);
> >       if (snap_rwsem_locked)
> >               up_read(&mdsc->snap_rwsem);
> > @@ -2729,10 +2744,22 @@ static void check_max_size(struct inode *inode, loff_t endoff)
> >               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
> >  }
> >
> > +static inline int get_used_file_mode(int need, int want)
> > +{
> > +     int fmode = 0;
> > +     if (need & CEPH_CAP_FILE_RD)
> > +             fmode |= CEPH_FILE_MODE_RD;
> > +     if (need & CEPH_CAP_FILE_WR)
> > +             fmode |= CEPH_FILE_MODE_WR;
> > +     if (want & CEPH_CAP_FILE_LAZYIO)
> > +             fmode |= CEPH_FILE_MODE_LAZY;
> > +     return fmode;
> > +}
> > +
> >  int ceph_try_get_caps(struct inode *inode, int need, int want,
> >                     bool nonblock, int *got)
> >  {
> > -     int ret;
> > +     int ret, flags;
> >
> >       BUG_ON(need & ~CEPH_CAP_FILE_RD);
> >       BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> > @@ -2740,8 +2767,11 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
> >       if (ret < 0)
> >               return ret;
> >
> > -     ret = try_get_cap_refs(inode, need, want, 0,
> > -                            (nonblock ? NON_BLOCKING : 0), got);
> > +     flags = get_used_file_mode(need, want);
> > +     if (nonblock)
> > +             flags |= NON_BLOCKING;
> > +
> > +     ret = try_get_cap_refs(inode, need, want, 0, flags, got);
> >       return ret == -EAGAIN ? 0 : ret;
> >  }
> >
> > @@ -2767,11 +2797,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >           fi->filp_gen != READ_ONCE(fsc->filp_gen))
> >               return -EBADF;
> >
> > +     flags = get_used_file_mode(need, want);
> > +
> >       while (true) {
> >               if (endoff > 0)
> >                       check_max_size(inode, endoff);
> >
> > -             flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> > +             flags &= CEPH_FILE_MODE_MASK;
> > +             if (atomic_read(&fi->num_locks))
> > +                     flags |= CHECK_FILELOCK;
> >               _got = 0;
> >               ret = try_get_cap_refs(inode, need, want, endoff,
> >                                      flags, &_got);
> > @@ -2791,6 +2825,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >                       list_add(&cw.list, &mdsc->cap_wait_list);
> >                       spin_unlock(&mdsc->caps_list_lock);
> >
> > +                     /* make sure used fmode not timeout */
> > +                     ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
> >                       add_wait_queue(&ci->i_cap_wq, &wait);
> >
> >                       flags |= NON_BLOCKING;
> > @@ -2804,6 +2840,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >                       }
> >
> >                       remove_wait_queue(&ci->i_cap_wq, &wait);
> > +                     ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
> >
> >                       spin_lock(&mdsc->caps_list_lock);
> >                       list_del(&cw.list);
> > @@ -2823,7 +2860,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >               if (ret < 0) {
> >                       if (ret == -ESTALE) {
> >                               /* session was killed, try renew caps */
> > -                             ret = ceph_renew_caps(inode);
> > +                             ret = ceph_renew_caps(inode, flags);
> >                               if (ret == 0)
> >                                       continue;
> >                       }
> > @@ -4121,13 +4158,41 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
> >       dout("flush_dirty_caps done\n");
> >  }
> >
> > +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > +                     struct ceph_mds_client *mdsc, int fmode)
> > +{
> > +     int i;
> > +     int bits = (fmode << 1);
> > +     unsigned long now = jiffies | 1;
> > +     for (i = 1; i < CEPH_FILE_MODE_BITS; i++) {
> > +             if (bits & (1 << i))
> > +                     ci->i_file_by_mode[i].last_used = now;
> > +     }
> > +
> > +     /* queue periodic check */
> > +     if (bits && list_empty(&ci->i_cap_delay_list))
> > +             __cap_delay_requeue(mdsc, ci, true);
> > +}
> > +
> > +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> > +{
> > +     int i;
> > +     int bits = (fmode << 1) | 1;
> > +     spin_lock(&ci->i_ceph_lock);
> > +     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > +             if (bits & (1 << i))
> > +                     ci->i_file_by_mode[i].nr += count;
> > +     }
> > +     spin_unlock(&ci->i_ceph_lock);
> > +}
> > +
> >  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> >  {
> >       int i;
> >       int bits = (fmode << 1) | 1;
> >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> >               if (bits & (1 << i))
> > -                     ci->i_nr_by_mode[i]++;
> > +                     ci->i_file_by_mode[i].nr++;
> >       }
> >  }
> >
> > @@ -4136,26 +4201,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> >   * we may need to release capabilities to the MDS (or schedule
> >   * their delayed release).
> >   */
> > -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> > +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
> >  {
> > -     int i, last = 0;
> > +     int i;
> >       int bits = (fmode << 1) | 1;
> >       spin_lock(&ci->i_ceph_lock);
> >       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> >               if (bits & (1 << i)) {
> > -                     BUG_ON(ci->i_nr_by_mode[i] == 0);
> > -                     if (--ci->i_nr_by_mode[i] == 0)
> > -                             last++;
> > +                     BUG_ON(ci->i_file_by_mode[i].nr < count);
> > +                     ci->i_file_by_mode[i].nr -= count;
> >               }
> >       }
> > -     dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> > -          &ci->vfs_inode, fmode,
> > -          ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> > -          ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
> >       spin_unlock(&ci->i_ceph_lock);
> > -
> > -     if (last && ci->i_vino.snap == CEPH_NOSNAP)
> > -             ceph_check_caps(ci, 0, NULL);
> >  }
> >
> >  /*
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 7e0190b1f821..f28f420bad23 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >               struct ceph_dir_file_info *dfi =
> >                       kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> >               if (!dfi) {
> > -                     ceph_put_fmode(ci, fmode); /* clean up */
> > +                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> >                       return -ENOMEM;
> >               }
> >
> > @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >       } else {
> >               fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> >               if (!fi) {
> > -                     ceph_put_fmode(ci, fmode); /* clean up */
> > +                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> >                       return -ENOMEM;
> >               }
> >
> > @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >       case S_IFLNK:
> >               dout("init_file %p %p 0%o (symlink)\n", inode, file,
> >                    inode->i_mode);
> > -             ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> >               break;
> >
> >       default:
> > @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >                * we need to drop the open ref now, since we don't
> >                * have .release set to ceph_release.
> >                */
> > -             ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> >               BUG_ON(inode->i_fop->release == ceph_release);
> >
> >               /* call the proper open fop */
> > @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >  /*
> >   * try renew caps after session gets killed.
> >   */
> > -int ceph_renew_caps(struct inode *inode)
> > +int ceph_renew_caps(struct inode *inode, int fmode)
> >  {
> > -     struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> > +     struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> >       struct ceph_mds_request *req;
> >       int err, flags, wanted;
> >
> >       spin_lock(&ci->i_ceph_lock);
> > +     __ceph_touch_fmode(ci, mdsc, fmode);
> >       wanted = __ceph_caps_file_wanted(ci);
> >       if (__ceph_is_any_real_caps(ci) &&
> >           (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> > @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
> >       } else if (ceph_snap(inode) != CEPH_NOSNAP &&
> >                  (ci->i_snap_caps & wanted) == wanted) {
> >               __ceph_get_fmode(ci, fmode);
> > +             __ceph_touch_fmode(ci, mdsc, fmode);
> >               spin_unlock(&ci->i_ceph_lock);
> >               return ceph_init_file(inode, file, fmode);
> >       }
> > @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >       }
> >  out_req:
> >       if (!req->r_err && req->r_target_inode)
> > -             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> > +             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
> >       ceph_mdsc_put_request(req);
> >  out_ctx:
> >       ceph_release_acl_sec_ctx(&as_ctx);
> > @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
> >               dout("release inode %p dir file %p\n", inode, file);
> >               WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
> >
> > -             ceph_put_fmode(ci, dfi->file_info.fmode);
> > +             ceph_put_fmode(ci, dfi->file_info.fmode, 1);
> >
> >               if (dfi->last_readdir)
> >                       ceph_mdsc_put_request(dfi->last_readdir);
> > @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
> >               dout("release inode %p regular file %p\n", inode, file);
> >               WARN_ON(!list_empty(&fi->rw_contexts));
> >
> > -             ceph_put_fmode(ci, fi->fmode);
> > +             ceph_put_fmode(ci, fi->fmode, 1);
> > +
> >               kmem_cache_free(ceph_file_cachep, fi);
> >       }
> >
> > @@ -1560,7 +1563,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >               if (dirty)
> >                       __mark_inode_dirty(inode, dirty);
> >               if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> > -                     ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> > +                     ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
>
> NODELAY to AUTHONLY? Why? I guess if we're approaching quota we do want
> to be talking to the authority, but I'm not sure that change belongs in
> this patch.
>
> >       }
> >
> >       dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 094b8fc37787..b279bd8e168e 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -478,8 +478,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >       ci->i_head_snapc = NULL;
> >       ci->i_snap_caps = 0;
> >
> > -     for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
> > -             ci->i_nr_by_mode[i] = 0;
> > +     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > +             ci->i_file_by_mode[i].nr = 0;
> > +             ci->i_file_by_mode[i].last_used = 0;
> > +     }
> >
> >       mutex_init(&ci->i_truncate_mutex);
> >       ci->i_truncate_seq = 0;
> > @@ -637,7 +639,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
> >                       if ((issued & (CEPH_CAP_FILE_CACHE|
> >                                      CEPH_CAP_FILE_BUFFER)) ||
> >                           mapping_mapped(inode->i_mapping) ||
> > -                         __ceph_caps_file_wanted(ci)) {
> > +                         __ceph_is_file_opened(ci)) {
> >                               ci->i_truncate_pending++;
> >                               queue_trunc = 1;
> >                       }
> > @@ -1010,6 +1012,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> >                       fill_inline = true;
> >       }
> >
> > +     if (cap_fmode >= 0) {
> > +             if (!info_caps)
> > +                     pr_warn("mds issued no caps on %llx.%llx\n",
> > +                             ceph_vinop(inode));
> > +             __ceph_touch_fmode(ci, mdsc, cap_fmode);
> > +     }
> > +
> >       spin_unlock(&ci->i_ceph_lock);
> >
> >       if (fill_inline)
> > diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> > index c90f03beb15d..da0ee54ae5bc 100644
> > --- a/fs/ceph/ioctl.c
> > +++ b/fs/ceph/ioctl.c
> > @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
> >       struct ceph_file_info *fi = file->private_data;
> >       struct inode *inode = file_inode(file);
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> > +     struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >
> >       if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
> >               spin_lock(&ci->i_ceph_lock);
> >               fi->fmode |= CEPH_FILE_MODE_LAZY;
> > -             ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> > +             ci->i_file_by_mode[ffs(CEPH_FILE_MODE_LAZY)].nr++;
> > +             __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_LAZY);
> >               spin_unlock(&ci->i_ceph_lock);
> >               dout("ioctl_layzio: file %p marked lazy\n", file);
> >
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index d370f89df358..029823643b8b 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -361,7 +361,10 @@ struct ceph_inode_info {
> >                                                   dirty|flushing caps */
> >       unsigned i_snap_caps;           /* cap bits for snapped files */
> >
> > -     int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> > +     struct {
> > +             int nr;
> > +             unsigned long last_used;
> > +     } i_file_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
> >
> >       struct mutex i_truncate_mutex;
> >       u32 i_truncate_seq;        /* last truncate to smaller size */
> > @@ -673,6 +676,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
> >  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
> >  extern int __ceph_caps_used(struct ceph_inode_info *ci);
> >
> > +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> > +{
> > +     return ci->i_file_by_mode[0].nr;
> > +}
> >  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
> >  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
> >
> > @@ -1074,7 +1081,10 @@ extern int ceph_try_get_caps(struct inode *inode,
> >
> >  /* for counting open files by mode */
> >  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> > +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> > +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> > +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > +                            struct ceph_mds_client *mdsc, int fmode);
> >
> >  /* addr.c */
> >  extern const struct address_space_operations ceph_aops;
> > @@ -1086,7 +1096,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
> >  /* file.c */
> >  extern const struct file_operations ceph_file_fops;
> >
> > -extern int ceph_renew_caps(struct inode *inode);
> > +extern int ceph_renew_caps(struct inode *inode, int fmode);
> >  extern int ceph_open(struct inode *inode, struct file *file);
> >  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >                           struct file *file, unsigned flags, umode_t mode);
> > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > index cb21c5cf12c3..8017130a08a1 100644
> > --- a/include/linux/ceph/ceph_fs.h
> > +++ b/include/linux/ceph/ceph_fs.h
> > @@ -564,6 +564,7 @@ struct ceph_filelock {
> >  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
> >  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
> >  #define CEPH_FILE_MODE_BITS       4
> > +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
> >
> >  int ceph_flags_to_mode(int flags);
> >
>
> --
> Jeff Layton <jlayton@kernel.org>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/4] ceph: simplify calling of ceph_get_fmode()
  2020-02-21  2:34     ` Yan, Zheng
@ 2020-02-21 12:06       ` Jeff Layton
  0 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-02-21 12:06 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 10:34 +0800, Yan, Zheng wrote:
> On Fri, Feb 21, 2020 at 3:15 AM Jeff Layton <jlayton@kernel.org> wrote:
> > On Thu, 2020-02-20 at 20:26 +0800, Yan, Zheng wrote:
> > > Call ceph_get_fmode() when initializing file. Because fill_inode()
> > > already calls ceph_touch_fmode() for open file request. It affects
> > 
> > You mean __ceph_touch_fmode()
> > 
> > > __ceph_caps_file_wanted() for 'caps_wanted_delay_min' seconds, enough
> > > for ceph_get_fmode() to get called by ceph_init_file_info().
> > > 
> > 
> > I don't understand this changelog. Are you saying there is a potential
> > race of some sort, but that you don't think it's something we can hit in
> > practice?
> > 
> 
> threads run handle_reply() and open request initiator are different.
> there is a small window between waking up request initiator and
> request initiator calls ceph_get_fmode(). someone else may call
> ceph_check_caps() in the window and wrongly release wanted caps. This
> is the reason old code calls _ceph_get_fmode() in fill_inode().
> 
> 

Ok. So how are you addressing this race in this series? Is it just that
we now hold on to caps for a minimal amount of time, and we just assume
that ceph_get_fmode will be called before they can expire?

> > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > ---
> > >  fs/ceph/caps.c  | 26 +++-----------------------
> > >  fs/ceph/file.c  | 21 +++++----------------
> > >  fs/ceph/inode.c |  8 +-------
> > >  fs/ceph/super.h |  3 +--
> > >  4 files changed, 10 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index ccdc47bd7cf0..2f4ff7e9508e 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -606,7 +606,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
> > >   */
> > >  void ceph_add_cap(struct inode *inode,
> > >                 struct ceph_mds_session *session, u64 cap_id,
> > > -               int fmode, unsigned issued, unsigned wanted,
> > > +               unsigned issued, unsigned wanted,
> > >                 unsigned seq, unsigned mseq, u64 realmino, int flags,
> > >                 struct ceph_cap **new_cap)
> > >  {
> > > @@ -622,13 +622,6 @@ void ceph_add_cap(struct inode *inode,
> > >       dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
> > >            session->s_mds, cap_id, ceph_cap_string(issued), seq);
> > > 
> > > -     /*
> > > -      * If we are opening the file, include file mode wanted bits
> > > -      * in wanted.
> > > -      */
> > > -     if (fmode >= 0)
> > > -             wanted |= ceph_caps_for_mode(fmode);
> > > -
> > >       spin_lock(&session->s_gen_ttl_lock);
> > >       gen = session->s_cap_gen;
> > >       spin_unlock(&session->s_gen_ttl_lock);
> > > @@ -753,9 +746,6 @@ void ceph_add_cap(struct inode *inode,
> > >       cap->issue_seq = seq;
> > >       cap->mseq = mseq;
> > >       cap->cap_gen = gen;
> > > -
> > > -     if (fmode >= 0)
> > > -             __ceph_get_fmode(ci, fmode);
> > >  }
> > > 
> > >  /*
> > > @@ -3726,7 +3716,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
> > >               /* add placeholder for the export tagert */
> > >               int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
> > >               tcap = new_cap;
> > > -             ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
> > > +             ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
> > >                            t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
> > > 
> > >               if (!list_empty(&ci->i_cap_flush_list) &&
> > > @@ -3831,7 +3821,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
> > >       __ceph_caps_issued(ci, &issued);
> > >       issued |= __ceph_caps_dirty(ci);
> > > 
> > > -     ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
> > > +     ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
> > >                    realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
> > > 
> > >       ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
> > > @@ -4186,16 +4176,6 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> > >       spin_unlock(&ci->i_ceph_lock);
> > >  }
> > > 
> > > -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> > > -{
> > > -     int i;
> > > -     int bits = (fmode << 1) | 1;
> > > -     for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > > -             if (bits & (1 << i))
> > > -                     ci->i_file_by_mode[i].nr++;
> > > -     }
> > > -}
> > > -
> > >  /*
> > >   * Drop open file reference.  If we were the last open file,
> > >   * we may need to release capabilities to the MDS (or schedule
> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > index f28f420bad23..60a2dfa02ba2 100644
> > > --- a/fs/ceph/file.c
> > > +++ b/fs/ceph/file.c
> > > @@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > >       if (isdir) {
> > >               struct ceph_dir_file_info *dfi =
> > >                       kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> > > -             if (!dfi) {
> > > -                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> > > +             if (!dfi)
> > >                       return -ENOMEM;
> > > -             }
> > > 
> > >               file->private_data = dfi;
> > >               fi = &dfi->file_info;
> > > @@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> > >               dfi->readdir_cache_idx = -1;
> > >       } else {
> > >               fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> > > -             if (!fi) {
> > > -                     ceph_put_fmode(ci, fmode, 1); /* clean up */
> > > +             if (!fi)
> > >                       return -ENOMEM;
> > > -             }
> > > 
> > >               file->private_data = fi;
> > >       }
> > > 
> > > +     ceph_get_fmode(ci, fmode, 1);
> > >       fi->fmode = fmode;
> > > +
> > >       spin_lock_init(&fi->rw_contexts_lock);
> > >       INIT_LIST_HEAD(&fi->rw_contexts);
> > >       fi->meta_err = errseq_sample(&ci->i_meta_err);
> > > @@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> > >       case S_IFLNK:
> > >               dout("init_file %p %p 0%o (symlink)\n", inode, file,
> > >                    inode->i_mode);
> > > -             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> > >               break;
> > > 
> > >       default:
> > > @@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> > >                * we need to drop the open ref now, since we don't
> > >                * have .release set to ceph_release.
> > >                */
> > > -             ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
> > >               BUG_ON(inode->i_fop->release == ceph_release);
> > > 
> > >               /* call the proper open fop */
> > > @@ -327,7 +323,6 @@ int ceph_renew_caps(struct inode *inode, int fmode)
> > >       req->r_inode = inode;
> > >       ihold(inode);
> > >       req->r_num_caps = 1;
> > > -     req->r_fmode = -1;
> > > 
> > >       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > >       ceph_mdsc_put_request(req);
> > > @@ -373,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
> > > 
> > >       /* trivially open snapdir */
> > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > > -             spin_lock(&ci->i_ceph_lock);
> > > -             __ceph_get_fmode(ci, fmode);
> > > -             spin_unlock(&ci->i_ceph_lock);
> > >               return ceph_init_file(inode, file, fmode);
> > >       }
> > > 
> > > @@ -393,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
> > >               dout("open %p fmode %d want %s issued %s using existing\n",
> > >                    inode, fmode, ceph_cap_string(wanted),
> > >                    ceph_cap_string(issued));
> > > -             __ceph_get_fmode(ci, fmode);
> > > +             __ceph_touch_fmode(ci, mdsc, fmode);
> > >               spin_unlock(&ci->i_ceph_lock);
> > > 
> > >               /* adjust wanted? */
> > > @@ -405,7 +397,6 @@ int ceph_open(struct inode *inode, struct file *file)
> > >               return ceph_init_file(inode, file, fmode);
> > >       } else if (ceph_snap(inode) != CEPH_NOSNAP &&
> > >                  (ci->i_snap_caps & wanted) == wanted) {
> > > -             __ceph_get_fmode(ci, fmode);
> > >               __ceph_touch_fmode(ci, mdsc, fmode);
> > >               spin_unlock(&ci->i_ceph_lock);
> > >               return ceph_init_file(inode, file, fmode);
> > > @@ -526,8 +517,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > >               err = finish_open(file, dentry, ceph_open);
> > >       }
> > >  out_req:
> > > -     if (!req->r_err && req->r_target_inode)
> > > -             ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
> > >       ceph_mdsc_put_request(req);
> > >  out_ctx:
> > >       ceph_release_acl_sec_ctx(&as_ctx);
> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index b279bd8e168e..bb73b0c8c4d9 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -969,7 +969,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> > >               if (ceph_snap(inode) == CEPH_NOSNAP) {
> > >                       ceph_add_cap(inode, session,
> > >                                    le64_to_cpu(info->cap.cap_id),
> > > -                                  cap_fmode, info_caps,
> > > +                                  info_caps,
> > >                                    le32_to_cpu(info->cap.wanted),
> > >                                    le32_to_cpu(info->cap.seq),
> > >                                    le32_to_cpu(info->cap.mseq),
> > > @@ -994,13 +994,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> > >                       dout(" %p got snap_caps %s\n", inode,
> > >                            ceph_cap_string(info_caps));
> > >                       ci->i_snap_caps |= info_caps;
> > > -                     if (cap_fmode >= 0)
> > > -                             __ceph_get_fmode(ci, cap_fmode);
> > >               }
> > > -     } else if (cap_fmode >= 0) {
> > > -             pr_warn("mds issued no caps on %llx.%llx\n",
> > > -                        ceph_vinop(inode));
> > > -             __ceph_get_fmode(ci, cap_fmode);
> > >       }
> > > 
> > >       if (iinfo->inline_version > 0 &&
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 029823643b8b..1ea76466efcb 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -1038,7 +1038,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
> > >                                    struct ceph_cap_reservation *ctx);
> > >  extern void ceph_add_cap(struct inode *inode,
> > >                        struct ceph_mds_session *session, u64 cap_id,
> > > -                      int fmode, unsigned issued, unsigned wanted,
> > > +                      unsigned issued, unsigned wanted,
> > >                        unsigned cap, unsigned seq, u64 realmino, int flags,
> > >                        struct ceph_cap **new_cap);
> > >  extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
> > > @@ -1080,7 +1080,6 @@ extern int ceph_try_get_caps(struct inode *inode,
> > >                            int need, int want, bool nonblock, int *got);
> > > 
> > >  /* for counting open files by mode */
> > > -extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> > >  extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> > >  extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> > >  extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> > 
> > --
> > Jeff Layton <jlayton@kernel.org>
> > 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-02-21 12:06 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-20 12:26 [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
2020-02-20 12:26 ` [PATCH 2/4] ceph: consider file's last read/write when calculating wanted caps Yan, Zheng
2020-02-20 14:14   ` Jeff Layton
2020-02-20 14:53     ` Yan, Zheng
2020-02-20 16:20       ` Jeff Layton
2020-02-20 19:06   ` Jeff Layton
2020-02-21  3:22     ` Yan, Zheng
2020-02-20 12:26 ` [PATCH 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
2020-02-20 19:14   ` Jeff Layton
2020-02-21  2:34     ` Yan, Zheng
2020-02-21 12:06       ` Jeff Layton
2020-02-20 12:26 ` [PATCH 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
2020-02-20 19:57   ` Jeff Layton
2020-02-20 13:42 ` [PATCH 1/4] ceph: always renew caps if mds_wanted is insufficient Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.