All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] ceph: don't request caps for idle open files
@ 2020-02-21 13:16 Yan, Zheng
  2020-02-21 13:16 ` [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
                   ` (5 more replies)
  0 siblings, 6 replies; 20+ messages in thread
From: Yan, Zheng @ 2020-02-21 13:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

This series make cephfs client not request caps for open files that
idle for a long time. For the case that one active client and multiple
standby clients open the same file, this increase the possibility that
mds issues exclusive caps to the active client.

Yan, Zheng (4):
  ceph: always renew caps if mds_wanted is insufficient
  ceph: consider inode's last read/write when calculating wanted caps
  ceph: simplify calling of ceph_get_fmode()
  ceph: remove delay check logic from ceph_check_caps()

 fs/ceph/caps.c               | 324 +++++++++++++++--------------------
 fs/ceph/file.c               |  39 ++---
 fs/ceph/inode.c              |  19 +-
 fs/ceph/ioctl.c              |   2 +
 fs/ceph/mds_client.c         |   5 -
 fs/ceph/super.h              |  35 ++--
 include/linux/ceph/ceph_fs.h |   1 +
 7 files changed, 188 insertions(+), 237 deletions(-)

-- 
2.21.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient
  2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
@ 2020-02-21 13:16 ` Yan, Zheng
  2020-02-21 17:17   ` Jeff Layton
  2020-02-21 13:16 ` [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps Yan, Zheng
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Yan, Zheng @ 2020-02-21 13:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

original code only renews caps for inodes with CEPH_I_CAP_DROPPED flags.
The flag indicates that mds closed session and caps were dropped. This
patch is preparation for not requesting caps for idle open files.

CEPH_I_CAP_DROPPED is no longer tested by anyone, so this patch also
remove it.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c       | 36 +++++++++++++++---------------------
 fs/ceph/mds_client.c |  5 -----
 fs/ceph/super.h      | 11 +++++------
 3 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d05717397c2a..293920d013ff 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2659,6 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 		}
 	} else {
 		int session_readonly = false;
+		int mds_wanted;
 		if (ci->i_auth_cap &&
 		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
 			struct ceph_mds_session *s = ci->i_auth_cap->session;
@@ -2667,32 +2668,27 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 			spin_unlock(&s->s_cap_lock);
 		}
 		if (session_readonly) {
-			dout("get_cap_refs %p needed %s but mds%d readonly\n",
+			dout("get_cap_refs %p need %s but mds%d readonly\n",
 			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
 			ret = -EROFS;
 			goto out_unlock;
 		}
 
-		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
-			int mds_wanted;
-			if (READ_ONCE(mdsc->fsc->mount_state) ==
-			    CEPH_MOUNT_SHUTDOWN) {
-				dout("get_cap_refs %p forced umount\n", inode);
-				ret = -EIO;
-				goto out_unlock;
-			}
-			mds_wanted = __ceph_caps_mds_wanted(ci, false);
-			if (need & ~(mds_wanted & need)) {
-				dout("get_cap_refs %p caps were dropped"
-				     " (session killed?)\n", inode);
-				ret = -ESTALE;
-				goto out_unlock;
-			}
-			if (!(file_wanted & ~mds_wanted))
-				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			dout("get_cap_refs %p forced umount\n", inode);
+			ret = -EIO;
+			goto out_unlock;
+		}
+		mds_wanted = __ceph_caps_mds_wanted(ci, false);
+		if (need & ~mds_wanted) {
+			dout("get_cap_refs %p need %s > mds_wanted %s\n",
+			     inode, ceph_cap_string(need),
+			     ceph_cap_string(mds_wanted));
+			ret = -ESTALE;
+			goto out_unlock;
 		}
 
-		dout("get_cap_refs %p have %s needed %s\n", inode,
+		dout("get_cap_refs %p have %s need %s\n", inode,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
 out_unlock:
@@ -3646,8 +3642,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 		goto out_unlock;
 
 	if (target < 0) {
-		if (cap->mds_wanted | cap->issued)
-			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 		__ceph_remove_cap(cap, false);
 		goto out_unlock;
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fab9d6461a65..98d746b3bb53 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1411,8 +1411,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
-	if (cap->mds_wanted | cap->issued)
-		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 	__ceph_remove_cap(cap, false);
 	if (!ci->i_auth_cap) {
 		struct ceph_cap_flush *cf;
@@ -1578,9 +1576,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 			/* mds did not re-issue stale cap */
 			spin_lock(&ci->i_ceph_lock);
 			cap->issued = cap->implemented = CEPH_CAP_PIN;
-			/* make sure mds knows what we want */
-			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
-				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 			spin_unlock(&ci->i_ceph_lock);
 		}
 	} else if (ev == FORCE_RO) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 37dc1ac8f6c3..48e84d7f48a0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -517,12 +517,11 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
 #define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
 #define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
-#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
-#define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
-#define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
+#define CEPH_I_KICK_FLUSH	(1 << 7)  /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS	(1 << 8)  /* need flush snapss */
+#define CEPH_I_ERROR_WRITE	(1 << 9)  /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK	(1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT		(1 << 11) /* inode in direct I/O mode */
 
 /*
  * Masks of ceph inode work.
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps
  2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
  2020-02-21 13:16 ` [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
@ 2020-02-21 13:16 ` Yan, Zheng
  2020-02-21 13:52   ` Jeff Layton
  2020-02-21 14:27   ` Jeff Layton
  2020-02-21 13:16 ` [PATCH v2 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 20+ messages in thread
From: Yan, Zheng @ 2020-02-21 13:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

Add i_last_rd and i_last_wr to ceph_inode_info. These two fields are
used to track inode's last read/write, they are updated when getting
caps for read/write.

If there is no read/write on an inode for 'caps_wanted_delay_max'
seconds, __ceph_caps_file_wanted() does not request caps for read/write
even there are open files.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c               | 152 ++++++++++++++++++++++++-----------
 fs/ceph/file.c               |  21 ++---
 fs/ceph/inode.c              |  10 ++-
 fs/ceph/ioctl.c              |   2 +
 fs/ceph/super.h              |  13 ++-
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 139 insertions(+), 60 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 293920d013ff..2a9df235286d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -971,18 +971,49 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
 	return used;
 }
 
+#define FMODE_WAIT_BIAS 1000
+
 /*
  * wanted, by virtue of open file modes
  */
 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
-	int i, bits = 0;
-	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
-		if (ci->i_nr_by_mode[i])
-			bits |= 1 << i;
+	struct ceph_mount_options *opt =
+		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+	unsigned long used_cutoff =
+		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
+	unsigned long idle_cutoff =
+		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
+	int bits = 0;
+
+	if (ci->i_nr_by_mode[0] > 0)
+		bits |= CEPH_FILE_MODE_PIN;
+
+	if (ci->i_nr_by_mode[1] > 0) {
+		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
+		    time_after(ci->i_last_rd, used_cutoff))
+			bits |= CEPH_FILE_MODE_RD;
+	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
+		bits |= CEPH_FILE_MODE_RD;
+	}
+
+	if (ci->i_nr_by_mode[2] > 0) {
+		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
+		    time_after(ci->i_last_wr, used_cutoff))
+			bits |= CEPH_FILE_MODE_WR;
+	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
+		bits |= CEPH_FILE_MODE_WR;
 	}
+
+	/* check lazyio only when read/write is wanted */
+	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
+		bits |= CEPH_FILE_MODE_LAZY;
+
 	if (bits == 0)
 		return 0;
+	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
+		return 0;
+
 	return ceph_caps_for_mode(bits >> 1);
 }
 
@@ -1021,14 +1052,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
 	return mds_wanted;
 }
 
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
-	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
 int ceph_is_any_caps(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1856,10 +1879,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
 		flags |= CHECK_CAPS_FLUSH;
 
-	if (!(flags & CHECK_CAPS_AUTHONLY) ||
-	    (ci->i_auth_cap && __ceph_is_single_caps(ci)))
-		__cap_delay_cancel(mdsc, ci);
-
 	goto retry_locked;
 retry:
 	spin_lock(&ci->i_ceph_lock);
@@ -2081,9 +2100,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
-	/* Reschedule delayed caps release if we delayed anything */
-	if (delayed)
-		__cap_delay_requeue(mdsc, ci, false);
+	if (list_empty(&ci->i_cap_delay_list)) {
+	    if (delayed) {
+		    /* Reschedule delayed caps release if we delayed anything */
+		    __cap_delay_requeue(mdsc, ci, false);
+	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
+			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		    /* periodically re-calculate caps wanted by open files */
+		    __cap_delay_requeue(mdsc, ci, true);
+	    }
+	}
 
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -2549,8 +2575,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
  * FIXME: how does a 0 return differ from -EAGAIN?
  */
 enum {
-	NON_BLOCKING	= 1,
-	CHECK_FILELOCK	= 2,
+	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+	NON_BLOCKING	= (1 << 8),
+	CHECK_FILELOCK	= (1 << 9),
 };
 
 static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2560,7 +2587,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	int ret = 0;
 	int have, implemented;
-	int file_wanted;
 	bool snap_rwsem_locked = false;
 
 	dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2576,15 +2602,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 		goto out_unlock;
 	}
 
-	/* make sure file is actually open */
-	file_wanted = __ceph_caps_file_wanted(ci);
-	if ((file_wanted & need) != need) {
-		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
-		     ceph_cap_string(need), ceph_cap_string(file_wanted));
-		ret = -EBADF;
-		goto out_unlock;
-	}
-
 	/* finish pending truncate */
 	while (ci->i_truncate_pending) {
 		spin_unlock(&ci->i_ceph_lock);
@@ -2692,6 +2709,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 		     ceph_cap_string(have), ceph_cap_string(need));
 	}
 out_unlock:
+
+	__ceph_touch_fmode(ci, mdsc, flags);
+
 	spin_unlock(&ci->i_ceph_lock);
 	if (snap_rwsem_locked)
 		up_read(&mdsc->snap_rwsem);
@@ -2729,10 +2749,20 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
 
+static inline int get_used_fmode(int caps)
+{
+	int fmode = 0;
+	if (caps & CEPH_CAP_FILE_RD)
+		fmode |= CEPH_FILE_MODE_RD;
+	if (caps & CEPH_CAP_FILE_WR)
+		fmode |= CEPH_FILE_MODE_WR;
+	return fmode;
+}
+
 int ceph_try_get_caps(struct inode *inode, int need, int want,
 		      bool nonblock, int *got)
 {
-	int ret;
+	int ret, flags;
 
 	BUG_ON(need & ~CEPH_CAP_FILE_RD);
 	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
@@ -2740,8 +2770,10 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
 	if (ret < 0)
 		return ret;
 
-	ret = try_get_cap_refs(inode, need, want, 0,
-			       (nonblock ? NON_BLOCKING : 0), got);
+	flags = get_used_fmode(need | want);
+	if (nonblock)
+		flags |= NON_BLOCKING;
+	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
 	return ret == -EAGAIN ? 0 : ret;
 }
 
@@ -2767,11 +2799,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
 	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
 		return -EBADF;
 
+	flags = get_used_fmode(need | want);
+
 	while (true) {
 		if (endoff > 0)
 			check_max_size(inode, endoff);
 
-		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+		flags &= CEPH_FILE_MODE_MASK;
+		if (atomic_read(&fi->num_locks))
+			flags |= CHECK_FILELOCK;
 		_got = 0;
 		ret = try_get_cap_refs(inode, need, want, endoff,
 				       flags, &_got);
@@ -2791,6 +2827,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			list_add(&cw.list, &mdsc->cap_wait_list);
 			spin_unlock(&mdsc->caps_list_lock);
 
+			/* make sure used fmode not timeout */
+			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
 			add_wait_queue(&ci->i_cap_wq, &wait);
 
 			flags |= NON_BLOCKING;
@@ -2804,6 +2842,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			}
 
 			remove_wait_queue(&ci->i_cap_wq, &wait);
+			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
 
 			spin_lock(&mdsc->caps_list_lock);
 			list_del(&cw.list);
@@ -2823,7 +2862,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 		if (ret < 0) {
 			if (ret == -ESTALE) {
 				/* session was killed, try renew caps */
-				ret = ceph_renew_caps(inode);
+				ret = ceph_renew_caps(inode, flags);
 				if (ret == 0)
 					continue;
 			}
@@ -4121,6 +4160,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 	dout("flush_dirty_caps done\n");
 }
 
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+			struct ceph_mds_client *mdsc, int fmode)
+{
+	unsigned long now = jiffies;
+	if (fmode & CEPH_FILE_MODE_RD)
+		ci->i_last_rd = now;
+	if (fmode & CEPH_FILE_MODE_WR)
+		ci->i_last_wr = now;
+	/* queue periodic check */
+	if (fmode && list_empty(&ci->i_cap_delay_list))
+		__cap_delay_requeue(mdsc, ci, true);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
+{
+	int i;
+	int bits = (fmode << 1) | 1;
+	spin_lock(&ci->i_ceph_lock);
+	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+		if (bits & (1 << i))
+			ci->i_nr_by_mode[i] += count;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+}
+
 void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
 {
 	int i;
@@ -4136,26 +4200,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
  * we may need to release capabilities to the MDS (or schedule
  * their delayed release).
  */
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
 {
-	int i, last = 0;
+	int i;
 	int bits = (fmode << 1) | 1;
 	spin_lock(&ci->i_ceph_lock);
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
 		if (bits & (1 << i)) {
-			BUG_ON(ci->i_nr_by_mode[i] == 0);
-			if (--ci->i_nr_by_mode[i] == 0)
-				last++;
+			BUG_ON(ci->i_nr_by_mode[i] < count);
+			ci->i_nr_by_mode[i] -= count;
 		}
 	}
-	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
-	     &ci->vfs_inode, fmode,
-	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
-	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
 	spin_unlock(&ci->i_ceph_lock);
-
-	if (last && ci->i_vino.snap == CEPH_NOSNAP)
-		ceph_check_caps(ci, 0, NULL);
 }
 
 /*
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7e0190b1f821..f6ca9be9fbbd 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 		struct ceph_dir_file_info *dfi =
 			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
 		if (!dfi) {
-			ceph_put_fmode(ci, fmode); /* clean up */
+			ceph_put_fmode(ci, fmode, 1); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 	} else {
 		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
 		if (!fi) {
-			ceph_put_fmode(ci, fmode); /* clean up */
+			ceph_put_fmode(ci, fmode, 1); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	case S_IFLNK:
 		dout("init_file %p %p 0%o (symlink)\n", inode, file,
 		     inode->i_mode);
-		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		break;
 
 	default:
@@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 		 * we need to drop the open ref now, since we don't
 		 * have .release set to ceph_release.
 		 */
-		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		BUG_ON(inode->i_fop->release == ceph_release);
 
 		/* call the proper open fop */
@@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 /*
  * try renew caps after session gets killed.
  */
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	int err, flags, wanted;
 
 	spin_lock(&ci->i_ceph_lock);
+	__ceph_touch_fmode(ci, mdsc, fmode);
 	wanted = __ceph_caps_file_wanted(ci);
 	if (__ceph_is_any_real_caps(ci) &&
 	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
 		   (ci->i_snap_caps & wanted) == wanted) {
 		__ceph_get_fmode(ci, fmode);
+		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
 	}
@@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	}
 out_req:
 	if (!req->r_err && req->r_target_inode)
-		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
+		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
 	ceph_mdsc_put_request(req);
 out_ctx:
 	ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
 		dout("release inode %p dir file %p\n", inode, file);
 		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
 
-		ceph_put_fmode(ci, dfi->file_info.fmode);
+		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
 
 		if (dfi->last_readdir)
 			ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
 		dout("release inode %p regular file %p\n", inode, file);
 		WARN_ON(!list_empty(&fi->rw_contexts));
 
-		ceph_put_fmode(ci, fi->fmode);
+		ceph_put_fmode(ci, fi->fmode, 1);
+
 		kmem_cache_free(ceph_file_cachep, fi);
 	}
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 094b8fc37787..95e7440cf6f7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -478,6 +478,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_head_snapc = NULL;
 	ci->i_snap_caps = 0;
 
+	ci->i_last_rd = ci->i_last_wr = jiffies;
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
 		ci->i_nr_by_mode[i] = 0;
 
@@ -637,7 +638,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			if ((issued & (CEPH_CAP_FILE_CACHE|
 				       CEPH_CAP_FILE_BUFFER)) ||
 			    mapping_mapped(inode->i_mapping) ||
-			    __ceph_caps_file_wanted(ci)) {
+			    __ceph_is_file_opened(ci)) {
 				ci->i_truncate_pending++;
 				queue_trunc = 1;
 			}
@@ -1010,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			fill_inline = true;
 	}
 
+	if (cap_fmode >= 0) {
+		if (!info_caps)
+			pr_warn("mds issued no caps on %llx.%llx\n",
+				ceph_vinop(inode));
+		__ceph_touch_fmode(ci, mdsc, cap_fmode);
+	}
+
 	spin_unlock(&ci->i_ceph_lock);
 
 	if (fill_inline)
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c90f03beb15d..6e061bf62ad4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 
 	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
 		spin_lock(&ci->i_ceph_lock);
 		fi->fmode |= CEPH_FILE_MODE_LAZY;
 		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+		__ceph_touch_fmode(ci, mdsc, fi->fmode);
 		spin_unlock(&ci->i_ceph_lock);
 		dout("ioctl_layzio: file %p marked lazy\n", file);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 48e84d7f48a0..8ce210cc62c9 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -361,6 +361,8 @@ struct ceph_inode_info {
 						    dirty|flushing caps */
 	unsigned i_snap_caps;           /* cap bits for snapped files */
 
+	unsigned long i_last_rd;
+	unsigned long i_last_wr;
 	int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
 
 	struct mutex i_truncate_mutex;
@@ -673,6 +675,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
 
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
+{
+	return ci->i_nr_by_mode[0];
+}
 extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
 extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
 
@@ -1074,7 +1080,10 @@ extern int ceph_try_get_caps(struct inode *inode,
 
 /* for counting open files by mode */
 extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+			       struct ceph_mds_client *mdsc, int fmode);
 
 /* addr.c */
 extern const struct address_space_operations ceph_aops;
@@ -1086,7 +1095,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags, umode_t mode);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cb21c5cf12c3..8017130a08a1 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -564,6 +564,7 @@ struct ceph_filelock {
 #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
 #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
 #define CEPH_FILE_MODE_BITS       4
+#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
 
 int ceph_flags_to_mode(int flags);
 
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 3/4] ceph: simplify calling of ceph_get_fmode()
  2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
  2020-02-21 13:16 ` [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
  2020-02-21 13:16 ` [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps Yan, Zheng
@ 2020-02-21 13:16 ` Yan, Zheng
  2020-02-21 13:55   ` Jeff Layton
  2020-02-21 13:16 ` [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Yan, Zheng @ 2020-02-21 13:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

Originally, calling ceph_get_fmode() for open files is by thread that
handles request reply. The reason is that there is a small window
between updating caps and request initiator gets woken up. we need to
prevent ceph_check_caps() from releasing wanted caps in the window.

Previous patch make fill_inode() call __ceph_touch_fmode() for open file
request. This prevents ceph_check_caps() from releasing wanted caps for
'caps_wanted_delay_min' seconds, enough for request initiator to get
woken up and call ceph_get_fmode(). So we can call ceph_get_fmode() in
ceph_open() now.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c  | 26 +++-----------------------
 fs/ceph/file.c  | 21 +++++----------------
 fs/ceph/inode.c |  8 +-------
 fs/ceph/super.h |  3 +--
 4 files changed, 10 insertions(+), 48 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a9df235286d..2959e4c36a15 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -606,7 +606,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
  */
 void ceph_add_cap(struct inode *inode,
 		  struct ceph_mds_session *session, u64 cap_id,
-		  int fmode, unsigned issued, unsigned wanted,
+		  unsigned issued, unsigned wanted,
 		  unsigned seq, unsigned mseq, u64 realmino, int flags,
 		  struct ceph_cap **new_cap)
 {
@@ -622,13 +622,6 @@ void ceph_add_cap(struct inode *inode,
 	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
 
-	/*
-	 * If we are opening the file, include file mode wanted bits
-	 * in wanted.
-	 */
-	if (fmode >= 0)
-		wanted |= ceph_caps_for_mode(fmode);
-
 	spin_lock(&session->s_gen_ttl_lock);
 	gen = session->s_cap_gen;
 	spin_unlock(&session->s_gen_ttl_lock);
@@ -753,9 +746,6 @@ void ceph_add_cap(struct inode *inode,
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
 	cap->cap_gen = gen;
-
-	if (fmode >= 0)
-		__ceph_get_fmode(ci, fmode);
 }
 
 /*
@@ -3728,7 +3718,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 		/* add placeholder for the export tagert */
 		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
 		tcap = new_cap;
-		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+		ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
 			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
 		if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3833,7 +3823,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 	__ceph_caps_issued(ci, &issued);
 	issued |= __ceph_caps_dirty(ci);
 
-	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+	ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
 		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
 
 	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -4185,16 +4175,6 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
 	spin_unlock(&ci->i_ceph_lock);
 }
 
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
-{
-	int i;
-	int bits = (fmode << 1) | 1;
-	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
-		if (bits & (1 << i))
-			ci->i_nr_by_mode[i]++;
-	}
-}
-
 /*
  * Drop open file reference.  If we were the last open file,
  * we may need to release capabilities to the MDS (or schedule
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f6ca9be9fbbd..84058d3c5685 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 	if (isdir) {
 		struct ceph_dir_file_info *dfi =
 			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
-		if (!dfi) {
-			ceph_put_fmode(ci, fmode, 1); /* clean up */
+		if (!dfi)
 			return -ENOMEM;
-		}
 
 		file->private_data = dfi;
 		fi = &dfi->file_info;
@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 		dfi->readdir_cache_idx = -1;
 	} else {
 		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
-		if (!fi) {
-			ceph_put_fmode(ci, fmode, 1); /* clean up */
+		if (!fi)
 			return -ENOMEM;
-		}
 
 		file->private_data = fi;
 	}
 
+	ceph_get_fmode(ci, fmode, 1);
 	fi->fmode = fmode;
+
 	spin_lock_init(&fi->rw_contexts_lock);
 	INIT_LIST_HEAD(&fi->rw_contexts);
 	fi->meta_err = errseq_sample(&ci->i_meta_err);
@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	case S_IFLNK:
 		dout("init_file %p %p 0%o (symlink)\n", inode, file,
 		     inode->i_mode);
-		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		break;
 
 	default:
@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 		 * we need to drop the open ref now, since we don't
 		 * have .release set to ceph_release.
 		 */
-		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
 		BUG_ON(inode->i_fop->release == ceph_release);
 
 		/* call the proper open fop */
@@ -327,7 +323,6 @@ int ceph_renew_caps(struct inode *inode, int fmode)
 	req->r_inode = inode;
 	ihold(inode);
 	req->r_num_caps = 1;
-	req->r_fmode = -1;
 
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
@@ -373,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
 
 	/* trivially open snapdir */
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
-		spin_lock(&ci->i_ceph_lock);
-		__ceph_get_fmode(ci, fmode);
-		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
 	}
 
@@ -393,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
 		dout("open %p fmode %d want %s issued %s using existing\n",
 		     inode, fmode, ceph_cap_string(wanted),
 		     ceph_cap_string(issued));
-		__ceph_get_fmode(ci, fmode);
+		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 
 		/* adjust wanted? */
@@ -405,7 +397,6 @@ int ceph_open(struct inode *inode, struct file *file)
 		return ceph_init_file(inode, file, fmode);
 	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
 		   (ci->i_snap_caps & wanted) == wanted) {
-		__ceph_get_fmode(ci, fmode);
 		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 		return ceph_init_file(inode, file, fmode);
@@ -526,8 +517,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = finish_open(file, dentry, ceph_open);
 	}
 out_req:
-	if (!req->r_err && req->r_target_inode)
-		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
 	ceph_mdsc_put_request(req);
 out_ctx:
 	ceph_release_acl_sec_ctx(&as_ctx);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 95e7440cf6f7..0b0f503c84c3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -968,7 +968,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		if (ceph_snap(inode) == CEPH_NOSNAP) {
 			ceph_add_cap(inode, session,
 				     le64_to_cpu(info->cap.cap_id),
-				     cap_fmode, info_caps,
+				     info_caps,
 				     le32_to_cpu(info->cap.wanted),
 				     le32_to_cpu(info->cap.seq),
 				     le32_to_cpu(info->cap.mseq),
@@ -993,13 +993,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			dout(" %p got snap_caps %s\n", inode,
 			     ceph_cap_string(info_caps));
 			ci->i_snap_caps |= info_caps;
-			if (cap_fmode >= 0)
-				__ceph_get_fmode(ci, cap_fmode);
 		}
-	} else if (cap_fmode >= 0) {
-		pr_warn("mds issued no caps on %llx.%llx\n",
-			   ceph_vinop(inode));
-		__ceph_get_fmode(ci, cap_fmode);
 	}
 
 	if (iinfo->inline_version > 0 &&
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8ce210cc62c9..d89478db8b24 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1037,7 +1037,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 				     struct ceph_cap_reservation *ctx);
 extern void ceph_add_cap(struct inode *inode,
 			 struct ceph_mds_session *session, u64 cap_id,
-			 int fmode, unsigned issued, unsigned wanted,
+			 unsigned issued, unsigned wanted,
 			 unsigned cap, unsigned seq, u64 realmino, int flags,
 			 struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
@@ -1079,7 +1079,6 @@ extern int ceph_try_get_caps(struct inode *inode,
 			     int need, int want, bool nonblock, int *got);
 
 /* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
 extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
 extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
 extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps()
  2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
                   ` (2 preceding siblings ...)
  2020-02-21 13:16 ` [PATCH v2 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
@ 2020-02-21 13:16 ` Yan, Zheng
  2020-02-27 11:43   ` Jeff Layton
  2020-02-21 14:08 ` [PATCH v2 0/4] ceph: don't request caps for idle open files Jeff Layton
  2020-02-21 14:57 ` Jeff Layton
  5 siblings, 1 reply; 20+ messages in thread
From: Yan, Zheng @ 2020-02-21 13:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, Yan, Zheng

__ceph_caps_file_wanted() already checks 'caps_wanted_delay_min' and
'caps_wanted_delay_max'. There is no need to duplicte the logic in
ceph_check_caps() and __send_cap()

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/caps.c  | 146 ++++++++++++------------------------------------
 fs/ceph/file.c  |   7 +--
 fs/ceph/inode.c |   1 -
 fs/ceph/super.h |   8 +--
 4 files changed, 42 insertions(+), 120 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2959e4c36a15..ad365cf870f6 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
 	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
-	ci->i_hold_caps_min = round_jiffies(jiffies +
-					    opt->caps_wanted_delay_min * HZ);
 	ci->i_hold_caps_max = round_jiffies(jiffies +
 					    opt->caps_wanted_delay_max * HZ);
-	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
-	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+	dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_max - jiffies);
 }
 
 /*
@@ -508,8 +505,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
  *    -> we take mdsc->cap_delay_lock
  */
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
-				struct ceph_inode_info *ci,
-				bool set_timeout)
+				struct ceph_inode_info *ci)
 {
 	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
 	     ci->i_ceph_flags, ci->i_hold_caps_max);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 				goto no_change;
 			list_del_init(&ci->i_cap_delay_list);
 		}
-		if (set_timeout)
-			__cap_set_timeouts(mdsc, ci);
+		__cap_set_timeouts(mdsc, ci);
 		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 no_change:
 		spin_unlock(&mdsc->cap_delay_lock);
@@ -719,7 +714,7 @@ void ceph_add_cap(struct inode *inode,
 		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
 		     ceph_cap_string(issued), ceph_cap_string(wanted),
 		     ceph_cap_string(actual_wanted));
-		__cap_delay_requeue(mdsc, ci, true);
+		__cap_delay_requeue(mdsc, ci);
 	}
 
 	if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -1304,7 +1299,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	struct cap_msg_args arg;
 	int held, revoking;
 	int wake = 0;
-	int delayed = 0;
 	int ret;
 
 	held = cap->issued | cap->implemented;
@@ -1317,28 +1311,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	     ceph_cap_string(revoking));
 	BUG_ON((retain & CEPH_CAP_PIN) == 0);
 
-	arg.session = cap->session;
-
-	/* don't release wanted unless we've waited a bit. */
-	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
-	    time_before(jiffies, ci->i_hold_caps_min)) {
-		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
-		     ceph_cap_string(cap->issued),
-		     ceph_cap_string(cap->issued & retain),
-		     ceph_cap_string(cap->mds_wanted),
-		     ceph_cap_string(want));
-		want |= cap->mds_wanted;
-		retain |= cap->issued;
-		delayed = 1;
-	}
-	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
-	if (want & ~cap->mds_wanted) {
-		/* user space may open/close single file frequently.
-		 * This avoids droping mds_wanted immediately after
-		 * requesting new mds_wanted.
-		 */
-		__cap_set_timeouts(mdsc, ci);
-	}
+	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
 
 	cap->issued &= retain;  /* drop bits we don't want */
 	if (cap->implemented & ~cap->issued) {
@@ -1353,6 +1326,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	cap->implemented &= cap->issued | used;
 	cap->mds_wanted = want;
 
+	arg.session = cap->session;
 	arg.ino = ceph_vino(inode).ino;
 	arg.cid = cap->cap_id;
 	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
@@ -1413,14 +1387,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 
 	ret = send_cap_msg(&arg);
 	if (ret < 0) {
-		dout("error sending cap msg, must requeue %p\n", inode);
-		delayed = 1;
+		pr_err("error sending cap msg, ino (%llx.%llx) "
+		       "flushing %s tid %llu, requeue\n",
+		       ceph_vinop(inode), ceph_cap_string(flushing),
+		       flush_tid);
+		spin_lock(&ci->i_ceph_lock);
+		__cap_delay_requeue(mdsc, ci);
+		spin_unlock(&ci->i_ceph_lock);
 	}
 
 	if (wake)
 		wake_up_all(&ci->i_cap_wq);
 
-	return delayed;
+	return ret;
 }
 
 static inline int __send_flush_snap(struct inode *inode,
@@ -1684,7 +1663,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
 	    (mask & CEPH_CAP_FILE_BUFFER))
 		dirty |= I_DIRTY_DATASYNC;
-	__cap_delay_requeue(mdsc, ci, true);
+	__cap_delay_requeue(mdsc, ci);
 	return dirty;
 }
 
@@ -1835,8 +1814,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
  * versus held caps.  Release, flush, ack revoked caps to mds as
  * appropriate.
  *
- *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- *    cap release further.
  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
  *    further delay.
@@ -1855,17 +1832,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
 			   to avoid an infinite loop on retry */
 	struct rb_node *p;
-	int delayed = 0, sent = 0;
-	bool no_delay = flags & CHECK_CAPS_NODELAY;
 	bool queue_invalidate = false;
 	bool tried_invalidate = false;
 
-	/* if we are unmounting, flush any unused caps immediately. */
-	if (mdsc->stopping)
-		no_delay = true;
-
 	spin_lock(&ci->i_ceph_lock);
-
 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
 		flags |= CHECK_CAPS_FLUSH;
 
@@ -1911,14 +1881,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	}
 
 	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
-	     " issued %s revoking %s retain %s %s%s%s\n", inode,
+	     " issued %s revoking %s retain %s %s%s\n", inode,
 	     ceph_cap_string(file_wanted),
 	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_flushing_caps),
 	     ceph_cap_string(issued), ceph_cap_string(revoking),
 	     ceph_cap_string(retain),
 	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
-	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
 	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
 
 	/*
@@ -1926,7 +1895,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	 * have cached pages, but don't want them, then try to invalidate.
 	 * If we fail, it's because pages are locked.... try again later.
 	 */
-	if ((!no_delay || mdsc->stopping) &&
+	if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
 	    S_ISREG(inode->i_mode) &&
 	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
 	    inode->i_data.nrpages &&		/* have cached pages */
@@ -2006,21 +1975,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		if ((cap->issued & ~retain) == 0)
 			continue;     /* nope, all good */
 
-		if (no_delay)
-			goto ack;
-
-		/* delay? */
-		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
-		    time_before(jiffies, ci->i_hold_caps_max)) {
-			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
-			     ceph_cap_string(cap->issued),
-			     ceph_cap_string(cap->issued & retain),
-			     ceph_cap_string(cap->mds_wanted),
-			     ceph_cap_string(want));
-			delayed++;
-			continue;
-		}
-
 ack:
 		if (session && session != cap->session) {
 			dout("oops, wrong session %p mutex\n", session);
@@ -2081,24 +2035,18 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		}
 
 		mds = cap->mds;  /* remember mds, so we don't repeat */
-		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
-				cap_used, want, retain, flushing,
-				flush_tid, oldest_flush_tid);
+		__send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
+			   retain, flushing, flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
-	if (list_empty(&ci->i_cap_delay_list)) {
-	    if (delayed) {
-		    /* Reschedule delayed caps release if we delayed anything */
-		    __cap_delay_requeue(mdsc, ci, false);
-	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
-			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
-		    /* periodically re-calculate caps wanted by open files */
-		    __cap_delay_requeue(mdsc, ci, true);
-	    }
+	/* periodically re-calculate caps wanted by open files */
+	if (list_empty(&ci->i_cap_delay_list) &&
+	    (file_wanted & ~CEPH_CAP_PIN) &&
+	    !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+		__cap_delay_requeue(mdsc, ci);
 	}
 
 	spin_unlock(&ci->i_ceph_lock);
@@ -2128,7 +2076,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
 retry_locked:
 	if (ci->i_dirty_caps && ci->i_auth_cap) {
 		struct ceph_cap *cap = ci->i_auth_cap;
-		int delayed;
 
 		if (session != cap->session) {
 			spin_unlock(&ci->i_ceph_lock);
@@ -2157,18 +2104,10 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
 						 &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-				     CEPH_CLIENT_CAPS_SYNC,
-				     __ceph_caps_used(ci),
-				     __ceph_caps_wanted(ci),
-				     (cap->issued | cap->implemented),
-				     flushing, flush_tid, oldest_flush_tid);
-
-		if (delayed) {
-			spin_lock(&ci->i_ceph_lock);
-			__cap_delay_requeue(mdsc, ci, true);
-			spin_unlock(&ci->i_ceph_lock);
-		}
+		__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+			   __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+			   (cap->issued | cap->implemented),
+			   flushing, flush_tid, oldest_flush_tid);
 	} else {
 		if (!list_empty(&ci->i_cap_flush_list)) {
 			struct ceph_cap_flush *cf =
@@ -2368,22 +2307,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 		if (cf->caps) {
 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
-			ci->i_ceph_flags |= CEPH_I_NODELAY;
-
-			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+			__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
 					 (cf->tid < last_snap_flush ?
 					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
 					  __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
 					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
-			if (ret) {
-				pr_err("kick_flushing_caps: error sending "
-					"cap flush, ino (%llx.%llx) "
-					"tid %llu flushing %s\n",
-					ceph_vinop(inode), cf->tid,
-					ceph_cap_string(cf->caps));
-			}
 		} else {
 			struct ceph_cap_snap *capsnap =
 					container_of(cf, struct ceph_cap_snap,
@@ -3001,7 +2931,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
 	     last ? " last" : "", put ? " put" : "");
 
-	if (last && !flushsnaps)
+	if (last)
 		ceph_check_caps(ci, 0, NULL);
 	else if (flushsnaps)
 		ceph_flush_snaps(ci, NULL);
@@ -3419,10 +3349,10 @@ static void handle_cap_grant(struct inode *inode,
 		wake_up_all(&ci->i_cap_wq);
 
 	if (check_caps == 1)
-		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
 				session);
 	else if (check_caps == 2)
-		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+		ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
 	else
 		mutex_unlock(&session->s_mutex);
 }
@@ -4097,7 +4027,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
-	int flags = CHECK_CAPS_NODELAY;
 
 	dout("check_delayed_caps\n");
 	while (1) {
@@ -4117,7 +4046,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 
 		if (inode) {
 			dout("check_delayed_caps on %p\n", inode);
-			ceph_check_caps(ci, flags, NULL);
+			ceph_check_caps(ci, 0, NULL);
 			/* avoid calling iput_final() in tick thread */
 			ceph_async_iput(inode);
 		}
@@ -4142,7 +4071,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 		ihold(inode);
 		dout("flush_dirty_caps %p\n", inode);
 		spin_unlock(&mdsc->cap_dirty_lock);
-		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
 		iput(inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 	}
@@ -4160,7 +4089,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
 		ci->i_last_wr = now;
 	/* queue periodic check */
 	if (fmode && list_empty(&ci->i_cap_delay_list))
-		__cap_delay_requeue(mdsc, ci, true);
+		__cap_delay_requeue(mdsc, ci);
 }
 
 void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
@@ -4209,7 +4138,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
 	if (inode->i_nlink == 1) {
 		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
 
-		ci->i_ceph_flags |= CEPH_I_NODELAY;
 		if (__ceph_caps_dirty(ci)) {
 			struct ceph_mds_client *mdsc =
 				ceph_inode_to_client(inode)->mdsc;
@@ -4265,8 +4193,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 		if (force || (cap->issued & drop)) {
 			if (cap->issued & drop) {
 				int wanted = __ceph_caps_wanted(ci);
-				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
-					wanted |= cap->mds_wanted;
 				dout("encode_inode_release %p cap %p "
 				     "%s -> %s, wanted %s -> %s\n", inode, cap,
 				     ceph_cap_string(cap->issued),
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 84058d3c5685..1d76bdf1a1b9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1552,7 +1552,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
 		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
-			ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+			ceph_check_caps(ci, 0, NULL);
 	}
 
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
@@ -2129,12 +2129,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 
 	if (endoff > size) {
 		int caps_flags = 0;
-
 		/* Let the MDS know about dst file size change */
-		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
-			caps_flags |= CHECK_CAPS_NODELAY;
 		if (ceph_inode_set_size(dst_inode, endoff))
 			caps_flags |= CHECK_CAPS_AUTHONLY;
+		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
+			caps_flags |= CHECK_CAPS_AUTHONLY;
 		if (caps_flags)
 			ceph_check_caps(dst_ci, caps_flags, NULL);
 	}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0b0f503c84c3..5a8fa8a2d3cf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -471,7 +471,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_prealloc_cap_flush = NULL;
 	INIT_LIST_HEAD(&ci->i_cap_flush_list);
 	init_waitqueue_head(&ci->i_cap_wq);
-	ci->i_hold_caps_min = 0;
 	ci->i_hold_caps_max = 0;
 	INIT_LIST_HEAD(&ci->i_cap_delay_list);
 	INIT_LIST_HEAD(&ci->i_cap_snaps);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d89478db8b24..e586cff3dfd5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -170,9 +170,9 @@ struct ceph_cap {
 	struct list_head caps_item;
 };
 
-#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
-#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY   1  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      2  /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL    4  /* don't invalidate pagecache */
 
 struct ceph_cap_flush {
 	u64 tid;
@@ -352,7 +352,6 @@ struct ceph_inode_info {
 	struct ceph_cap_flush *i_prealloc_cap_flush;
 	struct list_head i_cap_flush_list;
 	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
-	unsigned long i_hold_caps_min; /* jiffies */
 	unsigned long i_hold_caps_max; /* jiffies */
 	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
 	struct ceph_cap_reservation i_cap_migration_resv;
@@ -513,7 +512,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
  * Ceph inode.
  */
 #define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
-#define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
 #define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
 #define CEPH_I_POOL_PERM	(1 << 3)  /* pool rd/wr bits are valid */
 #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
-- 
2.21.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps
  2020-02-21 13:16 ` [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps Yan, Zheng
@ 2020-02-21 13:52   ` Jeff Layton
  2020-02-21 14:27   ` Jeff Layton
  1 sibling, 0 replies; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 13:52 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> Add i_last_rd and i_last_wr to ceph_inode_info. These two fields are
> used to track inode's last read/write, they are updated when getting
> caps for read/write.
> 
> If there is no read/write on an inode for 'caps_wanted_delay_max'
> seconds, __ceph_caps_file_wanted() does not request caps for read/write
> even there are open files.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c               | 152 ++++++++++++++++++++++++-----------
>  fs/ceph/file.c               |  21 ++---
>  fs/ceph/inode.c              |  10 ++-
>  fs/ceph/ioctl.c              |   2 +
>  fs/ceph/super.h              |  13 ++-
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 139 insertions(+), 60 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 293920d013ff..2a9df235286d 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -971,18 +971,49 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
>  	return used;
>  }
>  
> +#define FMODE_WAIT_BIAS 1000
> +
>  /*
>   * wanted, by virtue of open file modes
>   */
>  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>  {
> -	int i, bits = 0;
> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> -		if (ci->i_nr_by_mode[i])
> -			bits |= 1 << i;
> +	struct ceph_mount_options *opt =
> +		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> +	unsigned long used_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> +	unsigned long idle_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
> +	int bits = 0;
> +
> +	if (ci->i_nr_by_mode[0] > 0)


Let's not use magic numbers as array indexes here:

    if (ci->i_nr_by_mode[CEPH_FILE_MODE_PIN] > 0)


> +		bits |= CEPH_FILE_MODE_PIN;
> +
> +	if (ci->i_nr_by_mode[1] > 0) {
> +		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
> +		    time_after(ci->i_last_rd, used_cutoff))
> +			bits |= CEPH_FILE_MODE_RD;
> +	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
> +		bits |= CEPH_FILE_MODE_RD;
> +	}
> +
> +	if (ci->i_nr_by_mode[2] > 0) {
> +		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
> +		    time_after(ci->i_last_wr, used_cutoff))
> +			bits |= CEPH_FILE_MODE_WR;
> +	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
> +		bits |= CEPH_FILE_MODE_WR;
>  	}
> +
> +	/* check lazyio only when read/write is wanted */
> +	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
> +		bits |= CEPH_FILE_MODE_LAZY;
> +
>  	if (bits == 0)
>  		return 0;
> +	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> +		return 0;
> +
>  	return ceph_caps_for_mode(bits >> 1);
> 

For the record, this function is really hard to follow just because
sometimes CEPH_FILE_MODE constants are interpreted as discrete values
(like an enum), and other times as set of flags.

It all works out in the end, but I spent several minutes yesterday
convincing myself that the shift right above this was correct. It might
be nice to make this whole function less "clever", if you see a way to
do it.

>  }
>  
> @@ -1021,14 +1052,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
>  	return mds_wanted;
>  }
>  
> -/*
> - * called under i_ceph_lock
> - */
> -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> -{
> -	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> -}
> -
>  int ceph_is_any_caps(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -1856,10 +1879,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  		flags |= CHECK_CAPS_FLUSH;
>  
> -	if (!(flags & CHECK_CAPS_AUTHONLY) ||
> -	    (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> -		__cap_delay_cancel(mdsc, ci);
> -
>  	goto retry_locked;
>  retry:
>  	spin_lock(&ci->i_ceph_lock);
> @@ -2081,9 +2100,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		goto retry; /* retake i_ceph_lock and restart our cap scan. */
>  	}
>  
> -	/* Reschedule delayed caps release if we delayed anything */
> -	if (delayed)
> -		__cap_delay_requeue(mdsc, ci, false);
> +	if (list_empty(&ci->i_cap_delay_list)) {
> +	    if (delayed) {
> +		    /* Reschedule delayed caps release if we delayed anything */
> +		    __cap_delay_requeue(mdsc, ci, false);
> +	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> +			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +		    /* periodically re-calculate caps wanted by open files */
> +		    __cap_delay_requeue(mdsc, ci, true);
> +	    }
> +	}
>  
>  	spin_unlock(&ci->i_ceph_lock);
>  
> @@ -2549,8 +2575,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
>   * FIXME: how does a 0 return differ from -EAGAIN?
>   */
>  enum {
> -	NON_BLOCKING	= 1,
> -	CHECK_FILELOCK	= 2,
> +	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> +	NON_BLOCKING	= (1 << 8),
> +	CHECK_FILELOCK	= (1 << 9),
>  };
>  
>  static int try_get_cap_refs(struct inode *inode, int need, int want,
> @@ -2560,7 +2587,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	int ret = 0;
>  	int have, implemented;
> -	int file_wanted;
>  	bool snap_rwsem_locked = false;
>  
>  	dout("get_cap_refs %p need %s want %s\n", inode,
> @@ -2576,15 +2602,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		goto out_unlock;
>  	}
>  
> -	/* make sure file is actually open */
> -	file_wanted = __ceph_caps_file_wanted(ci);
> -	if ((file_wanted & need) != need) {
> -		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> -		     ceph_cap_string(need), ceph_cap_string(file_wanted));
> -		ret = -EBADF;
> -		goto out_unlock;
> -	}
> -
>  	/* finish pending truncate */
>  	while (ci->i_truncate_pending) {
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -2692,6 +2709,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		     ceph_cap_string(have), ceph_cap_string(need));
>  	}
>  out_unlock:
> +

nit: no need for blank like above

> +	__ceph_touch_fmode(ci, mdsc, flags);
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  	if (snap_rwsem_locked)
>  		up_read(&mdsc->snap_rwsem);
> @@ -2729,10 +2749,20 @@ static void check_max_size(struct inode *inode, loff_t endoff)
>  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
>  }
>  
> +static inline int get_used_fmode(int caps)
> +{
> +	int fmode = 0;
> +	if (caps & CEPH_CAP_FILE_RD)
> +		fmode |= CEPH_FILE_MODE_RD;
> +	if (caps & CEPH_CAP_FILE_WR)
> +		fmode |= CEPH_FILE_MODE_WR;
> +	return fmode;
> +}
> +
>  int ceph_try_get_caps(struct inode *inode, int need, int want,
>  		      bool nonblock, int *got)
>  {
> -	int ret;
> +	int ret, flags;
>  
>  	BUG_ON(need & ~CEPH_CAP_FILE_RD);
>  	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> @@ -2740,8 +2770,10 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
>  	if (ret < 0)
>  		return ret;
>  
> -	ret = try_get_cap_refs(inode, need, want, 0,
> -			       (nonblock ? NON_BLOCKING : 0), got);
> +	flags = get_used_fmode(need | want);
> +	if (nonblock)
> +		flags |= NON_BLOCKING;
> +	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
>  	return ret == -EAGAIN ? 0 : ret;
>  }
>  
> @@ -2767,11 +2799,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
>  		return -EBADF;
>  
> +	flags = get_used_fmode(need | want);
> +
>  	while (true) {
>  		if (endoff > 0)
>  			check_max_size(inode, endoff);
>  
> -		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> +		flags &= CEPH_FILE_MODE_MASK;
> +		if (atomic_read(&fi->num_locks))
> +			flags |= CHECK_FILELOCK;
>  		_got = 0;
>  		ret = try_get_cap_refs(inode, need, want, endoff,
>  				       flags, &_got);
> @@ -2791,6 +2827,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			list_add(&cw.list, &mdsc->cap_wait_list);
>  			spin_unlock(&mdsc->caps_list_lock);
>  
> +			/* make sure used fmode not timeout */
> +			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
>  			add_wait_queue(&ci->i_cap_wq, &wait);
>  
>  			flags |= NON_BLOCKING;
> @@ -2804,6 +2842,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			}
>  
>  			remove_wait_queue(&ci->i_cap_wq, &wait);
> +			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
>  
>  			spin_lock(&mdsc->caps_list_lock);
>  			list_del(&cw.list);
> @@ -2823,7 +2862,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  		if (ret < 0) {
>  			if (ret == -ESTALE) {
>  				/* session was killed, try renew caps */
> -				ret = ceph_renew_caps(inode);
> +				ret = ceph_renew_caps(inode, flags);
>  				if (ret == 0)
>  					continue;
>  			}
> @@ -4121,6 +4160,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>  	dout("flush_dirty_caps done\n");
>  }
>  
> +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			struct ceph_mds_client *mdsc, int fmode)
> +{
> +	unsigned long now = jiffies;
> +	if (fmode & CEPH_FILE_MODE_RD)
> +		ci->i_last_rd = now;
> +	if (fmode & CEPH_FILE_MODE_WR)
> +		ci->i_last_wr = now;
> +	/* queue periodic check */
> +	if (fmode && list_empty(&ci->i_cap_delay_list))
> +		__cap_delay_requeue(mdsc, ci, true);
> +}
> +
> +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> +{
> +	int i;
> +	int bits = (fmode << 1) | 1;
> +	spin_lock(&ci->i_ceph_lock);
> +	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> +		if (bits & (1 << i))
> +			ci->i_nr_by_mode[i] += count;
> +	}
> +	spin_unlock(&ci->i_ceph_lock);
> +}
> +
>  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>  {
>  	int i;
> @@ -4136,26 +4200,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>   * we may need to release capabilities to the MDS (or schedule
>   * their delayed release).
>   */
> -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
>  {
> -	int i, last = 0;
> +	int i;
>  	int bits = (fmode << 1) | 1;
>  	spin_lock(&ci->i_ceph_lock);
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>  		if (bits & (1 << i)) {
> -			BUG_ON(ci->i_nr_by_mode[i] == 0);
> -			if (--ci->i_nr_by_mode[i] == 0)
> -				last++;
> +			BUG_ON(ci->i_nr_by_mode[i] < count);
> +			ci->i_nr_by_mode[i] -= count;
>  		}
>  	}
> -	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> -	     &ci->vfs_inode, fmode,
> -	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> -	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
>  	spin_unlock(&ci->i_ceph_lock);
> -
> -	if (last && ci->i_vino.snap == CEPH_NOSNAP)
> -		ceph_check_caps(ci, 0, NULL);
>  }
>  
>  /*
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 7e0190b1f821..f6ca9be9fbbd 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  		struct ceph_dir_file_info *dfi =
>  			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
>  		if (!dfi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  	} else {
>  		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
>  		if (!fi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  	case S_IFLNK:
>  		dout("init_file %p %p 0%o (symlink)\n", inode, file,
>  		     inode->i_mode);
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		break;
>  
>  	default:
> @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  		 * we need to drop the open ref now, since we don't
>  		 * have .release set to ceph_release.
>  		 */
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		BUG_ON(inode->i_fop->release == ceph_release);
>  
>  		/* call the proper open fop */
> @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  /*
>   * try renew caps after session gets killed.
>   */
> -int ceph_renew_caps(struct inode *inode)
> +int ceph_renew_caps(struct inode *inode, int fmode)
>  {
> -	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  	struct ceph_mds_request *req;
>  	int err, flags, wanted;
>  
>  	spin_lock(&ci->i_ceph_lock);
> +	__ceph_touch_fmode(ci, mdsc, fmode);
>  	wanted = __ceph_caps_file_wanted(ci);
>  	if (__ceph_is_any_real_caps(ci) &&
>  	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
>  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
>  		   (ci->i_snap_caps & wanted) == wanted) {
>  		__ceph_get_fmode(ci, fmode);
> +		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
>  	}
> @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  	}
>  out_req:
>  	if (!req->r_err && req->r_target_inode)
> -		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> +		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
>  	ceph_mdsc_put_request(req);
>  out_ctx:
>  	ceph_release_acl_sec_ctx(&as_ctx);
> @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p dir file %p\n", inode, file);
>  		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
>  
> -		ceph_put_fmode(ci, dfi->file_info.fmode);
> +		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
>  
>  		if (dfi->last_readdir)
>  			ceph_mdsc_put_request(dfi->last_readdir);
> @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p regular file %p\n", inode, file);
>  		WARN_ON(!list_empty(&fi->rw_contexts));
>  
> -		ceph_put_fmode(ci, fi->fmode);
> +		ceph_put_fmode(ci, fi->fmode, 1);
> +
>  		kmem_cache_free(ceph_file_cachep, fi);
>  	}
>  
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 094b8fc37787..95e7440cf6f7 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -478,6 +478,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_head_snapc = NULL;
>  	ci->i_snap_caps = 0;
>  
> +	ci->i_last_rd = ci->i_last_wr = jiffies;
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
>  		ci->i_nr_by_mode[i] = 0;
>  
> @@ -637,7 +638,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
>  			if ((issued & (CEPH_CAP_FILE_CACHE|
>  				       CEPH_CAP_FILE_BUFFER)) ||
>  			    mapping_mapped(inode->i_mapping) ||
> -			    __ceph_caps_file_wanted(ci)) {
> +			    __ceph_is_file_opened(ci)) {
>  				ci->i_truncate_pending++;
>  				queue_trunc = 1;
>  			}
> @@ -1010,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  			fill_inline = true;
>  	}
>  
> +	if (cap_fmode >= 0) {
> +		if (!info_caps)
> +			pr_warn("mds issued no caps on %llx.%llx\n",
> +				ceph_vinop(inode));
> +		__ceph_touch_fmode(ci, mdsc, cap_fmode);
> +	}
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  
>  	if (fill_inline)
> diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> index c90f03beb15d..6e061bf62ad4 100644
> --- a/fs/ceph/ioctl.c
> +++ b/fs/ceph/ioctl.c
> @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
>  	struct ceph_file_info *fi = file->private_data;
>  	struct inode *inode = file_inode(file);
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  
>  	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
>  		spin_lock(&ci->i_ceph_lock);
>  		fi->fmode |= CEPH_FILE_MODE_LAZY;
>  		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> +		__ceph_touch_fmode(ci, mdsc, fi->fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		dout("ioctl_layzio: file %p marked lazy\n", file);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 48e84d7f48a0..8ce210cc62c9 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -361,6 +361,8 @@ struct ceph_inode_info {
>  						    dirty|flushing caps */
>  	unsigned i_snap_caps;           /* cap bits for snapped files */
>  
> +	unsigned long i_last_rd;
> +	unsigned long i_last_wr;
>  	int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
>  
>  	struct mutex i_truncate_mutex;
> @@ -673,6 +675,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
>  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
>  extern int __ceph_caps_used(struct ceph_inode_info *ci);
>  
> +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> +{
> +	return ci->i_nr_by_mode[0];
> +}
>  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
>  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
>  
> @@ -1074,7 +1080,10 @@ extern int ceph_try_get_caps(struct inode *inode,
>  
>  /* for counting open files by mode */
>  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			       struct ceph_mds_client *mdsc, int fmode);
>  
>  /* addr.c */
>  extern const struct address_space_operations ceph_aops;
> @@ -1086,7 +1095,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
>  /* file.c */
>  extern const struct file_operations ceph_file_fops;
>  
> -extern int ceph_renew_caps(struct inode *inode);
> +extern int ceph_renew_caps(struct inode *inode, int fmode);
>  extern int ceph_open(struct inode *inode, struct file *file);
>  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  			    struct file *file, unsigned flags, umode_t mode);
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index cb21c5cf12c3..8017130a08a1 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -564,6 +564,7 @@ struct ceph_filelock {
>  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
>  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
>  #define CEPH_FILE_MODE_BITS       4
> +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
>  
>  int ceph_flags_to_mode(int flags);
>  

Looks reasonable overall, aside from a few nits.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 3/4] ceph: simplify calling of ceph_get_fmode()
  2020-02-21 13:16 ` [PATCH v2 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
@ 2020-02-21 13:55   ` Jeff Layton
  0 siblings, 0 replies; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 13:55 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> Originally, calling ceph_get_fmode() for open files is by thread that
> handles request reply. The reason is that there is a small window
> between updating caps and request initiator gets woken up. we need to
> prevent ceph_check_caps() from releasing wanted caps in the window.
> 
> Previous patch make fill_inode() call __ceph_touch_fmode() for open file
> request. This prevents ceph_check_caps() from releasing wanted caps for
> 'caps_wanted_delay_min' seconds, enough for request initiator to get
> woken up and call ceph_get_fmode(). So we can call ceph_get_fmode() in
> ceph_open() now.
> 

Thanks for the explanation.

So, to be clear, if the reply is delayed past those several seconds,
then we might still lose the caps before it comes in?

I think that's probably ok if so. If you're seeing delays like that then
a little extra ping-ponging of caps is probably the least of your
worries. 

Nice cleanup too!

> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c  | 26 +++-----------------------
>  fs/ceph/file.c  | 21 +++++----------------
>  fs/ceph/inode.c |  8 +-------
>  fs/ceph/super.h |  3 +--
>  4 files changed, 10 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 2a9df235286d..2959e4c36a15 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -606,7 +606,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
>   */
>  void ceph_add_cap(struct inode *inode,
>  		  struct ceph_mds_session *session, u64 cap_id,
> -		  int fmode, unsigned issued, unsigned wanted,
> +		  unsigned issued, unsigned wanted,
>  		  unsigned seq, unsigned mseq, u64 realmino, int flags,
>  		  struct ceph_cap **new_cap)
>  {
> @@ -622,13 +622,6 @@ void ceph_add_cap(struct inode *inode,
>  	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
>  	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
>  
> -	/*
> -	 * If we are opening the file, include file mode wanted bits
> -	 * in wanted.
> -	 */
> -	if (fmode >= 0)
> -		wanted |= ceph_caps_for_mode(fmode);
> -
>  	spin_lock(&session->s_gen_ttl_lock);
>  	gen = session->s_cap_gen;
>  	spin_unlock(&session->s_gen_ttl_lock);
> @@ -753,9 +746,6 @@ void ceph_add_cap(struct inode *inode,
>  	cap->issue_seq = seq;
>  	cap->mseq = mseq;
>  	cap->cap_gen = gen;
> -
> -	if (fmode >= 0)
> -		__ceph_get_fmode(ci, fmode);
>  }
>  
>  /*
> @@ -3728,7 +3718,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
>  		/* add placeholder for the export tagert */
>  		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
>  		tcap = new_cap;
> -		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
> +		ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
>  			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
>  
>  		if (!list_empty(&ci->i_cap_flush_list) &&
> @@ -3833,7 +3823,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
>  	__ceph_caps_issued(ci, &issued);
>  	issued |= __ceph_caps_dirty(ci);
>  
> -	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
> +	ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
>  		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
>  
>  	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
> @@ -4185,16 +4175,6 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
>  	spin_unlock(&ci->i_ceph_lock);
>  }
>  
> -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
> -{
> -	int i;
> -	int bits = (fmode << 1) | 1;
> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> -		if (bits & (1 << i))
> -			ci->i_nr_by_mode[i]++;
> -	}
> -}
> -
>  /*
>   * Drop open file reference.  If we were the last open file,
>   * we may need to release capabilities to the MDS (or schedule
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f6ca9be9fbbd..84058d3c5685 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  	if (isdir) {
>  		struct ceph_dir_file_info *dfi =
>  			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> -		if (!dfi) {
> -			ceph_put_fmode(ci, fmode, 1); /* clean up */
> +		if (!dfi)
>  			return -ENOMEM;
> -		}
>  
>  		file->private_data = dfi;
>  		fi = &dfi->file_info;
> @@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  		dfi->readdir_cache_idx = -1;
>  	} else {
>  		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> -		if (!fi) {
> -			ceph_put_fmode(ci, fmode, 1); /* clean up */
> +		if (!fi)
>  			return -ENOMEM;
> -		}
>  
>  		file->private_data = fi;
>  	}
>  
> +	ceph_get_fmode(ci, fmode, 1);
>  	fi->fmode = fmode;
> +
>  	spin_lock_init(&fi->rw_contexts_lock);
>  	INIT_LIST_HEAD(&fi->rw_contexts);
>  	fi->meta_err = errseq_sample(&ci->i_meta_err);
> @@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  	case S_IFLNK:
>  		dout("init_file %p %p 0%o (symlink)\n", inode, file,
>  		     inode->i_mode);
> -		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		break;
>  
>  	default:
> @@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  		 * we need to drop the open ref now, since we don't
>  		 * have .release set to ceph_release.
>  		 */
> -		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		BUG_ON(inode->i_fop->release == ceph_release);
>  
>  		/* call the proper open fop */
> @@ -327,7 +323,6 @@ int ceph_renew_caps(struct inode *inode, int fmode)
>  	req->r_inode = inode;
>  	ihold(inode);
>  	req->r_num_caps = 1;
> -	req->r_fmode = -1;
>  
>  	err = ceph_mdsc_do_request(mdsc, NULL, req);
>  	ceph_mdsc_put_request(req);
> @@ -373,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
>  
>  	/* trivially open snapdir */
>  	if (ceph_snap(inode) == CEPH_SNAPDIR) {
> -		spin_lock(&ci->i_ceph_lock);
> -		__ceph_get_fmode(ci, fmode);
> -		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
>  	}
>  
> @@ -393,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
>  		dout("open %p fmode %d want %s issued %s using existing\n",
>  		     inode, fmode, ceph_cap_string(wanted),
>  		     ceph_cap_string(issued));
> -		__ceph_get_fmode(ci, fmode);
> +		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  
>  		/* adjust wanted? */
> @@ -405,7 +397,6 @@ int ceph_open(struct inode *inode, struct file *file)
>  		return ceph_init_file(inode, file, fmode);
>  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
>  		   (ci->i_snap_caps & wanted) == wanted) {
> -		__ceph_get_fmode(ci, fmode);
>  		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
> @@ -526,8 +517,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  		err = finish_open(file, dentry, ceph_open);
>  	}
>  out_req:
> -	if (!req->r_err && req->r_target_inode)
> -		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
>  	ceph_mdsc_put_request(req);
>  out_ctx:
>  	ceph_release_acl_sec_ctx(&as_ctx);
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 95e7440cf6f7..0b0f503c84c3 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -968,7 +968,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  		if (ceph_snap(inode) == CEPH_NOSNAP) {
>  			ceph_add_cap(inode, session,
>  				     le64_to_cpu(info->cap.cap_id),
> -				     cap_fmode, info_caps,
> +				     info_caps,
>  				     le32_to_cpu(info->cap.wanted),
>  				     le32_to_cpu(info->cap.seq),
>  				     le32_to_cpu(info->cap.mseq),
> @@ -993,13 +993,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  			dout(" %p got snap_caps %s\n", inode,
>  			     ceph_cap_string(info_caps));
>  			ci->i_snap_caps |= info_caps;
> -			if (cap_fmode >= 0)
> -				__ceph_get_fmode(ci, cap_fmode);
>  		}
> -	} else if (cap_fmode >= 0) {
> -		pr_warn("mds issued no caps on %llx.%llx\n",
> -			   ceph_vinop(inode));
> -		__ceph_get_fmode(ci, cap_fmode);
>  	}
>  
>  	if (iinfo->inline_version > 0 &&
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 8ce210cc62c9..d89478db8b24 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1037,7 +1037,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
>  				     struct ceph_cap_reservation *ctx);
>  extern void ceph_add_cap(struct inode *inode,
>  			 struct ceph_mds_session *session, u64 cap_id,
> -			 int fmode, unsigned issued, unsigned wanted,
> +			 unsigned issued, unsigned wanted,
>  			 unsigned cap, unsigned seq, u64 realmino, int flags,
>  			 struct ceph_cap **new_cap);
>  extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
> @@ -1079,7 +1079,6 @@ extern int ceph_try_get_caps(struct inode *inode,
>  			     int need, int want, bool nonblock, int *got);
>  
>  /* for counting open files by mode */
> -extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
>  extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
>  extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
>  extern void __ceph_touch_fmode(struct ceph_inode_info *ci,

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/4] ceph: don't request caps for idle open files
  2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
                   ` (3 preceding siblings ...)
  2020-02-21 13:16 ` [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
@ 2020-02-21 14:08 ` Jeff Layton
  2020-02-21 14:57 ` Jeff Layton
  5 siblings, 0 replies; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 14:08 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel; +Cc: Xiubo Li, Ilya Dryomov

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> This series make cephfs client not request caps for open files that
> idle for a long time. For the case that one active client and multiple
> standby clients open the same file, this increase the possibility that
> mds issues exclusive caps to the active client.
> 
> Yan, Zheng (4):
>   ceph: always renew caps if mds_wanted is insufficient
>   ceph: consider inode's last read/write when calculating wanted caps
>   ceph: simplify calling of ceph_get_fmode()
>   ceph: remove delay check logic from ceph_check_caps()
> 
>  fs/ceph/caps.c               | 324 +++++++++++++++--------------------
>  fs/ceph/file.c               |  39 ++---
>  fs/ceph/inode.c              |  19 +-
>  fs/ceph/ioctl.c              |   2 +
>  fs/ceph/mds_client.c         |   5 -
>  fs/ceph/super.h              |  35 ++--
>  include/linux/ceph/ceph_fs.h |   1 +
>  7 files changed, 188 insertions(+), 237 deletions(-)
> 

This looks really good -- nice work, Zheng!

I had a minor nit with patch #2, but I'm not too concerned about it and
it can be fixed up after merge if necessary.

The async dirops set has some conflicts, and Xiubo's stats work may
also. I'm going to go ahead and merge this into testing so we can base
further work on top of it.

Thanks!
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps
  2020-02-21 13:16 ` [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps Yan, Zheng
  2020-02-21 13:52   ` Jeff Layton
@ 2020-02-21 14:27   ` Jeff Layton
  2020-02-21 14:35     ` Jeff Layton
  1 sibling, 1 reply; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 14:27 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> Add i_last_rd and i_last_wr to ceph_inode_info. These two fields are
> used to track inode's last read/write, they are updated when getting
> caps for read/write.
> 
> If there is no read/write on an inode for 'caps_wanted_delay_max'
> seconds, __ceph_caps_file_wanted() does not request caps for read/write
> even there are open files.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c               | 152 ++++++++++++++++++++++++-----------
>  fs/ceph/file.c               |  21 ++---
>  fs/ceph/inode.c              |  10 ++-
>  fs/ceph/ioctl.c              |   2 +
>  fs/ceph/super.h              |  13 ++-
>  include/linux/ceph/ceph_fs.h |   1 +
>  6 files changed, 139 insertions(+), 60 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 293920d013ff..2a9df235286d 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -971,18 +971,49 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
>  	return used;
>  }
>  
> +#define FMODE_WAIT_BIAS 1000
> +
>  /*
>   * wanted, by virtue of open file modes
>   */
>  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>  {
> -	int i, bits = 0;
> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> -		if (ci->i_nr_by_mode[i])
> -			bits |= 1 << i;
> +	struct ceph_mount_options *opt =
> +		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> +	unsigned long used_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> +	unsigned long idle_cutoff =
> +		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
> +	int bits = 0;
> +
> +	if (ci->i_nr_by_mode[0] > 0)
> +		bits |= CEPH_FILE_MODE_PIN;
> +
> +	if (ci->i_nr_by_mode[1] > 0) {
> +		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
> +		    time_after(ci->i_last_rd, used_cutoff))
> +			bits |= CEPH_FILE_MODE_RD;
> +	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
> +		bits |= CEPH_FILE_MODE_RD;
> +	}
> +
> +	if (ci->i_nr_by_mode[2] > 0) {
> +		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
> +		    time_after(ci->i_last_wr, used_cutoff))
> +			bits |= CEPH_FILE_MODE_WR;
> +	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
> +		bits |= CEPH_FILE_MODE_WR;
>  	}
> +
> +	/* check lazyio only when read/write is wanted */
> +	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)

LAZY is 4. Shouldn't this be?

    if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[CEPH_FILE_MODE_LAZY] > 0)

> +		bits |= CEPH_FILE_MODE_LAZY;
> +
>  	if (bits == 0)
>  		return 0;
> +	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> +		return 0;
> +
>  	return ceph_caps_for_mode(bits >> 1);
>  }
>  
> @@ -1021,14 +1052,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
>  	return mds_wanted;
>  }
>  
> -/*
> - * called under i_ceph_lock
> - */
> -static int __ceph_is_single_caps(struct ceph_inode_info *ci)
> -{
> -	return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
> -}
> -
>  int ceph_is_any_caps(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -1856,10 +1879,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  		flags |= CHECK_CAPS_FLUSH;
>  
> -	if (!(flags & CHECK_CAPS_AUTHONLY) ||
> -	    (ci->i_auth_cap && __ceph_is_single_caps(ci)))
> -		__cap_delay_cancel(mdsc, ci);
> -
>  	goto retry_locked;
>  retry:
>  	spin_lock(&ci->i_ceph_lock);
> @@ -2081,9 +2100,16 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		goto retry; /* retake i_ceph_lock and restart our cap scan. */
>  	}
>  
> -	/* Reschedule delayed caps release if we delayed anything */
> -	if (delayed)
> -		__cap_delay_requeue(mdsc, ci, false);
> +	if (list_empty(&ci->i_cap_delay_list)) {
> +	    if (delayed) {
> +		    /* Reschedule delayed caps release if we delayed anything */
> +		    __cap_delay_requeue(mdsc, ci, false);
> +	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> +			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +		    /* periodically re-calculate caps wanted by open files */
> +		    __cap_delay_requeue(mdsc, ci, true);
> +	    }
> +	}
>  
>  	spin_unlock(&ci->i_ceph_lock);
>  
> @@ -2549,8 +2575,9 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
>   * FIXME: how does a 0 return differ from -EAGAIN?
>   */
>  enum {
> -	NON_BLOCKING	= 1,
> -	CHECK_FILELOCK	= 2,
> +	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
> +	NON_BLOCKING	= (1 << 8),
> +	CHECK_FILELOCK	= (1 << 9),
>  };
>  
>  static int try_get_cap_refs(struct inode *inode, int need, int want,
> @@ -2560,7 +2587,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	int ret = 0;
>  	int have, implemented;
> -	int file_wanted;
>  	bool snap_rwsem_locked = false;
>  
>  	dout("get_cap_refs %p need %s want %s\n", inode,
> @@ -2576,15 +2602,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		goto out_unlock;
>  	}
>  
> -	/* make sure file is actually open */
> -	file_wanted = __ceph_caps_file_wanted(ci);
> -	if ((file_wanted & need) != need) {
> -		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
> -		     ceph_cap_string(need), ceph_cap_string(file_wanted));
> -		ret = -EBADF;
> -		goto out_unlock;
> -	}
> -
>  	/* finish pending truncate */
>  	while (ci->i_truncate_pending) {
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -2692,6 +2709,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		     ceph_cap_string(have), ceph_cap_string(need));
>  	}
>  out_unlock:
> +
> +	__ceph_touch_fmode(ci, mdsc, flags);
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  	if (snap_rwsem_locked)
>  		up_read(&mdsc->snap_rwsem);
> @@ -2729,10 +2749,20 @@ static void check_max_size(struct inode *inode, loff_t endoff)
>  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
>  }
>  
> +static inline int get_used_fmode(int caps)
> +{
> +	int fmode = 0;
> +	if (caps & CEPH_CAP_FILE_RD)
> +		fmode |= CEPH_FILE_MODE_RD;
> +	if (caps & CEPH_CAP_FILE_WR)
> +		fmode |= CEPH_FILE_MODE_WR;
> +	return fmode;
> +}
> +
>  int ceph_try_get_caps(struct inode *inode, int need, int want,
>  		      bool nonblock, int *got)
>  {
> -	int ret;
> +	int ret, flags;
>  
>  	BUG_ON(need & ~CEPH_CAP_FILE_RD);
>  	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> @@ -2740,8 +2770,10 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
>  	if (ret < 0)
>  		return ret;
>  
> -	ret = try_get_cap_refs(inode, need, want, 0,
> -			       (nonblock ? NON_BLOCKING : 0), got);
> +	flags = get_used_fmode(need | want);
> +	if (nonblock)
> +		flags |= NON_BLOCKING;
> +	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
>  	return ret == -EAGAIN ? 0 : ret;
>  }
>  
> @@ -2767,11 +2799,15 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
>  		return -EBADF;
>  
> +	flags = get_used_fmode(need | want);
> +
>  	while (true) {
>  		if (endoff > 0)
>  			check_max_size(inode, endoff);
>  
> -		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
> +		flags &= CEPH_FILE_MODE_MASK;
> +		if (atomic_read(&fi->num_locks))
> +			flags |= CHECK_FILELOCK;
>  		_got = 0;
>  		ret = try_get_cap_refs(inode, need, want, endoff,
>  				       flags, &_got);
> @@ -2791,6 +2827,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			list_add(&cw.list, &mdsc->cap_wait_list);
>  			spin_unlock(&mdsc->caps_list_lock);
>  
> +			/* make sure used fmode not timeout */
> +			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
>  			add_wait_queue(&ci->i_cap_wq, &wait);
>  
>  			flags |= NON_BLOCKING;
> @@ -2804,6 +2842,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			}
>  
>  			remove_wait_queue(&ci->i_cap_wq, &wait);
> +			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
>  
>  			spin_lock(&mdsc->caps_list_lock);
>  			list_del(&cw.list);
> @@ -2823,7 +2862,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  		if (ret < 0) {
>  			if (ret == -ESTALE) {
>  				/* session was killed, try renew caps */
> -				ret = ceph_renew_caps(inode);
> +				ret = ceph_renew_caps(inode, flags);
>  				if (ret == 0)
>  					continue;
>  			}
> @@ -4121,6 +4160,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>  	dout("flush_dirty_caps done\n");
>  }
>  
> +void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			struct ceph_mds_client *mdsc, int fmode)
> +{
> +	unsigned long now = jiffies;
> +	if (fmode & CEPH_FILE_MODE_RD)
> +		ci->i_last_rd = now;
> +	if (fmode & CEPH_FILE_MODE_WR)
> +		ci->i_last_wr = now;
> +	/* queue periodic check */
> +	if (fmode && list_empty(&ci->i_cap_delay_list))
> +		__cap_delay_requeue(mdsc, ci, true);
> +}
> +
> +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> +{
> +	int i;
> +	int bits = (fmode << 1) | 1;
> +	spin_lock(&ci->i_ceph_lock);
> +	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> +		if (bits & (1 << i))
> +			ci->i_nr_by_mode[i] += count;
> +	}
> +	spin_unlock(&ci->i_ceph_lock);
> +}
> +
>  void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>  {
>  	int i;
> @@ -4136,26 +4200,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
>   * we may need to release capabilities to the MDS (or schedule
>   * their delayed release).
>   */
> -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
> +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
>  {
> -	int i, last = 0;
> +	int i;
>  	int bits = (fmode << 1) | 1;
>  	spin_lock(&ci->i_ceph_lock);
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>  		if (bits & (1 << i)) {
> -			BUG_ON(ci->i_nr_by_mode[i] == 0);
> -			if (--ci->i_nr_by_mode[i] == 0)
> -				last++;
> +			BUG_ON(ci->i_nr_by_mode[i] < count);
> +			ci->i_nr_by_mode[i] -= count;
>  		}
>  	}
> -	dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
> -	     &ci->vfs_inode, fmode,
> -	     ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
> -	     ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
>  	spin_unlock(&ci->i_ceph_lock);
> -
> -	if (last && ci->i_vino.snap == CEPH_NOSNAP)
> -		ceph_check_caps(ci, 0, NULL);
>  }
>  
>  /*
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 7e0190b1f821..f6ca9be9fbbd 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -213,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  		struct ceph_dir_file_info *dfi =
>  			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
>  		if (!dfi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -224,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>  	} else {
>  		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
>  		if (!fi) {
> -			ceph_put_fmode(ci, fmode); /* clean up */
> +			ceph_put_fmode(ci, fmode, 1); /* clean up */
>  			return -ENOMEM;
>  		}
>  
> @@ -263,7 +263,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  	case S_IFLNK:
>  		dout("init_file %p %p 0%o (symlink)\n", inode, file,
>  		     inode->i_mode);
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		break;
>  
>  	default:
> @@ -273,7 +273,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  		 * we need to drop the open ref now, since we don't
>  		 * have .release set to ceph_release.
>  		 */
> -		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +		ceph_put_fmode(ceph_inode(inode), fmode, 1); /* clean up */
>  		BUG_ON(inode->i_fop->release == ceph_release);
>  
>  		/* call the proper open fop */
> @@ -285,14 +285,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  /*
>   * try renew caps after session gets killed.
>   */
> -int ceph_renew_caps(struct inode *inode)
> +int ceph_renew_caps(struct inode *inode, int fmode)
>  {
> -	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  	struct ceph_mds_request *req;
>  	int err, flags, wanted;
>  
>  	spin_lock(&ci->i_ceph_lock);
> +	__ceph_touch_fmode(ci, mdsc, fmode);
>  	wanted = __ceph_caps_file_wanted(ci);
>  	if (__ceph_is_any_real_caps(ci) &&
>  	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
> @@ -405,6 +406,7 @@ int ceph_open(struct inode *inode, struct file *file)
>  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
>  		   (ci->i_snap_caps & wanted) == wanted) {
>  		__ceph_get_fmode(ci, fmode);
> +		__ceph_touch_fmode(ci, mdsc, fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		return ceph_init_file(inode, file, fmode);
>  	}
> @@ -525,7 +527,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  	}
>  out_req:
>  	if (!req->r_err && req->r_target_inode)
> -		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> +		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode, 1);
>  	ceph_mdsc_put_request(req);
>  out_ctx:
>  	ceph_release_acl_sec_ctx(&as_ctx);
> @@ -542,7 +544,7 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p dir file %p\n", inode, file);
>  		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
>  
> -		ceph_put_fmode(ci, dfi->file_info.fmode);
> +		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
>  
>  		if (dfi->last_readdir)
>  			ceph_mdsc_put_request(dfi->last_readdir);
> @@ -554,7 +556,8 @@ int ceph_release(struct inode *inode, struct file *file)
>  		dout("release inode %p regular file %p\n", inode, file);
>  		WARN_ON(!list_empty(&fi->rw_contexts));
>  
> -		ceph_put_fmode(ci, fi->fmode);
> +		ceph_put_fmode(ci, fi->fmode, 1);
> +
>  		kmem_cache_free(ceph_file_cachep, fi);
>  	}
>  
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 094b8fc37787..95e7440cf6f7 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -478,6 +478,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_head_snapc = NULL;
>  	ci->i_snap_caps = 0;
>  
> +	ci->i_last_rd = ci->i_last_wr = jiffies;
>  	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
>  		ci->i_nr_by_mode[i] = 0;
>  
> @@ -637,7 +638,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
>  			if ((issued & (CEPH_CAP_FILE_CACHE|
>  				       CEPH_CAP_FILE_BUFFER)) ||
>  			    mapping_mapped(inode->i_mapping) ||
> -			    __ceph_caps_file_wanted(ci)) {
> +			    __ceph_is_file_opened(ci)) {
>  				ci->i_truncate_pending++;
>  				queue_trunc = 1;
>  			}
> @@ -1010,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  			fill_inline = true;
>  	}
>  
> +	if (cap_fmode >= 0) {
> +		if (!info_caps)
> +			pr_warn("mds issued no caps on %llx.%llx\n",
> +				ceph_vinop(inode));
> +		__ceph_touch_fmode(ci, mdsc, cap_fmode);
> +	}
> +
>  	spin_unlock(&ci->i_ceph_lock);
>  
>  	if (fill_inline)
> diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
> index c90f03beb15d..6e061bf62ad4 100644
> --- a/fs/ceph/ioctl.c
> +++ b/fs/ceph/ioctl.c
> @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
>  	struct ceph_file_info *fi = file->private_data;
>  	struct inode *inode = file_inode(file);
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  
>  	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
>  		spin_lock(&ci->i_ceph_lock);
>  		fi->fmode |= CEPH_FILE_MODE_LAZY;
>  		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
> +		__ceph_touch_fmode(ci, mdsc, fi->fmode);
>  		spin_unlock(&ci->i_ceph_lock);
>  		dout("ioctl_layzio: file %p marked lazy\n", file);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 48e84d7f48a0..8ce210cc62c9 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -361,6 +361,8 @@ struct ceph_inode_info {
>  						    dirty|flushing caps */
>  	unsigned i_snap_caps;           /* cap bits for snapped files */
>  
> +	unsigned long i_last_rd;
> +	unsigned long i_last_wr;
>  	int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
>  
>  	struct mutex i_truncate_mutex;
> @@ -673,6 +675,10 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
>  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
>  extern int __ceph_caps_used(struct ceph_inode_info *ci);
>  
> +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
> +{
> +	return ci->i_nr_by_mode[0];
> +}
>  extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
>  extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
>  
> @@ -1074,7 +1080,10 @@ extern int ceph_try_get_caps(struct inode *inode,
>  
>  /* for counting open files by mode */
>  extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
> -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
> +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
> +extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
> +			       struct ceph_mds_client *mdsc, int fmode);
>  
>  /* addr.c */
>  extern const struct address_space_operations ceph_aops;
> @@ -1086,7 +1095,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
>  /* file.c */
>  extern const struct file_operations ceph_file_fops;
>  
> -extern int ceph_renew_caps(struct inode *inode);
> +extern int ceph_renew_caps(struct inode *inode, int fmode);
>  extern int ceph_open(struct inode *inode, struct file *file);
>  extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  			    struct file *file, unsigned flags, umode_t mode);
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index cb21c5cf12c3..8017130a08a1 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -564,6 +564,7 @@ struct ceph_filelock {
>  #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
>  #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
>  #define CEPH_FILE_MODE_BITS       4
> +#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
>  
>  int ceph_flags_to_mode(int flags);
>  

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps
  2020-02-21 14:27   ` Jeff Layton
@ 2020-02-21 14:35     ` Jeff Layton
  2020-02-24  7:09       ` Yan, Zheng
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 14:35 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 09:27 -0500, Jeff Layton wrote:
> On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> > Add i_last_rd and i_last_wr to ceph_inode_info. These two fields are
> > used to track inode's last read/write, they are updated when getting
> > caps for read/write.
> > 
> > If there is no read/write on an inode for 'caps_wanted_delay_max'
> > seconds, __ceph_caps_file_wanted() does not request caps for read/write
> > even there are open files.
> > 
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/caps.c               | 152 ++++++++++++++++++++++++-----------
> >  fs/ceph/file.c               |  21 ++---
> >  fs/ceph/inode.c              |  10 ++-
> >  fs/ceph/ioctl.c              |   2 +
> >  fs/ceph/super.h              |  13 ++-
> >  include/linux/ceph/ceph_fs.h |   1 +
> >  6 files changed, 139 insertions(+), 60 deletions(-)
> > 
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 293920d013ff..2a9df235286d 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -971,18 +971,49 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
> >  	return used;
> >  }
> >  
> > +#define FMODE_WAIT_BIAS 1000
> > +
> >  /*
> >   * wanted, by virtue of open file modes
> >   */
> >  int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
> >  {
> > -	int i, bits = 0;
> > -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
> > -		if (ci->i_nr_by_mode[i])
> > -			bits |= 1 << i;
> > +	struct ceph_mount_options *opt =
> > +		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
> > +	unsigned long used_cutoff =
> > +		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
> > +	unsigned long idle_cutoff =
> > +		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
> > +	int bits = 0;
> > +
> > +	if (ci->i_nr_by_mode[0] > 0)
> > +		bits |= CEPH_FILE_MODE_PIN;
> > +
> > +	if (ci->i_nr_by_mode[1] > 0) {
> > +		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
> > +		    time_after(ci->i_last_rd, used_cutoff))
> > +			bits |= CEPH_FILE_MODE_RD;
> > +	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
> > +		bits |= CEPH_FILE_MODE_RD;
> > +	}
> > +
> > +	if (ci->i_nr_by_mode[2] > 0) {
> > +		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
> > +		    time_after(ci->i_last_wr, used_cutoff))
> > +			bits |= CEPH_FILE_MODE_WR;
> > +	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
> > +		bits |= CEPH_FILE_MODE_WR;
> >  	}
> > +
> > +	/* check lazyio only when read/write is wanted */
> > +	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
> 
> LAZY is 4. Shouldn't this be?
> 
>     if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[CEPH_FILE_MODE_LAZY] > 0)
> 

Nope, that value was right, but I think we should phrase this in terms
of symbolic constants. Maybe we can squash this patch into your series?

-----------------------8<-----------------------

[PATCH] SQUASH: use symbolic constants in __ceph_caps_file_wanted()

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ad365cf870f6..1b450f2195fe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -971,19 +971,19 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
 	int bits = 0;
 
-	if (ci->i_nr_by_mode[0] > 0)
+	if (ci->i_nr_by_mode[CEPH_FILE_MODE_PIN] > 0)
 		bits |= CEPH_FILE_MODE_PIN;
 
-	if (ci->i_nr_by_mode[1] > 0) {
-		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
+	if (ci->i_nr_by_mode[CEPH_FILE_MODE_RD] > 0) {
+		if (ci->i_nr_by_mode[CEPH_FILE_MODE_RD] >= FMODE_WAIT_BIAS ||
 		    time_after(ci->i_last_rd, used_cutoff))
 			bits |= CEPH_FILE_MODE_RD;
 	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
 		bits |= CEPH_FILE_MODE_RD;
 	}
 
-	if (ci->i_nr_by_mode[2] > 0) {
-		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
+	if (ci->i_nr_by_mode[CEPH_FILE_MODE_WR] > 0) {
+		if (ci->i_nr_by_mode[CEPH_FILE_MODE_WR] >= FMODE_WAIT_BIAS ||
 		    time_after(ci->i_last_wr, used_cutoff))
 			bits |= CEPH_FILE_MODE_WR;
 	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
@@ -991,12 +991,13 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 	}
 
 	/* check lazyio only when read/write is wanted */
-	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
+	if ((bits & CEPH_FILE_MODE_RDWR) &&
+	    ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)] > 0)
 		bits |= CEPH_FILE_MODE_LAZY;
 
 	if (bits == 0)
 		return 0;
-	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
+	if (bits == (1 << CEPH_FILE_MODE_PIN) && !S_ISDIR(ci->vfs_inode.i_mode))
 		return 0;
 
 	return ceph_caps_for_mode(bits >> 1);
-- 
2.24.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/4] ceph: don't request caps for idle open files
  2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
                   ` (4 preceding siblings ...)
  2020-02-21 14:08 ` [PATCH v2 0/4] ceph: don't request caps for idle open files Jeff Layton
@ 2020-02-21 14:57 ` Jeff Layton
  2020-02-27 12:07   ` Yan, Zheng
  5 siblings, 1 reply; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 14:57 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> This series make cephfs client not request caps for open files that
> idle for a long time. For the case that one active client and multiple
> standby clients open the same file, this increase the possibility that
> mds issues exclusive caps to the active client.
> 
> Yan, Zheng (4):
>   ceph: always renew caps if mds_wanted is insufficient
>   ceph: consider inode's last read/write when calculating wanted caps
>   ceph: simplify calling of ceph_get_fmode()
>   ceph: remove delay check logic from ceph_check_caps()
> 
>  fs/ceph/caps.c               | 324 +++++++++++++++--------------------
>  fs/ceph/file.c               |  39 ++---
>  fs/ceph/inode.c              |  19 +-
>  fs/ceph/ioctl.c              |   2 +
>  fs/ceph/mds_client.c         |   5 -
>  fs/ceph/super.h              |  35 ++--
>  include/linux/ceph/ceph_fs.h |   1 +
>  7 files changed, 188 insertions(+), 237 deletions(-)
> 

Testing with this series and xfstests is throwing softlockups. Still
looking at the cause:

[  251.385573] watchdog: BUG: soft lockup - CPU#7 stuck for 23s! [fsstress:7113]
[  251.387105] Modules linked in: ceph(OE) libceph(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) ip_set(E) nfnetlink(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) cachefiles(E) fscache(E) sunrpc(E) pcspkr(E) joydev(E) i2c_piix4(E) virtio_balloon(E) crct10dif_pclmul(E) crc32_pclmul(E) ghash_clmulni_intel(E) ip_tables(E) xfs(E) libcrc32c(E) rfkill(E) qxl(E) drm_ttm_helper(E) crc32c_intel(E) ttm(E) drm_kms_helper(E) cec(E) serio_raw(E) virtio_net(E) net_failover(E) virti
 o_console(E) virtio_blk(E) failover(E) drm(E) ata_generic(E) floppy(E) pata_acpi(E) qemu_fw_cfg(E)
[  251.402300] CPU: 7 PID: 7113 Comm: fsstress Kdump: loaded Tainted: G           OE     5.6.0-rc1+ #12
[  251.404351] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[  251.407716] RIP: 0010:ceph_get_caps+0x108/0x5e0 [ceph]
[  251.409154] Code: 80 cc 02 85 d2 44 89 f2 0f 45 d8 41 89 d8 e8 4f f8 ff ff 89 c1 83 f8 f5 74 ba 85 c0 0f 84 55 02 00 00 48 8b 74 24 50 f6 06 02 <74> 11 48 8b 44 24 48 8b 40 2c 39 46 1c 0f 85 97 04 00 00 85 c9 0f
[  251.413302] RSP: 0018:ffffb72c01567a10 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
[  251.414827] RAX: 00000000ffffff8c RBX: 0000000000000002 RCX: 00000000ffffff8c
[  251.416303] RDX: ffff8a1fd45b4468 RSI: ffff8a2228fd2c08 RDI: ffff8a1fd45b42d0
[  251.417778] RBP: ffffb72c01567b00 R08: ffff8a2227fcb000 R09: ffffb72c01567a6c
[  251.419265] R10: 0000000000000001 R11: 0000000000000000 R12: 00000000160337c1
[  251.420811] R13: 0000000000001000 R14: 0000000000002000 R15: ffff8a1fd45b45f8
[  251.422355] FS:  00007fcdcc9ebf40(0000) GS:ffff8a222fbc0000(0000) knlGS:0000000000000000
[  251.423962] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  251.425294] CR2: 000000000162e000 CR3: 00000001f0b18000 CR4: 00000000003406e0
[  251.426785] Call Trace:
[  251.427739]  ? __vfs_getxattr+0x53/0x70
[  251.428835]  ? generic_update_time+0x9d/0xc0
[  251.429978]  ? file_update_time+0xdd/0x120
[  251.431106]  ceph_write_iter+0x27b/0xc30 [ceph]
[  251.432285]  ? ceph_read_iter+0x3b3/0xd60 [ceph]
[  251.433463]  ? _cond_resched+0x15/0x30
[  251.434533]  ? do_iter_readv_writev+0x158/0x1d0
[  251.435681]  do_iter_readv_writev+0x158/0x1d0
[  251.436825]  do_iter_write+0x7d/0x190
[  251.437867]  iter_file_splice_write+0x26a/0x3c0
[  251.439059]  direct_splice_actor+0x35/0x40
[  251.440144]  splice_direct_to_actor+0x102/0x250
[  251.441274]  ? generic_pipe_buf_nosteal+0x10/0x10
[  251.442444]  do_splice_direct+0x8b/0xd0
[  251.443491]  generic_copy_file_range+0x32/0x40
[  251.444596]  vfs_copy_file_range+0x2eb/0x310
[  251.445691]  ? __do_sys_newfstat+0x5a/0x70
[  251.446764]  __x64_sys_copy_file_range+0xd6/0x210
[  251.447933]  do_syscall_64+0x5b/0x1c0
[  251.448972]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  251.450188] RIP: 0033:0x7fcdccae91ed
[  251.451236] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
[  251.455096] RSP: 002b:00007ffcbac62e28 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
[  251.456664] RAX: ffffffffffffffda RBX: 0000000124e3181d RCX: 00007fcdccae91ed
[  251.458153] RDX: 0000000000000004 RSI: 00007ffcbac62e60 RDI: 0000000000000003
[  251.459626] RBP: 0000000000000003 R08: 000000000001c50b R09: 0000000000000000
[  251.461147] R10: 00007ffcbac62e68 R11: 0000000000000246 R12: 0000000000000088
[  251.462680] R13: 0000000000000004 R14: 000000000001c50b R15: 000000001602447d


-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient
  2020-02-21 13:16 ` [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
@ 2020-02-21 17:17   ` Jeff Layton
  2020-02-24  3:17     ` Yan, Zheng
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Layton @ 2020-02-21 17:17 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> original code only renews caps for inodes with CEPH_I_CAP_DROPPED flags.
> The flag indicates that mds closed session and caps were dropped. This
> patch is preparation for not requesting caps for idle open files.
> 
> CEPH_I_CAP_DROPPED is no longer tested by anyone, so this patch also
> remove it.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c       | 36 +++++++++++++++---------------------
>  fs/ceph/mds_client.c |  5 -----
>  fs/ceph/super.h      | 11 +++++------
>  3 files changed, 20 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index d05717397c2a..293920d013ff 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2659,6 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  		}
>  	} else {
>  		int session_readonly = false;
> +		int mds_wanted;
>  		if (ci->i_auth_cap &&
>  		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
>  			struct ceph_mds_session *s = ci->i_auth_cap->session;
> @@ -2667,32 +2668,27 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>  			spin_unlock(&s->s_cap_lock);
>  		}
>  		if (session_readonly) {
> -			dout("get_cap_refs %p needed %s but mds%d readonly\n",
> +			dout("get_cap_refs %p need %s but mds%d readonly\n",
>  			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
>  			ret = -EROFS;
>  			goto out_unlock;
>  		}
>  
> -		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
> -			int mds_wanted;
> -			if (READ_ONCE(mdsc->fsc->mount_state) ==
> -			    CEPH_MOUNT_SHUTDOWN) {
> -				dout("get_cap_refs %p forced umount\n", inode);
> -				ret = -EIO;
> -				goto out_unlock;
> -			}
> -			mds_wanted = __ceph_caps_mds_wanted(ci, false);
> -			if (need & ~(mds_wanted & need)) {
> -				dout("get_cap_refs %p caps were dropped"
> -				     " (session killed?)\n", inode);
> -				ret = -ESTALE;
> -				goto out_unlock;
> -			}
> -			if (!(file_wanted & ~mds_wanted))
> -				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
> +		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> +			dout("get_cap_refs %p forced umount\n", inode);
> +			ret = -EIO;
> +			goto out_unlock;
> +		}
> +		mds_wanted = __ceph_caps_mds_wanted(ci, false);
> +		if (need & ~mds_wanted) {
> +			dout("get_cap_refs %p need %s > mds_wanted %s\n",
> +			     inode, ceph_cap_string(need),
> +			     ceph_cap_string(mds_wanted));
> +			ret = -ESTALE;
> +			goto out_unlock;
>  		}
>  

I was able to reproduce softlockups with xfstests reliably for a little
while, but it doesn't always happen. I bisected it down to this patch
though. I suspect the problem is in the hunk above. It looks like it's
getting into a situation where this is continually returning ESTALE.

I cranked up debug logging in this function and I see this:

[  259.284839] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
[  259.284848] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
[  259.284855] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
[  259.284863] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284868] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
[  259.284877] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284885] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284890] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284899] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
[  259.284908] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
[  259.284918] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284926] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284945] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
[  259.284950] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284961] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284969] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
[  259.284975] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
[  259.284984] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
[  259.284994] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
[  259.285003] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc

...not sure I understand the logical flow in this function well enough
to suggest a fix yet though.

> -		dout("get_cap_refs %p have %s needed %s\n", inode,
> +		dout("get_cap_refs %p have %s need %s\n", inode,
>  		     ceph_cap_string(have), ceph_cap_string(need));
>  	}
>  out_unlock:
> @@ -3646,8 +3642,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
>  		goto out_unlock;
>  
>  	if (target < 0) {
> -		if (cap->mds_wanted | cap->issued)
> -			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>  		__ceph_remove_cap(cap, false);
>  		goto out_unlock;
>  	}
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index fab9d6461a65..98d746b3bb53 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1411,8 +1411,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  	dout("removing cap %p, ci is %p, inode is %p\n",
>  	     cap, ci, &ci->vfs_inode);
>  	spin_lock(&ci->i_ceph_lock);
> -	if (cap->mds_wanted | cap->issued)
> -		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>  	__ceph_remove_cap(cap, false);
>  	if (!ci->i_auth_cap) {
>  		struct ceph_cap_flush *cf;
> @@ -1578,9 +1576,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
>  			/* mds did not re-issue stale cap */
>  			spin_lock(&ci->i_ceph_lock);
>  			cap->issued = cap->implemented = CEPH_CAP_PIN;
> -			/* make sure mds knows what we want */
> -			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
> -				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>  			spin_unlock(&ci->i_ceph_lock);
>  		}
>  	} else if (ev == FORCE_RO) {
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 37dc1ac8f6c3..48e84d7f48a0 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -517,12 +517,11 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>  #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
>  #define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
>  #define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
> -#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
> -#define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
> -#define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
> -#define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
> -#define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
> -#define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
> +#define CEPH_I_KICK_FLUSH	(1 << 7)  /* kick flushing caps */
> +#define CEPH_I_FLUSH_SNAPS	(1 << 8)  /* need flush snapss */
> +#define CEPH_I_ERROR_WRITE	(1 << 9)  /* have seen write errors */
> +#define CEPH_I_ERROR_FILELOCK	(1 << 10) /* have seen file lock errors */
> +#define CEPH_I_ODIRECT		(1 << 11) /* inode in direct I/O mode */
>  
>  /*
>   * Masks of ceph inode work.

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient
  2020-02-21 17:17   ` Jeff Layton
@ 2020-02-24  3:17     ` Yan, Zheng
  2020-02-24 15:17       ` Jeff Layton
  0 siblings, 1 reply; 20+ messages in thread
From: Yan, Zheng @ 2020-02-24  3:17 UTC (permalink / raw)
  To: Jeff Layton, ceph-devel

On 2/22/20 1:17 AM, Jeff Layton wrote:
> On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
>> original code only renews caps for inodes with CEPH_I_CAP_DROPPED flags.
>> The flag indicates that mds closed session and caps were dropped. This
>> patch is preparation for not requesting caps for idle open files.
>>
>> CEPH_I_CAP_DROPPED is no longer tested by anyone, so this patch also
>> remove it.
>>
>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>> ---
>>   fs/ceph/caps.c       | 36 +++++++++++++++---------------------
>>   fs/ceph/mds_client.c |  5 -----
>>   fs/ceph/super.h      | 11 +++++------
>>   3 files changed, 20 insertions(+), 32 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index d05717397c2a..293920d013ff 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -2659,6 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>>   		}
>>   	} else {
>>   		int session_readonly = false;
>> +		int mds_wanted;
>>   		if (ci->i_auth_cap &&
>>   		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
>>   			struct ceph_mds_session *s = ci->i_auth_cap->session;
>> @@ -2667,32 +2668,27 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
>>   			spin_unlock(&s->s_cap_lock);
>>   		}
>>   		if (session_readonly) {
>> -			dout("get_cap_refs %p needed %s but mds%d readonly\n",
>> +			dout("get_cap_refs %p need %s but mds%d readonly\n",
>>   			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
>>   			ret = -EROFS;
>>   			goto out_unlock;
>>   		}
>>   
>> -		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
>> -			int mds_wanted;
>> -			if (READ_ONCE(mdsc->fsc->mount_state) ==
>> -			    CEPH_MOUNT_SHUTDOWN) {
>> -				dout("get_cap_refs %p forced umount\n", inode);
>> -				ret = -EIO;
>> -				goto out_unlock;
>> -			}
>> -			mds_wanted = __ceph_caps_mds_wanted(ci, false);
>> -			if (need & ~(mds_wanted & need)) {
>> -				dout("get_cap_refs %p caps were dropped"
>> -				     " (session killed?)\n", inode);
>> -				ret = -ESTALE;
>> -				goto out_unlock;
>> -			}
>> -			if (!(file_wanted & ~mds_wanted))
>> -				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
>> +		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
>> +			dout("get_cap_refs %p forced umount\n", inode);
>> +			ret = -EIO;
>> +			goto out_unlock;
>> +		}
>> +		mds_wanted = __ceph_caps_mds_wanted(ci, false);
>> +		if (need & ~mds_wanted) {
>> +			dout("get_cap_refs %p need %s > mds_wanted %s\n",
>> +			     inode, ceph_cap_string(need),
>> +			     ceph_cap_string(mds_wanted));
>> +			ret = -ESTALE;
>> +			goto out_unlock;
>>   		}
>>   
> 
> I was able to reproduce softlockups with xfstests reliably for a little
> while, but it doesn't always happen. I bisected it down to this patch
> though. I suspect the problem is in the hunk above. It looks like it's
> getting into a situation where this is continually returning ESTALE.
> 
> I cranked up debug logging in this function and I see this:
> 
> [  259.284839] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> [  259.284848] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> [  259.284855] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> [  259.284863] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284868] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> [  259.284877] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284885] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284890] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284899] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> [  259.284908] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> [  259.284918] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284926] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284945] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> [  259.284950] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284961] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284969] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> [  259.284975] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> [  259.284984] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> [  259.284994] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> [  259.285003] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> 

Looks like ceph_check_caps did update caps' mds_wanted. Did you test 
this patch with async dirops patches? please try reproducing this issue 
again with debug log of try_get_cap_refs(), ceph_check_caps() and 
ceph_renew_caps().

Thanks
Yan, Zheng


> ...not sure I understand the logical flow in this function well enough
> to suggest a fix yet though.
> 
>> -		dout("get_cap_refs %p have %s needed %s\n", inode,
>> +		dout("get_cap_refs %p have %s need %s\n", inode,
>>   		     ceph_cap_string(have), ceph_cap_string(need));
>>   	}
>>   out_unlock:
>> @@ -3646,8 +3642,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
>>   		goto out_unlock;
>>   
>>   	if (target < 0) {
>> -		if (cap->mds_wanted | cap->issued)
>> -			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>>   		__ceph_remove_cap(cap, false);
>>   		goto out_unlock;
>>   	}
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index fab9d6461a65..98d746b3bb53 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -1411,8 +1411,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>>   	dout("removing cap %p, ci is %p, inode is %p\n",
>>   	     cap, ci, &ci->vfs_inode);
>>   	spin_lock(&ci->i_ceph_lock);
>> -	if (cap->mds_wanted | cap->issued)
>> -		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>>   	__ceph_remove_cap(cap, false);
>>   	if (!ci->i_auth_cap) {
>>   		struct ceph_cap_flush *cf;
>> @@ -1578,9 +1576,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
>>   			/* mds did not re-issue stale cap */
>>   			spin_lock(&ci->i_ceph_lock);
>>   			cap->issued = cap->implemented = CEPH_CAP_PIN;
>> -			/* make sure mds knows what we want */
>> -			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
>> -				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
>>   			spin_unlock(&ci->i_ceph_lock);
>>   		}
>>   	} else if (ev == FORCE_RO) {
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 37dc1ac8f6c3..48e84d7f48a0 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -517,12 +517,11 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>>   #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
>>   #define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
>>   #define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
>> -#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
>> -#define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
>> -#define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
>> -#define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
>> -#define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
>> -#define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
>> +#define CEPH_I_KICK_FLUSH	(1 << 7)  /* kick flushing caps */
>> +#define CEPH_I_FLUSH_SNAPS	(1 << 8)  /* need flush snapss */
>> +#define CEPH_I_ERROR_WRITE	(1 << 9)  /* have seen write errors */
>> +#define CEPH_I_ERROR_FILELOCK	(1 << 10) /* have seen file lock errors */
>> +#define CEPH_I_ODIRECT		(1 << 11) /* inode in direct I/O mode */
>>   
>>   /*
>>    * Masks of ceph inode work.
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps
  2020-02-21 14:35     ` Jeff Layton
@ 2020-02-24  7:09       ` Yan, Zheng
  0 siblings, 0 replies; 20+ messages in thread
From: Yan, Zheng @ 2020-02-24  7:09 UTC (permalink / raw)
  To: Jeff Layton, ceph-devel

On 2/21/20 10:35 PM, Jeff Layton wrote:
> On Fri, 2020-02-21 at 09:27 -0500, Jeff Layton wrote:
>> On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
>>> Add i_last_rd and i_last_wr to ceph_inode_info. These two fields are
>>> used to track inode's last read/write, they are updated when getting
>>> caps for read/write.
>>>
>>> If there is no read/write on an inode for 'caps_wanted_delay_max'
>>> seconds, __ceph_caps_file_wanted() does not request caps for read/write
>>> even there are open files.
>>>
>>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>>> ---
>>>   fs/ceph/caps.c               | 152 ++++++++++++++++++++++++-----------
>>>   fs/ceph/file.c               |  21 ++---
>>>   fs/ceph/inode.c              |  10 ++-
>>>   fs/ceph/ioctl.c              |   2 +
>>>   fs/ceph/super.h              |  13 ++-
>>>   include/linux/ceph/ceph_fs.h |   1 +
>>>   6 files changed, 139 insertions(+), 60 deletions(-)
>>>
>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>> index 293920d013ff..2a9df235286d 100644
>>> --- a/fs/ceph/caps.c
>>> +++ b/fs/ceph/caps.c
>>> @@ -971,18 +971,49 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
>>>   	return used;
>>>   }
>>>   
>>> +#define FMODE_WAIT_BIAS 1000
>>> +
>>>   /*
>>>    * wanted, by virtue of open file modes
>>>    */
>>>   int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>>>   {
>>> -	int i, bits = 0;
>>> -	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
>>> -		if (ci->i_nr_by_mode[i])
>>> -			bits |= 1 << i;
>>> +	struct ceph_mount_options *opt =
>>> +		ceph_inode_to_client(&ci->vfs_inode)->mount_options;
>>> +	unsigned long used_cutoff =
>>> +		round_jiffies(jiffies - opt->caps_wanted_delay_max * HZ);
>>> +	unsigned long idle_cutoff =
>>> +		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
>>> +	int bits = 0;
>>> +
>>> +	if (ci->i_nr_by_mode[0] > 0)
>>> +		bits |= CEPH_FILE_MODE_PIN;
>>> +
>>> +	if (ci->i_nr_by_mode[1] > 0) {
>>> +		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
>>> +		    time_after(ci->i_last_rd, used_cutoff))
>>> +			bits |= CEPH_FILE_MODE_RD;
>>> +	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
>>> +		bits |= CEPH_FILE_MODE_RD;
>>> +	}
>>> +
>>> +	if (ci->i_nr_by_mode[2] > 0) {
>>> +		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
>>> +		    time_after(ci->i_last_wr, used_cutoff))
>>> +			bits |= CEPH_FILE_MODE_WR;
>>> +	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
>>> +		bits |= CEPH_FILE_MODE_WR;
>>>   	}
>>> +
>>> +	/* check lazyio only when read/write is wanted */
>>> +	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
>>
>> LAZY is 4. Shouldn't this be?
>>
>>      if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[CEPH_FILE_MODE_LAZY] > 0)
>>
> 
> Nope, that value was right, but I think we should phrase this in terms
> of symbolic constants. Maybe we can squash this patch into your series?
> 
> -----------------------8<-----------------------
> 
> [PATCH] SQUASH: use symbolic constants in __ceph_caps_file_wanted()
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>   fs/ceph/caps.c | 15 ++++++++-------
>   1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index ad365cf870f6..1b450f2195fe 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -971,19 +971,19 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>   		round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
>   	int bits = 0;
>   
> -	if (ci->i_nr_by_mode[0] > 0)
> +	if (ci->i_nr_by_mode[CEPH_FILE_MODE_PIN] > 0)
>   		bits |= CEPH_FILE_MODE_PIN;
>   
> -	if (ci->i_nr_by_mode[1] > 0) {
> -		if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
> +	if (ci->i_nr_by_mode[CEPH_FILE_MODE_RD] > 0) {
> +		if (ci->i_nr_by_mode[CEPH_FILE_MODE_RD] >= FMODE_WAIT_BIAS ||
>   		    time_after(ci->i_last_rd, used_cutoff))
>   			bits |= CEPH_FILE_MODE_RD;
>   	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
>   		bits |= CEPH_FILE_MODE_RD;
>   	}
>   
> -	if (ci->i_nr_by_mode[2] > 0) {
> -		if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
> +	if (ci->i_nr_by_mode[CEPH_FILE_MODE_WR] > 0) {
> +		if (ci->i_nr_by_mode[CEPH_FILE_MODE_WR] >= FMODE_WAIT_BIAS ||
>   		    time_after(ci->i_last_wr, used_cutoff))
>   			bits |= CEPH_FILE_MODE_WR;
>   	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
> @@ -991,12 +991,13 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
>   	}
>   
>   	/* check lazyio only when read/write is wanted */
> -	if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
> +	if ((bits & CEPH_FILE_MODE_RDWR) &&
> +	    ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)] > 0)
>   		bits |= CEPH_FILE_MODE_LAZY;
>   
>   	if (bits == 0)
>   		return 0;
> -	if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
> +	if (bits == (1 << CEPH_FILE_MODE_PIN) && !S_ISDIR(ci->vfs_inode.i_mode))
>   		return 0;
>   
>   	return ceph_caps_for_mode(bits >> 1);
> 

how about something like below. when compile with -O2, gcc optimize out 
ffs() functions.

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a9df235286d..e1d38ef9478b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -985,33 +985,38 @@ int __ceph_caps_file_wanted(struct ceph_inode_info 
*ci)
         unsigned long idle_cutoff =
                 round_jiffies(jiffies - opt->caps_wanted_delay_min * HZ);
         int bits = 0;
+       const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+       const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+       const int WR_SHIFT= ffs(CEPH_FILE_MODE_WR);
+       const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);

-       if (ci->i_nr_by_mode[0] > 0)
-               bits |= CEPH_FILE_MODE_PIN;
+       if (ci->i_nr_by_mode[PIN_SHIFT] > 0)
+               bits |= 1 << PIN_SHIFT;

-       if (ci->i_nr_by_mode[1] > 0) {
-               if (ci->i_nr_by_mode[1] >= FMODE_WAIT_BIAS ||
+       if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+               if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
                     time_after(ci->i_last_rd, used_cutoff))
-                       bits |= CEPH_FILE_MODE_RD;
+                       bits |= 1 << RD_SHIFT;
         } else if (time_after(ci->i_last_rd, idle_cutoff)) {
-               bits |= CEPH_FILE_MODE_RD;
+               bits |= 1 << RD_SHIFT;
         }

-       if (ci->i_nr_by_mode[2] > 0) {
-               if (ci->i_nr_by_mode[2] >= FMODE_WAIT_BIAS ||
+       if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+               if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
                     time_after(ci->i_last_wr, used_cutoff))
-                       bits |= CEPH_FILE_MODE_WR;
+                       bits |= 1 << WR_SHIFT;
         } else if (time_after(ci->i_last_wr, idle_cutoff)) {
-               bits |= CEPH_FILE_MODE_WR;
+               bits |= 1 << WR_SHIFT;
         }

         /* check lazyio only when read/write is wanted */
-       if ((bits & CEPH_FILE_MODE_RDWR) && ci->i_nr_by_mode[3] > 0)
-               bits |= CEPH_FILE_MODE_LAZY;
+       if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+           ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+               bits |= 1 << LAZY_SHIFT;

         if (bits == 0)
                 return 0;
-       if (bits == 1 && !S_ISDIR(ci->vfs_inode.i_mode))
+       if (bits == (1 << PIN_SHIFT) && !S_ISDIR(ci->vfs_inode.i_mode))
                 return 0;

         return ceph_caps_for_mode(bits >> 1);

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient
  2020-02-24  3:17     ` Yan, Zheng
@ 2020-02-24 15:17       ` Jeff Layton
  0 siblings, 0 replies; 20+ messages in thread
From: Jeff Layton @ 2020-02-24 15:17 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Mon, 2020-02-24 at 11:17 +0800, Yan, Zheng wrote:
> On 2/22/20 1:17 AM, Jeff Layton wrote:
> > On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> > > original code only renews caps for inodes with CEPH_I_CAP_DROPPED flags.
> > > The flag indicates that mds closed session and caps were dropped. This
> > > patch is preparation for not requesting caps for idle open files.
> > > 
> > > CEPH_I_CAP_DROPPED is no longer tested by anyone, so this patch also
> > > remove it.
> > > 
> > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > ---
> > >   fs/ceph/caps.c       | 36 +++++++++++++++---------------------
> > >   fs/ceph/mds_client.c |  5 -----
> > >   fs/ceph/super.h      | 11 +++++------
> > >   3 files changed, 20 insertions(+), 32 deletions(-)
> > > 
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index d05717397c2a..293920d013ff 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -2659,6 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> > >   		}
> > >   	} else {
> > >   		int session_readonly = false;
> > > +		int mds_wanted;
> > >   		if (ci->i_auth_cap &&
> > >   		    (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
> > >   			struct ceph_mds_session *s = ci->i_auth_cap->session;
> > > @@ -2667,32 +2668,27 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> > >   			spin_unlock(&s->s_cap_lock);
> > >   		}
> > >   		if (session_readonly) {
> > > -			dout("get_cap_refs %p needed %s but mds%d readonly\n",
> > > +			dout("get_cap_refs %p need %s but mds%d readonly\n",
> > >   			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
> > >   			ret = -EROFS;
> > >   			goto out_unlock;
> > >   		}
> > >   
> > > -		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
> > > -			int mds_wanted;
> > > -			if (READ_ONCE(mdsc->fsc->mount_state) ==
> > > -			    CEPH_MOUNT_SHUTDOWN) {
> > > -				dout("get_cap_refs %p forced umount\n", inode);
> > > -				ret = -EIO;
> > > -				goto out_unlock;
> > > -			}
> > > -			mds_wanted = __ceph_caps_mds_wanted(ci, false);
> > > -			if (need & ~(mds_wanted & need)) {
> > > -				dout("get_cap_refs %p caps were dropped"
> > > -				     " (session killed?)\n", inode);
> > > -				ret = -ESTALE;
> > > -				goto out_unlock;
> > > -			}
> > > -			if (!(file_wanted & ~mds_wanted))
> > > -				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
> > > +		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> > > +			dout("get_cap_refs %p forced umount\n", inode);
> > > +			ret = -EIO;
> > > +			goto out_unlock;
> > > +		}
> > > +		mds_wanted = __ceph_caps_mds_wanted(ci, false);
> > > +		if (need & ~mds_wanted) {
> > > +			dout("get_cap_refs %p need %s > mds_wanted %s\n",
> > > +			     inode, ceph_cap_string(need),
> > > +			     ceph_cap_string(mds_wanted));
> > > +			ret = -ESTALE;
> > > +			goto out_unlock;
> > >   		}
> > >   
> > 
> > I was able to reproduce softlockups with xfstests reliably for a little
> > while, but it doesn't always happen. I bisected it down to this patch
> > though. I suspect the problem is in the hunk above. It looks like it's
> > getting into a situation where this is continually returning ESTALE.
> > 
> > I cranked up debug logging in this function and I see this:
> > 
> > [  259.284839] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> > [  259.284848] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> > [  259.284855] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> > [  259.284863] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284868] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> > [  259.284877] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284885] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284890] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284899] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> > [  259.284908] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> > [  259.284918] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284926] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284945] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> > [  259.284950] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284961] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284969] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > [  259.284975] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> > [  259.284984] ceph:  get_cap_refs 000000003d7d65fa need Fr > mds_wanted -
> > [  259.284994] ceph:  get_cap_refs 000000003d7d65fa ret -116 got -
> > [  259.285003] ceph:  get_cap_refs 000000003d7d65fa need Fr want Fc
> > 
> 
> Looks like ceph_check_caps did update caps' mds_wanted. Did you test 
> this patch with async dirops patches? please try reproducing this issue 
> again with debug log of try_get_cap_refs(), ceph_check_caps() and 
> ceph_renew_caps().
> 
> 

I tried with and without async dirops and was able to reproduce it
either way. It seems like the issue is that the mds_wanted set is not
being updated properly when we end up in in get_cap_refs.

I'm traveling at the moment though and won't be able to reproduce this
until I'm back at work (Thursday).


> 
> > ...not sure I understand the logical flow in this function well enough
> > to suggest a fix yet though.
> > 
> > > -		dout("get_cap_refs %p have %s needed %s\n", inode,
> > > +		dout("get_cap_refs %p have %s need %s\n", inode,
> > >   		     ceph_cap_string(have), ceph_cap_string(need));
> > >   	}
> > >   out_unlock:
> > > @@ -3646,8 +3642,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
> > >   		goto out_unlock;
> > >   
> > >   	if (target < 0) {
> > > -		if (cap->mds_wanted | cap->issued)
> > > -			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
> > >   		__ceph_remove_cap(cap, false);
> > >   		goto out_unlock;
> > >   	}
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index fab9d6461a65..98d746b3bb53 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -1411,8 +1411,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> > >   	dout("removing cap %p, ci is %p, inode is %p\n",
> > >   	     cap, ci, &ci->vfs_inode);
> > >   	spin_lock(&ci->i_ceph_lock);
> > > -	if (cap->mds_wanted | cap->issued)
> > > -		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
> > >   	__ceph_remove_cap(cap, false);
> > >   	if (!ci->i_auth_cap) {
> > >   		struct ceph_cap_flush *cf;
> > > @@ -1578,9 +1576,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
> > >   			/* mds did not re-issue stale cap */
> > >   			spin_lock(&ci->i_ceph_lock);
> > >   			cap->issued = cap->implemented = CEPH_CAP_PIN;
> > > -			/* make sure mds knows what we want */
> > > -			if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
> > > -				ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
> > >   			spin_unlock(&ci->i_ceph_lock);
> > >   		}
> > >   	} else if (ev == FORCE_RO) {
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 37dc1ac8f6c3..48e84d7f48a0 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -517,12 +517,11 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> > >   #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
> > >   #define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
> > >   #define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
> > > -#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
> > > -#define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
> > > -#define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
> > > -#define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
> > > -#define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
> > > -#define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
> > > +#define CEPH_I_KICK_FLUSH	(1 << 7)  /* kick flushing caps */
> > > +#define CEPH_I_FLUSH_SNAPS	(1 << 8)  /* need flush snapss */
> > > +#define CEPH_I_ERROR_WRITE	(1 << 9)  /* have seen write errors */
> > > +#define CEPH_I_ERROR_FILELOCK	(1 << 10) /* have seen file lock errors */
> > > +#define CEPH_I_ODIRECT		(1 << 11) /* inode in direct I/O mode */
> > >   
> > >   /*
> > >    * Masks of ceph inode work.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps()
  2020-02-21 13:16 ` [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
@ 2020-02-27 11:43   ` Jeff Layton
  2020-02-27 12:08     ` Yan, Zheng
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Layton @ 2020-02-27 11:43 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel

On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> __ceph_caps_file_wanted() already checks 'caps_wanted_delay_min' and
> 'caps_wanted_delay_max'. There is no need to duplicte the logic in
> ceph_check_caps() and __send_cap()
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/caps.c  | 146 ++++++++++++------------------------------------
>  fs/ceph/file.c  |   7 +--
>  fs/ceph/inode.c |   1 -
>  fs/ceph/super.h |   8 +--
>  4 files changed, 42 insertions(+), 120 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 2959e4c36a15..ad365cf870f6 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>  			       struct ceph_inode_info *ci)
>  {
>  	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
> -
> -	ci->i_hold_caps_min = round_jiffies(jiffies +
> -					    opt->caps_wanted_delay_min * HZ);
>  	ci->i_hold_caps_max = round_jiffies(jiffies +
>  					    opt->caps_wanted_delay_max * HZ);
> -	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
> -	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
> +	dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
> +	     ci->i_hold_caps_max - jiffies);
>  }
>  
>  /*
> @@ -508,8 +505,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>   *    -> we take mdsc->cap_delay_lock
>   */
>  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> -				struct ceph_inode_info *ci,
> -				bool set_timeout)
> +				struct ceph_inode_info *ci)
>  {
>  	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
>  	     ci->i_ceph_flags, ci->i_hold_caps_max);
> @@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  				goto no_change;
>  			list_del_init(&ci->i_cap_delay_list);
>  		}
> -		if (set_timeout)
> -			__cap_set_timeouts(mdsc, ci);
> +		__cap_set_timeouts(mdsc, ci);
>  		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>  no_change:
>  		spin_unlock(&mdsc->cap_delay_lock);
> @@ -719,7 +714,7 @@ void ceph_add_cap(struct inode *inode,
>  		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
>  		     ceph_cap_string(issued), ceph_cap_string(wanted),
>  		     ceph_cap_string(actual_wanted));
> -		__cap_delay_requeue(mdsc, ci, true);
> +		__cap_delay_requeue(mdsc, ci);
>  	}
>  
>  	if (flags & CEPH_CAP_FLAG_AUTH) {
> @@ -1304,7 +1299,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  	struct cap_msg_args arg;
>  	int held, revoking;
>  	int wake = 0;
> -	int delayed = 0;
>  	int ret;
>  
>  	held = cap->issued | cap->implemented;
> @@ -1317,28 +1311,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  	     ceph_cap_string(revoking));
>  	BUG_ON((retain & CEPH_CAP_PIN) == 0);
>  
> -	arg.session = cap->session;
> -
> -	/* don't release wanted unless we've waited a bit. */
> -	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
> -	    time_before(jiffies, ci->i_hold_caps_min)) {
> -		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
> -		     ceph_cap_string(cap->issued),
> -		     ceph_cap_string(cap->issued & retain),
> -		     ceph_cap_string(cap->mds_wanted),
> -		     ceph_cap_string(want));
> -		want |= cap->mds_wanted;
> -		retain |= cap->issued;
> -		delayed = 1;
> -	}
> -	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
> -	if (want & ~cap->mds_wanted) {
> -		/* user space may open/close single file frequently.
> -		 * This avoids droping mds_wanted immediately after
> -		 * requesting new mds_wanted.
> -		 */
> -		__cap_set_timeouts(mdsc, ci);
> -	}
> +	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
>  
>  	cap->issued &= retain;  /* drop bits we don't want */
>  	if (cap->implemented & ~cap->issued) {
> @@ -1353,6 +1326,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  	cap->implemented &= cap->issued | used;
>  	cap->mds_wanted = want;
>  
> +	arg.session = cap->session;
>  	arg.ino = ceph_vino(inode).ino;
>  	arg.cid = cap->cap_id;
>  	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
> @@ -1413,14 +1387,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>  
>  	ret = send_cap_msg(&arg);
>  	if (ret < 0) {
> -		dout("error sending cap msg, must requeue %p\n", inode);
> -		delayed = 1;
> +		pr_err("error sending cap msg, ino (%llx.%llx) "
> +		       "flushing %s tid %llu, requeue\n",
> +		       ceph_vinop(inode), ceph_cap_string(flushing),
> +		       flush_tid);
> +		spin_lock(&ci->i_ceph_lock);
> +		__cap_delay_requeue(mdsc, ci);
> +		spin_unlock(&ci->i_ceph_lock);
>  	}
>  
>  	if (wake)
>  		wake_up_all(&ci->i_cap_wq);
>  
> -	return delayed;
> +	return ret;
>  }
>  
>  static inline int __send_flush_snap(struct inode *inode,
> @@ -1684,7 +1663,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
>  	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
>  	    (mask & CEPH_CAP_FILE_BUFFER))
>  		dirty |= I_DIRTY_DATASYNC;
> -	__cap_delay_requeue(mdsc, ci, true);
> +	__cap_delay_requeue(mdsc, ci);
>  	return dirty;
>  }
>  
> @@ -1835,8 +1814,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
>   * versus held caps.  Release, flush, ack revoked caps to mds as
>   * appropriate.
>   *
> - *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
> - *    cap release further.
>   *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
>   *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
>   *    further delay.
> @@ -1855,17 +1832,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	int mds = -1;   /* keep track of how far we've gone through i_caps list
>  			   to avoid an infinite loop on retry */
>  	struct rb_node *p;
> -	int delayed = 0, sent = 0;
> -	bool no_delay = flags & CHECK_CAPS_NODELAY;
>  	bool queue_invalidate = false;
>  	bool tried_invalidate = false;
>  
> -	/* if we are unmounting, flush any unused caps immediately. */
> -	if (mdsc->stopping)
> -		no_delay = true;
> -
>  	spin_lock(&ci->i_ceph_lock);
> -
>  	if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  		flags |= CHECK_CAPS_FLUSH;
>  
> @@ -1911,14 +1881,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	}
>  
>  	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
> -	     " issued %s revoking %s retain %s %s%s%s\n", inode,
> +	     " issued %s revoking %s retain %s %s%s\n", inode,
>  	     ceph_cap_string(file_wanted),
>  	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
>  	     ceph_cap_string(ci->i_flushing_caps),
>  	     ceph_cap_string(issued), ceph_cap_string(revoking),
>  	     ceph_cap_string(retain),
>  	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
> -	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
>  	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
>  
>  	/*
> @@ -1926,7 +1895,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	 * have cached pages, but don't want them, then try to invalidate.
>  	 * If we fail, it's because pages are locked.... try again later.
>  	 */
> -	if ((!no_delay || mdsc->stopping) &&
> +	if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
>  	    S_ISREG(inode->i_mode) &&
>  	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
>  	    inode->i_data.nrpages &&		/* have cached pages */
> @@ -2006,21 +1975,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		if ((cap->issued & ~retain) == 0)
>  			continue;     /* nope, all good */
>  
> -		if (no_delay)
> -			goto ack;
> -
> -		/* delay? */
> -		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
> -		    time_before(jiffies, ci->i_hold_caps_max)) {
> -			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
> -			     ceph_cap_string(cap->issued),
> -			     ceph_cap_string(cap->issued & retain),
> -			     ceph_cap_string(cap->mds_wanted),
> -			     ceph_cap_string(want));
> -			delayed++;
> -			continue;
> -		}
> -
>  ack:
>  		if (session && session != cap->session) {
>  			dout("oops, wrong session %p mutex\n", session);
> @@ -2081,24 +2035,18 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		}
>  
>  		mds = cap->mds;  /* remember mds, so we don't repeat */
> -		sent++;
>  
>  		/* __send_cap drops i_ceph_lock */
> -		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
> -				cap_used, want, retain, flushing,
> -				flush_tid, oldest_flush_tid);
> +		__send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
> +			   retain, flushing, flush_tid, oldest_flush_tid);
>  		goto retry; /* retake i_ceph_lock and restart our cap scan. */
>  	}
>  
> -	if (list_empty(&ci->i_cap_delay_list)) {
> -	    if (delayed) {
> -		    /* Reschedule delayed caps release if we delayed anything */
> -		    __cap_delay_requeue(mdsc, ci, false);
> -	    } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> -			!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> -		    /* periodically re-calculate caps wanted by open files */
> -		    __cap_delay_requeue(mdsc, ci, true);
> -	    }
> +	/* periodically re-calculate caps wanted by open files */
> +	if (list_empty(&ci->i_cap_delay_list) &&
> +	    (file_wanted & ~CEPH_CAP_PIN) &&
> +	    !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +		__cap_delay_requeue(mdsc, ci);
>  	}
>  
>  	spin_unlock(&ci->i_ceph_lock);
> @@ -2128,7 +2076,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
>  retry_locked:
>  	if (ci->i_dirty_caps && ci->i_auth_cap) {
>  		struct ceph_cap *cap = ci->i_auth_cap;
> -		int delayed;
>  
>  		if (session != cap->session) {
>  			spin_unlock(&ci->i_ceph_lock);
> @@ -2157,18 +2104,10 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
>  						 &oldest_flush_tid);
>  
>  		/* __send_cap drops i_ceph_lock */
> -		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> -				     CEPH_CLIENT_CAPS_SYNC,
> -				     __ceph_caps_used(ci),
> -				     __ceph_caps_wanted(ci),
> -				     (cap->issued | cap->implemented),
> -				     flushing, flush_tid, oldest_flush_tid);
> -
> -		if (delayed) {
> -			spin_lock(&ci->i_ceph_lock);
> -			__cap_delay_requeue(mdsc, ci, true);
> -			spin_unlock(&ci->i_ceph_lock);
> -		}
> +		__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
> +			   __ceph_caps_used(ci), __ceph_caps_wanted(ci),
> +			   (cap->issued | cap->implemented),
> +			   flushing, flush_tid, oldest_flush_tid);
>  	} else {
>  		if (!list_empty(&ci->i_cap_flush_list)) {
>  			struct ceph_cap_flush *cf =
> @@ -2368,22 +2307,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
>  		if (cf->caps) {
>  			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
>  			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
> -			ci->i_ceph_flags |= CEPH_I_NODELAY;
> -
> -			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> +			__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
>  					 (cf->tid < last_snap_flush ?
>  					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
>  					  __ceph_caps_used(ci),
>  					  __ceph_caps_wanted(ci),
>  					  (cap->issued | cap->implemented),
>  					  cf->caps, cf->tid, oldest_flush_tid);
> -			if (ret) {
> -				pr_err("kick_flushing_caps: error sending "
> -					"cap flush, ino (%llx.%llx) "
> -					"tid %llu flushing %s\n",
> -					ceph_vinop(inode), cf->tid,
> -					ceph_cap_string(cf->caps));
> -			}
>  		} else {
>  			struct ceph_cap_snap *capsnap =
>  					container_of(cf, struct ceph_cap_snap,
> @@ -3001,7 +2931,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>  	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
>  	     last ? " last" : "", put ? " put" : "");
>  
> -	if (last && !flushsnaps)
> +	if (last)
>  		ceph_check_caps(ci, 0, NULL);
>  	else if (flushsnaps)
>  		ceph_flush_snaps(ci, NULL);
> @@ -3419,10 +3349,10 @@ static void handle_cap_grant(struct inode *inode,
>  		wake_up_all(&ci->i_cap_wq);
>  
>  	if (check_caps == 1)
> -		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
> +		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
>  				session);
>  	else if (check_caps == 2)
> -		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
> +		ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
>  	else
>  		mutex_unlock(&session->s_mutex);
>  }
> @@ -4097,7 +4027,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
>  {
>  	struct inode *inode;
>  	struct ceph_inode_info *ci;
> -	int flags = CHECK_CAPS_NODELAY;
>  
>  	dout("check_delayed_caps\n");
>  	while (1) {
> @@ -4117,7 +4046,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
>  
>  		if (inode) {
>  			dout("check_delayed_caps on %p\n", inode);
> -			ceph_check_caps(ci, flags, NULL);
> +			ceph_check_caps(ci, 0, NULL);
>  			/* avoid calling iput_final() in tick thread */
>  			ceph_async_iput(inode);
>  		}
> @@ -4142,7 +4071,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>  		ihold(inode);
>  		dout("flush_dirty_caps %p\n", inode);
>  		spin_unlock(&mdsc->cap_dirty_lock);
> -		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
> +		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
>  		iput(inode);
>  		spin_lock(&mdsc->cap_dirty_lock);
>  	}
> @@ -4160,7 +4089,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
>  		ci->i_last_wr = now;
>  	/* queue periodic check */
>  	if (fmode && list_empty(&ci->i_cap_delay_list))
> -		__cap_delay_requeue(mdsc, ci, true);
> +		__cap_delay_requeue(mdsc, ci);
>  }
>  
>  void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> @@ -4209,7 +4138,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
>  	if (inode->i_nlink == 1) {
>  		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
>  
> -		ci->i_ceph_flags |= CEPH_I_NODELAY;
>  		if (__ceph_caps_dirty(ci)) {
>  			struct ceph_mds_client *mdsc =
>  				ceph_inode_to_client(inode)->mdsc;
> @@ -4265,8 +4193,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
>  		if (force || (cap->issued & drop)) {
>  			if (cap->issued & drop) {
>  				int wanted = __ceph_caps_wanted(ci);
> -				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
> -					wanted |= cap->mds_wanted;
>  				dout("encode_inode_release %p cap %p "
>  				     "%s -> %s, wanted %s -> %s\n", inode, cap,
>  				     ceph_cap_string(cap->issued),
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 84058d3c5685..1d76bdf1a1b9 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1552,7 +1552,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		if (dirty)
>  			__mark_inode_dirty(inode, dirty);
>  		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> -			ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> +			ceph_check_caps(ci, 0, NULL);
>  	}
>  
>  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> @@ -2129,12 +2129,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  
>  	if (endoff > size) {
>  		int caps_flags = 0;
> -
>  		/* Let the MDS know about dst file size change */
> -		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
> -			caps_flags |= CHECK_CAPS_NODELAY;
>  		if (ceph_inode_set_size(dst_inode, endoff))
>  			caps_flags |= CHECK_CAPS_AUTHONLY;
> +		if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
> +			caps_flags |= CHECK_CAPS_AUTHONLY;

Does ceph_quota_is_max_bytes_approaching have any side effects?

It looks like that call can be pretty expensive (including a possible
separate round trip to the MDS). If we don't need to call it for other
reasons, then we should probably do:

                if (ceph_inode_set_size(dst_inode, dst_off) ||                  
                    ceph_quota_is_max_bytes_approaching(dst_inode, endoff))     
                        caps_flags |= CHECK_CAPS_AUTHONLY;                      


>  		if (caps_flags)
>  			ceph_check_caps(dst_ci, caps_flags, NULL);
>  	}
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 0b0f503c84c3..5a8fa8a2d3cf 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -471,7 +471,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_prealloc_cap_flush = NULL;
>  	INIT_LIST_HEAD(&ci->i_cap_flush_list);
>  	init_waitqueue_head(&ci->i_cap_wq);
> -	ci->i_hold_caps_min = 0;
>  	ci->i_hold_caps_max = 0;
>  	INIT_LIST_HEAD(&ci->i_cap_delay_list);
>  	INIT_LIST_HEAD(&ci->i_cap_snaps);
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index d89478db8b24..e586cff3dfd5 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -170,9 +170,9 @@ struct ceph_cap {
>  	struct list_head caps_item;
>  };
>  
> -#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
> -#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
> -#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
> +#define CHECK_CAPS_AUTHONLY   1  /* only check auth cap */
> +#define CHECK_CAPS_FLUSH      2  /* flush any dirty caps */
> +#define CHECK_CAPS_NOINVAL    4  /* don't invalidate pagecache */
>  
>  struct ceph_cap_flush {
>  	u64 tid;
> @@ -352,7 +352,6 @@ struct ceph_inode_info {
>  	struct ceph_cap_flush *i_prealloc_cap_flush;
>  	struct list_head i_cap_flush_list;
>  	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
> -	unsigned long i_hold_caps_min; /* jiffies */
>  	unsigned long i_hold_caps_max; /* jiffies */
>  	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
>  	struct ceph_cap_reservation i_cap_migration_resv;
> @@ -513,7 +512,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>   * Ceph inode.
>   */
>  #define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
> -#define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
>  #define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
>  #define CEPH_I_POOL_PERM	(1 << 3)  /* pool rd/wr bits are valid */
>  #define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/4] ceph: don't request caps for idle open files
  2020-02-21 14:57 ` Jeff Layton
@ 2020-02-27 12:07   ` Yan, Zheng
       [not found]     ` <68c33106a754b443f2dd938f2d8eed6da19136dc.camel@kernel.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Yan, Zheng @ 2020-02-27 12:07 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Yan, Zheng, ceph-devel

On Fri, Feb 21, 2020 at 10:59 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> > This series make cephfs client not request caps for open files that
> > idle for a long time. For the case that one active client and multiple
> > standby clients open the same file, this increase the possibility that
> > mds issues exclusive caps to the active client.
> >
> > Yan, Zheng (4):
> >   ceph: always renew caps if mds_wanted is insufficient
> >   ceph: consider inode's last read/write when calculating wanted caps
> >   ceph: simplify calling of ceph_get_fmode()
> >   ceph: remove delay check logic from ceph_check_caps()
> >
> >  fs/ceph/caps.c               | 324 +++++++++++++++--------------------
> >  fs/ceph/file.c               |  39 ++---
> >  fs/ceph/inode.c              |  19 +-
> >  fs/ceph/ioctl.c              |   2 +
> >  fs/ceph/mds_client.c         |   5 -
> >  fs/ceph/super.h              |  35 ++--
> >  include/linux/ceph/ceph_fs.h |   1 +
> >  7 files changed, 188 insertions(+), 237 deletions(-)
> >
>
> Testing with this series and xfstests is throwing softlockups. Still
> looking at the cause:
>
> [  251.385573] watchdog: BUG: soft lockup - CPU#7 stuck for 23s! [fsstress:7113]
> [  251.387105] Modules linked in: ceph(OE) libceph(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) ip_set(E) nfnetlink(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) cachefiles(E) fscache(E) sunrpc(E) pcspkr(E) joydev(E) i2c_piix4(E) virtio_balloon(E) crct10dif_pclmul(E) crc32_pclmul(E) ghash_clmulni_intel(E) ip_tables(E) xfs(E) libcrc32c(E) rfkill(E) qxl(E) drm_ttm_helper(E) crc32c_intel(E) ttm(E) drm_kms_helper(E) cec(E) serio_raw(E) virtio_net(E) net_failover(E) virtio_console(E) virtio_blk(E) failover(E) drm(E) ata_generic(E) floppy(E) pata_acpi(E) qemu_fw_cfg(E)
> [  251.402300] CPU: 7 PID: 7113 Comm: fsstress Kdump: loaded Tainted: G           OE     5.6.0-rc1+ #12
> [  251.404351] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
> [  251.407716] RIP: 0010:ceph_get_caps+0x108/0x5e0 [ceph]
> [  251.409154] Code: 80 cc 02 85 d2 44 89 f2 0f 45 d8 41 89 d8 e8 4f f8 ff ff 89 c1 83 f8 f5 74 ba 85 c0 0f 84 55 02 00 00 48 8b 74 24 50 f6 06 02 <74> 11 48 8b 44 24 48 8b 40 2c 39 46 1c 0f 85 97 04 00 00 85 c9 0f
> [  251.413302] RSP: 0018:ffffb72c01567a10 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
> [  251.414827] RAX: 00000000ffffff8c RBX: 0000000000000002 RCX: 00000000ffffff8c
> [  251.416303] RDX: ffff8a1fd45b4468 RSI: ffff8a2228fd2c08 RDI: ffff8a1fd45b42d0
> [  251.417778] RBP: ffffb72c01567b00 R08: ffff8a2227fcb000 R09: ffffb72c01567a6c
> [  251.419265] R10: 0000000000000001 R11: 0000000000000000 R12: 00000000160337c1
> [  251.420811] R13: 0000000000001000 R14: 0000000000002000 R15: ffff8a1fd45b45f8
> [  251.422355] FS:  00007fcdcc9ebf40(0000) GS:ffff8a222fbc0000(0000) knlGS:0000000000000000
> [  251.423962] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  251.425294] CR2: 000000000162e000 CR3: 00000001f0b18000 CR4: 00000000003406e0
> [  251.426785] Call Trace:
> [  251.427739]  ? __vfs_getxattr+0x53/0x70
> [  251.428835]  ? generic_update_time+0x9d/0xc0
> [  251.429978]  ? file_update_time+0xdd/0x120
> [  251.431106]  ceph_write_iter+0x27b/0xc30 [ceph]
> [  251.432285]  ? ceph_read_iter+0x3b3/0xd60 [ceph]
> [  251.433463]  ? _cond_resched+0x15/0x30
> [  251.434533]  ? do_iter_readv_writev+0x158/0x1d0
> [  251.435681]  do_iter_readv_writev+0x158/0x1d0
> [  251.436825]  do_iter_write+0x7d/0x190
> [  251.437867]  iter_file_splice_write+0x26a/0x3c0
> [  251.439059]  direct_splice_actor+0x35/0x40
> [  251.440144]  splice_direct_to_actor+0x102/0x250
> [  251.441274]  ? generic_pipe_buf_nosteal+0x10/0x10
> [  251.442444]  do_splice_direct+0x8b/0xd0
> [  251.443491]  generic_copy_file_range+0x32/0x40
> [  251.444596]  vfs_copy_file_range+0x2eb/0x310
> [  251.445691]  ? __do_sys_newfstat+0x5a/0x70
> [  251.446764]  __x64_sys_copy_file_range+0xd6/0x210
> [  251.447933]  do_syscall_64+0x5b/0x1c0
> [  251.448972]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  251.450188] RIP: 0033:0x7fcdccae91ed
> [  251.451236] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
> [  251.455096] RSP: 002b:00007ffcbac62e28 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
> [  251.456664] RAX: ffffffffffffffda RBX: 0000000124e3181d RCX: 00007fcdccae91ed
> [  251.458153] RDX: 0000000000000004 RSI: 00007ffcbac62e60 RDI: 0000000000000003
> [  251.459626] RBP: 0000000000000003 R08: 000000000001c50b R09: 0000000000000000
> [  251.461147] R10: 00007ffcbac62e68 R11: 0000000000000246 R12: 0000000000000088
> [  251.462680] R13: 0000000000000004 R14: 000000000001c50b R15: 000000001602447d
>
>

I have tried fsstress for days. found a few other issue, but still
can't reproduce this. Please try branch
https://github.com/ceph/ceph-client/tree/wip-fmode-timeout

Regards
Yan, Zheng

> --
> Jeff Layton <jlayton@kernel.org>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps()
  2020-02-27 11:43   ` Jeff Layton
@ 2020-02-27 12:08     ` Yan, Zheng
  0 siblings, 0 replies; 20+ messages in thread
From: Yan, Zheng @ 2020-02-27 12:08 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Yan, Zheng, ceph-devel

On Thu, Feb 27, 2020 at 7:45 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> > __ceph_caps_file_wanted() already checks 'caps_wanted_delay_min' and
> > 'caps_wanted_delay_max'. There is no need to duplicte the logic in
> > ceph_check_caps() and __send_cap()
> >
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/caps.c  | 146 ++++++++++++------------------------------------
> >  fs/ceph/file.c  |   7 +--
> >  fs/ceph/inode.c |   1 -
> >  fs/ceph/super.h |   8 +--
> >  4 files changed, 42 insertions(+), 120 deletions(-)
> >
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 2959e4c36a15..ad365cf870f6 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
> >                              struct ceph_inode_info *ci)
> >  {
> >       struct ceph_mount_options *opt = mdsc->fsc->mount_options;
> > -
> > -     ci->i_hold_caps_min = round_jiffies(jiffies +
> > -                                         opt->caps_wanted_delay_min * HZ);
> >       ci->i_hold_caps_max = round_jiffies(jiffies +
> >                                           opt->caps_wanted_delay_max * HZ);
> > -     dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
> > -          ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
> > +     dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
> > +          ci->i_hold_caps_max - jiffies);
> >  }
> >
> >  /*
> > @@ -508,8 +505,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
> >   *    -> we take mdsc->cap_delay_lock
> >   */
> >  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > -                             struct ceph_inode_info *ci,
> > -                             bool set_timeout)
> > +                             struct ceph_inode_info *ci)
> >  {
> >       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> >            ci->i_ceph_flags, ci->i_hold_caps_max);
> > @@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> >                               goto no_change;
> >                       list_del_init(&ci->i_cap_delay_list);
> >               }
> > -             if (set_timeout)
> > -                     __cap_set_timeouts(mdsc, ci);
> > +             __cap_set_timeouts(mdsc, ci);
> >               list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> >  no_change:
> >               spin_unlock(&mdsc->cap_delay_lock);
> > @@ -719,7 +714,7 @@ void ceph_add_cap(struct inode *inode,
> >               dout(" issued %s, mds wanted %s, actual %s, queueing\n",
> >                    ceph_cap_string(issued), ceph_cap_string(wanted),
> >                    ceph_cap_string(actual_wanted));
> > -             __cap_delay_requeue(mdsc, ci, true);
> > +             __cap_delay_requeue(mdsc, ci);
> >       }
> >
> >       if (flags & CEPH_CAP_FLAG_AUTH) {
> > @@ -1304,7 +1299,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> >       struct cap_msg_args arg;
> >       int held, revoking;
> >       int wake = 0;
> > -     int delayed = 0;
> >       int ret;
> >
> >       held = cap->issued | cap->implemented;
> > @@ -1317,28 +1311,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> >            ceph_cap_string(revoking));
> >       BUG_ON((retain & CEPH_CAP_PIN) == 0);
> >
> > -     arg.session = cap->session;
> > -
> > -     /* don't release wanted unless we've waited a bit. */
> > -     if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
> > -         time_before(jiffies, ci->i_hold_caps_min)) {
> > -             dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
> > -                  ceph_cap_string(cap->issued),
> > -                  ceph_cap_string(cap->issued & retain),
> > -                  ceph_cap_string(cap->mds_wanted),
> > -                  ceph_cap_string(want));
> > -             want |= cap->mds_wanted;
> > -             retain |= cap->issued;
> > -             delayed = 1;
> > -     }
> > -     ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
> > -     if (want & ~cap->mds_wanted) {
> > -             /* user space may open/close single file frequently.
> > -              * This avoids droping mds_wanted immediately after
> > -              * requesting new mds_wanted.
> > -              */
> > -             __cap_set_timeouts(mdsc, ci);
> > -     }
> > +     ci->i_ceph_flags &= ~CEPH_I_FLUSH;
> >
> >       cap->issued &= retain;  /* drop bits we don't want */
> >       if (cap->implemented & ~cap->issued) {
> > @@ -1353,6 +1326,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> >       cap->implemented &= cap->issued | used;
> >       cap->mds_wanted = want;
> >
> > +     arg.session = cap->session;
> >       arg.ino = ceph_vino(inode).ino;
> >       arg.cid = cap->cap_id;
> >       arg.follows = flushing ? ci->i_head_snapc->seq : 0;
> > @@ -1413,14 +1387,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> >
> >       ret = send_cap_msg(&arg);
> >       if (ret < 0) {
> > -             dout("error sending cap msg, must requeue %p\n", inode);
> > -             delayed = 1;
> > +             pr_err("error sending cap msg, ino (%llx.%llx) "
> > +                    "flushing %s tid %llu, requeue\n",
> > +                    ceph_vinop(inode), ceph_cap_string(flushing),
> > +                    flush_tid);
> > +             spin_lock(&ci->i_ceph_lock);
> > +             __cap_delay_requeue(mdsc, ci);
> > +             spin_unlock(&ci->i_ceph_lock);
> >       }
> >
> >       if (wake)
> >               wake_up_all(&ci->i_cap_wq);
> >
> > -     return delayed;
> > +     return ret;
> >  }
> >
> >  static inline int __send_flush_snap(struct inode *inode,
> > @@ -1684,7 +1663,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
> >       if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
> >           (mask & CEPH_CAP_FILE_BUFFER))
> >               dirty |= I_DIRTY_DATASYNC;
> > -     __cap_delay_requeue(mdsc, ci, true);
> > +     __cap_delay_requeue(mdsc, ci);
> >       return dirty;
> >  }
> >
> > @@ -1835,8 +1814,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
> >   * versus held caps.  Release, flush, ack revoked caps to mds as
> >   * appropriate.
> >   *
> > - *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
> > - *    cap release further.
> >   *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
> >   *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
> >   *    further delay.
> > @@ -1855,17 +1832,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >       int mds = -1;   /* keep track of how far we've gone through i_caps list
> >                          to avoid an infinite loop on retry */
> >       struct rb_node *p;
> > -     int delayed = 0, sent = 0;
> > -     bool no_delay = flags & CHECK_CAPS_NODELAY;
> >       bool queue_invalidate = false;
> >       bool tried_invalidate = false;
> >
> > -     /* if we are unmounting, flush any unused caps immediately. */
> > -     if (mdsc->stopping)
> > -             no_delay = true;
> > -
> >       spin_lock(&ci->i_ceph_lock);
> > -
> >       if (ci->i_ceph_flags & CEPH_I_FLUSH)
> >               flags |= CHECK_CAPS_FLUSH;
> >
> > @@ -1911,14 +1881,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >       }
> >
> >       dout("check_caps %p file_want %s used %s dirty %s flushing %s"
> > -          " issued %s revoking %s retain %s %s%s%s\n", inode,
> > +          " issued %s revoking %s retain %s %s%s\n", inode,
> >            ceph_cap_string(file_wanted),
> >            ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
> >            ceph_cap_string(ci->i_flushing_caps),
> >            ceph_cap_string(issued), ceph_cap_string(revoking),
> >            ceph_cap_string(retain),
> >            (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
> > -          (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
> >            (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
> >
> >       /*
> > @@ -1926,7 +1895,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >        * have cached pages, but don't want them, then try to invalidate.
> >        * If we fail, it's because pages are locked.... try again later.
> >        */
> > -     if ((!no_delay || mdsc->stopping) &&
> > +     if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
> >           S_ISREG(inode->i_mode) &&
> >           !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
> >           inode->i_data.nrpages &&            /* have cached pages */
> > @@ -2006,21 +1975,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >               if ((cap->issued & ~retain) == 0)
> >                       continue;     /* nope, all good */
> >
> > -             if (no_delay)
> > -                     goto ack;
> > -
> > -             /* delay? */
> > -             if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
> > -                 time_before(jiffies, ci->i_hold_caps_max)) {
> > -                     dout(" delaying issued %s -> %s, wanted %s -> %s\n",
> > -                          ceph_cap_string(cap->issued),
> > -                          ceph_cap_string(cap->issued & retain),
> > -                          ceph_cap_string(cap->mds_wanted),
> > -                          ceph_cap_string(want));
> > -                     delayed++;
> > -                     continue;
> > -             }
> > -
> >  ack:
> >               if (session && session != cap->session) {
> >                       dout("oops, wrong session %p mutex\n", session);
> > @@ -2081,24 +2035,18 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >               }
> >
> >               mds = cap->mds;  /* remember mds, so we don't repeat */
> > -             sent++;
> >
> >               /* __send_cap drops i_ceph_lock */
> > -             delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
> > -                             cap_used, want, retain, flushing,
> > -                             flush_tid, oldest_flush_tid);
> > +             __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
> > +                        retain, flushing, flush_tid, oldest_flush_tid);
> >               goto retry; /* retake i_ceph_lock and restart our cap scan. */
> >       }
> >
> > -     if (list_empty(&ci->i_cap_delay_list)) {
> > -         if (delayed) {
> > -                 /* Reschedule delayed caps release if we delayed anything */
> > -                 __cap_delay_requeue(mdsc, ci, false);
> > -         } else if ((file_wanted & ~CEPH_CAP_PIN) &&
> > -                     !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > -                 /* periodically re-calculate caps wanted by open files */
> > -                 __cap_delay_requeue(mdsc, ci, true);
> > -         }
> > +     /* periodically re-calculate caps wanted by open files */
> > +     if (list_empty(&ci->i_cap_delay_list) &&
> > +         (file_wanted & ~CEPH_CAP_PIN) &&
> > +         !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > +             __cap_delay_requeue(mdsc, ci);
> >       }
> >
> >       spin_unlock(&ci->i_ceph_lock);
> > @@ -2128,7 +2076,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
> >  retry_locked:
> >       if (ci->i_dirty_caps && ci->i_auth_cap) {
> >               struct ceph_cap *cap = ci->i_auth_cap;
> > -             int delayed;
> >
> >               if (session != cap->session) {
> >                       spin_unlock(&ci->i_ceph_lock);
> > @@ -2157,18 +2104,10 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
> >                                                &oldest_flush_tid);
> >
> >               /* __send_cap drops i_ceph_lock */
> > -             delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> > -                                  CEPH_CLIENT_CAPS_SYNC,
> > -                                  __ceph_caps_used(ci),
> > -                                  __ceph_caps_wanted(ci),
> > -                                  (cap->issued | cap->implemented),
> > -                                  flushing, flush_tid, oldest_flush_tid);
> > -
> > -             if (delayed) {
> > -                     spin_lock(&ci->i_ceph_lock);
> > -                     __cap_delay_requeue(mdsc, ci, true);
> > -                     spin_unlock(&ci->i_ceph_lock);
> > -             }
> > +             __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
> > +                        __ceph_caps_used(ci), __ceph_caps_wanted(ci),
> > +                        (cap->issued | cap->implemented),
> > +                        flushing, flush_tid, oldest_flush_tid);
> >       } else {
> >               if (!list_empty(&ci->i_cap_flush_list)) {
> >                       struct ceph_cap_flush *cf =
> > @@ -2368,22 +2307,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
> >               if (cf->caps) {
> >                       dout("kick_flushing_caps %p cap %p tid %llu %s\n",
> >                            inode, cap, cf->tid, ceph_cap_string(cf->caps));
> > -                     ci->i_ceph_flags |= CEPH_I_NODELAY;
> > -
> > -                     ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> > +                     __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
> >                                        (cf->tid < last_snap_flush ?
> >                                         CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
> >                                         __ceph_caps_used(ci),
> >                                         __ceph_caps_wanted(ci),
> >                                         (cap->issued | cap->implemented),
> >                                         cf->caps, cf->tid, oldest_flush_tid);
> > -                     if (ret) {
> > -                             pr_err("kick_flushing_caps: error sending "
> > -                                     "cap flush, ino (%llx.%llx) "
> > -                                     "tid %llu flushing %s\n",
> > -                                     ceph_vinop(inode), cf->tid,
> > -                                     ceph_cap_string(cf->caps));
> > -                     }
> >               } else {
> >                       struct ceph_cap_snap *capsnap =
> >                                       container_of(cf, struct ceph_cap_snap,
> > @@ -3001,7 +2931,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
> >       dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
> >            last ? " last" : "", put ? " put" : "");
> >
> > -     if (last && !flushsnaps)
> > +     if (last)
> >               ceph_check_caps(ci, 0, NULL);
> >       else if (flushsnaps)
> >               ceph_flush_snaps(ci, NULL);
> > @@ -3419,10 +3349,10 @@ static void handle_cap_grant(struct inode *inode,
> >               wake_up_all(&ci->i_cap_wq);
> >
> >       if (check_caps == 1)
> > -             ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
> > +             ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
> >                               session);
> >       else if (check_caps == 2)
> > -             ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
> > +             ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
> >       else
> >               mutex_unlock(&session->s_mutex);
> >  }
> > @@ -4097,7 +4027,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
> >  {
> >       struct inode *inode;
> >       struct ceph_inode_info *ci;
> > -     int flags = CHECK_CAPS_NODELAY;
> >
> >       dout("check_delayed_caps\n");
> >       while (1) {
> > @@ -4117,7 +4046,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
> >
> >               if (inode) {
> >                       dout("check_delayed_caps on %p\n", inode);
> > -                     ceph_check_caps(ci, flags, NULL);
> > +                     ceph_check_caps(ci, 0, NULL);
> >                       /* avoid calling iput_final() in tick thread */
> >                       ceph_async_iput(inode);
> >               }
> > @@ -4142,7 +4071,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
> >               ihold(inode);
> >               dout("flush_dirty_caps %p\n", inode);
> >               spin_unlock(&mdsc->cap_dirty_lock);
> > -             ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
> > +             ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
> >               iput(inode);
> >               spin_lock(&mdsc->cap_dirty_lock);
> >       }
> > @@ -4160,7 +4089,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
> >               ci->i_last_wr = now;
> >       /* queue periodic check */
> >       if (fmode && list_empty(&ci->i_cap_delay_list))
> > -             __cap_delay_requeue(mdsc, ci, true);
> > +             __cap_delay_requeue(mdsc, ci);
> >  }
> >
> >  void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
> > @@ -4209,7 +4138,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
> >       if (inode->i_nlink == 1) {
> >               drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
> >
> > -             ci->i_ceph_flags |= CEPH_I_NODELAY;
> >               if (__ceph_caps_dirty(ci)) {
> >                       struct ceph_mds_client *mdsc =
> >                               ceph_inode_to_client(inode)->mdsc;
> > @@ -4265,8 +4193,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
> >               if (force || (cap->issued & drop)) {
> >                       if (cap->issued & drop) {
> >                               int wanted = __ceph_caps_wanted(ci);
> > -                             if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
> > -                                     wanted |= cap->mds_wanted;
> >                               dout("encode_inode_release %p cap %p "
> >                                    "%s -> %s, wanted %s -> %s\n", inode, cap,
> >                                    ceph_cap_string(cap->issued),
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 84058d3c5685..1d76bdf1a1b9 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -1552,7 +1552,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >               if (dirty)
> >                       __mark_inode_dirty(inode, dirty);
> >               if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> > -                     ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
> > +                     ceph_check_caps(ci, 0, NULL);
> >       }
> >
> >       dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> > @@ -2129,12 +2129,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
> >
> >       if (endoff > size) {
> >               int caps_flags = 0;
> > -
> >               /* Let the MDS know about dst file size change */
> > -             if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
> > -                     caps_flags |= CHECK_CAPS_NODELAY;
> >               if (ceph_inode_set_size(dst_inode, endoff))
> >                       caps_flags |= CHECK_CAPS_AUTHONLY;
> > +             if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
> > +                     caps_flags |= CHECK_CAPS_AUTHONLY;
>
> Does ceph_quota_is_max_bytes_approaching have any side effects?
>
> It looks like that call can be pretty expensive (including a possible
> separate round trip to the MDS). If we don't need to call it for other
> reasons, then we should probably do:
>
>                 if (ceph_inode_set_size(dst_inode, dst_off) ||
>                     ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
>                         caps_flags |= CHECK_CAPS_AUTHONLY;
>
>

No side affect. I added this change to
https://github.com/ceph/ceph-client/tree/wip-fmode-timeout


> >               if (caps_flags)
> >                       ceph_check_caps(dst_ci, caps_flags, NULL);
> >       }
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 0b0f503c84c3..5a8fa8a2d3cf 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -471,7 +471,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >       ci->i_prealloc_cap_flush = NULL;
> >       INIT_LIST_HEAD(&ci->i_cap_flush_list);
> >       init_waitqueue_head(&ci->i_cap_wq);
> > -     ci->i_hold_caps_min = 0;
> >       ci->i_hold_caps_max = 0;
> >       INIT_LIST_HEAD(&ci->i_cap_delay_list);
> >       INIT_LIST_HEAD(&ci->i_cap_snaps);
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index d89478db8b24..e586cff3dfd5 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -170,9 +170,9 @@ struct ceph_cap {
> >       struct list_head caps_item;
> >  };
> >
> > -#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
> > -#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
> > -#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
> > +#define CHECK_CAPS_AUTHONLY   1  /* only check auth cap */
> > +#define CHECK_CAPS_FLUSH      2  /* flush any dirty caps */
> > +#define CHECK_CAPS_NOINVAL    4  /* don't invalidate pagecache */
> >
> >  struct ceph_cap_flush {
> >       u64 tid;
> > @@ -352,7 +352,6 @@ struct ceph_inode_info {
> >       struct ceph_cap_flush *i_prealloc_cap_flush;
> >       struct list_head i_cap_flush_list;
> >       wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
> > -     unsigned long i_hold_caps_min; /* jiffies */
> >       unsigned long i_hold_caps_max; /* jiffies */
> >       struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
> >       struct ceph_cap_reservation i_cap_migration_resv;
> > @@ -513,7 +512,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> >   * Ceph inode.
> >   */
> >  #define CEPH_I_DIR_ORDERED   (1 << 0)  /* dentries in dir are ordered */
> > -#define CEPH_I_NODELAY               (1 << 1)  /* do not delay cap release */
> >  #define CEPH_I_FLUSH         (1 << 2)  /* do not delay flush of dirty metadata */
> >  #define CEPH_I_POOL_PERM     (1 << 3)  /* pool rd/wr bits are valid */
> >  #define CEPH_I_POOL_RD               (1 << 4)  /* can read from pool */
>
> --
> Jeff Layton <jlayton@kernel.org>
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/4] ceph: don't request caps for idle open files
       [not found]     ` <68c33106a754b443f2dd938f2d8eed6da19136dc.camel@kernel.org>
@ 2020-02-27 14:58       ` Jeff Layton
       [not found]         ` <CAAM7YA=dnNBXkijDUDV6NWo3A=uak1i-b=KpaZ4-CG=9kENVmA@mail.gmail.com>
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Layton @ 2020-02-27 14:58 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel

On Thu, 2020-02-27 at 08:36 -0500, Jeff Layton wrote:
> On Thu, 2020-02-27 at 20:07 +0800, Yan, Zheng wrote:
> > On Fri, Feb 21, 2020 at 10:59 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> > > > This series make cephfs client not request caps for open files that
> > > > idle for a long time. For the case that one active client and multiple
> > > > standby clients open the same file, this increase the possibility that
> > > > mds issues exclusive caps to the active client.
> > > > 
> > > > Yan, Zheng (4):
> > > >   ceph: always renew caps if mds_wanted is insufficient
> > > >   ceph: consider inode's last read/write when calculating wanted caps
> > > >   ceph: simplify calling of ceph_get_fmode()
> > > >   ceph: remove delay check logic from ceph_check_caps()
> > > > 
> > > >  fs/ceph/caps.c               | 324 +++++++++++++++--------------------
> > > >  fs/ceph/file.c               |  39 ++---
> > > >  fs/ceph/inode.c              |  19 +-
> > > >  fs/ceph/ioctl.c              |   2 +
> > > >  fs/ceph/mds_client.c         |   5 -
> > > >  fs/ceph/super.h              |  35 ++--
> > > >  include/linux/ceph/ceph_fs.h |   1 +
> > > >  7 files changed, 188 insertions(+), 237 deletions(-)
> > > > 
> > > 
> > > Testing with this series and xfstests is throwing softlockups. Still
> > > looking at the cause:
> > > 
> > > [  251.385573] watchdog: BUG: soft lockup - CPU#7 stuck for 23s! [fsstress:7113]
> > > [  251.387105] Modules linked in: ceph(OE) libceph(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) ip_set(E) nfnetlink(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) cachefiles(E) fscache(E) sunrpc(E) pcspkr(E) joydev(E) i2c_piix4(E) virtio_balloon(E) crct10dif_pclmul(E) crc32_pclmul(E) ghash_clmulni_intel(E) ip_tables(E) xfs(E) libcrc32c(E) rfkill(E) qxl(E) drm_ttm_helper(E) crc32c_intel(E) ttm(E) drm_kms_helper(E) cec(E) serio_raw(E) virtio_net(E) net_failover(E)
  virtio_console(E) virtio_blk(E) failover(E) drm(E) ata_generic(E) floppy(E) pata_acpi(E) qemu_fw_cfg(E)
> > > [  251.402300] CPU: 7 PID: 7113 Comm: fsstress Kdump: loaded Tainted: G           OE     5.6.0-rc1+ #12
> > > [  251.404351] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
> > > [  251.407716] RIP: 0010:ceph_get_caps+0x108/0x5e0 [ceph]
> > > [  251.409154] Code: 80 cc 02 85 d2 44 89 f2 0f 45 d8 41 89 d8 e8 4f f8 ff ff 89 c1 83 f8 f5 74 ba 85 c0 0f 84 55 02 00 00 48 8b 74 24 50 f6 06 02 <74> 11 48 8b 44 24 48 8b 40 2c 39 46 1c 0f 85 97 04 00 00 85 c9 0f
> > > [  251.413302] RSP: 0018:ffffb72c01567a10 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
> > > [  251.414827] RAX: 00000000ffffff8c RBX: 0000000000000002 RCX: 00000000ffffff8c
> > > [  251.416303] RDX: ffff8a1fd45b4468 RSI: ffff8a2228fd2c08 RDI: ffff8a1fd45b42d0
> > > [  251.417778] RBP: ffffb72c01567b00 R08: ffff8a2227fcb000 R09: ffffb72c01567a6c
> > > [  251.419265] R10: 0000000000000001 R11: 0000000000000000 R12: 00000000160337c1
> > > [  251.420811] R13: 0000000000001000 R14: 0000000000002000 R15: ffff8a1fd45b45f8
> > > [  251.422355] FS:  00007fcdcc9ebf40(0000) GS:ffff8a222fbc0000(0000) knlGS:0000000000000000
> > > [  251.423962] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [  251.425294] CR2: 000000000162e000 CR3: 00000001f0b18000 CR4: 00000000003406e0
> > > [  251.426785] Call Trace:
> > > [  251.427739]  ? __vfs_getxattr+0x53/0x70
> > > [  251.428835]  ? generic_update_time+0x9d/0xc0
> > > [  251.429978]  ? file_update_time+0xdd/0x120
> > > [  251.431106]  ceph_write_iter+0x27b/0xc30 [ceph]
> > > [  251.432285]  ? ceph_read_iter+0x3b3/0xd60 [ceph]
> > > [  251.433463]  ? _cond_resched+0x15/0x30
> > > [  251.434533]  ? do_iter_readv_writev+0x158/0x1d0
> > > [  251.435681]  do_iter_readv_writev+0x158/0x1d0
> > > [  251.436825]  do_iter_write+0x7d/0x190
> > > [  251.437867]  iter_file_splice_write+0x26a/0x3c0
> > > [  251.439059]  direct_splice_actor+0x35/0x40
> > > [  251.440144]  splice_direct_to_actor+0x102/0x250
> > > [  251.441274]  ? generic_pipe_buf_nosteal+0x10/0x10
> > > [  251.442444]  do_splice_direct+0x8b/0xd0
> > > [  251.443491]  generic_copy_file_range+0x32/0x40
> > > [  251.444596]  vfs_copy_file_range+0x2eb/0x310
> > > [  251.445691]  ? __do_sys_newfstat+0x5a/0x70
> > > [  251.446764]  __x64_sys_copy_file_range+0xd6/0x210
> > > [  251.447933]  do_syscall_64+0x5b/0x1c0
> > > [  251.448972]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > > [  251.450188] RIP: 0033:0x7fcdccae91ed
> > > [  251.451236] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
> > > [  251.455096] RSP: 002b:00007ffcbac62e28 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
> > > [  251.456664] RAX: ffffffffffffffda RBX: 0000000124e3181d RCX: 00007fcdccae91ed
> > > [  251.458153] RDX: 0000000000000004 RSI: 00007ffcbac62e60 RDI: 0000000000000003
> > > [  251.459626] RBP: 0000000000000003 R08: 000000000001c50b R09: 0000000000000000
> > > [  251.461147] R10: 00007ffcbac62e68 R11: 0000000000000246 R12: 0000000000000088
> > > [  251.462680] R13: 0000000000000004 R14: 000000000001c50b R15: 000000001602447d
> > > 
> > > 
> > 
> > I have tried fsstress for days. found a few other issue, but still
> > can't reproduce this. Please try branch
> > https://github.com/ceph/ceph-client/tree/wip-fmode-timeout
> > 
> 
> Still getting softlockups:
> 
> [  560.235608] watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [fsstress:10889]
> [  560.237143] Modules linked in: rpcsec_gss_krb5(E) auth_rpcgss(E) nfsv4(E) dns_resolver(E) nfs(E) lockd(E) grace(E) ceph(E) libceph(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) ip_set(E) nfnetlink(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) cachefiles(E) fscache(E) sunrpc(E) joydev(E) i2c_piix4(E) crct10dif_pclmul(E) pcspkr(E) crc32_pclmul(E) virtio_balloon(E) ghash_clmulni_intel(E) ip_tables(E) xfs(E) libcrc32c(E) rfkill(E) qxl(E) drm_ttm_helper(E) ttm(E) drm_
 kms_helper(E) crc32c_intel(E) cec(E) virtio_console(E) serio_raw(E) virtio_net(E) net_failover(E) ata_generic(E) virtio_blk(E) failover(E) drm(E) floppy(E) pata_acpi(E) qemu_fw_cfg(E)
> [  560.252015] CPU: 1 PID: 10889 Comm: fsstress Kdump: loaded Tainted: G            E     5.6.0-rc1+ #14
> [  560.253778] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
> [  560.256696] RIP: 0010:rb_first+0xf/0x20
> [  560.257857] Code: 4d 89 e7 4c 89 ea 4d 89 e5 49 89 c4 e9 80 fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 48 8b 07 48 85 c0 74 10 49 89 c0 48 8b 40 10 <48> 85 c0 75 f4 4c 89 c0 c3 45 31 c0 eb f7 0f 1f 00 48 8b 07 48 85
> [  560.261524] RSP: 0018:ffffb59a015a7808 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
> [  560.263184] RAX: 0000000000000000 RBX: ffffb59a015a78cc RCX: 0000000000000000
> [  560.264740] RDX: 0000000000000000 RSI: ffffb59a015a78cc RDI: ffff9fbd70442298
> [  560.266286] RBP: 0000000000000d01 R08: ffff9fbd72629da0 R09: 0000000000000000
> [  560.267843] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9fbd70442160
> [  560.269424] R13: 0000000000000000 R14: ffff9fbd70442160 R15: 0000000000000d01
> [  560.271070] FS:  00007f37eff21f40(0000) GS:ffff9fbfafa40000(0000) knlGS:0000000000000000
> [  560.272833] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  560.274238] CR2: 00007f37ef7ffff0 CR3: 00000001f288c000 CR4: 00000000003406e0
> [  560.275842] Call Trace:
> [  560.276873]  __ceph_caps_issued+0x33/0xc0 [ceph]
> [  560.278206]  ceph_check_caps+0x105/0x9c0 [ceph]
> [  560.279450]  ? __cap_is_valid+0x1c/0xb0 [ceph]
> [  560.280730]  ? __ceph_caps_issued+0x6d/0xc0 [ceph]
> [  560.282132]  ? __ceph_caps_mds_wanted+0x46/0x90 [ceph]
> [  560.283672]  ceph_renew_caps+0x98/0x1b0 [ceph]
> [  560.285305]  ceph_get_caps+0x2d6/0x5e0 [ceph]
> [  560.286967]  ? __vfs_getxattr+0x53/0x70
> [  560.288162]  ? generic_update_time+0x9d/0xc0
> [  560.289353]  ? file_update_time+0xdd/0x120
> [  560.290552]  ceph_write_iter+0x27b/0xc30 [ceph]
> [  560.291764]  ? _cond_resched+0x15/0x30
> [  560.292884]  ? __inode_security_revalidate+0x62/0x70
> [  560.294129]  ? do_iter_readv_writev+0x158/0x1d0
> [  560.295319]  do_iter_readv_writev+0x158/0x1d0
> [  560.296476]  do_iter_write+0x7d/0x190
> [  560.297556]  iter_file_splice_write+0x26a/0x3c0
> [  560.298809]  direct_splice_actor+0x35/0x40
> [  560.300028]  splice_direct_to_actor+0x102/0x250
> [  560.301236]  ? generic_pipe_buf_nosteal+0x10/0x10
> [  560.302453]  do_splice_direct+0x8b/0xd0
> [  560.303610]  generic_copy_file_range+0x32/0x40
> [  560.304850]  ceph_copy_file_range+0xb2/0x6e0 [ceph]
> [  560.306085]  ? _cond_resched+0x15/0x30
> [  560.307148]  ? __inode_security_revalidate+0x62/0x70
> [  560.308357]  vfs_copy_file_range+0x2eb/0x310
> [  560.309479]  ? __do_sys_newfstat+0x5a/0x70
> [  560.310565]  __x64_sys_copy_file_range+0xd6/0x210
> [  560.311711]  do_syscall_64+0x5b/0x1c0
> [  560.312705]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  560.313855] RIP: 0033:0x7f37f001f1ed
> [  560.314818] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
> [  560.319060] RSP: 002b:00007fff4263e4a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
> [  560.320845] RAX: ffffffffffffffda RBX: 0000000100163b5b RCX: 00007f37f001f1ed
> [  560.322389] RDX: 0000000000000004 RSI: 00007fff4263e4e0 RDI: 0000000000000003
> [  560.323756] RBP: 0000000000000003 R08: 0000000000001065 R09: 0000000000000000
> [  560.325286] R10: 00007fff4263e4e8 R11: 0000000000000246 R12: 000000000000014e
> [  560.327186] R13: 0000000000000004 R14: 0000000000001065 R15: 00000000787d3c00
> 
> This is testing against a ceph build from this morning's master branch
> (53febd478dfc) + a few userland client patches (nothing that should
> affect the mds).
> 
> I'm attaching my .config and the dmesg after enabling dynamic debug in
> try_get_cap_refs, ceph_check_caps, and ceph_renew_caps.

My bad...I had the wrong kernel module installed when I was testing
this. I'm rerunning the test now with the correct one and it seems to be
behaving better.

I'm guessing one of your other fixes might have fixed this as well. I'll
let you know if it does lock up, but you can disregard my earlier mail
for now.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/4] ceph: don't request caps for idle open files
       [not found]         ` <CAAM7YA=dnNBXkijDUDV6NWo3A=uak1i-b=KpaZ4-CG=9kENVmA@mail.gmail.com>
@ 2020-02-27 17:49           ` Jeff Layton
  0 siblings, 0 replies; 20+ messages in thread
From: Jeff Layton @ 2020-02-27 17:49 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel

On Thu, 2020-02-27 at 23:26 +0800, Yan, Zheng wrote:
> On Thu, Feb 27, 2020 at 10:58 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Thu, 2020-02-27 at 08:36 -0500, Jeff Layton wrote:
> > > On Thu, 2020-02-27 at 20:07 +0800, Yan, Zheng wrote:
> > > > On Fri, Feb 21, 2020 at 10:59 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > On Fri, 2020-02-21 at 21:16 +0800, Yan, Zheng wrote:
> > > > > > This series make cephfs client not request caps for open files that
> > > > > > idle for a long time. For the case that one active client and multiple
> > > > > > standby clients open the same file, this increase the possibility that
> > > > > > mds issues exclusive caps to the active client.
> > > > > > 
> > > > > > Yan, Zheng (4):
> > > > > >   ceph: always renew caps if mds_wanted is insufficient
> > > > > >   ceph: consider inode's last read/write when calculating wanted caps
> > > > > >   ceph: simplify calling of ceph_get_fmode()
> > > > > >   ceph: remove delay check logic from ceph_check_caps()
> > > > > > 
> > > > > >  fs/ceph/caps.c               | 324 +++++++++++++++--------------------
> > > > > >  fs/ceph/file.c               |  39 ++---
> > > > > >  fs/ceph/inode.c              |  19 +-
> > > > > >  fs/ceph/ioctl.c              |   2 +
> > > > > >  fs/ceph/mds_client.c         |   5 -
> > > > > >  fs/ceph/super.h              |  35 ++--
> > > > > >  include/linux/ceph/ceph_fs.h |   1 +
> > > > > >  7 files changed, 188 insertions(+), 237 deletions(-)
> > > > > > 
> > > > > 
> > > > > Testing with this series and xfstests is throwing softlockups. Still
> > > > > looking at the cause:
> > > > > 
> > > > > [  251.385573] watchdog: BUG: soft lockup - CPU#7 stuck for 23s! [fsstress:7113]
> > > > > [  251.387105] Modules linked in: ceph(OE) libceph(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) ip_set(E) nfnetlink(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) cachefiles(E) fscache(E) sunrpc(E) pcspkr(E) joydev(E) i2c_piix4(E) virtio_balloon(E) crct10dif_pclmul(E) crc32_pclmul(E) ghash_clmulni_intel(E) ip_tables(E) xfs(E) libcrc32c(E) rfkill(E) qxl(E) drm_ttm_helper(E) crc32c_intel(E) ttm(E) drm_kms_helper(E) cec(E) serio_raw(E) virtio_net(E) net_failove
 r(E) virtio_console(E) virtio_blk(E) failover(E) drm(E) ata_generic(E) floppy(E) pata_acpi(E) qemu_fw_cfg(E)
> > > > > [  251.402300] CPU: 7 PID: 7113 Comm: fsstress Kdump: loaded Tainted: G           OE     5.6.0-rc1+ #12
> > > > > [  251.404351] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
> > > > > [  251.407716] RIP: 0010:ceph_get_caps+0x108/0x5e0 [ceph]
> > > > > [  251.409154] Code: 80 cc 02 85 d2 44 89 f2 0f 45 d8 41 89 d8 e8 4f f8 ff ff 89 c1 83 f8 f5 74 ba 85 c0 0f 84 55 02 00 00 48 8b 74 24 50 f6 06 02 <74> 11 48 8b 44 24 48 8b 40 2c 39 46 1c 0f 85 97 04 00 00 85 c9 0f
> > > > > [  251.413302] RSP: 0018:ffffb72c01567a10 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
> > > > > [  251.414827] RAX: 00000000ffffff8c RBX: 0000000000000002 RCX: 00000000ffffff8c
> > > > > [  251.416303] RDX: ffff8a1fd45b4468 RSI: ffff8a2228fd2c08 RDI: ffff8a1fd45b42d0
> > > > > [  251.417778] RBP: ffffb72c01567b00 R08: ffff8a2227fcb000 R09: ffffb72c01567a6c
> > > > > [  251.419265] R10: 0000000000000001 R11: 0000000000000000 R12: 00000000160337c1
> > > > > [  251.420811] R13: 0000000000001000 R14: 0000000000002000 R15: ffff8a1fd45b45f8
> > > > > [  251.422355] FS:  00007fcdcc9ebf40(0000) GS:ffff8a222fbc0000(0000) knlGS:0000000000000000
> > > > > [  251.423962] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > > [  251.425294] CR2: 000000000162e000 CR3: 00000001f0b18000 CR4: 00000000003406e0
> > > > > [  251.426785] Call Trace:
> > > > > [  251.427739]  ? __vfs_getxattr+0x53/0x70
> > > > > [  251.428835]  ? generic_update_time+0x9d/0xc0
> > > > > [  251.429978]  ? file_update_time+0xdd/0x120
> > > > > [  251.431106]  ceph_write_iter+0x27b/0xc30 [ceph]
> > > > > [  251.432285]  ? ceph_read_iter+0x3b3/0xd60 [ceph]
> > > > > [  251.433463]  ? _cond_resched+0x15/0x30
> > > > > [  251.434533]  ? do_iter_readv_writev+0x158/0x1d0
> > > > > [  251.435681]  do_iter_readv_writev+0x158/0x1d0
> > > > > [  251.436825]  do_iter_write+0x7d/0x190
> > > > > [  251.437867]  iter_file_splice_write+0x26a/0x3c0
> > > > > [  251.439059]  direct_splice_actor+0x35/0x40
> > > > > [  251.440144]  splice_direct_to_actor+0x102/0x250
> > > > > [  251.441274]  ? generic_pipe_buf_nosteal+0x10/0x10
> > > > > [  251.442444]  do_splice_direct+0x8b/0xd0
> > > > > [  251.443491]  generic_copy_file_range+0x32/0x40
> > > > > [  251.444596]  vfs_copy_file_range+0x2eb/0x310
> > > > > [  251.445691]  ? __do_sys_newfstat+0x5a/0x70
> > > > > [  251.446764]  __x64_sys_copy_file_range+0xd6/0x210
> > > > > [  251.447933]  do_syscall_64+0x5b/0x1c0
> > > > > [  251.448972]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > > > > [  251.450188] RIP: 0033:0x7fcdccae91ed
> > > > > [  251.451236] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
> > > > > [  251.455096] RSP: 002b:00007ffcbac62e28 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
> > > > > [  251.456664] RAX: ffffffffffffffda RBX: 0000000124e3181d RCX: 00007fcdccae91ed
> > > > > [  251.458153] RDX: 0000000000000004 RSI: 00007ffcbac62e60 RDI: 0000000000000003
> > > > > [  251.459626] RBP: 0000000000000003 R08: 000000000001c50b R09: 0000000000000000
> > > > > [  251.461147] R10: 00007ffcbac62e68 R11: 0000000000000246 R12: 0000000000000088
> > > > > [  251.462680] R13: 0000000000000004 R14: 000000000001c50b R15: 000000001602447d
> > > > > 
> > > > > 
> > > > 
> > > > I have tried fsstress for days. found a few other issue, but still
> > > > can't reproduce this. Please try branch
> > > > https://github.com/ceph/ceph-client/tree/wip-fmode-timeout
> > > > 
> > > 
> > > Still getting softlockups:
> > > 
> > > [  560.235608] watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [fsstress:10889]
> > > [  560.237143] Modules linked in: rpcsec_gss_krb5(E) auth_rpcgss(E) nfsv4(E) dns_resolver(E) nfs(E) lockd(E) grace(E) ceph(E) libceph(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) iptable_mangle(E) iptable_raw(E) iptable_security(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) ip_set(E) nfnetlink(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) cachefiles(E) fscache(E) sunrpc(E) joydev(E) i2c_piix4(E) crct10dif_pclmul(E) pcspkr(E) crc32_pclmul(E) virtio_balloon(E) ghash_clmulni_intel(E) ip_tables(E) xfs(E) libcrc32c(E) rfkill(E) qxl(E) drm_ttm_helper(E) ttm(E) 
 drm_kms_helper(E) crc32c_intel(E) cec(E) virtio_console(E) serio_raw(E) virtio_net(E) net_failover(E) ata_generic(E) virtio_blk(E) failover(E) drm(E) floppy(E) pata_acpi(E) qemu_fw_cfg(E)
> > > [  560.252015] CPU: 1 PID: 10889 Comm: fsstress Kdump: loaded Tainted: G            E     5.6.0-rc1+ #14
> > > [  560.253778] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
> > > [  560.256696] RIP: 0010:rb_first+0xf/0x20
> > > [  560.257857] Code: 4d 89 e7 4c 89 ea 4d 89 e5 49 89 c4 e9 80 fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 48 8b 07 48 85 c0 74 10 49 89 c0 48 8b 40 10 <48> 85 c0 75 f4 4c 89 c0 c3 45 31 c0 eb f7 0f 1f 00 48 8b 07 48 85
> > > [  560.261524] RSP: 0018:ffffb59a015a7808 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
> > > [  560.263184] RAX: 0000000000000000 RBX: ffffb59a015a78cc RCX: 0000000000000000
> > > [  560.264740] RDX: 0000000000000000 RSI: ffffb59a015a78cc RDI: ffff9fbd70442298
> > > [  560.266286] RBP: 0000000000000d01 R08: ffff9fbd72629da0 R09: 0000000000000000
> > > [  560.267843] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9fbd70442160
> > > [  560.269424] R13: 0000000000000000 R14: ffff9fbd70442160 R15: 0000000000000d01
> > > [  560.271070] FS:  00007f37eff21f40(0000) GS:ffff9fbfafa40000(0000) knlGS:0000000000000000
> > > [  560.272833] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [  560.274238] CR2: 00007f37ef7ffff0 CR3: 00000001f288c000 CR4: 00000000003406e0
> > > [  560.275842] Call Trace:
> > > [  560.276873]  __ceph_caps_issued+0x33/0xc0 [ceph]
> > > [  560.278206]  ceph_check_caps+0x105/0x9c0 [ceph]
> > > [  560.279450]  ? __cap_is_valid+0x1c/0xb0 [ceph]
> > > [  560.280730]  ? __ceph_caps_issued+0x6d/0xc0 [ceph]
> > > [  560.282132]  ? __ceph_caps_mds_wanted+0x46/0x90 [ceph]
> > > [  560.283672]  ceph_renew_caps+0x98/0x1b0 [ceph]
> > > [  560.285305]  ceph_get_caps+0x2d6/0x5e0 [ceph]
> > > [  560.286967]  ? __vfs_getxattr+0x53/0x70
> > > [  560.288162]  ? generic_update_time+0x9d/0xc0
> > > [  560.289353]  ? file_update_time+0xdd/0x120
> > > [  560.290552]  ceph_write_iter+0x27b/0xc30 [ceph]
> > > [  560.291764]  ? _cond_resched+0x15/0x30
> > > [  560.292884]  ? __inode_security_revalidate+0x62/0x70
> > > [  560.294129]  ? do_iter_readv_writev+0x158/0x1d0
> > > [  560.295319]  do_iter_readv_writev+0x158/0x1d0
> > > [  560.296476]  do_iter_write+0x7d/0x190
> > > [  560.297556]  iter_file_splice_write+0x26a/0x3c0
> > > [  560.298809]  direct_splice_actor+0x35/0x40
> > > [  560.300028]  splice_direct_to_actor+0x102/0x250
> > > [  560.301236]  ? generic_pipe_buf_nosteal+0x10/0x10
> > > [  560.302453]  do_splice_direct+0x8b/0xd0
> > > [  560.303610]  generic_copy_file_range+0x32/0x40
> > > [  560.304850]  ceph_copy_file_range+0xb2/0x6e0 [ceph]
> > > [  560.306085]  ? _cond_resched+0x15/0x30
> > > [  560.307148]  ? __inode_security_revalidate+0x62/0x70
> > > [  560.308357]  vfs_copy_file_range+0x2eb/0x310
> > > [  560.309479]  ? __do_sys_newfstat+0x5a/0x70
> > > [  560.310565]  __x64_sys_copy_file_range+0xd6/0x210
> > > [  560.311711]  do_syscall_64+0x5b/0x1c0
> > > [  560.312705]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > > [  560.313855] RIP: 0033:0x7f37f001f1ed
> > > [  560.314818] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6b 5c 0c 00 f7 d8 64 89 01 48
> > > [  560.319060] RSP: 002b:00007fff4263e4a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000146
> > > [  560.320845] RAX: ffffffffffffffda RBX: 0000000100163b5b RCX: 00007f37f001f1ed
> > > [  560.322389] RDX: 0000000000000004 RSI: 00007fff4263e4e0 RDI: 0000000000000003
> > > [  560.323756] RBP: 0000000000000003 R08: 0000000000001065 R09: 0000000000000000
> > > [  560.325286] R10: 00007fff4263e4e8 R11: 0000000000000246 R12: 000000000000014e
> > > [  560.327186] R13: 0000000000000004 R14: 0000000000001065 R15: 00000000787d3c00
> > > 
> > > This is testing against a ceph build from this morning's master branch
> > > (53febd478dfc) + a few userland client patches (nothing that should
> > > affect the mds).
> > > 
> > > I'm attaching my .config and the dmesg after enabling dynamic debug in
> > > try_get_cap_refs, ceph_check_caps, and ceph_renew_caps.
> > 
> > My bad...I had the wrong kernel module installed when I was testing
> > this. I'm rerunning the test now with the correct one and it seems to be
> > behaving better.
> > 
> > I'm guessing one of your other fixes might have fixed this as well. I'll
> > let you know if it does lock up, but you can disregard my earlier mail
> > for now.
> > 
> 
> FYI: I update https://github.com/ceph/ceph-client/tree/wip-fmode-timeout.
> remove round_jiffies in __ceph_cap_file_wanted(). round_jiffies()
> return original value for past jiffies. So the change shouldn't affect
> anything.
> 

(re-cc'ing the list)

I tested your latest branch with a few cap-related cleanups of my own
that I'll be posting soon. I'm no longer seeing softlockups.

Sorry for the false alarm earlier! This is looking good now, so I'd like
to see this merged soon. I'll wait for you to do a v3 post once you
think it's ready.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-02-27 17:49 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-21 13:16 [PATCH v2 0/4] ceph: don't request caps for idle open files Yan, Zheng
2020-02-21 13:16 ` [PATCH v2 1/4] ceph: always renew caps if mds_wanted is insufficient Yan, Zheng
2020-02-21 17:17   ` Jeff Layton
2020-02-24  3:17     ` Yan, Zheng
2020-02-24 15:17       ` Jeff Layton
2020-02-21 13:16 ` [PATCH v2 2/4] ceph: consider inode's last read/write when calculating wanted caps Yan, Zheng
2020-02-21 13:52   ` Jeff Layton
2020-02-21 14:27   ` Jeff Layton
2020-02-21 14:35     ` Jeff Layton
2020-02-24  7:09       ` Yan, Zheng
2020-02-21 13:16 ` [PATCH v2 3/4] ceph: simplify calling of ceph_get_fmode() Yan, Zheng
2020-02-21 13:55   ` Jeff Layton
2020-02-21 13:16 ` [PATCH v2 4/4] ceph: remove delay check logic from ceph_check_caps() Yan, Zheng
2020-02-27 11:43   ` Jeff Layton
2020-02-27 12:08     ` Yan, Zheng
2020-02-21 14:08 ` [PATCH v2 0/4] ceph: don't request caps for idle open files Jeff Layton
2020-02-21 14:57 ` Jeff Layton
2020-02-27 12:07   ` Yan, Zheng
     [not found]     ` <68c33106a754b443f2dd938f2d8eed6da19136dc.camel@kernel.org>
2020-02-27 14:58       ` Jeff Layton
     [not found]         ` <CAAM7YA=dnNBXkijDUDV6NWo3A=uak1i-b=KpaZ4-CG=9kENVmA@mail.gmail.com>
2020-02-27 17:49           ` Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.