ceph-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/4] flush the mdlog before waiting on unsafe reqs
@ 2021-07-05  1:22 xiubli
  2021-07-05  1:22 ` [PATCH v2 1/4] ceph: make ceph_create_session_msg a global symbol xiubli
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: xiubli @ 2021-07-05  1:22 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

For the client requests who will have unsafe and safe replies from
MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
(journal log) immediatelly, because they think it's unnecessary.
That's true for most cases but not all, likes the fsync request.
The fsync will wait until all the unsafe replied requests to be
safely replied.

Normally if there have multiple threads or clients are running, the
whole mdlog in MDS daemons could be flushed in time if any request
will trigger the mdlog submit thread. So usually we won't experience
the normal operations will stuck for a long time. But in case there
has only one client with only thread is running, the stuck phenomenon
maybe obvious and the worst case it must wait at most 5 seconds to
wait the mdlog to be flushed by the MDS's tick thread periodically.

This patch will trigger to flush the mdlog in all the relevant and
auth MDSes manually just before waiting the unsafe requests to finish.


Changed in V2:
- send mdlog flush request to unsafe req relevant and auth MDSes only
instead of all of them.
- rename the first two commits' subject.
- fix the log messages.
- remove the feature bits fixing patch.
- fix some comments.
- remove flush_mdlog() wrapper.
- update the ceph_session_op_name() for new _REQUEST_FLUSH_MDLOG.



Xiubo Li (4):
  ceph: make ceph_create_session_msg a global symbol
  ceph: make iterate_sessions a global symbol
  ceph: flush mdlog before umounting
  ceph: flush the mdlog before waiting on unsafe reqs

 fs/ceph/caps.c               | 104 ++++++++++++++++++++++++++---------
 fs/ceph/mds_client.c         |  90 ++++++++++++++++++++++--------
 fs/ceph/mds_client.h         |   5 ++
 fs/ceph/strings.c            |   1 +
 include/linux/ceph/ceph_fs.h |   1 +
 5 files changed, 152 insertions(+), 49 deletions(-)

-- 
2.27.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2 1/4] ceph: make ceph_create_session_msg a global symbol
  2021-07-05  1:22 [PATCH v2 0/4] flush the mdlog before waiting on unsafe reqs xiubli
@ 2021-07-05  1:22 ` xiubli
  2021-07-05  1:22 ` [PATCH v2 2/4] ceph: make iterate_sessions " xiubli
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 9+ messages in thread
From: xiubli @ 2021-07-05  1:22 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 16 +++++++++-------
 fs/ceph/mds_client.h |  1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2d7dcd295bb9..3c9c58a6e75f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1150,7 +1150,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 /*
  * session messages
  */
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
@@ -1158,7 +1158,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
 			   false);
 	if (!msg) {
-		pr_err("create_session_msg ENOMEM creating msg\n");
+		pr_err("ENOMEM creating session %s msg\n",
+		       ceph_session_op_name(op));
 		return NULL;
 	}
 	h = msg->front.iov_base;
@@ -1289,7 +1290,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
 			   GFP_NOFS, false);
 	if (!msg) {
-		pr_err("create_session_msg ENOMEM creating msg\n");
+		pr_err("ENOMEM creating session open msg\n");
 		return ERR_PTR(-ENOMEM);
 	}
 	p = msg->front.iov_base;
@@ -1801,8 +1802,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 
 	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
 		ceph_mds_state_name(state));
-	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
-				 ++session->s_renew_seq);
+	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+				      ++session->s_renew_seq);
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
@@ -1816,7 +1817,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
 
 	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
 	     session->s_mds, ceph_session_state_name(session->s_state), seq);
-	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+	msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
@@ -1868,7 +1869,8 @@ static int request_close_session(struct ceph_mds_session *session)
 	dout("request_close_session mds%d state %s seq %lld\n",
 	     session->s_mds, ceph_session_state_name(session->s_state),
 	     session->s_seq);
-	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
+				      session->s_seq);
 	if (!msg)
 		return -ENOMEM;
 	ceph_con_send(&session->s_con, msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bf99c5ba47fc..bf2683f0ba43 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -523,6 +523,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
+extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
 extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
 				    struct ceph_cap *cap);
 extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 2/4] ceph: make iterate_sessions a global symbol
  2021-07-05  1:22 [PATCH v2 0/4] flush the mdlog before waiting on unsafe reqs xiubli
  2021-07-05  1:22 ` [PATCH v2 1/4] ceph: make ceph_create_session_msg a global symbol xiubli
@ 2021-07-05  1:22 ` xiubli
  2021-07-05  1:22 ` [PATCH v2 3/4] ceph: flush mdlog before umounting xiubli
  2021-07-05  1:22 ` [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs xiubli
  3 siblings, 0 replies; 9+ messages in thread
From: xiubli @ 2021-07-05  1:22 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c       | 26 +----------------------
 fs/ceph/mds_client.c | 49 +++++++++++++++++++++++++++++---------------
 fs/ceph/mds_client.h |  3 +++
 3 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e712826ea3f1..c6a3352a4d52 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4280,33 +4280,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
 	dout("flush_dirty_caps done\n");
 }
 
-static void iterate_sessions(struct ceph_mds_client *mdsc,
-			     void (*cb)(struct ceph_mds_session *))
-{
-	int mds;
-
-	mutex_lock(&mdsc->mutex);
-	for (mds = 0; mds < mdsc->max_sessions; ++mds) {
-		struct ceph_mds_session *s;
-
-		if (!mdsc->sessions[mds])
-			continue;
-
-		s = ceph_get_mds_session(mdsc->sessions[mds]);
-		if (!s)
-			continue;
-
-		mutex_unlock(&mdsc->mutex);
-		cb(s);
-		ceph_put_mds_session(s);
-		mutex_lock(&mdsc->mutex);
-	}
-	mutex_unlock(&mdsc->mutex);
-}
-
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-	iterate_sessions(mdsc, flush_dirty_session_caps);
+	ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
 }
 
 void __ceph_touch_fmode(struct ceph_inode_info *ci,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3c9c58a6e75f..5c3ec7eb8141 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -802,6 +802,33 @@ static void put_request_session(struct ceph_mds_request *req)
 	}
 }
 
+void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+			       void (*cb)(struct ceph_mds_session *),
+			       bool check_state)
+{
+	int mds;
+
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; mds < mdsc->max_sessions; ++mds) {
+		struct ceph_mds_session *s;
+
+		s = __ceph_lookup_mds_session(mdsc, mds);
+		if (!s)
+			continue;
+
+		if (check_state && !check_session_state(s)) {
+			ceph_put_mds_session(s);
+			continue;
+		}
+
+		mutex_unlock(&mdsc->mutex);
+		cb(s);
+		ceph_put_mds_session(s);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
 void ceph_mdsc_release_request(struct kref *kref)
 {
 	struct ceph_mds_request *req = container_of(kref,
@@ -4415,24 +4442,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 }
 
 /*
- * lock unlock sessions, to wait ongoing session activities
+ * lock unlock the session, to wait ongoing session activities
  */
-static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
+static void lock_unlock_session(struct ceph_mds_session *s)
 {
-	int i;
-
-	mutex_lock(&mdsc->mutex);
-	for (i = 0; i < mdsc->max_sessions; i++) {
-		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
-		if (!s)
-			continue;
-		mutex_unlock(&mdsc->mutex);
-		mutex_lock(&s->s_mutex);
-		mutex_unlock(&s->s_mutex);
-		ceph_put_mds_session(s);
-		mutex_lock(&mdsc->mutex);
-	}
-	mutex_unlock(&mdsc->mutex);
+	mutex_lock(&s->s_mutex);
+	mutex_unlock(&s->s_mutex);
 }
 
 static void maybe_recover_session(struct ceph_mds_client *mdsc)
@@ -4684,7 +4699,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	dout("pre_umount\n");
 	mdsc->stopping = 1;
 
-	lock_unlock_sessions(mdsc);
+	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
 	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bf2683f0ba43..fca2cf427eaf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -523,6 +523,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
+extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+				       void (*cb)(struct ceph_mds_session *),
+				       bool check_state);
 extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
 extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
 				    struct ceph_cap *cap);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 3/4] ceph: flush mdlog before umounting
  2021-07-05  1:22 [PATCH v2 0/4] flush the mdlog before waiting on unsafe reqs xiubli
  2021-07-05  1:22 ` [PATCH v2 1/4] ceph: make ceph_create_session_msg a global symbol xiubli
  2021-07-05  1:22 ` [PATCH v2 2/4] ceph: make iterate_sessions " xiubli
@ 2021-07-05  1:22 ` xiubli
  2021-07-05  1:22 ` [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs xiubli
  3 siblings, 0 replies; 9+ messages in thread
From: xiubli @ 2021-07-05  1:22 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c         | 25 +++++++++++++++++++++++++
 fs/ceph/mds_client.h         |  1 +
 fs/ceph/strings.c            |  1 +
 include/linux/ceph/ceph_fs.h |  1 +
 4 files changed, 28 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5c3ec7eb8141..79aa4ce3a388 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4690,6 +4690,30 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 	dout("wait_requests done\n");
 }
 
+void send_flush_mdlog(struct ceph_mds_session *s)
+{
+	struct ceph_msg *msg;
+
+	/*
+	 * Pre-luminous MDS crashes when it sees an unknown session request
+	 */
+	if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
+		return;
+
+	mutex_lock(&s->s_mutex);
+	dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
+	     ceph_session_state_name(s->s_state), s->s_seq);
+	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
+				      s->s_seq);
+	if (!msg) {
+		pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
+		       s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+	} else {
+		ceph_con_send(&s->s_con, msg);
+	}
+	mutex_unlock(&s->s_mutex);
+}
+
 /*
  * called before mount is ro, and before dentries are torn down.
  * (hmm, does this still race with new lookups?)
@@ -4699,6 +4723,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	dout("pre_umount\n");
 	mdsc->stopping = 1;
 
+	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
 	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
 	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index fca2cf427eaf..a7af09257382 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -523,6 +523,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
+extern void send_flush_mdlog(struct ceph_mds_session *s);
 extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
 				       void (*cb)(struct ceph_mds_session *),
 				       bool check_state);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4a79f3632260..573bb9556fb5 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op)
 	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
 	case CEPH_SESSION_FORCE_RO: return "force_ro";
 	case CEPH_SESSION_REJECT: return "reject";
+	case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog";
 	}
 	return "???";
 }
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 57e5bd63fb7a..ae60696fe40b 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -300,6 +300,7 @@ enum {
 	CEPH_SESSION_FLUSHMSG_ACK,
 	CEPH_SESSION_FORCE_RO,
 	CEPH_SESSION_REJECT,
+	CEPH_SESSION_REQUEST_FLUSH_MDLOG,
 };
 
 extern const char *ceph_session_op_name(int op);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs
  2021-07-05  1:22 [PATCH v2 0/4] flush the mdlog before waiting on unsafe reqs xiubli
                   ` (2 preceding siblings ...)
  2021-07-05  1:22 ` [PATCH v2 3/4] ceph: flush mdlog before umounting xiubli
@ 2021-07-05  1:22 ` xiubli
  2021-07-06 11:42   ` Jeff Layton
  3 siblings, 1 reply; 9+ messages in thread
From: xiubli @ 2021-07-05  1:22 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

For the client requests who will have unsafe and safe replies from
MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
(journal log) immediatelly, because they think it's unnecessary.
That's true for most cases but not all, likes the fsync request.
The fsync will wait until all the unsafe replied requests to be
safely replied.

Normally if there have multiple threads or clients are running, the
whole mdlog in MDS daemons could be flushed in time if any request
will trigger the mdlog submit thread. So usually we won't experience
the normal operations will stuck for a long time. But in case there
has only one client with only thread is running, the stuck phenomenon
maybe obvious and the worst case it must wait at most 5 seconds to
wait the mdlog to be flushed by the MDS's tick thread periodically.

This patch will trigger to flush the mdlog in the relevant and auth
MDSes to which the in-flight requests are sent just before waiting
the unsafe requests to finish.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c6a3352a4d52..4b966c29d9b5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
  */
 static int unsafe_request_wait(struct inode *inode)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
 	int ret, err = 0;
@@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)
 	}
 	spin_unlock(&ci->i_unsafe_lock);
 
+	/*
+	 * Trigger to flush the journal logs in all the relevant MDSes
+	 * manually, or in the worst case we must wait at most 5 seconds
+	 * to wait the journal logs to be flushed by the MDSes periodically.
+	 */
+	if (req1 || req2) {
+		struct ceph_mds_session **sessions = NULL;
+		struct ceph_mds_session *s;
+		struct ceph_mds_request *req;
+		unsigned int max;
+		int i;
+
+		/*
+		 * The mdsc->max_sessions is unlikely to be changed
+		 * mostly, here we will retry it by reallocating the
+		 * sessions arrary memory to get rid of the mdsc->mutex
+		 * lock.
+		 */
+retry:
+		max = mdsc->max_sessions;
+		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
+		if (!sessions) {
+			err = -ENOMEM;
+			goto out;
+		}
+		spin_lock(&ci->i_unsafe_lock);
+		if (req1) {
+			list_for_each_entry(req, &ci->i_unsafe_dirops,
+					    r_unsafe_dir_item) {
+				s = req->r_session;
+				if (unlikely(s->s_mds > max)) {
+					spin_unlock(&ci->i_unsafe_lock);
+					goto retry;
+				}
+				if (!sessions[s->s_mds]) {
+					s = ceph_get_mds_session(s);
+					sessions[s->s_mds] = s;
+				}
+			}
+		}
+		if (req2) {
+			list_for_each_entry(req, &ci->i_unsafe_iops,
+					    r_unsafe_target_item) {
+				s = req->r_session;
+				if (unlikely(s->s_mds > max)) {
+					spin_unlock(&ci->i_unsafe_lock);
+					goto retry;
+				}
+				if (!sessions[s->s_mds]) {
+					s = ceph_get_mds_session(s);
+					sessions[s->s_mds] = s;
+				}
+			}
+		}
+		spin_unlock(&ci->i_unsafe_lock);
+
+		/* the auth MDS */
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_auth_cap) {
+		      s = ci->i_auth_cap->session;
+		      if (!sessions[s->s_mds])
+			      sessions[s->s_mds] = ceph_get_mds_session(s);
+		}
+		spin_unlock(&ci->i_ceph_lock);
+
+		/* send flush mdlog request to MDSes */
+		for (i = 0; i < max; i++) {
+			s = sessions[i];
+			if (s) {
+				send_flush_mdlog(s);
+				ceph_put_mds_session(s);
+			}
+		}
+		kfree(sessions);
+	}
+
 	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
 	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
 	if (req1) {
@@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)
 			err = -EIO;
 		ceph_mdsc_put_request(req2);
 	}
+out:
 	return err;
 }
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs
  2021-07-05  1:22 ` [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs xiubli
@ 2021-07-06 11:42   ` Jeff Layton
  2021-07-06 12:37     ` Xiubo Li
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Layton @ 2021-07-06 11:42 UTC (permalink / raw)
  To: xiubli, idryomov; +Cc: pdonnell, ceph-devel

On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> For the client requests who will have unsafe and safe replies from
> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
> (journal log) immediatelly, because they think it's unnecessary.
> That's true for most cases but not all, likes the fsync request.
> The fsync will wait until all the unsafe replied requests to be
> safely replied.
> 
> Normally if there have multiple threads or clients are running, the
> whole mdlog in MDS daemons could be flushed in time if any request
> will trigger the mdlog submit thread. So usually we won't experience
> the normal operations will stuck for a long time. But in case there
> has only one client with only thread is running, the stuck phenomenon
> maybe obvious and the worst case it must wait at most 5 seconds to
> wait the mdlog to be flushed by the MDS's tick thread periodically.
> 
> This patch will trigger to flush the mdlog in the relevant and auth
> MDSes to which the in-flight requests are sent just before waiting
> the unsafe requests to finish.
> 
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 78 insertions(+)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index c6a3352a4d52..4b966c29d9b5 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
>   */
>  static int unsafe_request_wait(struct inode *inode)
>  {
> +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>  	struct ceph_inode_info *ci = ceph_inode(inode);
>  	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>  	int ret, err = 0;
> @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)
>  	}
>  	spin_unlock(&ci->i_unsafe_lock);
>  
> +	/*
> +	 * Trigger to flush the journal logs in all the relevant MDSes
> +	 * manually, or in the worst case we must wait at most 5 seconds
> +	 * to wait the journal logs to be flushed by the MDSes periodically.
> +	 */
> +	if (req1 || req2) {
> +		struct ceph_mds_session **sessions = NULL;
> +		struct ceph_mds_session *s;
> +		struct ceph_mds_request *req;
> +		unsigned int max;
> +		int i;
> +
> +		/*
> +		 * The mdsc->max_sessions is unlikely to be changed
> +		 * mostly, here we will retry it by reallocating the
> +		 * sessions arrary memory to get rid of the mdsc->mutex
> +		 * lock.
> +		 */
> +retry:
> +		max = mdsc->max_sessions;
> +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);

The kerneldoc over krealloc() says:

 * The contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes (__GFP_ZERO flag is effectively
ignored).

This code however relies on krealloc zeroing out the new part of the
allocation. Do you know for certain that that works?

> +		if (!sessions) {
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +		spin_lock(&ci->i_unsafe_lock);
> +		if (req1) {
> +			list_for_each_entry(req, &ci->i_unsafe_dirops,
> +					    r_unsafe_dir_item) {
> +				s = req->r_session;
> +				if (unlikely(s->s_mds > max)) {
> +					spin_unlock(&ci->i_unsafe_lock);
> +					goto retry;
> +				}
> +				if (!sessions[s->s_mds]) {
> +					s = ceph_get_mds_session(s);
> +					sessions[s->s_mds] = s;

nit: maybe just do:

    sessions[s->s_mds] = ceph_get_mds_session(s);


> +				}
> +			}
> +		}
> +		if (req2) {
> +			list_for_each_entry(req, &ci->i_unsafe_iops,
> +					    r_unsafe_target_item) {
> +				s = req->r_session;
> +				if (unlikely(s->s_mds > max)) {
> +					spin_unlock(&ci->i_unsafe_lock);
> +					goto retry;
> +				}
> +				if (!sessions[s->s_mds]) {
> +					s = ceph_get_mds_session(s);
> +					sessions[s->s_mds] = s;
> +				}
> +			}
> +		}
> +		spin_unlock(&ci->i_unsafe_lock);
> +
> +		/* the auth MDS */
> +		spin_lock(&ci->i_ceph_lock);
> +		if (ci->i_auth_cap) {
> +		      s = ci->i_auth_cap->session;
> +		      if (!sessions[s->s_mds])
> +			      sessions[s->s_mds] = ceph_get_mds_session(s);
> +		}
> +		spin_unlock(&ci->i_ceph_lock);
> +
> +		/* send flush mdlog request to MDSes */
> +		for (i = 0; i < max; i++) {
> +			s = sessions[i];
> +			if (s) {
> +				send_flush_mdlog(s);
> +				ceph_put_mds_session(s);
> +			}
> +		}
> +		kfree(sessions);
> +	}
> +
>  	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
>  	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
>  	if (req1) {
> @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)
>  			err = -EIO;
>  		ceph_mdsc_put_request(req2);
>  	}
> +out:
>  	return err;
>  }
>  

Otherwise the whole set looks pretty reasonable.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs
  2021-07-06 11:42   ` Jeff Layton
@ 2021-07-06 12:37     ` Xiubo Li
  2021-07-06 13:11       ` Jeff Layton
  0 siblings, 1 reply; 9+ messages in thread
From: Xiubo Li @ 2021-07-06 12:37 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: pdonnell, ceph-devel


On 7/6/21 7:42 PM, Jeff Layton wrote:
> On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> For the client requests who will have unsafe and safe replies from
>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
>> (journal log) immediatelly, because they think it's unnecessary.
>> That's true for most cases but not all, likes the fsync request.
>> The fsync will wait until all the unsafe replied requests to be
>> safely replied.
>>
>> Normally if there have multiple threads or clients are running, the
>> whole mdlog in MDS daemons could be flushed in time if any request
>> will trigger the mdlog submit thread. So usually we won't experience
>> the normal operations will stuck for a long time. But in case there
>> has only one client with only thread is running, the stuck phenomenon
>> maybe obvious and the worst case it must wait at most 5 seconds to
>> wait the mdlog to be flushed by the MDS's tick thread periodically.
>>
>> This patch will trigger to flush the mdlog in the relevant and auth
>> MDSes to which the in-flight requests are sent just before waiting
>> the unsafe requests to finish.
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 78 insertions(+)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index c6a3352a4d52..4b966c29d9b5 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
>>    */
>>   static int unsafe_request_wait(struct inode *inode)
>>   {
>> +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>>   	struct ceph_inode_info *ci = ceph_inode(inode);
>>   	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>>   	int ret, err = 0;
>> @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)
>>   	}
>>   	spin_unlock(&ci->i_unsafe_lock);
>>   
>> +	/*
>> +	 * Trigger to flush the journal logs in all the relevant MDSes
>> +	 * manually, or in the worst case we must wait at most 5 seconds
>> +	 * to wait the journal logs to be flushed by the MDSes periodically.
>> +	 */
>> +	if (req1 || req2) {
>> +		struct ceph_mds_session **sessions = NULL;
>> +		struct ceph_mds_session *s;
>> +		struct ceph_mds_request *req;
>> +		unsigned int max;
>> +		int i;
>> +
>> +		/*
>> +		 * The mdsc->max_sessions is unlikely to be changed
>> +		 * mostly, here we will retry it by reallocating the
>> +		 * sessions arrary memory to get rid of the mdsc->mutex
>> +		 * lock.
>> +		 */
>> +retry:
>> +		max = mdsc->max_sessions;
>> +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
> The kerneldoc over krealloc() says:
>
>   * The contents of the object pointed to are preserved up to the
>   * lesser of the new and old sizes (__GFP_ZERO flag is effectively
> ignored).
>
> This code however relies on krealloc zeroing out the new part of the
> allocation. Do you know for certain that that works?

I readed the krealloc() code, the "__GFP_ZERO flag will be ignored" only 
for the preserved contents. If the slab really needs to allocate a new 
object, the slab will help zero it first and then copy the old contents 
to it, the new part will keep zeroed.


>> +		if (!sessions) {
>> +			err = -ENOMEM;
>> +			goto out;
>> +		}
>> +		spin_lock(&ci->i_unsafe_lock);
>> +		if (req1) {
>> +			list_for_each_entry(req, &ci->i_unsafe_dirops,
>> +					    r_unsafe_dir_item) {
>> +				s = req->r_session;
>> +				if (unlikely(s->s_mds > max)) {
>> +					spin_unlock(&ci->i_unsafe_lock);
>> +					goto retry;
>> +				}
>> +				if (!sessions[s->s_mds]) {
>> +					s = ceph_get_mds_session(s);
>> +					sessions[s->s_mds] = s;
> nit: maybe just do:
>
>      sessions[s->s_mds] = ceph_get_mds_session(s);

Then it will exceed 80 chars for this line. Should we ignore it here ?

Thanks.

>
>
>> +				}
>> +			}
>> +		}
>> +		if (req2) {
>> +			list_for_each_entry(req, &ci->i_unsafe_iops,
>> +					    r_unsafe_target_item) {
>> +				s = req->r_session;
>> +				if (unlikely(s->s_mds > max)) {
>> +					spin_unlock(&ci->i_unsafe_lock);
>> +					goto retry;
>> +				}
>> +				if (!sessions[s->s_mds]) {
>> +					s = ceph_get_mds_session(s);
>> +					sessions[s->s_mds] = s;
>> +				}
>> +			}
>> +		}
>> +		spin_unlock(&ci->i_unsafe_lock);
>> +
>> +		/* the auth MDS */
>> +		spin_lock(&ci->i_ceph_lock);
>> +		if (ci->i_auth_cap) {
>> +		      s = ci->i_auth_cap->session;
>> +		      if (!sessions[s->s_mds])
>> +			      sessions[s->s_mds] = ceph_get_mds_session(s);
>> +		}
>> +		spin_unlock(&ci->i_ceph_lock);
>> +
>> +		/* send flush mdlog request to MDSes */
>> +		for (i = 0; i < max; i++) {
>> +			s = sessions[i];
>> +			if (s) {
>> +				send_flush_mdlog(s);
>> +				ceph_put_mds_session(s);
>> +			}
>> +		}
>> +		kfree(sessions);
>> +	}
>> +
>>   	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
>>   	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
>>   	if (req1) {
>> @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)
>>   			err = -EIO;
>>   		ceph_mdsc_put_request(req2);
>>   	}
>> +out:
>>   	return err;
>>   }
>>   
> Otherwise the whole set looks pretty reasonable.
>
> Thanks,


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs
  2021-07-06 12:37     ` Xiubo Li
@ 2021-07-06 13:11       ` Jeff Layton
  2021-07-06 13:17         ` Xiubo Li
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Layton @ 2021-07-06 13:11 UTC (permalink / raw)
  To: Xiubo Li, idryomov; +Cc: pdonnell, ceph-devel

On Tue, 2021-07-06 at 20:37 +0800, Xiubo Li wrote:
> On 7/6/21 7:42 PM, Jeff Layton wrote:
> > On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > For the client requests who will have unsafe and safe replies from
> > > MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
> > > (journal log) immediatelly, because they think it's unnecessary.
> > > That's true for most cases but not all, likes the fsync request.
> > > The fsync will wait until all the unsafe replied requests to be
> > > safely replied.
> > > 
> > > Normally if there have multiple threads or clients are running, the
> > > whole mdlog in MDS daemons could be flushed in time if any request
> > > will trigger the mdlog submit thread. So usually we won't experience
> > > the normal operations will stuck for a long time. But in case there
> > > has only one client with only thread is running, the stuck phenomenon
> > > maybe obvious and the worst case it must wait at most 5 seconds to
> > > wait the mdlog to be flushed by the MDS's tick thread periodically.
> > > 
> > > This patch will trigger to flush the mdlog in the relevant and auth
> > > MDSes to which the in-flight requests are sent just before waiting
> > > the unsafe requests to finish.
> > > 
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > >   fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > >   1 file changed, 78 insertions(+)
> > > 
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index c6a3352a4d52..4b966c29d9b5 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
> > >    */
> > >   static int unsafe_request_wait(struct inode *inode)
> > >   {
> > > +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> > >   	struct ceph_inode_info *ci = ceph_inode(inode);
> > >   	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
> > >   	int ret, err = 0;
> > > @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)
> > >   	}
> > >   	spin_unlock(&ci->i_unsafe_lock);
> > >   
> > > +	/*
> > > +	 * Trigger to flush the journal logs in all the relevant MDSes
> > > +	 * manually, or in the worst case we must wait at most 5 seconds
> > > +	 * to wait the journal logs to be flushed by the MDSes periodically.
> > > +	 */
> > > +	if (req1 || req2) {
> > > +		struct ceph_mds_session **sessions = NULL;
> > > +		struct ceph_mds_session *s;
> > > +		struct ceph_mds_request *req;
> > > +		unsigned int max;
> > > +		int i;
> > > +
> > > +		/*
> > > +		 * The mdsc->max_sessions is unlikely to be changed
> > > +		 * mostly, here we will retry it by reallocating the
> > > +		 * sessions arrary memory to get rid of the mdsc->mutex
> > > +		 * lock.
> > > +		 */
> > > +retry:
> > > +		max = mdsc->max_sessions;
> > > +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
> > The kerneldoc over krealloc() says:
> > 
> >   * The contents of the object pointed to are preserved up to the
> >   * lesser of the new and old sizes (__GFP_ZERO flag is effectively
> > ignored).
> > 
> > This code however relies on krealloc zeroing out the new part of the
> > allocation. Do you know for certain that that works?
> 
> I readed the krealloc() code, the "__GFP_ZERO flag will be ignored" only 
> for the preserved contents. If the slab really needs to allocate a new 
> object, the slab will help zero it first and then copy the old contents 
> to it, the new part will keep zeroed.
> 
> 

Ok, and in the case where it's an initial kmalloc, that will be done
with __GFP_ZERO so any remaining space in the allocation will already be
zeroed. That works.

> > > +		if (!sessions) {
> > > +			err = -ENOMEM;
> > > +			goto out;
> > > +		}
> > > +		spin_lock(&ci->i_unsafe_lock);
> > > +		if (req1) {
> > > +			list_for_each_entry(req, &ci->i_unsafe_dirops,
> > > +					    r_unsafe_dir_item) {
> > > +				s = req->r_session;
> > > +				if (unlikely(s->s_mds > max)) {
> > > +					spin_unlock(&ci->i_unsafe_lock);
> > > +					goto retry;
> > > +				}
> > > +				if (!sessions[s->s_mds]) {
> > > +					s = ceph_get_mds_session(s);
> > > +					sessions[s->s_mds] = s;
> > nit: maybe just do:
> > 
> >      sessions[s->s_mds] = ceph_get_mds_session(s);
> 
> Then it will exceed 80 chars for this line. Should we ignore it here ?
> 

I probably would have but it's not worth respinning over all by itself.

It might also be possible to do all of this without taking the
i_unsafe_lock twice, but that too probably won't make much difference.

I'll give these a closer look and probably merge into testing branch
later today unless I see a problem.

Thanks!
Jeff

> > 
> > > +				}
> > > +			}
> > > +		}
> > > +		if (req2) {
> > > +			list_for_each_entry(req, &ci->i_unsafe_iops,
> > > +					    r_unsafe_target_item) {
> > > +				s = req->r_session;
> > > +				if (unlikely(s->s_mds > max)) {
> > > +					spin_unlock(&ci->i_unsafe_lock);
> > > +					goto retry;
> > > +				}
> > > +				if (!sessions[s->s_mds]) {
> > > +					s = ceph_get_mds_session(s);
> > > +					sessions[s->s_mds] = s;
> > > +				}
> > > +			}
> > > +		}
> > > +		spin_unlock(&ci->i_unsafe_lock);
> > > +
> > > +		/* the auth MDS */
> > > +		spin_lock(&ci->i_ceph_lock);
> > > +		if (ci->i_auth_cap) {
> > > +		      s = ci->i_auth_cap->session;
> > > +		      if (!sessions[s->s_mds])
> > > +			      sessions[s->s_mds] = ceph_get_mds_session(s);
> > > +		}
> > > +		spin_unlock(&ci->i_ceph_lock);
> > > +
> > > +		/* send flush mdlog request to MDSes */
> > > +		for (i = 0; i < max; i++) {
> > > +			s = sessions[i];
> > > +			if (s) {
> > > +				send_flush_mdlog(s);
> > > +				ceph_put_mds_session(s);
> > > +			}
> > > +		}
> > > +		kfree(sessions);
> > > +	}
> > > +
> > >   	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
> > >   	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
> > >   	if (req1) {
> > > @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)
> > >   			err = -EIO;
> > >   		ceph_mdsc_put_request(req2);
> > >   	}
> > > +out:
> > >   	return err;
> > >   }
> > >   
> > Otherwise the whole set looks pretty reasonable.
> > 
> > Thanks,
> 

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs
  2021-07-06 13:11       ` Jeff Layton
@ 2021-07-06 13:17         ` Xiubo Li
  0 siblings, 0 replies; 9+ messages in thread
From: Xiubo Li @ 2021-07-06 13:17 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: pdonnell, ceph-devel


On 7/6/21 9:11 PM, Jeff Layton wrote:
> On Tue, 2021-07-06 at 20:37 +0800, Xiubo Li wrote:
>> On 7/6/21 7:42 PM, Jeff Layton wrote:
>>> On Mon, 2021-07-05 at 09:22 +0800, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> For the client requests who will have unsafe and safe replies from
>>>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
>>>> (journal log) immediatelly, because they think it's unnecessary.
>>>> That's true for most cases but not all, likes the fsync request.
>>>> The fsync will wait until all the unsafe replied requests to be
>>>> safely replied.
>>>>
>>>> Normally if there have multiple threads or clients are running, the
>>>> whole mdlog in MDS daemons could be flushed in time if any request
>>>> will trigger the mdlog submit thread. So usually we won't experience
>>>> the normal operations will stuck for a long time. But in case there
>>>> has only one client with only thread is running, the stuck phenomenon
>>>> maybe obvious and the worst case it must wait at most 5 seconds to
>>>> wait the mdlog to be flushed by the MDS's tick thread periodically.
>>>>
>>>> This patch will trigger to flush the mdlog in the relevant and auth
>>>> MDSes to which the in-flight requests are sent just before waiting
>>>> the unsafe requests to finish.
>>>>
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 78 insertions(+)
>>>>
>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>> index c6a3352a4d52..4b966c29d9b5 100644
>>>> --- a/fs/ceph/caps.c
>>>> +++ b/fs/ceph/caps.c
>>>> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
>>>>     */
>>>>    static int unsafe_request_wait(struct inode *inode)
>>>>    {
>>>> +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>>>>    	struct ceph_inode_info *ci = ceph_inode(inode);
>>>>    	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>>>>    	int ret, err = 0;
>>>> @@ -2305,6 +2306,82 @@ static int unsafe_request_wait(struct inode *inode)
>>>>    	}
>>>>    	spin_unlock(&ci->i_unsafe_lock);
>>>>    
>>>> +	/*
>>>> +	 * Trigger to flush the journal logs in all the relevant MDSes
>>>> +	 * manually, or in the worst case we must wait at most 5 seconds
>>>> +	 * to wait the journal logs to be flushed by the MDSes periodically.
>>>> +	 */
>>>> +	if (req1 || req2) {
>>>> +		struct ceph_mds_session **sessions = NULL;
>>>> +		struct ceph_mds_session *s;
>>>> +		struct ceph_mds_request *req;
>>>> +		unsigned int max;
>>>> +		int i;
>>>> +
>>>> +		/*
>>>> +		 * The mdsc->max_sessions is unlikely to be changed
>>>> +		 * mostly, here we will retry it by reallocating the
>>>> +		 * sessions arrary memory to get rid of the mdsc->mutex
>>>> +		 * lock.
>>>> +		 */
>>>> +retry:
>>>> +		max = mdsc->max_sessions;
>>>> +		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
>>> The kerneldoc over krealloc() says:
>>>
>>>    * The contents of the object pointed to are preserved up to the
>>>    * lesser of the new and old sizes (__GFP_ZERO flag is effectively
>>> ignored).
>>>
>>> This code however relies on krealloc zeroing out the new part of the
>>> allocation. Do you know for certain that that works?
>> I readed the krealloc() code, the "__GFP_ZERO flag will be ignored" only
>> for the preserved contents. If the slab really needs to allocate a new
>> object, the slab will help zero it first and then copy the old contents
>> to it, the new part will keep zeroed.
>>
>>
> Ok, and in the case where it's an initial kmalloc, that will be done
> with __GFP_ZERO so any remaining space in the allocation will already be
> zeroed. That works.

Yeah, it is.


>
>>>> +		if (!sessions) {
>>>> +			err = -ENOMEM;
>>>> +			goto out;
>>>> +		}
>>>> +		spin_lock(&ci->i_unsafe_lock);
>>>> +		if (req1) {
>>>> +			list_for_each_entry(req, &ci->i_unsafe_dirops,
>>>> +					    r_unsafe_dir_item) {
>>>> +				s = req->r_session;
>>>> +				if (unlikely(s->s_mds > max)) {
>>>> +					spin_unlock(&ci->i_unsafe_lock);
>>>> +					goto retry;
>>>> +				}
>>>> +				if (!sessions[s->s_mds]) {
>>>> +					s = ceph_get_mds_session(s);
>>>> +					sessions[s->s_mds] = s;
>>> nit: maybe just do:
>>>
>>>       sessions[s->s_mds] = ceph_get_mds_session(s);
>> Then it will exceed 80 chars for this line. Should we ignore it here ?
>>
> I probably would have but it's not worth respinning over all by itself.
>
> It might also be possible to do all of this without taking the
> i_unsafe_lock twice, but that too probably won't make much difference.
>
> I'll give these a closer look and probably merge into testing branch
> later today unless I see a problem.

Sure, thanks Jeff.


>
> Thanks!
> Jeff
>
>>>> +				}
>>>> +			}
>>>> +		}
>>>> +		if (req2) {
>>>> +			list_for_each_entry(req, &ci->i_unsafe_iops,
>>>> +					    r_unsafe_target_item) {
>>>> +				s = req->r_session;
>>>> +				if (unlikely(s->s_mds > max)) {
>>>> +					spin_unlock(&ci->i_unsafe_lock);
>>>> +					goto retry;
>>>> +				}
>>>> +				if (!sessions[s->s_mds]) {
>>>> +					s = ceph_get_mds_session(s);
>>>> +					sessions[s->s_mds] = s;
>>>> +				}
>>>> +			}
>>>> +		}
>>>> +		spin_unlock(&ci->i_unsafe_lock);
>>>> +
>>>> +		/* the auth MDS */
>>>> +		spin_lock(&ci->i_ceph_lock);
>>>> +		if (ci->i_auth_cap) {
>>>> +		      s = ci->i_auth_cap->session;
>>>> +		      if (!sessions[s->s_mds])
>>>> +			      sessions[s->s_mds] = ceph_get_mds_session(s);
>>>> +		}
>>>> +		spin_unlock(&ci->i_ceph_lock);
>>>> +
>>>> +		/* send flush mdlog request to MDSes */
>>>> +		for (i = 0; i < max; i++) {
>>>> +			s = sessions[i];
>>>> +			if (s) {
>>>> +				send_flush_mdlog(s);
>>>> +				ceph_put_mds_session(s);
>>>> +			}
>>>> +		}
>>>> +		kfree(sessions);
>>>> +	}
>>>> +
>>>>    	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
>>>>    	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
>>>>    	if (req1) {
>>>> @@ -2321,6 +2398,7 @@ static int unsafe_request_wait(struct inode *inode)
>>>>    			err = -EIO;
>>>>    		ceph_mdsc_put_request(req2);
>>>>    	}
>>>> +out:
>>>>    	return err;
>>>>    }
>>>>    
>>> Otherwise the whole set looks pretty reasonable.
>>>
>>> Thanks,


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-07-06 13:17 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-05  1:22 [PATCH v2 0/4] flush the mdlog before waiting on unsafe reqs xiubli
2021-07-05  1:22 ` [PATCH v2 1/4] ceph: make ceph_create_session_msg a global symbol xiubli
2021-07-05  1:22 ` [PATCH v2 2/4] ceph: make iterate_sessions " xiubli
2021-07-05  1:22 ` [PATCH v2 3/4] ceph: flush mdlog before umounting xiubli
2021-07-05  1:22 ` [PATCH v2 4/4] ceph: flush the mdlog before waiting on unsafe reqs xiubli
2021-07-06 11:42   ` Jeff Layton
2021-07-06 12:37     ` Xiubo Li
2021-07-06 13:11       ` Jeff Layton
2021-07-06 13:17         ` Xiubo Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).