* [PATCH 1/5] ceph: export ceph_create_session_msg
2021-06-29 4:42 [PATCH 0/5] flush the mdlog before waiting on unsafe reqs xiubli
@ 2021-06-29 4:42 ` xiubli
2021-06-29 13:12 ` Jeff Layton
2021-06-29 4:42 ` [PATCH 2/5] ceph: export iterate_sessions xiubli
` (4 subsequent siblings)
5 siblings, 1 reply; 35+ messages in thread
From: xiubli @ 2021-06-29 4:42 UTC (permalink / raw)
To: jlayton; +Cc: idryomov, pdonnell, ceph-devel, Xiubo Li
From: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
fs/ceph/mds_client.c | 15 ++++++++-------
fs/ceph/mds_client.h | 1 +
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2d7dcd295bb9..e49d3e230712 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1150,7 +1150,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
/*
* session messages
*/
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1158,7 +1158,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ceph_create_session_msg ENOMEM creating msg\n");
return NULL;
}
h = msg->front.iov_base;
@@ -1289,7 +1289,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ceph_create_session_msg ENOMEM creating msg\n");
return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
@@ -1801,8 +1801,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1816,7 +1816,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state), seq);
- msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1868,7 +1868,8 @@ static int request_close_session(struct ceph_mds_session *session)
dout("request_close_session mds%d state %s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state),
session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
+ session->s_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bf99c5ba47fc..bf2683f0ba43 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -523,6 +523,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
+extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
--
2.27.0
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 1/5] ceph: export ceph_create_session_msg
2021-06-29 4:42 ` [PATCH 1/5] ceph: export ceph_create_session_msg xiubli
@ 2021-06-29 13:12 ` Jeff Layton
2021-06-29 13:27 ` Xiubo Li
0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-29 13:12 UTC (permalink / raw)
To: xiubli; +Cc: idryomov, pdonnell, ceph-devel
On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
nit: the subject of this patch is not quite right. You aren't exporting
it here, just making it a global symbol (within ceph.ko).
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> fs/ceph/mds_client.c | 15 ++++++++-------
> fs/ceph/mds_client.h | 1 +
> 2 files changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 2d7dcd295bb9..e49d3e230712 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1150,7 +1150,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> /*
> * session messages
> */
> -static struct ceph_msg *create_session_msg(u32 op, u64 seq)
> +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
> {
> struct ceph_msg *msg;
> struct ceph_mds_session_head *h;
> @@ -1158,7 +1158,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
> msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
> false);
> if (!msg) {
> - pr_err("create_session_msg ENOMEM creating msg\n");
> + pr_err("ceph_create_session_msg ENOMEM creating msg\n");
instead of hardcoding the function names in these error messages, use
__func__ instead? That makes it easier to keep up with code changes.
pr_err("%s ENOMEM creating msg\n", __func__);
> return NULL;
> }
> h = msg->front.iov_base;
> @@ -1289,7 +1289,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
> msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
> GFP_NOFS, false);
> if (!msg) {
> - pr_err("create_session_msg ENOMEM creating msg\n");
> + pr_err("ceph_create_session_msg ENOMEM creating msg\n");
> return ERR_PTR(-ENOMEM);
> }
> p = msg->front.iov_base;
> @@ -1801,8 +1801,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
>
> dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
> ceph_mds_state_name(state));
> - msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
> - ++session->s_renew_seq);
> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
> + ++session->s_renew_seq);
> if (!msg)
> return -ENOMEM;
> ceph_con_send(&session->s_con, msg);
> @@ -1816,7 +1816,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
>
> dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
> session->s_mds, ceph_session_state_name(session->s_state), seq);
> - msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
> + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
> if (!msg)
> return -ENOMEM;
> ceph_con_send(&session->s_con, msg);
> @@ -1868,7 +1868,8 @@ static int request_close_session(struct ceph_mds_session *session)
> dout("request_close_session mds%d state %s seq %lld\n",
> session->s_mds, ceph_session_state_name(session->s_state),
> session->s_seq);
> - msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
> + session->s_seq);
> if (!msg)
> return -ENOMEM;
> ceph_con_send(&session->s_con, msg);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bf99c5ba47fc..bf2683f0ba43 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -523,6 +523,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
> kref_put(&req->r_kref, ceph_mdsc_release_request);
> }
>
> +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
> extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
> struct ceph_cap *cap);
> extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/5] ceph: export ceph_create_session_msg
2021-06-29 13:12 ` Jeff Layton
@ 2021-06-29 13:27 ` Xiubo Li
2021-06-30 12:17 ` Ilya Dryomov
0 siblings, 1 reply; 35+ messages in thread
From: Xiubo Li @ 2021-06-29 13:27 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/29/21 9:12 PM, Jeff Layton wrote:
> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
> nit: the subject of this patch is not quite right. You aren't exporting
> it here, just making it a global symbol (within ceph.ko).
>
Yeah, will fix it.
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>> fs/ceph/mds_client.c | 15 ++++++++-------
>> fs/ceph/mds_client.h | 1 +
>> 2 files changed, 9 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 2d7dcd295bb9..e49d3e230712 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -1150,7 +1150,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>> /*
>> * session messages
>> */
>> -static struct ceph_msg *create_session_msg(u32 op, u64 seq)
>> +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
>> {
>> struct ceph_msg *msg;
>> struct ceph_mds_session_head *h;
>> @@ -1158,7 +1158,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
>> msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
>> false);
>> if (!msg) {
>> - pr_err("create_session_msg ENOMEM creating msg\n");
>> + pr_err("ceph_create_session_msg ENOMEM creating msg\n");
> instead of hardcoding the function names in these error messages, use
> __func__ instead? That makes it easier to keep up with code changes.
>
> pr_err("%s ENOMEM creating msg\n", __func__);
Sure, will fix this too.
Thanks.
>> return NULL;
>> }
>> h = msg->front.iov_base;
>> @@ -1289,7 +1289,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>> msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
>> GFP_NOFS, false);
>> if (!msg) {
>> - pr_err("create_session_msg ENOMEM creating msg\n");
>> + pr_err("ceph_create_session_msg ENOMEM creating msg\n");
>> return ERR_PTR(-ENOMEM);
>> }
>> p = msg->front.iov_base;
>> @@ -1801,8 +1801,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
>>
>> dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
>> ceph_mds_state_name(state));
>> - msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
>> - ++session->s_renew_seq);
>> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
>> + ++session->s_renew_seq);
>> if (!msg)
>> return -ENOMEM;
>> ceph_con_send(&session->s_con, msg);
>> @@ -1816,7 +1816,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
>>
>> dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
>> session->s_mds, ceph_session_state_name(session->s_state), seq);
>> - msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
>> + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
>> if (!msg)
>> return -ENOMEM;
>> ceph_con_send(&session->s_con, msg);
>> @@ -1868,7 +1868,8 @@ static int request_close_session(struct ceph_mds_session *session)
>> dout("request_close_session mds%d state %s seq %lld\n",
>> session->s_mds, ceph_session_state_name(session->s_state),
>> session->s_seq);
>> - msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
>> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
>> + session->s_seq);
>> if (!msg)
>> return -ENOMEM;
>> ceph_con_send(&session->s_con, msg);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index bf99c5ba47fc..bf2683f0ba43 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -523,6 +523,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
>> kref_put(&req->r_kref, ceph_mdsc_release_request);
>> }
>>
>> +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
>> extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
>> struct ceph_cap *cap);
>> extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/5] ceph: export ceph_create_session_msg
2021-06-29 13:27 ` Xiubo Li
@ 2021-06-30 12:17 ` Ilya Dryomov
2021-07-01 1:50 ` Xiubo Li
0 siblings, 1 reply; 35+ messages in thread
From: Ilya Dryomov @ 2021-06-30 12:17 UTC (permalink / raw)
To: Xiubo Li; +Cc: Jeff Layton, Patrick Donnelly, Ceph Development
On Tue, Jun 29, 2021 at 3:27 PM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 6/29/21 9:12 PM, Jeff Layton wrote:
> > On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> > nit: the subject of this patch is not quite right. You aren't exporting
> > it here, just making it a global symbol (within ceph.ko).
> >
>
> Yeah, will fix it.
>
>
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >> fs/ceph/mds_client.c | 15 ++++++++-------
> >> fs/ceph/mds_client.h | 1 +
> >> 2 files changed, 9 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> index 2d7dcd295bb9..e49d3e230712 100644
> >> --- a/fs/ceph/mds_client.c
> >> +++ b/fs/ceph/mds_client.c
> >> @@ -1150,7 +1150,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> >> /*
> >> * session messages
> >> */
> >> -static struct ceph_msg *create_session_msg(u32 op, u64 seq)
> >> +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
> >> {
> >> struct ceph_msg *msg;
> >> struct ceph_mds_session_head *h;
> >> @@ -1158,7 +1158,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
> >> msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
> >> false);
> >> if (!msg) {
> >> - pr_err("create_session_msg ENOMEM creating msg\n");
> >> + pr_err("ceph_create_session_msg ENOMEM creating msg\n");
> > instead of hardcoding the function names in these error messages, use
> > __func__ instead? That makes it easier to keep up with code changes.
> >
> > pr_err("%s ENOMEM creating msg\n", __func__);
>
> Sure, will fix this too.
I think this can be just "ENOMEM creating session msg" without the
function name to avoid repetition.
Thanks,
Ilya
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 1/5] ceph: export ceph_create_session_msg
2021-06-30 12:17 ` Ilya Dryomov
@ 2021-07-01 1:50 ` Xiubo Li
0 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-07-01 1:50 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Jeff Layton, Patrick Donnelly, Ceph Development
On 6/30/21 8:17 PM, Ilya Dryomov wrote:
> On Tue, Jun 29, 2021 at 3:27 PM Xiubo Li <xiubli@redhat.com> wrote:
>>
>> On 6/29/21 9:12 PM, Jeff Layton wrote:
>>> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>> nit: the subject of this patch is not quite right. You aren't exporting
>>> it here, just making it a global symbol (within ceph.ko).
>>>
>> Yeah, will fix it.
>>
>>
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>> fs/ceph/mds_client.c | 15 ++++++++-------
>>>> fs/ceph/mds_client.h | 1 +
>>>> 2 files changed, 9 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index 2d7dcd295bb9..e49d3e230712 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -1150,7 +1150,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>>>> /*
>>>> * session messages
>>>> */
>>>> -static struct ceph_msg *create_session_msg(u32 op, u64 seq)
>>>> +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
>>>> {
>>>> struct ceph_msg *msg;
>>>> struct ceph_mds_session_head *h;
>>>> @@ -1158,7 +1158,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
>>>> msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
>>>> false);
>>>> if (!msg) {
>>>> - pr_err("create_session_msg ENOMEM creating msg\n");
>>>> + pr_err("ceph_create_session_msg ENOMEM creating msg\n");
>>> instead of hardcoding the function names in these error messages, use
>>> __func__ instead? That makes it easier to keep up with code changes.
>>>
>>> pr_err("%s ENOMEM creating msg\n", __func__);
>> Sure, will fix this too.
> I think this can be just "ENOMEM creating session msg" without the
> function name to avoid repetition.
Will fix it by using:
pr_err("ENOMEM creating session %s msg", ceph_session_op_name(op));
Thanks.
>
> Thanks,
>
> Ilya
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH 2/5] ceph: export iterate_sessions
2021-06-29 4:42 [PATCH 0/5] flush the mdlog before waiting on unsafe reqs xiubli
2021-06-29 4:42 ` [PATCH 1/5] ceph: export ceph_create_session_msg xiubli
@ 2021-06-29 4:42 ` xiubli
2021-06-29 15:39 ` Jeff Layton
2021-06-29 4:42 ` [PATCH 3/5] ceph: flush mdlog before umounting xiubli
` (3 subsequent siblings)
5 siblings, 1 reply; 35+ messages in thread
From: xiubli @ 2021-06-29 4:42 UTC (permalink / raw)
To: jlayton; +Cc: idryomov, pdonnell, ceph-devel, Xiubo Li
From: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
fs/ceph/caps.c | 26 +-----------------------
fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++---------------
fs/ceph/mds_client.h | 3 +++
3 files changed, 35 insertions(+), 41 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e712826ea3f1..c6a3352a4d52 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4280,33 +4280,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
dout("flush_dirty_caps done\n");
}
-static void iterate_sessions(struct ceph_mds_client *mdsc,
- void (*cb)(struct ceph_mds_session *))
-{
- int mds;
-
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ++mds) {
- struct ceph_mds_session *s;
-
- if (!mdsc->sessions[mds])
- continue;
-
- s = ceph_get_mds_session(mdsc->sessions[mds]);
- if (!s)
- continue;
-
- mutex_unlock(&mdsc->mutex);
- cb(s);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-}
-
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
- iterate_sessions(mdsc, flush_dirty_session_caps);
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}
void __ceph_touch_fmode(struct ceph_inode_info *ci,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e49d3e230712..96bef289f58f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -802,6 +802,33 @@ static void put_request_session(struct ceph_mds_request *req)
}
}
+void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state)
+{
+ int mds;
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; ++mds) {
+ struct ceph_mds_session *s;
+
+ s = __ceph_lookup_mds_session(mdsc, mds);
+ if (!s)
+ continue;
+
+ if (check_state && !check_session_state(s)) {
+ ceph_put_mds_session(s);
+ continue;
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ cb(s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
void ceph_mdsc_release_request(struct kref *kref)
{
struct ceph_mds_request *req = container_of(kref,
@@ -4416,22 +4443,10 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
/*
* lock unlock sessions, to wait ongoing session activities
*/
-static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
+static void lock_unlock_session(struct ceph_mds_session *s)
{
- int i;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
}
static void maybe_recover_session(struct ceph_mds_client *mdsc)
@@ -4683,7 +4698,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
dout("pre_umount\n");
mdsc->stopping = 1;
- lock_unlock_sessions(mdsc);
+ ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bf2683f0ba43..fca2cf427eaf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -523,6 +523,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
+extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state);
extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
--
2.27.0
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 2/5] ceph: export iterate_sessions
2021-06-29 4:42 ` [PATCH 2/5] ceph: export iterate_sessions xiubli
@ 2021-06-29 15:39 ` Jeff Layton
2021-06-30 0:55 ` Xiubo Li
0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-29 15:39 UTC (permalink / raw)
To: xiubli; +Cc: idryomov, pdonnell, ceph-devel
On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> fs/ceph/caps.c | 26 +-----------------------
> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++---------------
> fs/ceph/mds_client.h | 3 +++
> 3 files changed, 35 insertions(+), 41 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index e712826ea3f1..c6a3352a4d52 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -4280,33 +4280,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
> dout("flush_dirty_caps done\n");
> }
>
> -static void iterate_sessions(struct ceph_mds_client *mdsc,
> - void (*cb)(struct ceph_mds_session *))
> -{
> - int mds;
> -
> - mutex_lock(&mdsc->mutex);
> - for (mds = 0; mds < mdsc->max_sessions; ++mds) {
> - struct ceph_mds_session *s;
> -
> - if (!mdsc->sessions[mds])
> - continue;
> -
> - s = ceph_get_mds_session(mdsc->sessions[mds]);
> - if (!s)
> - continue;
> -
> - mutex_unlock(&mdsc->mutex);
> - cb(s);
> - ceph_put_mds_session(s);
> - mutex_lock(&mdsc->mutex);
> - }
> - mutex_unlock(&mdsc->mutex);
> -}
> -
> void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
> {
> - iterate_sessions(mdsc, flush_dirty_session_caps);
> + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
> }
>
> void __ceph_touch_fmode(struct ceph_inode_info *ci,
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index e49d3e230712..96bef289f58f 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -802,6 +802,33 @@ static void put_request_session(struct ceph_mds_request *req)
> }
> }
>
> +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
> + void (*cb)(struct ceph_mds_session *),
> + bool check_state)
> +{
> + int mds;
> +
> + mutex_lock(&mdsc->mutex);
> + for (mds = 0; mds < mdsc->max_sessions; ++mds) {
> + struct ceph_mds_session *s;
> +
> + s = __ceph_lookup_mds_session(mdsc, mds);
> + if (!s)
> + continue;
> +
> + if (check_state && !check_session_state(s)) {
> + ceph_put_mds_session(s);
> + continue;
> + }
> +
> + mutex_unlock(&mdsc->mutex);
> + cb(s);
> + ceph_put_mds_session(s);
> + mutex_lock(&mdsc->mutex);
> + }
> + mutex_unlock(&mdsc->mutex);
> +}
> +
> void ceph_mdsc_release_request(struct kref *kref)
> {
> struct ceph_mds_request *req = container_of(kref,
> @@ -4416,22 +4443,10 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
> /*
> * lock unlock sessions, to wait ongoing session activities
> */
> -static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
> +static void lock_unlock_session(struct ceph_mds_session *s)
> {
> - int i;
> -
> - mutex_lock(&mdsc->mutex);
> - for (i = 0; i < mdsc->max_sessions; i++) {
> - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
> - if (!s)
> - continue;
> - mutex_unlock(&mdsc->mutex);
> - mutex_lock(&s->s_mutex);
> - mutex_unlock(&s->s_mutex);
> - ceph_put_mds_session(s);
> - mutex_lock(&mdsc->mutex);
> - }
> - mutex_unlock(&mdsc->mutex);
> + mutex_lock(&s->s_mutex);
> + mutex_unlock(&s->s_mutex);
> }
>
Your patch doesn't materially change this, but it sure would be nice to
know what purpose this lock/unlock garbage serves. Barf.
> static void maybe_recover_session(struct ceph_mds_client *mdsc)
> @@ -4683,7 +4698,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> dout("pre_umount\n");
> mdsc->stopping = 1;
>
> - lock_unlock_sessions(mdsc);
> + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
> ceph_flush_dirty_caps(mdsc);
> wait_requests(mdsc);
>
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bf2683f0ba43..fca2cf427eaf 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -523,6 +523,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
> kref_put(&req->r_kref, ceph_mdsc_release_request);
> }
>
> +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
> + void (*cb)(struct ceph_mds_session *),
> + bool check_state);
> extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
> extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
> struct ceph_cap *cap);
Again, not really exporting this function, so maybe reword the subject
line?
Thanks,
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 2/5] ceph: export iterate_sessions
2021-06-29 15:39 ` Jeff Layton
@ 2021-06-30 0:55 ` Xiubo Li
0 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-06-30 0:55 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/29/21 11:39 PM, Jeff Layton wrote:
> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>> fs/ceph/caps.c | 26 +-----------------------
>> fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++++++---------------
>> fs/ceph/mds_client.h | 3 +++
>> 3 files changed, 35 insertions(+), 41 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index e712826ea3f1..c6a3352a4d52 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -4280,33 +4280,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
>> dout("flush_dirty_caps done\n");
>> }
>>
>> -static void iterate_sessions(struct ceph_mds_client *mdsc,
>> - void (*cb)(struct ceph_mds_session *))
>> -{
>> - int mds;
>> -
>> - mutex_lock(&mdsc->mutex);
>> - for (mds = 0; mds < mdsc->max_sessions; ++mds) {
>> - struct ceph_mds_session *s;
>> -
>> - if (!mdsc->sessions[mds])
>> - continue;
>> -
>> - s = ceph_get_mds_session(mdsc->sessions[mds]);
>> - if (!s)
>> - continue;
>> -
>> - mutex_unlock(&mdsc->mutex);
>> - cb(s);
>> - ceph_put_mds_session(s);
>> - mutex_lock(&mdsc->mutex);
>> - }
>> - mutex_unlock(&mdsc->mutex);
>> -}
>> -
>> void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
>> {
>> - iterate_sessions(mdsc, flush_dirty_session_caps);
>> + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
>> }
>>
>> void __ceph_touch_fmode(struct ceph_inode_info *ci,
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index e49d3e230712..96bef289f58f 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -802,6 +802,33 @@ static void put_request_session(struct ceph_mds_request *req)
>> }
>> }
>>
>> +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
>> + void (*cb)(struct ceph_mds_session *),
>> + bool check_state)
>> +{
>> + int mds;
>> +
>> + mutex_lock(&mdsc->mutex);
>> + for (mds = 0; mds < mdsc->max_sessions; ++mds) {
>> + struct ceph_mds_session *s;
>> +
>> + s = __ceph_lookup_mds_session(mdsc, mds);
>> + if (!s)
>> + continue;
>> +
>> + if (check_state && !check_session_state(s)) {
>> + ceph_put_mds_session(s);
>> + continue;
>> + }
>> +
>> + mutex_unlock(&mdsc->mutex);
>> + cb(s);
>> + ceph_put_mds_session(s);
>> + mutex_lock(&mdsc->mutex);
>> + }
>> + mutex_unlock(&mdsc->mutex);
>> +}
>> +
>> void ceph_mdsc_release_request(struct kref *kref)
>> {
>> struct ceph_mds_request *req = container_of(kref,
>> @@ -4416,22 +4443,10 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
>> /*
>> * lock unlock sessions, to wait ongoing session activities
>> */
>> -static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
>> +static void lock_unlock_session(struct ceph_mds_session *s)
>> {
>> - int i;
>> -
>> - mutex_lock(&mdsc->mutex);
>> - for (i = 0; i < mdsc->max_sessions; i++) {
>> - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
>> - if (!s)
>> - continue;
>> - mutex_unlock(&mdsc->mutex);
>> - mutex_lock(&s->s_mutex);
>> - mutex_unlock(&s->s_mutex);
>> - ceph_put_mds_session(s);
>> - mutex_lock(&mdsc->mutex);
>> - }
>> - mutex_unlock(&mdsc->mutex);
>> + mutex_lock(&s->s_mutex);
>> + mutex_unlock(&s->s_mutex);
>> }
>>
> Your patch doesn't materially change this, but it sure would be nice to
> know what purpose this lock/unlock garbage serves. Barf.
Yeah, it just simplify the code.
I will add some comments about it.
>
>> static void maybe_recover_session(struct ceph_mds_client *mdsc)
>> @@ -4683,7 +4698,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>> dout("pre_umount\n");
>> mdsc->stopping = 1;
>>
>> - lock_unlock_sessions(mdsc);
>> + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
>> ceph_flush_dirty_caps(mdsc);
>> wait_requests(mdsc);
>>
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index bf2683f0ba43..fca2cf427eaf 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -523,6 +523,9 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
>> kref_put(&req->r_kref, ceph_mdsc_release_request);
>> }
>>
>> +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
>> + void (*cb)(struct ceph_mds_session *),
>> + bool check_state);
>> extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
>> extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
>> struct ceph_cap *cap);
> Again, not really exporting this function, so maybe reword the subject
> line?
Sure, will fix it.
> Thanks,
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH 3/5] ceph: flush mdlog before umounting
2021-06-29 4:42 [PATCH 0/5] flush the mdlog before waiting on unsafe reqs xiubli
2021-06-29 4:42 ` [PATCH 1/5] ceph: export ceph_create_session_msg xiubli
2021-06-29 4:42 ` [PATCH 2/5] ceph: export iterate_sessions xiubli
@ 2021-06-29 4:42 ` xiubli
2021-06-29 15:34 ` Jeff Layton
2021-06-30 12:39 ` Ilya Dryomov
2021-06-29 4:42 ` [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs xiubli
` (2 subsequent siblings)
5 siblings, 2 replies; 35+ messages in thread
From: xiubli @ 2021-06-29 4:42 UTC (permalink / raw)
To: jlayton; +Cc: idryomov, pdonnell, ceph-devel, Xiubo Li
From: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
fs/ceph/mds_client.c | 29 +++++++++++++++++++++++++++++
fs/ceph/mds_client.h | 1 +
include/linux/ceph/ceph_fs.h | 1 +
3 files changed, 31 insertions(+)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 96bef289f58f..2db87a5c68d4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4689,6 +4689,34 @@ static void wait_requests(struct ceph_mds_client *mdsc)
dout("wait_requests done\n");
}
+static void send_flush_mdlog(struct ceph_mds_session *s)
+{
+ u64 seq = s->s_seq;
+ struct ceph_msg *msg;
+
+ /*
+ * For the MDS daemons lower than Luminous will crash when it
+ * saw this unknown session request.
+ */
+ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
+ return;
+
+ dout("send_flush_mdlog to mds%d (%s)s seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, seq);
+ if (!msg) {
+ pr_err("failed to send_flush_mdlog to mds%d (%s)s seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), seq);
+ } else {
+ ceph_con_send(&s->s_con, msg);
+ }
+}
+
+void flush_mdlog(struct ceph_mds_client *mdsc)
+{
+ ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
+}
+
/*
* called before mount is ro, and before dentries are torn down.
* (hmm, does this still race with new lookups?)
@@ -4698,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
dout("pre_umount\n");
mdsc->stopping = 1;
+ flush_mdlog(mdsc);
ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index fca2cf427eaf..79d5b8ed62bf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -537,6 +537,7 @@ extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *,
struct ceph_cap *, void *),
void *arg);
+extern void flush_mdlog(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
static inline void ceph_mdsc_free_path(char *path, int len)
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 57e5bd63fb7a..ae60696fe40b 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -300,6 +300,7 @@ enum {
CEPH_SESSION_FLUSHMSG_ACK,
CEPH_SESSION_FORCE_RO,
CEPH_SESSION_REJECT,
+ CEPH_SESSION_REQUEST_FLUSH_MDLOG,
};
extern const char *ceph_session_op_name(int op);
--
2.27.0
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 3/5] ceph: flush mdlog before umounting
2021-06-29 4:42 ` [PATCH 3/5] ceph: flush mdlog before umounting xiubli
@ 2021-06-29 15:34 ` Jeff Layton
2021-06-30 0:36 ` Xiubo Li
2021-06-30 12:39 ` Ilya Dryomov
1 sibling, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-29 15:34 UTC (permalink / raw)
To: xiubli; +Cc: idryomov, pdonnell, ceph-devel
On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> fs/ceph/mds_client.c | 29 +++++++++++++++++++++++++++++
> fs/ceph/mds_client.h | 1 +
> include/linux/ceph/ceph_fs.h | 1 +
> 3 files changed, 31 insertions(+)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 96bef289f58f..2db87a5c68d4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4689,6 +4689,34 @@ static void wait_requests(struct ceph_mds_client *mdsc)
> dout("wait_requests done\n");
> }
>
> +static void send_flush_mdlog(struct ceph_mds_session *s)
> +{
> + u64 seq = s->s_seq;
> + struct ceph_msg *msg;
> +
The s_seq field is protected by the s_mutex (at least, AFAICT). I think
you probably need to take it before fetching the s_seq and release it
after calling ceph_con_send.
Long term, we probably need to rethink how the session sequence number
handling is done. The s_mutex is a terribly heavyweight mechanism for
this.
> + /*
> + * For the MDS daemons lower than Luminous will crash when it
> + * saw this unknown session request.
> + */
> + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
> + return;
> +
> + dout("send_flush_mdlog to mds%d (%s)s seq %lld\n",
> + s->s_mds, ceph_session_state_name(s->s_state), seq);
> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, seq);
> + if (!msg) {
> + pr_err("failed to send_flush_mdlog to mds%d (%s)s seq %lld\n",
> + s->s_mds, ceph_session_state_name(s->s_state), seq);
> + } else {
> + ceph_con_send(&s->s_con, msg);
> + }
> +}
> +
> +void flush_mdlog(struct ceph_mds_client *mdsc)
> +{
> + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
> +}
> +
> /*
> * called before mount is ro, and before dentries are torn down.
> * (hmm, does this still race with new lookups?)
> @@ -4698,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> dout("pre_umount\n");
> mdsc->stopping = 1;
>
> + flush_mdlog(mdsc);
> ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
> ceph_flush_dirty_caps(mdsc);
> wait_requests(mdsc);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index fca2cf427eaf..79d5b8ed62bf 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -537,6 +537,7 @@ extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
> int (*cb)(struct inode *,
> struct ceph_cap *, void *),
> void *arg);
> +extern void flush_mdlog(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
>
> static inline void ceph_mdsc_free_path(char *path, int len)
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 57e5bd63fb7a..ae60696fe40b 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -300,6 +300,7 @@ enum {
> CEPH_SESSION_FLUSHMSG_ACK,
> CEPH_SESSION_FORCE_RO,
> CEPH_SESSION_REJECT,
> + CEPH_SESSION_REQUEST_FLUSH_MDLOG,
> };
>
> extern const char *ceph_session_op_name(int op);
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 3/5] ceph: flush mdlog before umounting
2021-06-29 15:34 ` Jeff Layton
@ 2021-06-30 0:36 ` Xiubo Li
0 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-06-30 0:36 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/29/21 11:34 PM, Jeff Layton wrote:
> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>> fs/ceph/mds_client.c | 29 +++++++++++++++++++++++++++++
>> fs/ceph/mds_client.h | 1 +
>> include/linux/ceph/ceph_fs.h | 1 +
>> 3 files changed, 31 insertions(+)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 96bef289f58f..2db87a5c68d4 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4689,6 +4689,34 @@ static void wait_requests(struct ceph_mds_client *mdsc)
>> dout("wait_requests done\n");
>> }
>>
>> +static void send_flush_mdlog(struct ceph_mds_session *s)
>> +{
>> + u64 seq = s->s_seq;
>> + struct ceph_msg *msg;
>> +
> The s_seq field is protected by the s_mutex (at least, AFAICT). I think
> you probably need to take it before fetching the s_seq and release it
> after calling ceph_con_send.
Will fix it.
>
> Long term, we probably need to rethink how the session sequence number
> handling is done. The s_mutex is a terribly heavyweight mechanism for
> this.
Yeah, makes sense.
>> + /*
>> + * For the MDS daemons lower than Luminous will crash when it
>> + * saw this unknown session request.
>> + */
>> + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
>> + return;
>> +
>> + dout("send_flush_mdlog to mds%d (%s)s seq %lld\n",
>> + s->s_mds, ceph_session_state_name(s->s_state), seq);
>> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, seq);
>> + if (!msg) {
>> + pr_err("failed to send_flush_mdlog to mds%d (%s)s seq %lld\n",
>> + s->s_mds, ceph_session_state_name(s->s_state), seq);
>> + } else {
>> + ceph_con_send(&s->s_con, msg);
>> + }
>> +}
>> +
>> +void flush_mdlog(struct ceph_mds_client *mdsc)
>> +{
>> + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
>> +}
>> +
>> /*
>> * called before mount is ro, and before dentries are torn down.
>> * (hmm, does this still race with new lookups?)
>> @@ -4698,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>> dout("pre_umount\n");
>> mdsc->stopping = 1;
>>
>> + flush_mdlog(mdsc);
>> ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
>> ceph_flush_dirty_caps(mdsc);
>> wait_requests(mdsc);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index fca2cf427eaf..79d5b8ed62bf 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -537,6 +537,7 @@ extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
>> int (*cb)(struct inode *,
>> struct ceph_cap *, void *),
>> void *arg);
>> +extern void flush_mdlog(struct ceph_mds_client *mdsc);
>> extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
>>
>> static inline void ceph_mdsc_free_path(char *path, int len)
>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>> index 57e5bd63fb7a..ae60696fe40b 100644
>> --- a/include/linux/ceph/ceph_fs.h
>> +++ b/include/linux/ceph/ceph_fs.h
>> @@ -300,6 +300,7 @@ enum {
>> CEPH_SESSION_FLUSHMSG_ACK,
>> CEPH_SESSION_FORCE_RO,
>> CEPH_SESSION_REJECT,
>> + CEPH_SESSION_REQUEST_FLUSH_MDLOG,
>> };
>>
>> extern const char *ceph_session_op_name(int op);
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 3/5] ceph: flush mdlog before umounting
2021-06-29 4:42 ` [PATCH 3/5] ceph: flush mdlog before umounting xiubli
2021-06-29 15:34 ` Jeff Layton
@ 2021-06-30 12:39 ` Ilya Dryomov
2021-07-01 1:18 ` Xiubo Li
1 sibling, 1 reply; 35+ messages in thread
From: Ilya Dryomov @ 2021-06-30 12:39 UTC (permalink / raw)
To: Xiubo Li; +Cc: Jeff Layton, Patrick Donnelly, Ceph Development
On Tue, Jun 29, 2021 at 6:42 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> fs/ceph/mds_client.c | 29 +++++++++++++++++++++++++++++
> fs/ceph/mds_client.h | 1 +
> include/linux/ceph/ceph_fs.h | 1 +
> 3 files changed, 31 insertions(+)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 96bef289f58f..2db87a5c68d4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4689,6 +4689,34 @@ static void wait_requests(struct ceph_mds_client *mdsc)
> dout("wait_requests done\n");
> }
>
> +static void send_flush_mdlog(struct ceph_mds_session *s)
> +{
> + u64 seq = s->s_seq;
> + struct ceph_msg *msg;
> +
> + /*
> + * For the MDS daemons lower than Luminous will crash when it
> + * saw this unknown session request.
"Pre-luminous MDS crashes when it sees an unknown session request"
> + */
> + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
> + return;
> +
> + dout("send_flush_mdlog to mds%d (%s)s seq %lld\n",
Should (%s)s be just (%s)?
> + s->s_mds, ceph_session_state_name(s->s_state), seq);
> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, seq);
> + if (!msg) {
> + pr_err("failed to send_flush_mdlog to mds%d (%s)s seq %lld\n",
Same here and let's avoid function names in error messages. Something
like "failed to request mdlog flush ...".
> + s->s_mds, ceph_session_state_name(s->s_state), seq);
> + } else {
> + ceph_con_send(&s->s_con, msg);
> + }
> +}
> +
> +void flush_mdlog(struct ceph_mds_client *mdsc)
> +{
> + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
> +}
Is this wrapper really needed?
> +
> /*
> * called before mount is ro, and before dentries are torn down.
> * (hmm, does this still race with new lookups?)
> @@ -4698,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> dout("pre_umount\n");
> mdsc->stopping = 1;
>
> + flush_mdlog(mdsc);
> ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
> ceph_flush_dirty_caps(mdsc);
> wait_requests(mdsc);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index fca2cf427eaf..79d5b8ed62bf 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -537,6 +537,7 @@ extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
> int (*cb)(struct inode *,
> struct ceph_cap *, void *),
> void *arg);
> +extern void flush_mdlog(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
>
> static inline void ceph_mdsc_free_path(char *path, int len)
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 57e5bd63fb7a..ae60696fe40b 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -300,6 +300,7 @@ enum {
> CEPH_SESSION_FLUSHMSG_ACK,
> CEPH_SESSION_FORCE_RO,
> CEPH_SESSION_REJECT,
> + CEPH_SESSION_REQUEST_FLUSH_MDLOG,
Need to update ceph_session_op_name().
Thanks,
Ilya
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 3/5] ceph: flush mdlog before umounting
2021-06-30 12:39 ` Ilya Dryomov
@ 2021-07-01 1:18 ` Xiubo Li
0 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-07-01 1:18 UTC (permalink / raw)
To: Ilya Dryomov; +Cc: Jeff Layton, Patrick Donnelly, Ceph Development
On 6/30/21 8:39 PM, Ilya Dryomov wrote:
> On Tue, Jun 29, 2021 at 6:42 AM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>> fs/ceph/mds_client.c | 29 +++++++++++++++++++++++++++++
>> fs/ceph/mds_client.h | 1 +
>> include/linux/ceph/ceph_fs.h | 1 +
>> 3 files changed, 31 insertions(+)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 96bef289f58f..2db87a5c68d4 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4689,6 +4689,34 @@ static void wait_requests(struct ceph_mds_client *mdsc)
>> dout("wait_requests done\n");
>> }
>>
>> +static void send_flush_mdlog(struct ceph_mds_session *s)
>> +{
>> + u64 seq = s->s_seq;
>> + struct ceph_msg *msg;
>> +
>> + /*
>> + * For the MDS daemons lower than Luminous will crash when it
>> + * saw this unknown session request.
> "Pre-luminous MDS crashes when it sees an unknown session request"
Will fix it.
>
>> + */
>> + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
>> + return;
>> +
>> + dout("send_flush_mdlog to mds%d (%s)s seq %lld\n",
> Should (%s)s be just (%s)?
Will fix it.
>
>> + s->s_mds, ceph_session_state_name(s->s_state), seq);
>> + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, seq);
>> + if (!msg) {
>> + pr_err("failed to send_flush_mdlog to mds%d (%s)s seq %lld\n",
> Same here and let's avoid function names in error messages. Something
> like "failed to request mdlog flush ...".
Okay.
>
>> + s->s_mds, ceph_session_state_name(s->s_state), seq);
>> + } else {
>> + ceph_con_send(&s->s_con, msg);
>> + }
>> +}
>> +
>> +void flush_mdlog(struct ceph_mds_client *mdsc)
>> +{
>> + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
>> +}
> Is this wrapper really needed?
Yeah, I can remove it.
>
>> +
>> /*
>> * called before mount is ro, and before dentries are torn down.
>> * (hmm, does this still race with new lookups?)
>> @@ -4698,6 +4726,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>> dout("pre_umount\n");
>> mdsc->stopping = 1;
>>
>> + flush_mdlog(mdsc);
>> ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
>> ceph_flush_dirty_caps(mdsc);
>> wait_requests(mdsc);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index fca2cf427eaf..79d5b8ed62bf 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -537,6 +537,7 @@ extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
>> int (*cb)(struct inode *,
>> struct ceph_cap *, void *),
>> void *arg);
>> +extern void flush_mdlog(struct ceph_mds_client *mdsc);
>> extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
>>
>> static inline void ceph_mdsc_free_path(char *path, int len)
>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>> index 57e5bd63fb7a..ae60696fe40b 100644
>> --- a/include/linux/ceph/ceph_fs.h
>> +++ b/include/linux/ceph/ceph_fs.h
>> @@ -300,6 +300,7 @@ enum {
>> CEPH_SESSION_FLUSHMSG_ACK,
>> CEPH_SESSION_FORCE_RO,
>> CEPH_SESSION_REJECT,
>> + CEPH_SESSION_REQUEST_FLUSH_MDLOG,
> Need to update ceph_session_op_name().
Sure.
Thanks
BRs
> Thanks,
>
> Ilya
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs
2021-06-29 4:42 [PATCH 0/5] flush the mdlog before waiting on unsafe reqs xiubli
` (2 preceding siblings ...)
2021-06-29 4:42 ` [PATCH 3/5] ceph: flush mdlog before umounting xiubli
@ 2021-06-29 4:42 ` xiubli
2021-06-29 13:25 ` Jeff Layton
2021-06-29 4:42 ` [PATCH 5/5] ceph: fix ceph feature bits xiubli
2021-06-29 15:27 ` [PATCH 0/5] flush the mdlog before waiting on unsafe reqs Jeff Layton
5 siblings, 1 reply; 35+ messages in thread
From: xiubli @ 2021-06-29 4:42 UTC (permalink / raw)
To: jlayton; +Cc: idryomov, pdonnell, ceph-devel, Xiubo Li
From: Xiubo Li <xiubli@redhat.com>
For the client requests who will have unsafe and safe replies from
MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
(journal log) immediatelly, because they think it's unnecessary.
That's true for most cases but not all, likes the fsync request.
The fsync will wait until all the unsafe replied requests to be
safely replied.
Normally if there have multiple threads or clients are running, the
whole mdlog in MDS daemons could be flushed in time if any request
will trigger the mdlog submit thread. So usually we won't experience
the normal operations will stuck for a long time. But in case there
has only one client with only thread is running, the stuck phenomenon
maybe obvious and the worst case it must wait at most 5 seconds to
wait the mdlog to be flushed by the MDS's tick thread periodically.
This patch will trigger to flush the mdlog in all the MDSes manually
just before waiting the unsafe requests to finish.
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
fs/ceph/caps.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c6a3352a4d52..6e80e4649c7a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
*/
static int unsafe_request_wait(struct inode *inode)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
@@ -2305,6 +2306,14 @@ static int unsafe_request_wait(struct inode *inode)
}
spin_unlock(&ci->i_unsafe_lock);
+ /*
+ * Trigger to flush the journal logs in all the MDSes manually,
+ * or in the worst case we must wait at most 5 seconds to wait
+ * the journal logs to be flushed by the MDSes periodically.
+ */
+ if (req1 || req2)
+ flush_mdlog(mdsc);
+
dout("unsafe_request_wait %p wait on tid %llu %llu\n",
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
--
2.27.0
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs
2021-06-29 4:42 ` [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs xiubli
@ 2021-06-29 13:25 ` Jeff Layton
2021-06-30 1:26 ` Xiubo Li
0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-29 13:25 UTC (permalink / raw)
To: xiubli; +Cc: idryomov, pdonnell, ceph-devel
On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> For the client requests who will have unsafe and safe replies from
> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
> (journal log) immediatelly, because they think it's unnecessary.
> That's true for most cases but not all, likes the fsync request.
> The fsync will wait until all the unsafe replied requests to be
> safely replied.
>
> Normally if there have multiple threads or clients are running, the
> whole mdlog in MDS daemons could be flushed in time if any request
> will trigger the mdlog submit thread. So usually we won't experience
> the normal operations will stuck for a long time. But in case there
> has only one client with only thread is running, the stuck phenomenon
> maybe obvious and the worst case it must wait at most 5 seconds to
> wait the mdlog to be flushed by the MDS's tick thread periodically.
>
> This patch will trigger to flush the mdlog in all the MDSes manually
> just before waiting the unsafe requests to finish.
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> fs/ceph/caps.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index c6a3352a4d52..6e80e4649c7a 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
> */
> static int unsafe_request_wait(struct inode *inode)
> {
> + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> struct ceph_inode_info *ci = ceph_inode(inode);
> struct ceph_mds_request *req1 = NULL, *req2 = NULL;
> int ret, err = 0;
> @@ -2305,6 +2306,14 @@ static int unsafe_request_wait(struct inode *inode)
> }
> spin_unlock(&ci->i_unsafe_lock);
>
> + /*
> + * Trigger to flush the journal logs in all the MDSes manually,
> + * or in the worst case we must wait at most 5 seconds to wait
> + * the journal logs to be flushed by the MDSes periodically.
> + */
> + if (req1 || req2)
> + flush_mdlog(mdsc);
> +
So this is called on fsync(). Do we really need to flush all of the mds
logs on every fsync? That sounds like it might have some performance
impact. Would it be possible to just flush the mdslog on the MDS that's
authoritative for this inode?
> dout("unsafe_request_wait %p wait on tid %llu %llu\n",
> inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
> if (req1) {
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs
2021-06-29 13:25 ` Jeff Layton
@ 2021-06-30 1:26 ` Xiubo Li
2021-06-30 12:13 ` Jeff Layton
0 siblings, 1 reply; 35+ messages in thread
From: Xiubo Li @ 2021-06-30 1:26 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/29/21 9:25 PM, Jeff Layton wrote:
> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> For the client requests who will have unsafe and safe replies from
>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
>> (journal log) immediatelly, because they think it's unnecessary.
>> That's true for most cases but not all, likes the fsync request.
>> The fsync will wait until all the unsafe replied requests to be
>> safely replied.
>>
>> Normally if there have multiple threads or clients are running, the
>> whole mdlog in MDS daemons could be flushed in time if any request
>> will trigger the mdlog submit thread. So usually we won't experience
>> the normal operations will stuck for a long time. But in case there
>> has only one client with only thread is running, the stuck phenomenon
>> maybe obvious and the worst case it must wait at most 5 seconds to
>> wait the mdlog to be flushed by the MDS's tick thread periodically.
>>
>> This patch will trigger to flush the mdlog in all the MDSes manually
>> just before waiting the unsafe requests to finish.
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>> fs/ceph/caps.c | 9 +++++++++
>> 1 file changed, 9 insertions(+)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index c6a3352a4d52..6e80e4649c7a 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
>> */
>> static int unsafe_request_wait(struct inode *inode)
>> {
>> + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>> struct ceph_inode_info *ci = ceph_inode(inode);
>> struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>> int ret, err = 0;
>> @@ -2305,6 +2306,14 @@ static int unsafe_request_wait(struct inode *inode)
>> }
>> spin_unlock(&ci->i_unsafe_lock);
>>
>> + /*
>> + * Trigger to flush the journal logs in all the MDSes manually,
>> + * or in the worst case we must wait at most 5 seconds to wait
>> + * the journal logs to be flushed by the MDSes periodically.
>> + */
>> + if (req1 || req2)
>> + flush_mdlog(mdsc);
>> +
> So this is called on fsync(). Do we really need to flush all of the mds
> logs on every fsync? That sounds like it might have some performance
> impact. Would it be possible to just flush the mdslog on the MDS that's
> authoritative for this inode?
I hit one case before, the mds.0 is the auth mds, but the client just
sent the request to mds.2, then when the mds.2 tried to gather the
rdlocks then it was stuck for waiting for the mds.0 to flush the mdlog.
I think it also will happen that if the mds.0 could also be stuck just
like this even its the auth MDS.
Normally the mdlog submit thread will be triggered per MDS's tick,
that's 5 seconds. But this is not always true mostly because any other
client request could trigger the mdlog submit thread to run at any time.
Since the fsync is not running all the time, so IMO the performance
impact should be okay.
>
>> dout("unsafe_request_wait %p wait on tid %llu %llu\n",
>> inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
>> if (req1) {
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs
2021-06-30 1:26 ` Xiubo Li
@ 2021-06-30 12:13 ` Jeff Layton
2021-07-01 1:16 ` Xiubo Li
0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-30 12:13 UTC (permalink / raw)
To: Xiubo Li; +Cc: idryomov, pdonnell, ceph-devel
On Wed, 2021-06-30 at 09:26 +0800, Xiubo Li wrote:
> On 6/29/21 9:25 PM, Jeff Layton wrote:
> > On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > >
> > > For the client requests who will have unsafe and safe replies from
> > > MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
> > > (journal log) immediatelly, because they think it's unnecessary.
> > > That's true for most cases but not all, likes the fsync request.
> > > The fsync will wait until all the unsafe replied requests to be
> > > safely replied.
> > >
> > > Normally if there have multiple threads or clients are running, the
> > > whole mdlog in MDS daemons could be flushed in time if any request
> > > will trigger the mdlog submit thread. So usually we won't experience
> > > the normal operations will stuck for a long time. But in case there
> > > has only one client with only thread is running, the stuck phenomenon
> > > maybe obvious and the worst case it must wait at most 5 seconds to
> > > wait the mdlog to be flushed by the MDS's tick thread periodically.
> > >
> > > This patch will trigger to flush the mdlog in all the MDSes manually
> > > just before waiting the unsafe requests to finish.
> > >
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > > fs/ceph/caps.c | 9 +++++++++
> > > 1 file changed, 9 insertions(+)
> > >
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index c6a3352a4d52..6e80e4649c7a 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
> > > */
> > > static int unsafe_request_wait(struct inode *inode)
> > > {
> > > + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> > > struct ceph_inode_info *ci = ceph_inode(inode);
> > > struct ceph_mds_request *req1 = NULL, *req2 = NULL;
> > > int ret, err = 0;
> > > @@ -2305,6 +2306,14 @@ static int unsafe_request_wait(struct inode *inode)
> > > }
> > > spin_unlock(&ci->i_unsafe_lock);
> > >
> > > + /*
> > > + * Trigger to flush the journal logs in all the MDSes manually,
> > > + * or in the worst case we must wait at most 5 seconds to wait
> > > + * the journal logs to be flushed by the MDSes periodically.
> > > + */
> > > + if (req1 || req2)
> > > + flush_mdlog(mdsc);
> > > +
> > So this is called on fsync(). Do we really need to flush all of the mds
> > logs on every fsync? That sounds like it might have some performance
> > impact. Would it be possible to just flush the mdslog on the MDS that's
> > authoritative for this inode?
>
> I hit one case before, the mds.0 is the auth mds, but the client just
> sent the request to mds.2, then when the mds.2 tried to gather the
> rdlocks then it was stuck for waiting for the mds.0 to flush the mdlog.
> I think it also will happen that if the mds.0 could also be stuck just
> like this even its the auth MDS.
>
It sounds like mds.0 should flush its own mdlog in this situation once
mds.2 started requesting locks that mds.0 was holding. Shouldn't it?
> Normally the mdlog submit thread will be triggered per MDS's tick,
> that's 5 seconds. But this is not always true mostly because any other
> client request could trigger the mdlog submit thread to run at any time.
> Since the fsync is not running all the time, so IMO the performance
> impact should be okay.
>
>
I'm not sure I'm convinced.
Consider a situation where we have a large(ish) ceph cluster with
several MDSs. One client is writing to a file that is on mds.0 and there
is little other activity there. Several other clients are doing heavy
I/O on other inodes (of which mds.1 is auth).
The first client then calls fsync, and now the other clients stall for a
bit while mds.1 unnecessarily flushes its mdlog. I think we need to take
care to only flush the mdlog for mds's that we care about here.
> >
> > > dout("unsafe_request_wait %p wait on tid %llu %llu\n",
> > > inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
> > > if (req1) {
>
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs
2021-06-30 12:13 ` Jeff Layton
@ 2021-07-01 1:16 ` Xiubo Li
2021-07-01 3:27 ` Patrick Donnelly
0 siblings, 1 reply; 35+ messages in thread
From: Xiubo Li @ 2021-07-01 1:16 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/30/21 8:13 PM, Jeff Layton wrote:
> On Wed, 2021-06-30 at 09:26 +0800, Xiubo Li wrote:
>> On 6/29/21 9:25 PM, Jeff Layton wrote:
>>> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> For the client requests who will have unsafe and safe replies from
>>>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
>>>> (journal log) immediatelly, because they think it's unnecessary.
>>>> That's true for most cases but not all, likes the fsync request.
>>>> The fsync will wait until all the unsafe replied requests to be
>>>> safely replied.
>>>>
>>>> Normally if there have multiple threads or clients are running, the
>>>> whole mdlog in MDS daemons could be flushed in time if any request
>>>> will trigger the mdlog submit thread. So usually we won't experience
>>>> the normal operations will stuck for a long time. But in case there
>>>> has only one client with only thread is running, the stuck phenomenon
>>>> maybe obvious and the worst case it must wait at most 5 seconds to
>>>> wait the mdlog to be flushed by the MDS's tick thread periodically.
>>>>
>>>> This patch will trigger to flush the mdlog in all the MDSes manually
>>>> just before waiting the unsafe requests to finish.
>>>>
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>> fs/ceph/caps.c | 9 +++++++++
>>>> 1 file changed, 9 insertions(+)
>>>>
>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>> index c6a3352a4d52..6e80e4649c7a 100644
>>>> --- a/fs/ceph/caps.c
>>>> +++ b/fs/ceph/caps.c
>>>> @@ -2286,6 +2286,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
>>>> */
>>>> static int unsafe_request_wait(struct inode *inode)
>>>> {
>>>> + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
>>>> struct ceph_inode_info *ci = ceph_inode(inode);
>>>> struct ceph_mds_request *req1 = NULL, *req2 = NULL;
>>>> int ret, err = 0;
>>>> @@ -2305,6 +2306,14 @@ static int unsafe_request_wait(struct inode *inode)
>>>> }
>>>> spin_unlock(&ci->i_unsafe_lock);
>>>>
>>>> + /*
>>>> + * Trigger to flush the journal logs in all the MDSes manually,
>>>> + * or in the worst case we must wait at most 5 seconds to wait
>>>> + * the journal logs to be flushed by the MDSes periodically.
>>>> + */
>>>> + if (req1 || req2)
>>>> + flush_mdlog(mdsc);
>>>> +
>>> So this is called on fsync(). Do we really need to flush all of the mds
>>> logs on every fsync? That sounds like it might have some performance
>>> impact. Would it be possible to just flush the mdslog on the MDS that's
>>> authoritative for this inode?
>> I hit one case before, the mds.0 is the auth mds, but the client just
>> sent the request to mds.2, then when the mds.2 tried to gather the
>> rdlocks then it was stuck for waiting for the mds.0 to flush the mdlog.
>> I think it also will happen that if the mds.0 could also be stuck just
>> like this even its the auth MDS.
>>
> It sounds like mds.0 should flush its own mdlog in this situation once
> mds.2 started requesting locks that mds.0 was holding. Shouldn't it?
Yeah, it is. I have fixed this case. I am not sure whether there has
some other situations just like this.
But they should be bugs in MDS.
>
>> Normally the mdlog submit thread will be triggered per MDS's tick,
>> that's 5 seconds. But this is not always true mostly because any other
>> client request could trigger the mdlog submit thread to run at any time.
>> Since the fsync is not running all the time, so IMO the performance
>> impact should be okay.
>>
>>
> I'm not sure I'm convinced.
>
> Consider a situation where we have a large(ish) ceph cluster with
> several MDSs. One client is writing to a file that is on mds.0 and there
> is little other activity there. Several other clients are doing heavy
> I/O on other inodes (of which mds.1 is auth).
>
> The first client then calls fsync, and now the other clients stall for a
> bit while mds.1 unnecessarily flushes its mdlog. I think we need to take
> care to only flush the mdlog for mds's that we care about here.
Okay, except the above case I mentioned I didn't find any case that
could prevent us doing this.
Let me test more about it by just flushing the mdlog in auth MDS.
>
>
>>>> dout("unsafe_request_wait %p wait on tid %llu %llu\n",
>>>> inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
>>>> if (req1) {
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs
2021-07-01 1:16 ` Xiubo Li
@ 2021-07-01 3:27 ` Patrick Donnelly
[not found] ` <e917a3e1-2902-604b-5154-98086c95357f@redhat.com>
0 siblings, 1 reply; 35+ messages in thread
From: Patrick Donnelly @ 2021-07-01 3:27 UTC (permalink / raw)
To: Xiubo Li; +Cc: Jeff Layton, Ilya Dryomov, Ceph Development
Hi Xiubo,
On Wed, Jun 30, 2021 at 6:16 PM Xiubo Li <xiubli@redhat.com> wrote:
> >> Normally the mdlog submit thread will be triggered per MDS's tick,
> >> that's 5 seconds. But this is not always true mostly because any other
> >> client request could trigger the mdlog submit thread to run at any time.
> >> Since the fsync is not running all the time, so IMO the performance
> >> impact should be okay.
> >>
> >>
> > I'm not sure I'm convinced.
> >
> > Consider a situation where we have a large(ish) ceph cluster with
> > several MDSs. One client is writing to a file that is on mds.0 and there
> > is little other activity there. Several other clients are doing heavy
> > I/O on other inodes (of which mds.1 is auth).
> >
> > The first client then calls fsync, and now the other clients stall for a
> > bit while mds.1 unnecessarily flushes its mdlog. I think we need to take
> > care to only flush the mdlog for mds's that we care about here.
>
> Okay, except the above case I mentioned I didn't find any case that
> could prevent us doing this.
>
> Let me test more about it by just flushing the mdlog in auth MDS.
I think Jeff raises a good point. I looked at the history of the
flush_mdlog session command. It was used to implement syncfs which
necessarily requires all MDS to flush their journals (at least those
MDS communicating with the client).
During my testing of the original bug I found that running "stat ."
prior to fsync caused the hang to go away. This is because the stat
forced the MDS to flush its log in order to issue new caps to the
client. I think we need to understand that behavior better: the MDS is
revoking caps on the client to execute the rename RPC. It is probably
sufficient to change fsync to getattr appropriate (read/shared) caps
instead of flush the MDS journal.
Your time on adding flush_mdlog is not wasted; I think a good followup
patch is to add syncfs support the same way ceph-fuse does.
--
Patrick Donnelly, Ph.D.
He / Him / His
Principal Software Engineer
Red Hat Sunnyvale, CA
GPG: 19F28A586F808C2402351B93C3301A3E258DD79D
^ permalink raw reply [flat|nested] 35+ messages in thread
* [PATCH 5/5] ceph: fix ceph feature bits
2021-06-29 4:42 [PATCH 0/5] flush the mdlog before waiting on unsafe reqs xiubli
` (3 preceding siblings ...)
2021-06-29 4:42 ` [PATCH 4/5] ceph: flush the mdlog before waiting on unsafe reqs xiubli
@ 2021-06-29 4:42 ` xiubli
2021-06-29 15:38 ` Jeff Layton
2021-06-29 15:27 ` [PATCH 0/5] flush the mdlog before waiting on unsafe reqs Jeff Layton
5 siblings, 1 reply; 35+ messages in thread
From: xiubli @ 2021-06-29 4:42 UTC (permalink / raw)
To: jlayton; +Cc: idryomov, pdonnell, ceph-devel, Xiubo Li
From: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
fs/ceph/mds_client.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 79d5b8ed62bf..b18eded84ede 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -27,7 +27,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_RECLAIM_CLIENT,
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_NAUTILUS,
CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_OCTOPUS,
CEPHFS_FEATURE_METRIC_COLLECT,
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
@@ -43,7 +45,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
+ CEPHFS_FEATURE_NAUTILUS, \
CEPHFS_FEATURE_DELEG_INO, \
+ CEPHFS_FEATURE_OCTOPUS, \
CEPHFS_FEATURE_METRIC_COLLECT, \
\
CEPHFS_FEATURE_MAX, \
--
2.27.0
^ permalink raw reply related [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-29 4:42 ` [PATCH 5/5] ceph: fix ceph feature bits xiubli
@ 2021-06-29 15:38 ` Jeff Layton
2021-06-30 0:52 ` Xiubo Li
0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-29 15:38 UTC (permalink / raw)
To: xiubli; +Cc: idryomov, pdonnell, ceph-devel
On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
> fs/ceph/mds_client.h | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 79d5b8ed62bf..b18eded84ede 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -27,7 +27,9 @@ enum ceph_feature_type {
> CEPHFS_FEATURE_RECLAIM_CLIENT,
> CEPHFS_FEATURE_LAZY_CAP_WANTED,
> CEPHFS_FEATURE_MULTI_RECONNECT,
> + CEPHFS_FEATURE_NAUTILUS,
> CEPHFS_FEATURE_DELEG_INO,
> + CEPHFS_FEATURE_OCTOPUS,
> CEPHFS_FEATURE_METRIC_COLLECT,
>
> CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> @@ -43,7 +45,9 @@ enum ceph_feature_type {
> CEPHFS_FEATURE_REPLY_ENCODING, \
> CEPHFS_FEATURE_LAZY_CAP_WANTED, \
> CEPHFS_FEATURE_MULTI_RECONNECT, \
> + CEPHFS_FEATURE_NAUTILUS, \
> CEPHFS_FEATURE_DELEG_INO, \
> + CEPHFS_FEATURE_OCTOPUS, \
> CEPHFS_FEATURE_METRIC_COLLECT, \
> \
> CEPHFS_FEATURE_MAX, \
Do we need this? I thought we had decided to deprecate the whole concept
of release-based feature flags.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-29 15:38 ` Jeff Layton
@ 2021-06-30 0:52 ` Xiubo Li
2021-06-30 12:05 ` Jeff Layton
0 siblings, 1 reply; 35+ messages in thread
From: Xiubo Li @ 2021-06-30 0:52 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/29/21 11:38 PM, Jeff Layton wrote:
> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>> fs/ceph/mds_client.h | 4 ++++
>> 1 file changed, 4 insertions(+)
>>
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index 79d5b8ed62bf..b18eded84ede 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -27,7 +27,9 @@ enum ceph_feature_type {
>> CEPHFS_FEATURE_RECLAIM_CLIENT,
>> CEPHFS_FEATURE_LAZY_CAP_WANTED,
>> CEPHFS_FEATURE_MULTI_RECONNECT,
>> + CEPHFS_FEATURE_NAUTILUS,
>> CEPHFS_FEATURE_DELEG_INO,
>> + CEPHFS_FEATURE_OCTOPUS,
>> CEPHFS_FEATURE_METRIC_COLLECT,
>>
>> CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>> @@ -43,7 +45,9 @@ enum ceph_feature_type {
>> CEPHFS_FEATURE_REPLY_ENCODING, \
>> CEPHFS_FEATURE_LAZY_CAP_WANTED, \
>> CEPHFS_FEATURE_MULTI_RECONNECT, \
>> + CEPHFS_FEATURE_NAUTILUS, \
>> CEPHFS_FEATURE_DELEG_INO, \
>> + CEPHFS_FEATURE_OCTOPUS, \
>> CEPHFS_FEATURE_METRIC_COLLECT, \
>> \
>> CEPHFS_FEATURE_MAX, \
> Do we need this? I thought we had decided to deprecate the whole concept
> of release-based feature flags.
This was inconsistent with the MDS side, that means if the MDS only
support CEPHFS_FEATURE_DELEG_INO at most in lower version of ceph
cluster, then the kclients will try to send the metric messages to
MDSes, which could crash the MDS daemons.
For the ceph version feature flags they are redundant since we can check
this from the con's, since pacific the MDS code stopped updating it. I
assume we should deprecate it since Pacific ?
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-30 0:52 ` Xiubo Li
@ 2021-06-30 12:05 ` Jeff Layton
2021-06-30 12:52 ` Ilya Dryomov
0 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-30 12:05 UTC (permalink / raw)
To: Xiubo Li; +Cc: idryomov, pdonnell, ceph-devel
On Wed, 2021-06-30 at 08:52 +0800, Xiubo Li wrote:
> On 6/29/21 11:38 PM, Jeff Layton wrote:
> > On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > >
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > > fs/ceph/mds_client.h | 4 ++++
> > > 1 file changed, 4 insertions(+)
> > >
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 79d5b8ed62bf..b18eded84ede 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -27,7 +27,9 @@ enum ceph_feature_type {
> > > CEPHFS_FEATURE_RECLAIM_CLIENT,
> > > CEPHFS_FEATURE_LAZY_CAP_WANTED,
> > > CEPHFS_FEATURE_MULTI_RECONNECT,
> > > + CEPHFS_FEATURE_NAUTILUS,
> > > CEPHFS_FEATURE_DELEG_INO,
> > > + CEPHFS_FEATURE_OCTOPUS,
> > > CEPHFS_FEATURE_METRIC_COLLECT,
> > >
> > > CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > @@ -43,7 +45,9 @@ enum ceph_feature_type {
> > > CEPHFS_FEATURE_REPLY_ENCODING, \
> > > CEPHFS_FEATURE_LAZY_CAP_WANTED, \
> > > CEPHFS_FEATURE_MULTI_RECONNECT, \
> > > + CEPHFS_FEATURE_NAUTILUS, \
> > > CEPHFS_FEATURE_DELEG_INO, \
> > > + CEPHFS_FEATURE_OCTOPUS, \
> > > CEPHFS_FEATURE_METRIC_COLLECT, \
> > > \
> > > CEPHFS_FEATURE_MAX, \
> > Do we need this? I thought we had decided to deprecate the whole concept
> > of release-based feature flags.
>
> This was inconsistent with the MDS side, that means if the MDS only
> support CEPHFS_FEATURE_DELEG_INO at most in lower version of ceph
> cluster, then the kclients will try to send the metric messages to
> MDSes, which could crash the MDS daemons.
>
> For the ceph version feature flags they are redundant since we can check
> this from the con's, since pacific the MDS code stopped updating it. I
> assume we should deprecate it since Pacific ?
>
I believe so. Basically the version-based features aren't terribly
useful. Mostly we want to check feature flags for specific features
themselves. Since there are no other occurrences of
CEPHFS_FEATURE_NAUTILUS or CEPHFS_FEATURE_OCTOPUS symbols, it's probably
best not to define them at all.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-30 12:05 ` Jeff Layton
@ 2021-06-30 12:52 ` Ilya Dryomov
2021-07-01 1:07 ` Xiubo Li
` (2 more replies)
0 siblings, 3 replies; 35+ messages in thread
From: Ilya Dryomov @ 2021-06-30 12:52 UTC (permalink / raw)
To: Jeff Layton; +Cc: Xiubo Li, Patrick Donnelly, Ceph Development
On Wed, Jun 30, 2021 at 2:05 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, 2021-06-30 at 08:52 +0800, Xiubo Li wrote:
> > On 6/29/21 11:38 PM, Jeff Layton wrote:
> > > On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> > > > From: Xiubo Li <xiubli@redhat.com>
> > > >
> > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > ---
> > > > fs/ceph/mds_client.h | 4 ++++
> > > > 1 file changed, 4 insertions(+)
> > > >
> > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > index 79d5b8ed62bf..b18eded84ede 100644
> > > > --- a/fs/ceph/mds_client.h
> > > > +++ b/fs/ceph/mds_client.h
> > > > @@ -27,7 +27,9 @@ enum ceph_feature_type {
> > > > CEPHFS_FEATURE_RECLAIM_CLIENT,
> > > > CEPHFS_FEATURE_LAZY_CAP_WANTED,
> > > > CEPHFS_FEATURE_MULTI_RECONNECT,
> > > > + CEPHFS_FEATURE_NAUTILUS,
> > > > CEPHFS_FEATURE_DELEG_INO,
> > > > + CEPHFS_FEATURE_OCTOPUS,
> > > > CEPHFS_FEATURE_METRIC_COLLECT,
> > > >
> > > > CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > @@ -43,7 +45,9 @@ enum ceph_feature_type {
> > > > CEPHFS_FEATURE_REPLY_ENCODING, \
> > > > CEPHFS_FEATURE_LAZY_CAP_WANTED, \
> > > > CEPHFS_FEATURE_MULTI_RECONNECT, \
> > > > + CEPHFS_FEATURE_NAUTILUS, \
> > > > CEPHFS_FEATURE_DELEG_INO, \
> > > > + CEPHFS_FEATURE_OCTOPUS, \
> > > > CEPHFS_FEATURE_METRIC_COLLECT, \
> > > > \
> > > > CEPHFS_FEATURE_MAX, \
> > > Do we need this? I thought we had decided to deprecate the whole concept
> > > of release-based feature flags.
> >
> > This was inconsistent with the MDS side, that means if the MDS only
> > support CEPHFS_FEATURE_DELEG_INO at most in lower version of ceph
> > cluster, then the kclients will try to send the metric messages to
> > MDSes, which could crash the MDS daemons.
> >
> > For the ceph version feature flags they are redundant since we can check
> > this from the con's, since pacific the MDS code stopped updating it. I
> > assume we should deprecate it since Pacific ?
> >
>
> I believe so. Basically the version-based features aren't terribly
> useful. Mostly we want to check feature flags for specific features
> themselves. Since there are no other occurrences of
> CEPHFS_FEATURE_NAUTILUS or CEPHFS_FEATURE_OCTOPUS symbols, it's probably
> best not to define them at all.
Not only that but this patch as is would break CEPHFS_FEATURE_DELEG_INO
and CEPHFS_FEATURE_METRIC_COLLECT checks in the kernel because their bit
numbers would change...
Thanks,
Ilya
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-30 12:52 ` Ilya Dryomov
@ 2021-07-01 1:07 ` Xiubo Li
2021-07-01 1:08 ` Xiubo Li
2021-07-01 3:35 ` Xiubo Li
2 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-07-01 1:07 UTC (permalink / raw)
To: Ilya Dryomov, Jeff Layton; +Cc: Patrick Donnelly, Ceph Development
On 6/30/21 8:52 PM, Ilya Dryomov wrote:
> On Wed, Jun 30, 2021 at 2:05 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Wed, 2021-06-30 at 08:52 +0800, Xiubo Li wrote:
>>> On 6/29/21 11:38 PM, Jeff Layton wrote:
>>>> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>
>>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>>> ---
>>>>> fs/ceph/mds_client.h | 4 ++++
>>>>> 1 file changed, 4 insertions(+)
>>>>>
>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>> index 79d5b8ed62bf..b18eded84ede 100644
>>>>> --- a/fs/ceph/mds_client.h
>>>>> +++ b/fs/ceph/mds_client.h
>>>>> @@ -27,7 +27,9 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_RECLAIM_CLIENT,
>>>>> CEPHFS_FEATURE_LAZY_CAP_WANTED,
>>>>> CEPHFS_FEATURE_MULTI_RECONNECT,
>>>>> + CEPHFS_FEATURE_NAUTILUS,
>>>>> CEPHFS_FEATURE_DELEG_INO,
>>>>> + CEPHFS_FEATURE_OCTOPUS,
>>>>> CEPHFS_FEATURE_METRIC_COLLECT,
>>>>>
>>>>> CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>>>> @@ -43,7 +45,9 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_REPLY_ENCODING, \
>>>>> CEPHFS_FEATURE_LAZY_CAP_WANTED, \
>>>>> CEPHFS_FEATURE_MULTI_RECONNECT, \
>>>>> + CEPHFS_FEATURE_NAUTILUS, \
>>>>> CEPHFS_FEATURE_DELEG_INO, \
>>>>> + CEPHFS_FEATURE_OCTOPUS, \
>>>>> CEPHFS_FEATURE_METRIC_COLLECT, \
>>>>> \
>>>>> CEPHFS_FEATURE_MAX, \
>>>> Do we need this? I thought we had decided to deprecate the whole concept
>>>> of release-based feature flags.
>>> This was inconsistent with the MDS side, that means if the MDS only
>>> support CEPHFS_FEATURE_DELEG_INO at most in lower version of ceph
>>> cluster, then the kclients will try to send the metric messages to
>>> MDSes, which could crash the MDS daemons.
>>>
>>> For the ceph version feature flags they are redundant since we can check
>>> this from the con's, since pacific the MDS code stopped updating it. I
>>> assume we should deprecate it since Pacific ?
>>>
>> I believe so. Basically the version-based features aren't terribly
>> useful. Mostly we want to check feature flags for specific features
>> themselves. Since there are no other occurrences of
>> CEPHFS_FEATURE_NAUTILUS or CEPHFS_FEATURE_OCTOPUS symbols, it's probably
>> best not to define them at all.
Without add this two symbols, we could just correct the number of them,
make sense ?
> Not only that but this patch as is would break CEPHFS_FEATURE_DELEG_INO
> and CEPHFS_FEATURE_METRIC_COLLECT checks in the kernel because their bit
> numbers would change...
Yeah, the problem is that the numbers of them was incorrect, which was
introducing crash bugs.
Thanks
Xiubo
> Thanks,
>
> Ilya
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-30 12:52 ` Ilya Dryomov
2021-07-01 1:07 ` Xiubo Li
@ 2021-07-01 1:08 ` Xiubo Li
2021-07-01 3:35 ` Xiubo Li
2 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-07-01 1:08 UTC (permalink / raw)
To: Ilya Dryomov, Jeff Layton; +Cc: Patrick Donnelly, Ceph Development
On 6/30/21 8:52 PM, Ilya Dryomov wrote:
> On Wed, Jun 30, 2021 at 2:05 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Wed, 2021-06-30 at 08:52 +0800, Xiubo Li wrote:
>>> On 6/29/21 11:38 PM, Jeff Layton wrote:
>>>> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>
>>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>>> ---
>>>>> fs/ceph/mds_client.h | 4 ++++
>>>>> 1 file changed, 4 insertions(+)
>>>>>
>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>> index 79d5b8ed62bf..b18eded84ede 100644
>>>>> --- a/fs/ceph/mds_client.h
>>>>> +++ b/fs/ceph/mds_client.h
>>>>> @@ -27,7 +27,9 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_RECLAIM_CLIENT,
>>>>> CEPHFS_FEATURE_LAZY_CAP_WANTED,
>>>>> CEPHFS_FEATURE_MULTI_RECONNECT,
>>>>> + CEPHFS_FEATURE_NAUTILUS,
>>>>> CEPHFS_FEATURE_DELEG_INO,
>>>>> + CEPHFS_FEATURE_OCTOPUS,
>>>>> CEPHFS_FEATURE_METRIC_COLLECT,
>>>>>
>>>>> CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>>>> @@ -43,7 +45,9 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_REPLY_ENCODING, \
>>>>> CEPHFS_FEATURE_LAZY_CAP_WANTED, \
>>>>> CEPHFS_FEATURE_MULTI_RECONNECT, \
>>>>> + CEPHFS_FEATURE_NAUTILUS, \
>>>>> CEPHFS_FEATURE_DELEG_INO, \
>>>>> + CEPHFS_FEATURE_OCTOPUS, \
>>>>> CEPHFS_FEATURE_METRIC_COLLECT, \
>>>>> \
>>>>> CEPHFS_FEATURE_MAX, \
>>>> Do we need this? I thought we had decided to deprecate the whole concept
>>>> of release-based feature flags.
>>> This was inconsistent with the MDS side, that means if the MDS only
>>> support CEPHFS_FEATURE_DELEG_INO at most in lower version of ceph
>>> cluster, then the kclients will try to send the metric messages to
>>> MDSes, which could crash the MDS daemons.
>>>
>>> For the ceph version feature flags they are redundant since we can check
>>> this from the con's, since pacific the MDS code stopped updating it. I
>>> assume we should deprecate it since Pacific ?
>>>
>> I believe so. Basically the version-based features aren't terribly
>> useful. Mostly we want to check feature flags for specific features
>> themselves. Since there are no other occurrences of
>> CEPHFS_FEATURE_NAUTILUS or CEPHFS_FEATURE_OCTOPUS symbols, it's probably
>> best not to define them at all.
Without adding this two symbols, we could just correct the number of
them, make sense ?
> Not only that but this patch as is would break CEPHFS_FEATURE_DELEG_INO
> and CEPHFS_FEATURE_METRIC_COLLECT checks in the kernel because their bit
> numbers would change...
Yeah, the problem is that the numbers of them was incorrect, which was
introducing crash bugs.
Thanks
Xiubo
> Thanks,
>
> Ilya
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 5/5] ceph: fix ceph feature bits
2021-06-30 12:52 ` Ilya Dryomov
2021-07-01 1:07 ` Xiubo Li
2021-07-01 1:08 ` Xiubo Li
@ 2021-07-01 3:35 ` Xiubo Li
2 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-07-01 3:35 UTC (permalink / raw)
To: Ilya Dryomov, Jeff Layton; +Cc: Patrick Donnelly, Ceph Development
On 6/30/21 8:52 PM, Ilya Dryomov wrote:
> On Wed, Jun 30, 2021 at 2:05 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Wed, 2021-06-30 at 08:52 +0800, Xiubo Li wrote:
>>> On 6/29/21 11:38 PM, Jeff Layton wrote:
>>>> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>
>>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>>> ---
>>>>> fs/ceph/mds_client.h | 4 ++++
>>>>> 1 file changed, 4 insertions(+)
>>>>>
>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>> index 79d5b8ed62bf..b18eded84ede 100644
>>>>> --- a/fs/ceph/mds_client.h
>>>>> +++ b/fs/ceph/mds_client.h
>>>>> @@ -27,7 +27,9 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_RECLAIM_CLIENT,
>>>>> CEPHFS_FEATURE_LAZY_CAP_WANTED,
>>>>> CEPHFS_FEATURE_MULTI_RECONNECT,
>>>>> + CEPHFS_FEATURE_NAUTILUS,
>>>>> CEPHFS_FEATURE_DELEG_INO,
>>>>> + CEPHFS_FEATURE_OCTOPUS,
>>>>> CEPHFS_FEATURE_METRIC_COLLECT,
>>>>>
>>>>> CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>>>> @@ -43,7 +45,9 @@ enum ceph_feature_type {
>>>>> CEPHFS_FEATURE_REPLY_ENCODING, \
>>>>> CEPHFS_FEATURE_LAZY_CAP_WANTED, \
>>>>> CEPHFS_FEATURE_MULTI_RECONNECT, \
>>>>> + CEPHFS_FEATURE_NAUTILUS, \
>>>>> CEPHFS_FEATURE_DELEG_INO, \
>>>>> + CEPHFS_FEATURE_OCTOPUS, \
>>>>> CEPHFS_FEATURE_METRIC_COLLECT, \
>>>>> \
>>>>> CEPHFS_FEATURE_MAX, \
>>>> Do we need this? I thought we had decided to deprecate the whole concept
>>>> of release-based feature flags.
>>> This was inconsistent with the MDS side, that means if the MDS only
>>> support CEPHFS_FEATURE_DELEG_INO at most in lower version of ceph
>>> cluster, then the kclients will try to send the metric messages to
>>> MDSes, which could crash the MDS daemons.
>>>
>>> For the ceph version feature flags they are redundant since we can check
>>> this from the con's, since pacific the MDS code stopped updating it. I
>>> assume we should deprecate it since Pacific ?
>>>
>> I believe so. Basically the version-based features aren't terribly
>> useful. Mostly we want to check feature flags for specific features
>> themselves. Since there are no other occurrences of
>> CEPHFS_FEATURE_NAUTILUS or CEPHFS_FEATURE_OCTOPUS symbols, it's probably
>> best not to define them at all.
> Not only that but this patch as is would break CEPHFS_FEATURE_DELEG_INO
> and CEPHFS_FEATURE_METRIC_COLLECT checks in the kernel because their bit
> numbers would change...
Sorry, please ignore this patch.
In the mds side:
#define CEPHFS_FEATURE_MULTI_RECONNECT 12
#define CEPHFS_FEATURE_NAUTILUS 12
#define CEPHFS_FEATURE_DELEG_INO 13
#define CEPHFS_FEATURE_OCTOPUS 13
#define CEPHFS_FEATURE_METRIC_COLLECT 14
#define CEPHFS_FEATURE_ALTERNATE_NAME 15
#define CEPHFS_FEATURE_MAX 15
So, this fixing makes no sense any more.
BRs
> Thanks,
>
> Ilya
>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 0/5] flush the mdlog before waiting on unsafe reqs
2021-06-29 4:42 [PATCH 0/5] flush the mdlog before waiting on unsafe reqs xiubli
` (4 preceding siblings ...)
2021-06-29 4:42 ` [PATCH 5/5] ceph: fix ceph feature bits xiubli
@ 2021-06-29 15:27 ` Jeff Layton
2021-06-30 0:35 ` Xiubo Li
5 siblings, 1 reply; 35+ messages in thread
From: Jeff Layton @ 2021-06-29 15:27 UTC (permalink / raw)
To: xiubli; +Cc: idryomov, pdonnell, ceph-devel
On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
>
> For the client requests who will have unsafe and safe replies from
> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
> (journal log) immediatelly, because they think it's unnecessary.
> That's true for most cases but not all, likes the fsync request.
> The fsync will wait until all the unsafe replied requests to be
> safely replied.
>
> Normally if there have multiple threads or clients are running, the
> whole mdlog in MDS daemons could be flushed in time if any request
> will trigger the mdlog submit thread. So usually we won't experience
> the normal operations will stuck for a long time. But in case there
> has only one client with only thread is running, the stuck phenomenon
> maybe obvious and the worst case it must wait at most 5 seconds to
> wait the mdlog to be flushed by the MDS's tick thread periodically.
>
> This patch will trigger to flush the mdlog in all the MDSes manually
> just before waiting the unsafe requests to finish.
>
>
This seems a bit heavyweight, given that you may end up sending
FLUSH_MDLOG messages to mds's on which you're not waiting. What might be
best is to scan the list of requests you're waiting on and just send
these messages to those mds's.
Is that possible here?
>
>
> Xiubo Li (5):
> ceph: export ceph_create_session_msg
> ceph: export iterate_sessions
> ceph: flush mdlog before umounting
> ceph: flush the mdlog before waiting on unsafe reqs
> ceph: fix ceph feature bits
>
> fs/ceph/caps.c | 35 ++++----------
> fs/ceph/mds_client.c | 91 +++++++++++++++++++++++++++---------
> fs/ceph/mds_client.h | 9 ++++
> include/linux/ceph/ceph_fs.h | 1 +
> 4 files changed, 88 insertions(+), 48 deletions(-)
>
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 35+ messages in thread
* Re: [PATCH 0/5] flush the mdlog before waiting on unsafe reqs
2021-06-29 15:27 ` [PATCH 0/5] flush the mdlog before waiting on unsafe reqs Jeff Layton
@ 2021-06-30 0:35 ` Xiubo Li
0 siblings, 0 replies; 35+ messages in thread
From: Xiubo Li @ 2021-06-30 0:35 UTC (permalink / raw)
To: Jeff Layton; +Cc: idryomov, pdonnell, ceph-devel
On 6/29/21 11:27 PM, Jeff Layton wrote:
> On Tue, 2021-06-29 at 12:42 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> For the client requests who will have unsafe and safe replies from
>> MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
>> (journal log) immediatelly, because they think it's unnecessary.
>> That's true for most cases but not all, likes the fsync request.
>> The fsync will wait until all the unsafe replied requests to be
>> safely replied.
>>
>> Normally if there have multiple threads or clients are running, the
>> whole mdlog in MDS daemons could be flushed in time if any request
>> will trigger the mdlog submit thread. So usually we won't experience
>> the normal operations will stuck for a long time. But in case there
>> has only one client with only thread is running, the stuck phenomenon
>> maybe obvious and the worst case it must wait at most 5 seconds to
>> wait the mdlog to be flushed by the MDS's tick thread periodically.
>>
>> This patch will trigger to flush the mdlog in all the MDSes manually
>> just before waiting the unsafe requests to finish.
>>
>>
> This seems a bit heavyweight, given that you may end up sending
> FLUSH_MDLOG messages to mds's on which you're not waiting. What might be
> best is to scan the list of requests you're waiting on and just send
> these messages to those mds's.
>
> Is that possible here?
Yeah, possibly and let me try that.
Thanks.
>>
>> Xiubo Li (5):
>> ceph: export ceph_create_session_msg
>> ceph: export iterate_sessions
>> ceph: flush mdlog before umounting
>> ceph: flush the mdlog before waiting on unsafe reqs
>> ceph: fix ceph feature bits
>>
>> fs/ceph/caps.c | 35 ++++----------
>> fs/ceph/mds_client.c | 91 +++++++++++++++++++++++++++---------
>> fs/ceph/mds_client.h | 9 ++++
>> include/linux/ceph/ceph_fs.h | 1 +
>> 4 files changed, 88 insertions(+), 48 deletions(-)
>>
^ permalink raw reply [flat|nested] 35+ messages in thread