All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] ceph: flush the delayed caps in time
@ 2021-07-21  8:27 xiubli
  2021-07-21 11:23 ` Jeff Layton
  0 siblings, 1 reply; 9+ messages in thread
From: xiubli @ 2021-07-21  8:27 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

The delayed_work will be executed per 5 seconds, during this time
the cap_delay_list may accumulate thounsands of caps need to flush,
this will make the MDS's dispatch queue be full and need a very long
time to handle them. And if there has some other operations, likes
a rmdir request, it will be add in the tail of dispath queue and
need to wait for several or tens of seconds.

In client side we shouldn't queue to many of the cap requests and
flush them if there has more than 100 items.

URL: https://tracker.ceph.com/issues/51734
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c       | 21 ++++++++++++++++++++-
 fs/ceph/mds_client.c |  3 ++-
 fs/ceph/mds_client.h |  3 +++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 4b966c29d9b5..064865761d2b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 				struct ceph_inode_info *ci)
 {
+	int num = 0;
+
 	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
 	     ci->i_ceph_flags, ci->i_hold_caps_max);
 	if (!mdsc->stopping) {
@@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 			if (ci->i_ceph_flags & CEPH_I_FLUSH)
 				goto no_change;
 			list_del_init(&ci->i_cap_delay_list);
+			mdsc->num_cap_delay--;
 		}
 		__cap_set_timeouts(mdsc, ci);
 		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+		num = ++mdsc->num_cap_delay;
 no_change:
 		spin_unlock(&mdsc->cap_delay_lock);
 	}
+
+	if (num > 100) {
+		flush_delayed_work(&mdsc->delayed_work);
+		schedule_delayed(mdsc);
+	}
 }
 
 /*
@@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 				      struct ceph_inode_info *ci)
 {
+	int num;
+
 	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
 	spin_lock(&mdsc->cap_delay_lock);
 	ci->i_ceph_flags |= CEPH_I_FLUSH;
-	if (!list_empty(&ci->i_cap_delay_list))
+	if (!list_empty(&ci->i_cap_delay_list)) {
 		list_del_init(&ci->i_cap_delay_list);
+		mdsc->num_cap_delay--;
+	}
 	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+	num = ++mdsc->num_cap_delay;
 	spin_unlock(&mdsc->cap_delay_lock);
+
+	if (num > 100) {
+		flush_delayed_work(&mdsc->delayed_work);
+		schedule_delayed(mdsc);
+	}
 }
 
 /*
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 79aa4ce3a388..14e44de05812 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
 /*
  * delayed work -- periodically trim expired leases, renew caps with mds
  */
-static void schedule_delayed(struct ceph_mds_client *mdsc)
+void schedule_delayed(struct ceph_mds_client *mdsc)
 {
 	int delay = 5;
 	unsigned hz = round_jiffies_relative(HZ * delay);
@@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	mdsc->request_tree = RB_ROOT;
 	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
 	mdsc->last_renew_caps = jiffies;
+	mdsc->num_cap_delay = 0;
 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
 	INIT_LIST_HEAD(&mdsc->cap_wait_list);
 	spin_lock_init(&mdsc->cap_delay_lock);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a7af09257382..b4289b8d23ec 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -423,6 +423,7 @@ struct ceph_mds_client {
 	struct rb_root         request_tree;  /* pending mds requests */
 	struct delayed_work    delayed_work;  /* delayed work */
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
+	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
 	struct list_head cap_delay_list;   /* caps with delayed release */
 	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
 	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
@@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
 			  struct ceph_mds_session *session,
 			  int max_caps);
 
+extern void schedule_delayed(struct ceph_mds_client *mdsc);
+
 static inline int ceph_wait_on_async_create(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-21  8:27 [PATCH RFC] ceph: flush the delayed caps in time xiubli
@ 2021-07-21 11:23 ` Jeff Layton
  2021-07-21 11:54   ` Xiubo Li
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Layton @ 2021-07-21 11:23 UTC (permalink / raw)
  To: xiubli, idryomov; +Cc: pdonnell, ceph-devel

On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> The delayed_work will be executed per 5 seconds, during this time
> the cap_delay_list may accumulate thounsands of caps need to flush,
> this will make the MDS's dispatch queue be full and need a very long
> time to handle them. And if there has some other operations, likes
> a rmdir request, it will be add in the tail of dispath queue and
> need to wait for several or tens of seconds.
> 
> In client side we shouldn't queue to many of the cap requests and
> flush them if there has more than 100 items.
> 

Why 100? My inclination is to say NAK on this.

I'm not a fan of this sort of arbitrary throttling in order to alleviate
a scalability problem in the MDS. The cap delay logic is already
horrifically complex as well, and this just makes it worse. If the MDS
happens to be processing requests from a lot of clients, this may not
help at all.

> URL: https://tracker.ceph.com/issues/51734

The core problem sounds like the MDS got a call and just took way too
long to get to it. Shouldn't we be addressing this in the MDS instead?
It sounds like it needs to do a better job of parallelizing cap flushes.

> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/caps.c       | 21 ++++++++++++++++++++-
>  fs/ceph/mds_client.c |  3 ++-
>  fs/ceph/mds_client.h |  3 +++
>  3 files changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 4b966c29d9b5..064865761d2b 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  				struct ceph_inode_info *ci)
>  {
> +	int num = 0;
> +
>  	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>  	     ci->i_ceph_flags, ci->i_hold_caps_max);
>  	if (!mdsc->stopping) {
> @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  			if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  				goto no_change;
>  			list_del_init(&ci->i_cap_delay_list);
> +			mdsc->num_cap_delay--;
>  		}
>  		__cap_set_timeouts(mdsc, ci);
>  		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> +		num = ++mdsc->num_cap_delay;
>  no_change:
>  		spin_unlock(&mdsc->cap_delay_lock);
>  	}
> +
> +	if (num > 100) {
> +		flush_delayed_work(&mdsc->delayed_work);
> +		schedule_delayed(mdsc);
> +	}
>  }
>  
>  /*
> @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
>  				      struct ceph_inode_info *ci)
>  {
> +	int num;
> +
>  	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
>  	spin_lock(&mdsc->cap_delay_lock);
>  	ci->i_ceph_flags |= CEPH_I_FLUSH;
> -	if (!list_empty(&ci->i_cap_delay_list))
> +	if (!list_empty(&ci->i_cap_delay_list)) {
>  		list_del_init(&ci->i_cap_delay_list);
> +		mdsc->num_cap_delay--;
> +	}
>  	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> +	num = ++mdsc->num_cap_delay;
>  	spin_unlock(&mdsc->cap_delay_lock);
> +
> +	if (num > 100) {
> +		flush_delayed_work(&mdsc->delayed_work);
> +		schedule_delayed(mdsc);
> +	}
>  }
>  
>  /*
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 79aa4ce3a388..14e44de05812 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
>  /*
>   * delayed work -- periodically trim expired leases, renew caps with mds
>   */
> -static void schedule_delayed(struct ceph_mds_client *mdsc)
> +void schedule_delayed(struct ceph_mds_client *mdsc)
>  {
>  	int delay = 5;
>  	unsigned hz = round_jiffies_relative(HZ * delay);
> @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>  	mdsc->request_tree = RB_ROOT;
>  	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
>  	mdsc->last_renew_caps = jiffies;
> +	mdsc->num_cap_delay = 0;
>  	INIT_LIST_HEAD(&mdsc->cap_delay_list);
>  	INIT_LIST_HEAD(&mdsc->cap_wait_list);
>  	spin_lock_init(&mdsc->cap_delay_lock);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index a7af09257382..b4289b8d23ec 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -423,6 +423,7 @@ struct ceph_mds_client {
>  	struct rb_root         request_tree;  /* pending mds requests */
>  	struct delayed_work    delayed_work;  /* delayed work */
>  	unsigned long    last_renew_caps;  /* last time we renewed our caps */
> +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
>  	struct list_head cap_delay_list;   /* caps with delayed release */
>  	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
>  	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
> @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>  			  struct ceph_mds_session *session,
>  			  int max_caps);
>  
> +extern void schedule_delayed(struct ceph_mds_client *mdsc);
> +
>  static inline int ceph_wait_on_async_create(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-21 11:23 ` Jeff Layton
@ 2021-07-21 11:54   ` Xiubo Li
  2021-07-21 12:57     ` Jeff Layton
  0 siblings, 1 reply; 9+ messages in thread
From: Xiubo Li @ 2021-07-21 11:54 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: pdonnell, ceph-devel


On 7/21/21 7:23 PM, Jeff Layton wrote:
> On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The delayed_work will be executed per 5 seconds, during this time
>> the cap_delay_list may accumulate thounsands of caps need to flush,
>> this will make the MDS's dispatch queue be full and need a very long
>> time to handle them. And if there has some other operations, likes
>> a rmdir request, it will be add in the tail of dispath queue and
>> need to wait for several or tens of seconds.
>>
>> In client side we shouldn't queue to many of the cap requests and
>> flush them if there has more than 100 items.
>>
> Why 100? My inclination is to say NAK on this.

This just from my test that around 100 client_caps requests queued will 
work fine in most cases, which won't take too long to handle. Some times 
the client will send thousands of requests in a short time, that will be 
a problem.

>
> I'm not a fan of this sort of arbitrary throttling in order to alleviate
> a scalability problem in the MDS. The cap delay logic is already
> horrifically complex as well, and this just makes it worse. If the MDS
> happens to be processing requests from a lot of clients, this may not
> help at all.

Yeah, right, in that case the mds dispatch queue possibly will have more 
in a short time.


>> URL: https://tracker.ceph.com/issues/51734
> The core problem sounds like the MDS got a call and just took way too
> long to get to it. Shouldn't we be addressing this in the MDS instead?
> It sounds like it needs to do a better job of parallelizing cap flushes.

The problem is that almost all the requests need to acquire the global 
'mds_lock', so the parallelizing won't improve much.

Currently all the client side requests will have the priority of 
CEPH_MSG_PRIO_DEFAULT:

m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);

Not sure whether can we just make the cap flushing request a lower 
priority ? If this won't introduce other potential issue, that will work 
because in the MDS dispatch queue it will handle high priority requests 
first.

Thanks




>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c       | 21 ++++++++++++++++++++-
>>   fs/ceph/mds_client.c |  3 ++-
>>   fs/ceph/mds_client.h |  3 +++
>>   3 files changed, 25 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 4b966c29d9b5..064865761d2b 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>>   static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>   				struct ceph_inode_info *ci)
>>   {
>> +	int num = 0;
>> +
>>   	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>>   	     ci->i_ceph_flags, ci->i_hold_caps_max);
>>   	if (!mdsc->stopping) {
>> @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>   			if (ci->i_ceph_flags & CEPH_I_FLUSH)
>>   				goto no_change;
>>   			list_del_init(&ci->i_cap_delay_list);
>> +			mdsc->num_cap_delay--;
>>   		}
>>   		__cap_set_timeouts(mdsc, ci);
>>   		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>> +		num = ++mdsc->num_cap_delay;
>>   no_change:
>>   		spin_unlock(&mdsc->cap_delay_lock);
>>   	}
>> +
>> +	if (num > 100) {
>> +		flush_delayed_work(&mdsc->delayed_work);
>> +		schedule_delayed(mdsc);
>> +	}
>>   }
>>   
>>   /*
>> @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>   static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
>>   				      struct ceph_inode_info *ci)
>>   {
>> +	int num;
>> +
>>   	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
>>   	spin_lock(&mdsc->cap_delay_lock);
>>   	ci->i_ceph_flags |= CEPH_I_FLUSH;
>> -	if (!list_empty(&ci->i_cap_delay_list))
>> +	if (!list_empty(&ci->i_cap_delay_list)) {
>>   		list_del_init(&ci->i_cap_delay_list);
>> +		mdsc->num_cap_delay--;
>> +	}
>>   	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>> +	num = ++mdsc->num_cap_delay;
>>   	spin_unlock(&mdsc->cap_delay_lock);
>> +
>> +	if (num > 100) {
>> +		flush_delayed_work(&mdsc->delayed_work);
>> +		schedule_delayed(mdsc);
>> +	}
>>   }
>>   
>>   /*
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 79aa4ce3a388..14e44de05812 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
>>   /*
>>    * delayed work -- periodically trim expired leases, renew caps with mds
>>    */
>> -static void schedule_delayed(struct ceph_mds_client *mdsc)
>> +void schedule_delayed(struct ceph_mds_client *mdsc)
>>   {
>>   	int delay = 5;
>>   	unsigned hz = round_jiffies_relative(HZ * delay);
>> @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>>   	mdsc->request_tree = RB_ROOT;
>>   	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
>>   	mdsc->last_renew_caps = jiffies;
>> +	mdsc->num_cap_delay = 0;
>>   	INIT_LIST_HEAD(&mdsc->cap_delay_list);
>>   	INIT_LIST_HEAD(&mdsc->cap_wait_list);
>>   	spin_lock_init(&mdsc->cap_delay_lock);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index a7af09257382..b4289b8d23ec 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -423,6 +423,7 @@ struct ceph_mds_client {
>>   	struct rb_root         request_tree;  /* pending mds requests */
>>   	struct delayed_work    delayed_work;  /* delayed work */
>>   	unsigned long    last_renew_caps;  /* last time we renewed our caps */
>> +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
>>   	struct list_head cap_delay_list;   /* caps with delayed release */
>>   	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
>>   	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
>> @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>>   			  struct ceph_mds_session *session,
>>   			  int max_caps);
>>   
>> +extern void schedule_delayed(struct ceph_mds_client *mdsc);
>> +
>>   static inline int ceph_wait_on_async_create(struct inode *inode)
>>   {
>>   	struct ceph_inode_info *ci = ceph_inode(inode);


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-21 11:54   ` Xiubo Li
@ 2021-07-21 12:57     ` Jeff Layton
  2021-07-21 13:42       ` Xiubo Li
  2021-07-22  9:36       ` Xiubo Li
  0 siblings, 2 replies; 9+ messages in thread
From: Jeff Layton @ 2021-07-21 12:57 UTC (permalink / raw)
  To: Xiubo Li, idryomov; +Cc: pdonnell, ceph-devel

On Wed, 2021-07-21 at 19:54 +0800, Xiubo Li wrote:
> On 7/21/21 7:23 PM, Jeff Layton wrote:
> > On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > The delayed_work will be executed per 5 seconds, during this time
> > > the cap_delay_list may accumulate thounsands of caps need to flush,
> > > this will make the MDS's dispatch queue be full and need a very long
> > > time to handle them. And if there has some other operations, likes
> > > a rmdir request, it will be add in the tail of dispath queue and
> > > need to wait for several or tens of seconds.
> > > 
> > > In client side we shouldn't queue to many of the cap requests and
> > > flush them if there has more than 100 items.
> > > 
> > Why 100? My inclination is to say NAK on this.
> 
> This just from my test that around 100 client_caps requests queued will 
> work fine in most cases, which won't take too long to handle. Some times 
> the client will send thousands of requests in a short time, that will be 
> a problem.

What may be a better approach is to figure out why we're holding on to
so many caps and trying to flush them all at once. Maybe if we were to
more aggressively flush sooner, we'd not end up with such a backlog?

> > I'm not a fan of this sort of arbitrary throttling in order to alleviate
> > a scalability problem in the MDS. The cap delay logic is already
> > horrifically complex as well, and this just makes it worse. If the MDS
> > happens to be processing requests from a lot of clients, this may not
> > help at all.
> 
> Yeah, right, in that case the mds dispatch queue possibly will have more 
> in a short time.
> 
> 
> > > URL: https://tracker.ceph.com/issues/51734
> > The core problem sounds like the MDS got a call and just took way too
> > long to get to it. Shouldn't we be addressing this in the MDS instead?
> > It sounds like it needs to do a better job of parallelizing cap flushes.
> 
> The problem is that almost all the requests need to acquire the global 
> 'mds_lock', so the parallelizing won't improve much.
> 

Yuck. That sounds like the root cause right there. For the record, I'm
against papering over that with client patches.

> Currently all the client side requests will have the priority of 
> CEPH_MSG_PRIO_DEFAULT:
> 
> m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
> 
> Not sure whether can we just make the cap flushing request a lower 
> priority ? If this won't introduce other potential issue, that will work 
> because in the MDS dispatch queue it will handle high priority requests 
> first.
> 

Tough call. You might mark it for lower priority, but then someone calls
fsync() and suddenly you need it to finish sooner. I think that sort of
change might backfire in some cases.

Usually when faced with this sort of issue, the right answer is to try
to work ahead of the "spikes" in activity so that you don't have quite
such a backlog when you do need to flush everything out. I'm not sure
how possible that is here.

One underlying issue may be that some of the client's background
activity is currently driven by arbitrary timers (e.g. the delayed_work
timer). When the timer pops, it suddenly has to do a bunch of work, when
some of it could have been done earlier so that the load was spread out.


> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > >   fs/ceph/caps.c       | 21 ++++++++++++++++++++-
> > >   fs/ceph/mds_client.c |  3 ++-
> > >   fs/ceph/mds_client.h |  3 +++
> > >   3 files changed, 25 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index 4b966c29d9b5..064865761d2b 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
> > >   static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > >   				struct ceph_inode_info *ci)
> > >   {
> > > +	int num = 0;
> > > +
> > >   	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> > >   	     ci->i_ceph_flags, ci->i_hold_caps_max);
> > >   	if (!mdsc->stopping) {
> > > @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > >   			if (ci->i_ceph_flags & CEPH_I_FLUSH)
> > >   				goto no_change;
> > >   			list_del_init(&ci->i_cap_delay_list);
> > > +			mdsc->num_cap_delay--;
> > >   		}
> > >   		__cap_set_timeouts(mdsc, ci);
> > >   		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> > > +		num = ++mdsc->num_cap_delay;
> > >   no_change:
> > >   		spin_unlock(&mdsc->cap_delay_lock);
> > >   	}
> > > +
> > > +	if (num > 100) {
> > > +		flush_delayed_work(&mdsc->delayed_work);
> > > +		schedule_delayed(mdsc);
> > > +	}
> > >   }
> > >   
> > >   /*
> > > @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > >   static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
> > >   				      struct ceph_inode_info *ci)
> > >   {
> > > +	int num;
> > > +
> > >   	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
> > >   	spin_lock(&mdsc->cap_delay_lock);
> > >   	ci->i_ceph_flags |= CEPH_I_FLUSH;
> > > -	if (!list_empty(&ci->i_cap_delay_list))
> > > +	if (!list_empty(&ci->i_cap_delay_list)) {
> > >   		list_del_init(&ci->i_cap_delay_list);
> > > +		mdsc->num_cap_delay--;
> > > +	}
> > >   	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> > > +	num = ++mdsc->num_cap_delay;
> > >   	spin_unlock(&mdsc->cap_delay_lock);
> > > +
> > > +	if (num > 100) {
> > > +		flush_delayed_work(&mdsc->delayed_work);
> > > +		schedule_delayed(mdsc);
> > > +	}
> > >   }
> > >   
> > >   /*
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 79aa4ce3a388..14e44de05812 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
> > >   /*
> > >    * delayed work -- periodically trim expired leases, renew caps with mds
> > >    */
> > > -static void schedule_delayed(struct ceph_mds_client *mdsc)
> > > +void schedule_delayed(struct ceph_mds_client *mdsc)
> > >   {
> > >   	int delay = 5;
> > >   	unsigned hz = round_jiffies_relative(HZ * delay);
> > > @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
> > >   	mdsc->request_tree = RB_ROOT;
> > >   	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
> > >   	mdsc->last_renew_caps = jiffies;
> > > +	mdsc->num_cap_delay = 0;
> > >   	INIT_LIST_HEAD(&mdsc->cap_delay_list);
> > >   	INIT_LIST_HEAD(&mdsc->cap_wait_list);
> > >   	spin_lock_init(&mdsc->cap_delay_lock);
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index a7af09257382..b4289b8d23ec 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -423,6 +423,7 @@ struct ceph_mds_client {
> > >   	struct rb_root         request_tree;  /* pending mds requests */
> > >   	struct delayed_work    delayed_work;  /* delayed work */
> > >   	unsigned long    last_renew_caps;  /* last time we renewed our caps */
> > > +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
> > >   	struct list_head cap_delay_list;   /* caps with delayed release */
> > >   	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
> > >   	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
> > > @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > >   			  struct ceph_mds_session *session,
> > >   			  int max_caps);
> > >   
> > > +extern void schedule_delayed(struct ceph_mds_client *mdsc);
> > > +
> > >   static inline int ceph_wait_on_async_create(struct inode *inode)
> > >   {
> > >   	struct ceph_inode_info *ci = ceph_inode(inode);
> 

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-21 12:57     ` Jeff Layton
@ 2021-07-21 13:42       ` Xiubo Li
  2021-07-21 14:04         ` Jeff Layton
  2021-07-22  9:36       ` Xiubo Li
  1 sibling, 1 reply; 9+ messages in thread
From: Xiubo Li @ 2021-07-21 13:42 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: pdonnell, ceph-devel


On 7/21/21 8:57 PM, Jeff Layton wrote:
> On Wed, 2021-07-21 at 19:54 +0800, Xiubo Li wrote:
>> On 7/21/21 7:23 PM, Jeff Layton wrote:
>>> On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> The delayed_work will be executed per 5 seconds, during this time
>>>> the cap_delay_list may accumulate thounsands of caps need to flush,
>>>> this will make the MDS's dispatch queue be full and need a very long
>>>> time to handle them. And if there has some other operations, likes
>>>> a rmdir request, it will be add in the tail of dispath queue and
>>>> need to wait for several or tens of seconds.
>>>>
>>>> In client side we shouldn't queue to many of the cap requests and
>>>> flush them if there has more than 100 items.
>>>>
>>> Why 100? My inclination is to say NAK on this.
>> This just from my test that around 100 client_caps requests queued will
>> work fine in most cases, which won't take too long to handle. Some times
>> the client will send thousands of requests in a short time, that will be
>> a problem.
> What may be a better approach is to figure out why we're holding on to
> so many caps and trying to flush them all at once. Maybe if we were to
> more aggressively flush sooner, we'd not end up with such a backlog?

Yeah, I am still working on this.

 From my analysis, some fixing or improvement like this will help reduce it:

commit e64f44a884657358812e6c057957be546db03cbe
Author: Xiubo Li <xiubli@redhat.com>
Date:   Wed May 27 09:09:27 2020 -0400

     ceph: skip checking caps when session reconnecting and releasing reqs

     It make no sense to check the caps when reconnecting to mds. And
     for the async dirop caps, they will be put by its _cb() function,
     so when releasing the requests, it will make no sense too.

 From the test result, it seems that some caps related requests were 
still pending in the cap_delay_list even after the rmdir requests were 
already sent out and finished for the previous directories. I am still 
confirming this.



>
>>> I'm not a fan of this sort of arbitrary throttling in order to alleviate
>>> a scalability problem in the MDS. The cap delay logic is already
>>> horrifically complex as well, and this just makes it worse. If the MDS
>>> happens to be processing requests from a lot of clients, this may not
>>> help at all.
>> Yeah, right, in that case the mds dispatch queue possibly will have more
>> in a short time.
>>
>>
>>>> URL: https://tracker.ceph.com/issues/51734
>>> The core problem sounds like the MDS got a call and just took way too
>>> long to get to it. Shouldn't we be addressing this in the MDS instead?
>>> It sounds like it needs to do a better job of parallelizing cap flushes.
>> The problem is that almost all the requests need to acquire the global
>> 'mds_lock', so the parallelizing won't improve much.
>>
> Yuck. That sounds like the root cause right there. For the record, I'm
> against papering over that with client patches.
>
>> Currently all the client side requests will have the priority of
>> CEPH_MSG_PRIO_DEFAULT:
>>
>> m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
>>
>> Not sure whether can we just make the cap flushing request a lower
>> priority ? If this won't introduce other potential issue, that will work
>> because in the MDS dispatch queue it will handle high priority requests
>> first.
>>
> Tough call. You might mark it for lower priority, but then someone calls
> fsync() and suddenly you need it to finish sooner. I think that sort of
> change might backfire in some cases.

Yeah, I am also worrying about this.


>
> Usually when faced with this sort of issue, the right answer is to try
> to work ahead of the "spikes" in activity so that you don't have quite
> such a backlog when you do need to flush everything out. I'm not sure
> how possible that is here.
>
> One underlying issue may be that some of the client's background
> activity is currently driven by arbitrary timers (e.g. the delayed_work
> timer). When the timer pops, it suddenly has to do a bunch of work, when
> some of it could have been done earlier so that the load was spread out.
>
>
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/caps.c       | 21 ++++++++++++++++++++-
>>>>    fs/ceph/mds_client.c |  3 ++-
>>>>    fs/ceph/mds_client.h |  3 +++
>>>>    3 files changed, 25 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>> index 4b966c29d9b5..064865761d2b 100644
>>>> --- a/fs/ceph/caps.c
>>>> +++ b/fs/ceph/caps.c
>>>> @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>>>>    static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>    				struct ceph_inode_info *ci)
>>>>    {
>>>> +	int num = 0;
>>>> +
>>>>    	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>>>>    	     ci->i_ceph_flags, ci->i_hold_caps_max);
>>>>    	if (!mdsc->stopping) {
>>>> @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>    			if (ci->i_ceph_flags & CEPH_I_FLUSH)
>>>>    				goto no_change;
>>>>    			list_del_init(&ci->i_cap_delay_list);
>>>> +			mdsc->num_cap_delay--;
>>>>    		}
>>>>    		__cap_set_timeouts(mdsc, ci);
>>>>    		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>>>> +		num = ++mdsc->num_cap_delay;
>>>>    no_change:
>>>>    		spin_unlock(&mdsc->cap_delay_lock);
>>>>    	}
>>>> +
>>>> +	if (num > 100) {
>>>> +		flush_delayed_work(&mdsc->delayed_work);
>>>> +		schedule_delayed(mdsc);
>>>> +	}
>>>>    }
>>>>    
>>>>    /*
>>>> @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>    static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
>>>>    				      struct ceph_inode_info *ci)
>>>>    {
>>>> +	int num;
>>>> +
>>>>    	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
>>>>    	spin_lock(&mdsc->cap_delay_lock);
>>>>    	ci->i_ceph_flags |= CEPH_I_FLUSH;
>>>> -	if (!list_empty(&ci->i_cap_delay_list))
>>>> +	if (!list_empty(&ci->i_cap_delay_list)) {
>>>>    		list_del_init(&ci->i_cap_delay_list);
>>>> +		mdsc->num_cap_delay--;
>>>> +	}
>>>>    	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>>>> +	num = ++mdsc->num_cap_delay;
>>>>    	spin_unlock(&mdsc->cap_delay_lock);
>>>> +
>>>> +	if (num > 100) {
>>>> +		flush_delayed_work(&mdsc->delayed_work);
>>>> +		schedule_delayed(mdsc);
>>>> +	}
>>>>    }
>>>>    
>>>>    /*
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index 79aa4ce3a388..14e44de05812 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
>>>>    /*
>>>>     * delayed work -- periodically trim expired leases, renew caps with mds
>>>>     */
>>>> -static void schedule_delayed(struct ceph_mds_client *mdsc)
>>>> +void schedule_delayed(struct ceph_mds_client *mdsc)
>>>>    {
>>>>    	int delay = 5;
>>>>    	unsigned hz = round_jiffies_relative(HZ * delay);
>>>> @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>>>>    	mdsc->request_tree = RB_ROOT;
>>>>    	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
>>>>    	mdsc->last_renew_caps = jiffies;
>>>> +	mdsc->num_cap_delay = 0;
>>>>    	INIT_LIST_HEAD(&mdsc->cap_delay_list);
>>>>    	INIT_LIST_HEAD(&mdsc->cap_wait_list);
>>>>    	spin_lock_init(&mdsc->cap_delay_lock);
>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>> index a7af09257382..b4289b8d23ec 100644
>>>> --- a/fs/ceph/mds_client.h
>>>> +++ b/fs/ceph/mds_client.h
>>>> @@ -423,6 +423,7 @@ struct ceph_mds_client {
>>>>    	struct rb_root         request_tree;  /* pending mds requests */
>>>>    	struct delayed_work    delayed_work;  /* delayed work */
>>>>    	unsigned long    last_renew_caps;  /* last time we renewed our caps */
>>>> +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
>>>>    	struct list_head cap_delay_list;   /* caps with delayed release */
>>>>    	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
>>>>    	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
>>>> @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>>>>    			  struct ceph_mds_session *session,
>>>>    			  int max_caps);
>>>>    
>>>> +extern void schedule_delayed(struct ceph_mds_client *mdsc);
>>>> +
>>>>    static inline int ceph_wait_on_async_create(struct inode *inode)
>>>>    {
>>>>    	struct ceph_inode_info *ci = ceph_inode(inode);


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-21 13:42       ` Xiubo Li
@ 2021-07-21 14:04         ` Jeff Layton
  0 siblings, 0 replies; 9+ messages in thread
From: Jeff Layton @ 2021-07-21 14:04 UTC (permalink / raw)
  To: Xiubo Li, idryomov; +Cc: pdonnell, ceph-devel

On Wed, 2021-07-21 at 21:42 +0800, Xiubo Li wrote:
> On 7/21/21 8:57 PM, Jeff Layton wrote:
> > On Wed, 2021-07-21 at 19:54 +0800, Xiubo Li wrote:
> > > On 7/21/21 7:23 PM, Jeff Layton wrote:
> > > > On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
> > > > > From: Xiubo Li <xiubli@redhat.com>
> > > > > 
> > > > > The delayed_work will be executed per 5 seconds, during this time
> > > > > the cap_delay_list may accumulate thounsands of caps need to flush,
> > > > > this will make the MDS's dispatch queue be full and need a very long
> > > > > time to handle them. And if there has some other operations, likes
> > > > > a rmdir request, it will be add in the tail of dispath queue and
> > > > > need to wait for several or tens of seconds.
> > > > > 
> > > > > In client side we shouldn't queue to many of the cap requests and
> > > > > flush them if there has more than 100 items.
> > > > > 
> > > > Why 100? My inclination is to say NAK on this.
> > > This just from my test that around 100 client_caps requests queued will
> > > work fine in most cases, which won't take too long to handle. Some times
> > > the client will send thousands of requests in a short time, that will be
> > > a problem.
> > What may be a better approach is to figure out why we're holding on to
> > so many caps and trying to flush them all at once. Maybe if we were to
> > more aggressively flush sooner, we'd not end up with such a backlog?
> 
> Yeah, I am still working on this.
> 
>  From my analysis, some fixing or improvement like this will help reduce it:
> 
> commit e64f44a884657358812e6c057957be546db03cbe
> Author: Xiubo Li <xiubli@redhat.com>
> Date:   Wed May 27 09:09:27 2020 -0400
> 
>      ceph: skip checking caps when session reconnecting and releasing reqs
> 
>      It make no sense to check the caps when reconnecting to mds. And
>      for the async dirop caps, they will be put by its _cb() function,
>      so when releasing the requests, it will make no sense too.
> 
>  From the test result, it seems that some caps related requests were 
> still pending in the cap_delay_list even after the rmdir requests were 
> already sent out and finished for the previous directories. I am still 
> confirming this.
> 

Ok, that at least sounds superficially similar to this bug that Patrick
and I have been chasing:

    https://tracker.ceph.com/issues/51279

It's possible that some of these have the same root cause.

> 
> 
> > 
> > > > I'm not a fan of this sort of arbitrary throttling in order to alleviate
> > > > a scalability problem in the MDS. The cap delay logic is already
> > > > horrifically complex as well, and this just makes it worse. If the MDS
> > > > happens to be processing requests from a lot of clients, this may not
> > > > help at all.
> > > Yeah, right, in that case the mds dispatch queue possibly will have more
> > > in a short time.
> > > 
> > > 
> > > > > URL: https://tracker.ceph.com/issues/51734
> > > > The core problem sounds like the MDS got a call and just took way too
> > > > long to get to it. Shouldn't we be addressing this in the MDS instead?
> > > > It sounds like it needs to do a better job of parallelizing cap flushes.
> > > The problem is that almost all the requests need to acquire the global
> > > 'mds_lock', so the parallelizing won't improve much.
> > > 
> > Yuck. That sounds like the root cause right there. For the record, I'm
> > against papering over that with client patches.
> > 
> > > Currently all the client side requests will have the priority of
> > > CEPH_MSG_PRIO_DEFAULT:
> > > 
> > > m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
> > > 
> > > Not sure whether can we just make the cap flushing request a lower
> > > priority ? If this won't introduce other potential issue, that will work
> > > because in the MDS dispatch queue it will handle high priority requests
> > > first.
> > > 
> > Tough call. You might mark it for lower priority, but then someone calls
> > fsync() and suddenly you need it to finish sooner. I think that sort of
> > change might backfire in some cases.
> 
> Yeah, I am also worrying about this.
> 
> 
> > 
> > Usually when faced with this sort of issue, the right answer is to try
> > to work ahead of the "spikes" in activity so that you don't have quite
> > such a backlog when you do need to flush everything out. I'm not sure
> > how possible that is here.
> > 
> > One underlying issue may be that some of the client's background
> > activity is currently driven by arbitrary timers (e.g. the delayed_work
> > timer). When the timer pops, it suddenly has to do a bunch of work, when
> > some of it could have been done earlier so that the load was spread out.
> > 
> > 
> > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > > ---
> > > > >    fs/ceph/caps.c       | 21 ++++++++++++++++++++-
> > > > >    fs/ceph/mds_client.c |  3 ++-
> > > > >    fs/ceph/mds_client.h |  3 +++
> > > > >    3 files changed, 25 insertions(+), 2 deletions(-)
> > > > > 
> > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > index 4b966c29d9b5..064865761d2b 100644
> > > > > --- a/fs/ceph/caps.c
> > > > > +++ b/fs/ceph/caps.c
> > > > > @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
> > > > >    static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > > > >    				struct ceph_inode_info *ci)
> > > > >    {
> > > > > +	int num = 0;
> > > > > +
> > > > >    	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> > > > >    	     ci->i_ceph_flags, ci->i_hold_caps_max);
> > > > >    	if (!mdsc->stopping) {
> > > > > @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > > > >    			if (ci->i_ceph_flags & CEPH_I_FLUSH)
> > > > >    				goto no_change;
> > > > >    			list_del_init(&ci->i_cap_delay_list);
> > > > > +			mdsc->num_cap_delay--;
> > > > >    		}
> > > > >    		__cap_set_timeouts(mdsc, ci);
> > > > >    		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> > > > > +		num = ++mdsc->num_cap_delay;
> > > > >    no_change:
> > > > >    		spin_unlock(&mdsc->cap_delay_lock);
> > > > >    	}
> > > > > +
> > > > > +	if (num > 100) {
> > > > > +		flush_delayed_work(&mdsc->delayed_work);
> > > > > +		schedule_delayed(mdsc);
> > > > > +	}
> > > > >    }
> > > > >    
> > > > >    /*
> > > > > @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > > > >    static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
> > > > >    				      struct ceph_inode_info *ci)
> > > > >    {
> > > > > +	int num;
> > > > > +
> > > > >    	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
> > > > >    	spin_lock(&mdsc->cap_delay_lock);
> > > > >    	ci->i_ceph_flags |= CEPH_I_FLUSH;
> > > > > -	if (!list_empty(&ci->i_cap_delay_list))
> > > > > +	if (!list_empty(&ci->i_cap_delay_list)) {
> > > > >    		list_del_init(&ci->i_cap_delay_list);
> > > > > +		mdsc->num_cap_delay--;
> > > > > +	}
> > > > >    	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> > > > > +	num = ++mdsc->num_cap_delay;
> > > > >    	spin_unlock(&mdsc->cap_delay_lock);
> > > > > +
> > > > > +	if (num > 100) {
> > > > > +		flush_delayed_work(&mdsc->delayed_work);
> > > > > +		schedule_delayed(mdsc);
> > > > > +	}
> > > > >    }
> > > > >    
> > > > >    /*
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index 79aa4ce3a388..14e44de05812 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
> > > > >    /*
> > > > >     * delayed work -- periodically trim expired leases, renew caps with mds
> > > > >     */
> > > > > -static void schedule_delayed(struct ceph_mds_client *mdsc)
> > > > > +void schedule_delayed(struct ceph_mds_client *mdsc)
> > > > >    {
> > > > >    	int delay = 5;
> > > > >    	unsigned hz = round_jiffies_relative(HZ * delay);
> > > > > @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
> > > > >    	mdsc->request_tree = RB_ROOT;
> > > > >    	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
> > > > >    	mdsc->last_renew_caps = jiffies;
> > > > > +	mdsc->num_cap_delay = 0;
> > > > >    	INIT_LIST_HEAD(&mdsc->cap_delay_list);
> > > > >    	INIT_LIST_HEAD(&mdsc->cap_wait_list);
> > > > >    	spin_lock_init(&mdsc->cap_delay_lock);
> > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > index a7af09257382..b4289b8d23ec 100644
> > > > > --- a/fs/ceph/mds_client.h
> > > > > +++ b/fs/ceph/mds_client.h
> > > > > @@ -423,6 +423,7 @@ struct ceph_mds_client {
> > > > >    	struct rb_root         request_tree;  /* pending mds requests */
> > > > >    	struct delayed_work    delayed_work;  /* delayed work */
> > > > >    	unsigned long    last_renew_caps;  /* last time we renewed our caps */
> > > > > +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
> > > > >    	struct list_head cap_delay_list;   /* caps with delayed release */
> > > > >    	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
> > > > >    	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
> > > > > @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > > > >    			  struct ceph_mds_session *session,
> > > > >    			  int max_caps);
> > > > >    
> > > > > +extern void schedule_delayed(struct ceph_mds_client *mdsc);
> > > > > +
> > > > >    static inline int ceph_wait_on_async_create(struct inode *inode)
> > > > >    {
> > > > >    	struct ceph_inode_info *ci = ceph_inode(inode);
> 

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-21 12:57     ` Jeff Layton
  2021-07-21 13:42       ` Xiubo Li
@ 2021-07-22  9:36       ` Xiubo Li
  2021-07-22 11:42         ` Jeff Layton
  1 sibling, 1 reply; 9+ messages in thread
From: Xiubo Li @ 2021-07-22  9:36 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: pdonnell, ceph-devel


On 7/21/21 8:57 PM, Jeff Layton wrote:
> On Wed, 2021-07-21 at 19:54 +0800, Xiubo Li wrote:
>> On 7/21/21 7:23 PM, Jeff Layton wrote:
>>> On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> The delayed_work will be executed per 5 seconds, during this time
>>>> the cap_delay_list may accumulate thounsands of caps need to flush,
>>>> this will make the MDS's dispatch queue be full and need a very long
>>>> time to handle them. And if there has some other operations, likes
>>>> a rmdir request, it will be add in the tail of dispath queue and
>>>> need to wait for several or tens of seconds.
>>>>
>>>> In client side we shouldn't queue to many of the cap requests and
>>>> flush them if there has more than 100 items.
>>>>
>>> Why 100? My inclination is to say NAK on this.
>> This just from my test that around 100 client_caps requests queued will
>> work fine in most cases, which won't take too long to handle. Some times
>> the client will send thousands of requests in a short time, that will be
>> a problem.
> What may be a better approach is to figure out why we're holding on to
> so many caps and trying to flush them all at once. Maybe if we were to
> more aggressively flush sooner, we'd not end up with such a backlog?

  881912 Jul 22 10:36:14 lxbceph1 kernel: ceph:  00000000f7ee4ccf mode 
040755 uid.gid 0.0
  881913 Jul 22 10:36:14 lxbceph1 kernel: ceph:  size 0 -> 0
  881914 Jul 22 10:36:14 lxbceph1 kernel: ceph:  truncate_seq 0 -> 1
  881915 Jul 22 10:36:14 lxbceph1 kernel: ceph:  truncate_size 0 -> 
18446744073709551615
  881916 Jul 22 10:36:14 lxbceph1 kernel: ceph:  add_cap 
00000000f7ee4ccf mds0 cap 152d pAsxLsXsxFsx seq 1
  881917 Jul 22 10:36:14 lxbceph1 kernel: ceph:  lookup_snap_realm 1 
0000000095ff27ff
  881918 Jul 22 10:36:14 lxbceph1 kernel: ceph:  get_realm 
0000000095ff27ff 5421 -> 5422
  881919 Jul 22 10:36:14 lxbceph1 kernel: ceph:  __ceph_caps_issued 
00000000f7ee4ccf cap 000000003c8bc134 issued -
  881920 Jul 22 10:36:14 lxbceph1 kernel: ceph:   marking 
00000000f7ee4ccf NOT complete
  881921 Jul 22 10:36:14 lxbceph1 kernel: ceph:   issued pAsxLsXsxFsx, 
mds wanted -, actual -, queueing
  881922 Jul 22 10:36:14 lxbceph1 kernel: ceph:  __cap_set_timeouts 
00000000f7ee4ccf min 5036 max 60036
  881923 Jul 22 10:36:14 lxbceph1 kernel: ceph: __cap_delay_requeue 
00000000f7ee4ccf flags 0 at 4294896928
  881924 Jul 22 10:36:14 lxbceph1 kernel: ceph:  add_cap inode 
00000000f7ee4ccf (1000000152a.fffffffffffffffe) cap 000000003c8bc134 
pAsxLsXsxFsx now pAsxLsXsxFsx seq 1 mds0
  881925 Jul 22 10:36:14 lxbceph1 kernel: ceph:   marking 
00000000f7ee4ccf complete (empty)
  881926 Jul 22 10:36:14 lxbceph1 kernel: ceph:  dn 0000000079fd7e04 
attached to 00000000f7ee4ccf ino 1000000152a.fffffffffffffffe
  881927 Jul 22 10:36:14 lxbceph1 kernel: ceph: update_dentry_lease 
0000000079fd7e04 duration 30000 ms ttl 4294866888
  881928 Jul 22 10:36:14 lxbceph1 kernel: ceph:  dentry_lru_touch 
000000006fddf4b0 0000000079fd7e04 'removalC.test01806' (offset 0)


 From the kernel logs, some of these delayed caps are from previous 
thousands of 'mkdir' requests, after the mkdir requests get replied and 
when creating new caps, since the MDS has issued extra caps that they 
don't want, then these caps are added to the delay queue, and in 5 
seconds the delay list could accumulate up to several thousands of caps.

And most of them come from stale dentries.

So 5 seconds later when the delayed_queue is fired, all this client_caps 
requests are queued in the MDS dispatch queue.

The following commit could improve the performance very much:

commit 37c4efc1ddf98ba8b234d116d863a9464445901e
Author: Yan, Zheng <zyan@redhat.com>
Date:   Thu Jan 31 16:55:51 2019 +0800

     ceph: periodically trim stale dentries

     Previous commit make VFS delete stale dentry when last reference is
     dropped. Lease also can become invalid when corresponding dentry has
     no reference. This patch make cephfs periodically scan lease list,
     delete corresponding dentry if lease is invalid.

     There are two types of lease, dentry lease and dir lease. dentry lease
     has life time and applies to singe dentry. Dentry lease is added to 
tail
     of a list when it's updated, leases at front of the list will expire
     first. Dir lease is CEPH_CAP_FILE_SHARED on directory inode, it applies
     to all dentries in the directory. Dentries have dir leases are added to
     another list. Dentries in the list are periodically checked in a round
     robin manner.

With this commit, it will take 3~4 minutes to finish my test:

real    3m33.998s
user    0m0.644s
sys    0m2.341s
[1]   Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o 
~/removal${i}.log -- rm -rf removal$i* )
real    3m34.028s
user    0m0.620s
sys    0m2.342s
[2]-  Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o 
~/removal${i}.log -- rm -rf removal$i* )
real    3m34.049s
user    0m0.638s
sys    0m2.315s
[3]+  Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o 
~/removal${i}.log -- rm -rf removal$i* )

Without this it will take more than 12 minutes for above 3 'rm' threads.

Thanks.





>>> I'm not a fan of this sort of arbitrary throttling in order to alleviate
>>> a scalability problem in the MDS. The cap delay logic is already
>>> horrifically complex as well, and this just makes it worse. If the MDS
>>> happens to be processing requests from a lot of clients, this may not
>>> help at all.
>> Yeah, right, in that case the mds dispatch queue possibly will have more
>> in a short time.
>>
>>
>>>> URL: https://tracker.ceph.com/issues/51734
>>> The core problem sounds like the MDS got a call and just took way too
>>> long to get to it. Shouldn't we be addressing this in the MDS instead?
>>> It sounds like it needs to do a better job of parallelizing cap flushes.
>> The problem is that almost all the requests need to acquire the global
>> 'mds_lock', so the parallelizing won't improve much.
>>
> Yuck. That sounds like the root cause right there. For the record, I'm
> against papering over that with client patches.
>
>> Currently all the client side requests will have the priority of
>> CEPH_MSG_PRIO_DEFAULT:
>>
>> m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
>>
>> Not sure whether can we just make the cap flushing request a lower
>> priority ? If this won't introduce other potential issue, that will work
>> because in the MDS dispatch queue it will handle high priority requests
>> first.
>>
> Tough call. You might mark it for lower priority, but then someone calls
> fsync() and suddenly you need it to finish sooner. I think that sort of
> change might backfire in some cases.
>
> Usually when faced with this sort of issue, the right answer is to try
> to work ahead of the "spikes" in activity so that you don't have quite
> such a backlog when you do need to flush everything out. I'm not sure
> how possible that is here.
>
> One underlying issue may be that some of the client's background
> activity is currently driven by arbitrary timers (e.g. the delayed_work
> timer). When the timer pops, it suddenly has to do a bunch of work, when
> some of it could have been done earlier so that the load was spread out.
>
>
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/caps.c       | 21 ++++++++++++++++++++-
>>>>    fs/ceph/mds_client.c |  3 ++-
>>>>    fs/ceph/mds_client.h |  3 +++
>>>>    3 files changed, 25 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>> index 4b966c29d9b5..064865761d2b 100644
>>>> --- a/fs/ceph/caps.c
>>>> +++ b/fs/ceph/caps.c
>>>> @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>>>>    static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>    				struct ceph_inode_info *ci)
>>>>    {
>>>> +	int num = 0;
>>>> +
>>>>    	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>>>>    	     ci->i_ceph_flags, ci->i_hold_caps_max);
>>>>    	if (!mdsc->stopping) {
>>>> @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>    			if (ci->i_ceph_flags & CEPH_I_FLUSH)
>>>>    				goto no_change;
>>>>    			list_del_init(&ci->i_cap_delay_list);
>>>> +			mdsc->num_cap_delay--;
>>>>    		}
>>>>    		__cap_set_timeouts(mdsc, ci);
>>>>    		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>>>> +		num = ++mdsc->num_cap_delay;
>>>>    no_change:
>>>>    		spin_unlock(&mdsc->cap_delay_lock);
>>>>    	}
>>>> +
>>>> +	if (num > 100) {
>>>> +		flush_delayed_work(&mdsc->delayed_work);
>>>> +		schedule_delayed(mdsc);
>>>> +	}
>>>>    }
>>>>    
>>>>    /*
>>>> @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>    static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
>>>>    				      struct ceph_inode_info *ci)
>>>>    {
>>>> +	int num;
>>>> +
>>>>    	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
>>>>    	spin_lock(&mdsc->cap_delay_lock);
>>>>    	ci->i_ceph_flags |= CEPH_I_FLUSH;
>>>> -	if (!list_empty(&ci->i_cap_delay_list))
>>>> +	if (!list_empty(&ci->i_cap_delay_list)) {
>>>>    		list_del_init(&ci->i_cap_delay_list);
>>>> +		mdsc->num_cap_delay--;
>>>> +	}
>>>>    	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
>>>> +	num = ++mdsc->num_cap_delay;
>>>>    	spin_unlock(&mdsc->cap_delay_lock);
>>>> +
>>>> +	if (num > 100) {
>>>> +		flush_delayed_work(&mdsc->delayed_work);
>>>> +		schedule_delayed(mdsc);
>>>> +	}
>>>>    }
>>>>    
>>>>    /*
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index 79aa4ce3a388..14e44de05812 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
>>>>    /*
>>>>     * delayed work -- periodically trim expired leases, renew caps with mds
>>>>     */
>>>> -static void schedule_delayed(struct ceph_mds_client *mdsc)
>>>> +void schedule_delayed(struct ceph_mds_client *mdsc)
>>>>    {
>>>>    	int delay = 5;
>>>>    	unsigned hz = round_jiffies_relative(HZ * delay);
>>>> @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>>>>    	mdsc->request_tree = RB_ROOT;
>>>>    	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
>>>>    	mdsc->last_renew_caps = jiffies;
>>>> +	mdsc->num_cap_delay = 0;
>>>>    	INIT_LIST_HEAD(&mdsc->cap_delay_list);
>>>>    	INIT_LIST_HEAD(&mdsc->cap_wait_list);
>>>>    	spin_lock_init(&mdsc->cap_delay_lock);
>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>> index a7af09257382..b4289b8d23ec 100644
>>>> --- a/fs/ceph/mds_client.h
>>>> +++ b/fs/ceph/mds_client.h
>>>> @@ -423,6 +423,7 @@ struct ceph_mds_client {
>>>>    	struct rb_root         request_tree;  /* pending mds requests */
>>>>    	struct delayed_work    delayed_work;  /* delayed work */
>>>>    	unsigned long    last_renew_caps;  /* last time we renewed our caps */
>>>> +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
>>>>    	struct list_head cap_delay_list;   /* caps with delayed release */
>>>>    	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
>>>>    	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
>>>> @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>>>>    			  struct ceph_mds_session *session,
>>>>    			  int max_caps);
>>>>    
>>>> +extern void schedule_delayed(struct ceph_mds_client *mdsc);
>>>> +
>>>>    static inline int ceph_wait_on_async_create(struct inode *inode)
>>>>    {
>>>>    	struct ceph_inode_info *ci = ceph_inode(inode);


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-22  9:36       ` Xiubo Li
@ 2021-07-22 11:42         ` Jeff Layton
  2021-07-22 13:53           ` Xiubo Li
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Layton @ 2021-07-22 11:42 UTC (permalink / raw)
  To: Xiubo Li, idryomov; +Cc: pdonnell, ceph-devel

On Thu, 2021-07-22 at 17:36 +0800, Xiubo Li wrote:
> On 7/21/21 8:57 PM, Jeff Layton wrote:
> > On Wed, 2021-07-21 at 19:54 +0800, Xiubo Li wrote:
> > > On 7/21/21 7:23 PM, Jeff Layton wrote:
> > > > On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
> > > > > From: Xiubo Li <xiubli@redhat.com>
> > > > > 
> > > > > The delayed_work will be executed per 5 seconds, during this time
> > > > > the cap_delay_list may accumulate thounsands of caps need to flush,
> > > > > this will make the MDS's dispatch queue be full and need a very long
> > > > > time to handle them. And if there has some other operations, likes
> > > > > a rmdir request, it will be add in the tail of dispath queue and
> > > > > need to wait for several or tens of seconds.
> > > > > 
> > > > > In client side we shouldn't queue to many of the cap requests and
> > > > > flush them if there has more than 100 items.
> > > > > 
> > > > Why 100? My inclination is to say NAK on this.
> > > This just from my test that around 100 client_caps requests queued will
> > > work fine in most cases, which won't take too long to handle. Some times
> > > the client will send thousands of requests in a short time, that will be
> > > a problem.
> > What may be a better approach is to figure out why we're holding on to
> > so many caps and trying to flush them all at once. Maybe if we were to
> > more aggressively flush sooner, we'd not end up with such a backlog?
> 
>   881912 Jul 22 10:36:14 lxbceph1 kernel: ceph:  00000000f7ee4ccf mode 
> 040755 uid.gid 0.0
>   881913 Jul 22 10:36:14 lxbceph1 kernel: ceph:  size 0 -> 0
>   881914 Jul 22 10:36:14 lxbceph1 kernel: ceph:  truncate_seq 0 -> 1
>   881915 Jul 22 10:36:14 lxbceph1 kernel: ceph:  truncate_size 0 -> 
> 18446744073709551615
>   881916 Jul 22 10:36:14 lxbceph1 kernel: ceph:  add_cap 
> 00000000f7ee4ccf mds0 cap 152d pAsxLsXsxFsx seq 1
>   881917 Jul 22 10:36:14 lxbceph1 kernel: ceph:  lookup_snap_realm 1 
> 0000000095ff27ff
>   881918 Jul 22 10:36:14 lxbceph1 kernel: ceph:  get_realm 
> 0000000095ff27ff 5421 -> 5422
>   881919 Jul 22 10:36:14 lxbceph1 kernel: ceph:  __ceph_caps_issued 
> 00000000f7ee4ccf cap 000000003c8bc134 issued -
>   881920 Jul 22 10:36:14 lxbceph1 kernel: ceph:   marking 
> 00000000f7ee4ccf NOT complete
>   881921 Jul 22 10:36:14 lxbceph1 kernel: ceph:   issued pAsxLsXsxFsx, 
> mds wanted -, actual -, queueing
>   881922 Jul 22 10:36:14 lxbceph1 kernel: ceph:  __cap_set_timeouts 
> 00000000f7ee4ccf min 5036 max 60036
>   881923 Jul 22 10:36:14 lxbceph1 kernel: ceph: __cap_delay_requeue 
> 00000000f7ee4ccf flags 0 at 4294896928
>   881924 Jul 22 10:36:14 lxbceph1 kernel: ceph:  add_cap inode 
> 00000000f7ee4ccf (1000000152a.fffffffffffffffe) cap 000000003c8bc134 
> pAsxLsXsxFsx now pAsxLsXsxFsx seq 1 mds0
>   881925 Jul 22 10:36:14 lxbceph1 kernel: ceph:   marking 
> 00000000f7ee4ccf complete (empty)
>   881926 Jul 22 10:36:14 lxbceph1 kernel: ceph:  dn 0000000079fd7e04 
> attached to 00000000f7ee4ccf ino 1000000152a.fffffffffffffffe
>   881927 Jul 22 10:36:14 lxbceph1 kernel: ceph: update_dentry_lease 
> 0000000079fd7e04 duration 30000 ms ttl 4294866888
>   881928 Jul 22 10:36:14 lxbceph1 kernel: ceph:  dentry_lru_touch 
> 000000006fddf4b0 0000000079fd7e04 'removalC.test01806' (offset 0)
> 
> 
>  From the kernel logs, some of these delayed caps are from previous 
> thousands of 'mkdir' requests, after the mkdir requests get replied and 
> when creating new caps, since the MDS has issued extra caps that they 
> don't want, then these caps are added to the delay queue, and in 5 
> seconds the delay list could accumulate up to several thousands of caps.
> 
> And most of them come from stale dentries.
> 
> So 5 seconds later when the delayed_queue is fired, all this client_caps 
> requests are queued in the MDS dispatch queue.
> 
> The following commit could improve the performance very much:
> 
> commit 37c4efc1ddf98ba8b234d116d863a9464445901e
> Author: Yan, Zheng <zyan@redhat.com>
> Date:   Thu Jan 31 16:55:51 2019 +0800
> 
>      ceph: periodically trim stale dentries
> 
>      Previous commit make VFS delete stale dentry when last reference is
>      dropped. Lease also can become invalid when corresponding dentry has
>      no reference. This patch make cephfs periodically scan lease list,
>      delete corresponding dentry if lease is invalid.
> 
>      There are two types of lease, dentry lease and dir lease. dentry lease
>      has life time and applies to singe dentry. Dentry lease is added to 
> tail
>      of a list when it's updated, leases at front of the list will expire
>      first. Dir lease is CEPH_CAP_FILE_SHARED on directory inode, it applies
>      to all dentries in the directory. Dentries have dir leases are added to
>      another list. Dentries in the list are periodically checked in a round
>      robin manner.
> 
> With this commit, it will take 3~4 minutes to finish my test:
> 
> real    3m33.998s
> user    0m0.644s
> sys    0m2.341s
> [1]   Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o 
> ~/removal${i}.log -- rm -rf removal$i* )
> real    3m34.028s
> user    0m0.620s
> sys    0m2.342s
> [2]-  Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o 
> ~/removal${i}.log -- rm -rf removal$i* )
> real    3m34.049s
> user    0m0.638s
> sys    0m2.315s
> [3]+  Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o 
> ~/removal${i}.log -- rm -rf removal$i* )
> 
> Without this it will take more than 12 minutes for above 3 'rm' threads.
> 

That commit was already merged a couple of years ago. Does this mean you
think the client behavior can't be improved further?
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] ceph: flush the delayed caps in time
  2021-07-22 11:42         ` Jeff Layton
@ 2021-07-22 13:53           ` Xiubo Li
  0 siblings, 0 replies; 9+ messages in thread
From: Xiubo Li @ 2021-07-22 13:53 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: pdonnell, ceph-devel


On 7/22/21 7:42 PM, Jeff Layton wrote:
> On Thu, 2021-07-22 at 17:36 +0800, Xiubo Li wrote:
>> On 7/21/21 8:57 PM, Jeff Layton wrote:
>>> On Wed, 2021-07-21 at 19:54 +0800, Xiubo Li wrote:
>>>> On 7/21/21 7:23 PM, Jeff Layton wrote:
>>>>> On Wed, 2021-07-21 at 16:27 +0800, xiubli@redhat.com wrote:
>>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>>
>>>>>> The delayed_work will be executed per 5 seconds, during this time
>>>>>> the cap_delay_list may accumulate thounsands of caps need to flush,
>>>>>> this will make the MDS's dispatch queue be full and need a very long
>>>>>> time to handle them. And if there has some other operations, likes
>>>>>> a rmdir request, it will be add in the tail of dispath queue and
>>>>>> need to wait for several or tens of seconds.
>>>>>>
>>>>>> In client side we shouldn't queue to many of the cap requests and
>>>>>> flush them if there has more than 100 items.
>>>>>>
>>>>> Why 100? My inclination is to say NAK on this.
>>>> This just from my test that around 100 client_caps requests queued will
>>>> work fine in most cases, which won't take too long to handle. Some times
>>>> the client will send thousands of requests in a short time, that will be
>>>> a problem.
>>> What may be a better approach is to figure out why we're holding on to
>>> so many caps and trying to flush them all at once. Maybe if we were to
>>> more aggressively flush sooner, we'd not end up with such a backlog?
>>    881912 Jul 22 10:36:14 lxbceph1 kernel: ceph:  00000000f7ee4ccf mode
>> 040755 uid.gid 0.0
>>    881913 Jul 22 10:36:14 lxbceph1 kernel: ceph:  size 0 -> 0
>>    881914 Jul 22 10:36:14 lxbceph1 kernel: ceph:  truncate_seq 0 -> 1
>>    881915 Jul 22 10:36:14 lxbceph1 kernel: ceph:  truncate_size 0 ->
>> 18446744073709551615
>>    881916 Jul 22 10:36:14 lxbceph1 kernel: ceph:  add_cap
>> 00000000f7ee4ccf mds0 cap 152d pAsxLsXsxFsx seq 1
>>    881917 Jul 22 10:36:14 lxbceph1 kernel: ceph:  lookup_snap_realm 1
>> 0000000095ff27ff
>>    881918 Jul 22 10:36:14 lxbceph1 kernel: ceph:  get_realm
>> 0000000095ff27ff 5421 -> 5422
>>    881919 Jul 22 10:36:14 lxbceph1 kernel: ceph:  __ceph_caps_issued
>> 00000000f7ee4ccf cap 000000003c8bc134 issued -
>>    881920 Jul 22 10:36:14 lxbceph1 kernel: ceph:   marking
>> 00000000f7ee4ccf NOT complete
>>    881921 Jul 22 10:36:14 lxbceph1 kernel: ceph:   issued pAsxLsXsxFsx,
>> mds wanted -, actual -, queueing
>>    881922 Jul 22 10:36:14 lxbceph1 kernel: ceph:  __cap_set_timeouts
>> 00000000f7ee4ccf min 5036 max 60036
>>    881923 Jul 22 10:36:14 lxbceph1 kernel: ceph: __cap_delay_requeue
>> 00000000f7ee4ccf flags 0 at 4294896928
>>    881924 Jul 22 10:36:14 lxbceph1 kernel: ceph:  add_cap inode
>> 00000000f7ee4ccf (1000000152a.fffffffffffffffe) cap 000000003c8bc134
>> pAsxLsXsxFsx now pAsxLsXsxFsx seq 1 mds0
>>    881925 Jul 22 10:36:14 lxbceph1 kernel: ceph:   marking
>> 00000000f7ee4ccf complete (empty)
>>    881926 Jul 22 10:36:14 lxbceph1 kernel: ceph:  dn 0000000079fd7e04
>> attached to 00000000f7ee4ccf ino 1000000152a.fffffffffffffffe
>>    881927 Jul 22 10:36:14 lxbceph1 kernel: ceph: update_dentry_lease
>> 0000000079fd7e04 duration 30000 ms ttl 4294866888
>>    881928 Jul 22 10:36:14 lxbceph1 kernel: ceph:  dentry_lru_touch
>> 000000006fddf4b0 0000000079fd7e04 'removalC.test01806' (offset 0)
>>
>>
>>   From the kernel logs, some of these delayed caps are from previous
>> thousands of 'mkdir' requests, after the mkdir requests get replied and
>> when creating new caps, since the MDS has issued extra caps that they
>> don't want, then these caps are added to the delay queue, and in 5
>> seconds the delay list could accumulate up to several thousands of caps.
>>
>> And most of them come from stale dentries.
>>
>> So 5 seconds later when the delayed_queue is fired, all this client_caps
>> requests are queued in the MDS dispatch queue.
>>
>> The following commit could improve the performance very much:
>>
>> commit 37c4efc1ddf98ba8b234d116d863a9464445901e
>> Author: Yan, Zheng <zyan@redhat.com>
>> Date:   Thu Jan 31 16:55:51 2019 +0800
>>
>>       ceph: periodically trim stale dentries
>>
>>       Previous commit make VFS delete stale dentry when last reference is
>>       dropped. Lease also can become invalid when corresponding dentry has
>>       no reference. This patch make cephfs periodically scan lease list,
>>       delete corresponding dentry if lease is invalid.
>>
>>       There are two types of lease, dentry lease and dir lease. dentry lease
>>       has life time and applies to singe dentry. Dentry lease is added to
>> tail
>>       of a list when it's updated, leases at front of the list will expire
>>       first. Dir lease is CEPH_CAP_FILE_SHARED on directory inode, it applies
>>       to all dentries in the directory. Dentries have dir leases are added to
>>       another list. Dentries in the list are periodically checked in a round
>>       robin manner.
>>
>> With this commit, it will take 3~4 minutes to finish my test:
>>
>> real    3m33.998s
>> user    0m0.644s
>> sys    0m2.341s
>> [1]   Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o
>> ~/removal${i}.log -- rm -rf removal$i* )
>> real    3m34.028s
>> user    0m0.620s
>> sys    0m2.342s
>> [2]-  Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o
>> ~/removal${i}.log -- rm -rf removal$i* )
>> real    3m34.049s
>> user    0m0.638s
>> sys    0m2.315s
>> [3]+  Done                    ( cd /mnt/kcephfs.$i && time strace -Tv -o
>> ~/removal${i}.log -- rm -rf removal$i* )
>>
>> Without this it will take more than 12 minutes for above 3 'rm' threads.
>>
> That commit was already merged a couple of years ago. Does this mean you
> think the client behavior can't be improved further?

No, I think we can.

The customer is mainly using the downstream kernel which doesn't include 
it yet.

I have patched it and with this it can save a lot of time for my test. 
In theory and in rare case from my test that the delayed_work() still 
could send a large number of cap requests in a short time. It's very 
hard for me to reproduce with this fixing together with the Finisher's 
fix in https://github.com/ceph/ceph/pull/42394.

I checked that downstream code, there has some buggy codes which could 
impact the perf, but in the following numerous downstream patches they 
have been removed or fixed by coincidence when fixing other issues. In 
the upstream kernel I didn't find any new issue yet.

Thanks.









^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-07-22 13:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-21  8:27 [PATCH RFC] ceph: flush the delayed caps in time xiubli
2021-07-21 11:23 ` Jeff Layton
2021-07-21 11:54   ` Xiubo Li
2021-07-21 12:57     ` Jeff Layton
2021-07-21 13:42       ` Xiubo Li
2021-07-21 14:04         ` Jeff Layton
2021-07-22  9:36       ` Xiubo Li
2021-07-22 11:42         ` Jeff Layton
2021-07-22 13:53           ` Xiubo Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.