rcu.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work
@ 2019-08-27 19:01 Joel Fernandes (Google)
  2019-08-27 23:52 ` Boqun Feng
  2019-08-28 14:09 ` [PATCH v2] " Joel Fernandes (Google)
  0 siblings, 2 replies; 6+ messages in thread
From: Joel Fernandes (Google) @ 2019-08-27 19:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google),
	Paul E . McKenney, byungchul.park, Josh Triplett, Lai Jiangshan,
	linux-doc, Mathieu Desnoyers, rcu, Steven Rostedt, kernel-team

During testing, it was observed that amount of memory consumed due
kfree_rcu() batching is 300-400MB. Previously we had only a single
head_free pointer pointing to the list of rcu_head(s) that are to be
freed after a grace period. Until this list is drained, we cannot queue
any more objects on it since such objects may not be ready to be
reclaimed when the worker thread eventually gets to drainin g the
head_free list.

We can do better by maintaining multiple lists as done by this patch.
Testing shows that memory consumption came down by around 100-150MB with
just adding another list. Adding more than 1 additional list did not
show any improvement.

Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree.c | 64 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4f7c3096d786..9b9ae4db1c2d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2688,28 +2688,38 @@ EXPORT_SYMBOL_GPL(call_rcu);
 
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_N_BATCHES 2
+
+struct kfree_rcu_work {
+	/* The rcu_work node for queuing work with queue_rcu_work(). The work
+	 * is done after a grace period.
+	 */
+	struct rcu_work rcu_work;
+
+	/* The list of objects that have now left ->head and are queued for
+	 * freeing after a grace period.
+	 */
+	struct rcu_head *head_free;
+
+	struct kfree_rcu_cpu *krcp;
+};
+static DEFINE_PER_CPU(__typeof__(struct kfree_rcu_work)[KFREE_N_BATCHES], krw);
 
 /*
  * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
  * kfree(s) is queued for freeing after a grace period, right away.
  */
 struct kfree_rcu_cpu {
-	/* The rcu_work node for queuing work with queue_rcu_work(). The work
-	 * is done after a grace period.
-	 */
-	struct rcu_work rcu_work;
 
 	/* The list of objects being queued in a batch but are not yet
 	 * scheduled to be freed.
 	 */
 	struct rcu_head *head;
 
-	/* The list of objects that have now left ->head and are queued for
-	 * freeing after a grace period.
-	 */
-	struct rcu_head *head_free;
+	/* Pointer to the per-cpu array of kfree_rcu_work structures */
+	struct kfree_rcu_work *krwp;
 
-	/* Protect concurrent access to this structure. */
+	/* Protect concurrent access to this structure and kfree_rcu_work. */
 	spinlock_t lock;
 
 	/* The delayed work that flushes ->head to ->head_free incase ->head
@@ -2730,12 +2740,14 @@ static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
 	struct rcu_head *head, *next;
-	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
-					struct kfree_rcu_cpu, rcu_work);
+	struct kfree_rcu_work *krwp = container_of(to_rcu_work(work),
+					struct kfree_rcu_work, rcu_work);
+	struct kfree_rcu_cpu *krcp;
+
+	krcp = krwp->krcp;
 
 	spin_lock_irqsave(&krcp->lock, flags);
-	head = krcp->head_free;
-	krcp->head_free = NULL;
+	head = xchg(&krwp->head_free, NULL);
 	spin_unlock_irqrestore(&krcp->lock, flags);
 
 	/*
@@ -2758,19 +2770,28 @@ static void kfree_rcu_work(struct work_struct *work)
  */
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
+	int i = 0;
+	struct kfree_rcu_work *krwp = NULL;
+
 	lockdep_assert_held(&krcp->lock);
+	while (i < KFREE_N_BATCHES) {
+		if (!krcp->krwp[i].head_free) {
+			krwp = &(krcp->krwp[i]);
+			break;
+		}
+		i++;
+	}
 
-	/* If a previous RCU batch work is already in progress, we cannot queue
+	/* If both RCU batches are already in progress, we cannot queue
 	 * another one, just refuse the optimization and it will be retried
 	 * again in KFREE_DRAIN_JIFFIES time.
 	 */
-	if (krcp->head_free)
+	if (!krwp)
 		return false;
 
-	krcp->head_free = krcp->head;
-	krcp->head = NULL;
-	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
-	queue_rcu_work(system_wq, &krcp->rcu_work);
+	krwp->head_free = xchg(&krcp->head, NULL);
+	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
+	queue_rcu_work(system_wq, &krwp->rcu_work);
 
 	return true;
 }
@@ -3736,8 +3757,13 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+		struct kfree_rcu_work *krwp = &(per_cpu(krw, cpu)[0]);
+		int i = KFREE_N_BATCHES;
 
 		spin_lock_init(&krcp->lock);
+		krcp->krwp = krwp;
+		while (i--)
+			krwp[i].krcp = krcp;
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 	}
 }
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work
  2019-08-27 19:01 [PATCH 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work Joel Fernandes (Google)
@ 2019-08-27 23:52 ` Boqun Feng
  2019-08-28 14:02   ` Joel Fernandes
  2019-08-28 14:09 ` [PATCH v2] " Joel Fernandes (Google)
  1 sibling, 1 reply; 6+ messages in thread
From: Boqun Feng @ 2019-08-27 23:52 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, Paul E . McKenney, byungchul.park, Josh Triplett,
	Lai Jiangshan, linux-doc, Mathieu Desnoyers, rcu, Steven Rostedt,
	kernel-team

[-- Attachment #1: Type: text/plain, Size: 5608 bytes --]

Hi Joel,

On Tue, Aug 27, 2019 at 03:01:56PM -0400, Joel Fernandes (Google) wrote:
> During testing, it was observed that amount of memory consumed due
> kfree_rcu() batching is 300-400MB. Previously we had only a single
> head_free pointer pointing to the list of rcu_head(s) that are to be
> freed after a grace period. Until this list is drained, we cannot queue
> any more objects on it since such objects may not be ready to be
> reclaimed when the worker thread eventually gets to drainin g the
> head_free list.
> 
> We can do better by maintaining multiple lists as done by this patch.
> Testing shows that memory consumption came down by around 100-150MB with
> just adding another list. Adding more than 1 additional list did not
> show any improvement.
> 
> Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  kernel/rcu/tree.c | 64 +++++++++++++++++++++++++++++++++--------------
>  1 file changed, 45 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 4f7c3096d786..9b9ae4db1c2d 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2688,28 +2688,38 @@ EXPORT_SYMBOL_GPL(call_rcu);
>  
>  /* Maximum number of jiffies to wait before draining a batch. */
>  #define KFREE_DRAIN_JIFFIES (HZ / 50)
> +#define KFREE_N_BATCHES 2
> +
> +struct kfree_rcu_work {
> +	/* The rcu_work node for queuing work with queue_rcu_work(). The work
> +	 * is done after a grace period.
> +	 */
> +	struct rcu_work rcu_work;
> +
> +	/* The list of objects that have now left ->head and are queued for
> +	 * freeing after a grace period.
> +	 */
> +	struct rcu_head *head_free;
> +
> +	struct kfree_rcu_cpu *krcp;
> +};
> +static DEFINE_PER_CPU(__typeof__(struct kfree_rcu_work)[KFREE_N_BATCHES], krw);
>  

Why not

	static DEFINE_PER_CPU(struct kfree_rcu_work[KFREE_N_BATCHES], krw);

here? Am I missing something?

Further, given "struct kfree_rcu_cpu" is only for defining percpu
variables, how about orginazing the data structure like:

	struct kfree_rcu_cpu {
		...
		struct kfree_rcu_work krws[KFREE_N_BATCHES];
		...
	}

This could save one pointer in kfree_rcu_cpu, and I think it provides
better cache locality for accessing _cpu and _work on the same cpu.

Thoughts?

Regards,
Boqun


>  /*
>   * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
>   * kfree(s) is queued for freeing after a grace period, right away.
>   */
>  struct kfree_rcu_cpu {
> -	/* The rcu_work node for queuing work with queue_rcu_work(). The work
> -	 * is done after a grace period.
> -	 */
> -	struct rcu_work rcu_work;
>  
>  	/* The list of objects being queued in a batch but are not yet
>  	 * scheduled to be freed.
>  	 */
>  	struct rcu_head *head;
>  
> -	/* The list of objects that have now left ->head and are queued for
> -	 * freeing after a grace period.
> -	 */
> -	struct rcu_head *head_free;
> +	/* Pointer to the per-cpu array of kfree_rcu_work structures */
> +	struct kfree_rcu_work *krwp;
>  
> -	/* Protect concurrent access to this structure. */
> +	/* Protect concurrent access to this structure and kfree_rcu_work. */
>  	spinlock_t lock;
>  
>  	/* The delayed work that flushes ->head to ->head_free incase ->head
> @@ -2730,12 +2740,14 @@ static void kfree_rcu_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct rcu_head *head, *next;
> -	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
> -					struct kfree_rcu_cpu, rcu_work);
> +	struct kfree_rcu_work *krwp = container_of(to_rcu_work(work),
> +					struct kfree_rcu_work, rcu_work);
> +	struct kfree_rcu_cpu *krcp;
> +
> +	krcp = krwp->krcp;
>  
>  	spin_lock_irqsave(&krcp->lock, flags);
> -	head = krcp->head_free;
> -	krcp->head_free = NULL;
> +	head = xchg(&krwp->head_free, NULL);
>  	spin_unlock_irqrestore(&krcp->lock, flags);
>  
>  	/*
> @@ -2758,19 +2770,28 @@ static void kfree_rcu_work(struct work_struct *work)
>   */
>  static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  {
> +	int i = 0;
> +	struct kfree_rcu_work *krwp = NULL;
> +
>  	lockdep_assert_held(&krcp->lock);
> +	while (i < KFREE_N_BATCHES) {
> +		if (!krcp->krwp[i].head_free) {
> +			krwp = &(krcp->krwp[i]);
> +			break;
> +		}
> +		i++;
> +	}
>  
> -	/* If a previous RCU batch work is already in progress, we cannot queue
> +	/* If both RCU batches are already in progress, we cannot queue
>  	 * another one, just refuse the optimization and it will be retried
>  	 * again in KFREE_DRAIN_JIFFIES time.
>  	 */
> -	if (krcp->head_free)
> +	if (!krwp)
>  		return false;
>  
> -	krcp->head_free = krcp->head;
> -	krcp->head = NULL;
> -	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
> -	queue_rcu_work(system_wq, &krcp->rcu_work);
> +	krwp->head_free = xchg(&krcp->head, NULL);
> +	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> +	queue_rcu_work(system_wq, &krwp->rcu_work);
>  
>  	return true;
>  }
> @@ -3736,8 +3757,13 @@ static void __init kfree_rcu_batch_init(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +		struct kfree_rcu_work *krwp = &(per_cpu(krw, cpu)[0]);
> +		int i = KFREE_N_BATCHES;
>  
>  		spin_lock_init(&krcp->lock);
> +		krcp->krwp = krwp;
> +		while (i--)
> +			krwp[i].krcp = krcp;
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  	}
>  }
> -- 
> 2.23.0.187.g17f5b7556c-goog
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work
  2019-08-27 23:52 ` Boqun Feng
@ 2019-08-28 14:02   ` Joel Fernandes
  0 siblings, 0 replies; 6+ messages in thread
From: Joel Fernandes @ 2019-08-28 14:02 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, Paul E . McKenney, byungchul.park, Josh Triplett,
	Lai Jiangshan, linux-doc, Mathieu Desnoyers, rcu, Steven Rostedt,
	kernel-team

On Wed, Aug 28, 2019 at 07:52:53AM +0800, Boqun Feng wrote:
> Hi Joel,
> 
> On Tue, Aug 27, 2019 at 03:01:56PM -0400, Joel Fernandes (Google) wrote:
> > During testing, it was observed that amount of memory consumed due
> > kfree_rcu() batching is 300-400MB. Previously we had only a single
> > head_free pointer pointing to the list of rcu_head(s) that are to be
> > freed after a grace period. Until this list is drained, we cannot queue
> > any more objects on it since such objects may not be ready to be
> > reclaimed when the worker thread eventually gets to drainin g the
> > head_free list.
> > 
> > We can do better by maintaining multiple lists as done by this patch.
> > Testing shows that memory consumption came down by around 100-150MB with
> > just adding another list. Adding more than 1 additional list did not
> > show any improvement.
> > 
> > Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > ---
> >  kernel/rcu/tree.c | 64 +++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 45 insertions(+), 19 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 4f7c3096d786..9b9ae4db1c2d 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2688,28 +2688,38 @@ EXPORT_SYMBOL_GPL(call_rcu);
> >  
> >  /* Maximum number of jiffies to wait before draining a batch. */
> >  #define KFREE_DRAIN_JIFFIES (HZ / 50)
> > +#define KFREE_N_BATCHES 2
> > +
> > +struct kfree_rcu_work {
> > +	/* The rcu_work node for queuing work with queue_rcu_work(). The work
> > +	 * is done after a grace period.
> > +	 */
> > +	struct rcu_work rcu_work;
> > +
> > +	/* The list of objects that have now left ->head and are queued for
> > +	 * freeing after a grace period.
> > +	 */
> > +	struct rcu_head *head_free;
> > +
> > +	struct kfree_rcu_cpu *krcp;
> > +};
> > +static DEFINE_PER_CPU(__typeof__(struct kfree_rcu_work)[KFREE_N_BATCHES], krw);
> >  
> 
> Why not
> 
> 	static DEFINE_PER_CPU(struct kfree_rcu_work[KFREE_N_BATCHES], krw);
> 
> here? Am I missing something?

Yes, that's better.

> Further, given "struct kfree_rcu_cpu" is only for defining percpu
> variables, how about orginazing the data structure like:
> 
> 	struct kfree_rcu_cpu {
> 		...
> 		struct kfree_rcu_work krws[KFREE_N_BATCHES];
> 		...
> 	}
> 
> This could save one pointer in kfree_rcu_cpu, and I think it provides
> better cache locality for accessing _cpu and _work on the same cpu.
> 
> Thoughts?

Yes, that's better. Thanks, Boqun! Following is the diff which I will fold
into this patch:

---8<-----------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3259306b7a5..fac5ae96d8b1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2717,7 +2717,6 @@ struct kfree_rcu_work {
 
 	struct kfree_rcu_cpu *krcp;
 };
-static DEFINE_PER_CPU(__typeof__(struct kfree_rcu_work)[KFREE_N_BATCHES], krw);
 
 /*
  * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
@@ -2731,7 +2730,7 @@ struct kfree_rcu_cpu {
 	struct rcu_head *head;
 
 	/* Pointer to the per-cpu array of kfree_rcu_work structures */
-	struct kfree_rcu_work *krwp;
+	struct kfree_rcu_work krw_arr[KFREE_N_BATCHES];
 
 	/* Protect concurrent access to this structure and kfree_rcu_work. */
 	spinlock_t lock;
@@ -2800,8 +2799,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 
 	lockdep_assert_held(&krcp->lock);
 	while (i < KFREE_N_BATCHES) {
-		if (!krcp->krwp[i].head_free) {
-			krwp = &(krcp->krwp[i]);
+		if (!krcp->krw_arr[i].head_free) {
+			krwp = &(krcp->krw_arr[i]);
 			break;
 		}
 		i++;
@@ -3780,13 +3779,11 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-		struct kfree_rcu_work *krwp = &(per_cpu(krw, cpu)[0]);
 		int i = KFREE_N_BATCHES;
 
 		spin_lock_init(&krcp->lock);
-		krcp->krwp = krwp;
 		while (i--)
-			krwp[i].krcp = krcp;
+			krcp->krw_arr[i].krcp = krcp;
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 	}
 }

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2] rcu/tree: Add multiple in-flight batches of kfree_rcu work
  2019-08-27 19:01 [PATCH 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work Joel Fernandes (Google)
  2019-08-27 23:52 ` Boqun Feng
@ 2019-08-28 14:09 ` Joel Fernandes (Google)
  2019-08-28 20:45   ` Paul E. McKenney
  1 sibling, 1 reply; 6+ messages in thread
From: Joel Fernandes (Google) @ 2019-08-28 14:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google),
	Paul E . McKenney, byungchul.park, Josh Triplett, Lai Jiangshan,
	linux-doc, Mathieu Desnoyers, rcu, Steven Rostedt, kernel-team

During testing, it was observed that amount of memory consumed due
kfree_rcu() batching is 300-400MB. Previously we had only a single
head_free pointer pointing to the list of rcu_head(s) that are to be
freed after a grace period. Until this list is drained, we cannot queue
any more objects on it since such objects may not be ready to be
reclaimed when the worker thread eventually gets to drainin g the
head_free list.

We can do better by maintaining multiple lists as done by this patch.
Testing shows that memory consumption came down by around 100-150MB with
just adding another list. Adding more than 1 additional list did not
show any improvement.

Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree.c | 61 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4f7c3096d786..5bf8f7e793ea 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2688,28 +2688,37 @@ EXPORT_SYMBOL_GPL(call_rcu);
 
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_N_BATCHES 2
+
+struct kfree_rcu_work {
+	/* The rcu_work node for queuing work with queue_rcu_work(). The work
+	 * is done after a grace period.
+	 */
+	struct rcu_work rcu_work;
+
+	/* The list of objects that have now left ->head and are queued for
+	 * freeing after a grace period.
+	 */
+	struct rcu_head *head_free;
+
+	struct kfree_rcu_cpu *krcp;
+};
 
 /*
  * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
  * kfree(s) is queued for freeing after a grace period, right away.
  */
 struct kfree_rcu_cpu {
-	/* The rcu_work node for queuing work with queue_rcu_work(). The work
-	 * is done after a grace period.
-	 */
-	struct rcu_work rcu_work;
 
 	/* The list of objects being queued in a batch but are not yet
 	 * scheduled to be freed.
 	 */
 	struct rcu_head *head;
 
-	/* The list of objects that have now left ->head and are queued for
-	 * freeing after a grace period.
-	 */
-	struct rcu_head *head_free;
+	/* Pointer to the per-cpu array of kfree_rcu_work structures */
+	struct kfree_rcu_work krw_arr[KFREE_N_BATCHES];
 
-	/* Protect concurrent access to this structure. */
+	/* Protect concurrent access to this structure and kfree_rcu_work. */
 	spinlock_t lock;
 
 	/* The delayed work that flushes ->head to ->head_free incase ->head
@@ -2730,12 +2739,14 @@ static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
 	struct rcu_head *head, *next;
-	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
-					struct kfree_rcu_cpu, rcu_work);
+	struct kfree_rcu_work *krwp = container_of(to_rcu_work(work),
+					struct kfree_rcu_work, rcu_work);
+	struct kfree_rcu_cpu *krcp;
+
+	krcp = krwp->krcp;
 
 	spin_lock_irqsave(&krcp->lock, flags);
-	head = krcp->head_free;
-	krcp->head_free = NULL;
+	head = xchg(&krwp->head_free, NULL);
 	spin_unlock_irqrestore(&krcp->lock, flags);
 
 	/*
@@ -2758,19 +2769,28 @@ static void kfree_rcu_work(struct work_struct *work)
  */
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
+	int i = 0;
+	struct kfree_rcu_work *krwp = NULL;
+
 	lockdep_assert_held(&krcp->lock);
+	while (i < KFREE_N_BATCHES) {
+		if (!krcp->krw_arr[i].head_free) {
+			krwp = &(krcp->krw_arr[i]);
+			break;
+		}
+		i++;
+	}
 
-	/* If a previous RCU batch work is already in progress, we cannot queue
+	/* If both RCU batches are already in progress, we cannot queue
 	 * another one, just refuse the optimization and it will be retried
 	 * again in KFREE_DRAIN_JIFFIES time.
 	 */
-	if (krcp->head_free)
+	if (!krwp)
 		return false;
 
-	krcp->head_free = krcp->head;
-	krcp->head = NULL;
-	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
-	queue_rcu_work(system_wq, &krcp->rcu_work);
+	krwp->head_free = xchg(&krcp->head, NULL);
+	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
+	queue_rcu_work(system_wq, &krwp->rcu_work);
 
 	return true;
 }
@@ -3736,8 +3756,11 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+		int i = KFREE_N_BATCHES;
 
 		spin_lock_init(&krcp->lock);
+		while (i--)
+			krcp->krw_arr[i].krcp = krcp;
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 	}
 }
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] rcu/tree: Add multiple in-flight batches of kfree_rcu work
  2019-08-28 14:09 ` [PATCH v2] " Joel Fernandes (Google)
@ 2019-08-28 20:45   ` Paul E. McKenney
  2019-08-29 21:26     ` Joel Fernandes
  0 siblings, 1 reply; 6+ messages in thread
From: Paul E. McKenney @ 2019-08-28 20:45 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, byungchul.park, Josh Triplett, Lai Jiangshan,
	linux-doc, Mathieu Desnoyers, rcu, Steven Rostedt, kernel-team

On Wed, Aug 28, 2019 at 10:09:52AM -0400, Joel Fernandes (Google) wrote:
> During testing, it was observed that amount of memory consumed due
> kfree_rcu() batching is 300-400MB. Previously we had only a single
> head_free pointer pointing to the list of rcu_head(s) that are to be
> freed after a grace period. Until this list is drained, we cannot queue
> any more objects on it since such objects may not be ready to be
> reclaimed when the worker thread eventually gets to drainin g the
> head_free list.
> 
> We can do better by maintaining multiple lists as done by this patch.
> Testing shows that memory consumption came down by around 100-150MB with
> just adding another list. Adding more than 1 additional list did not
> show any improvement.

Nice!  A few comments below.  Please address them and post a full v3.
(I am off the next two days, and I guarantee you that upon return I will
mix and match the wrong patches otherwise!)

> Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  kernel/rcu/tree.c | 61 ++++++++++++++++++++++++++++++++---------------
>  1 file changed, 42 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 4f7c3096d786..5bf8f7e793ea 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2688,28 +2688,37 @@ EXPORT_SYMBOL_GPL(call_rcu);
>  
>  /* Maximum number of jiffies to wait before draining a batch. */
>  #define KFREE_DRAIN_JIFFIES (HZ / 50)
> +#define KFREE_N_BATCHES 2
> +
> +struct kfree_rcu_work {
> +	/* The rcu_work node for queuing work with queue_rcu_work(). The work
> +	 * is done after a grace period.
> +	 */
> +	struct rcu_work rcu_work;
> +
> +	/* The list of objects that have now left ->head and are queued for
> +	 * freeing after a grace period.
> +	 */
> +	struct rcu_head *head_free;
> +
> +	struct kfree_rcu_cpu *krcp;
> +};
>  
>  /*
>   * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
>   * kfree(s) is queued for freeing after a grace period, right away.
>   */
>  struct kfree_rcu_cpu {
> -	/* The rcu_work node for queuing work with queue_rcu_work(). The work
> -	 * is done after a grace period.
> -	 */
> -	struct rcu_work rcu_work;
>  
>  	/* The list of objects being queued in a batch but are not yet
>  	 * scheduled to be freed.
>  	 */
>  	struct rcu_head *head;
>  
> -	/* The list of objects that have now left ->head and are queued for
> -	 * freeing after a grace period.
> -	 */
> -	struct rcu_head *head_free;
> +	/* Pointer to the per-cpu array of kfree_rcu_work structures */
> +	struct kfree_rcu_work krw_arr[KFREE_N_BATCHES];
>  
> -	/* Protect concurrent access to this structure. */
> +	/* Protect concurrent access to this structure and kfree_rcu_work. */
>  	spinlock_t lock;
>  
>  	/* The delayed work that flushes ->head to ->head_free incase ->head
> @@ -2730,12 +2739,14 @@ static void kfree_rcu_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct rcu_head *head, *next;
> -	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
> -					struct kfree_rcu_cpu, rcu_work);
> +	struct kfree_rcu_work *krwp = container_of(to_rcu_work(work),
> +					struct kfree_rcu_work, rcu_work);
> +	struct kfree_rcu_cpu *krcp;
> +
> +	krcp = krwp->krcp;
>  
>  	spin_lock_irqsave(&krcp->lock, flags);
> -	head = krcp->head_free;
> -	krcp->head_free = NULL;
> +	head = xchg(&krwp->head_free, NULL);

Given that we hold the lock, why the xchg()?  Alternatively, why not
just acquire the lock in the other places you use xchg()?  This is a
per-CPU lock, so contention should not be a problem, should it?

>  	spin_unlock_irqrestore(&krcp->lock, flags);
>  
>  	/*
> @@ -2758,19 +2769,28 @@ static void kfree_rcu_work(struct work_struct *work)
>   */
>  static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  {
> +	int i = 0;
> +	struct kfree_rcu_work *krwp = NULL;
> +
>  	lockdep_assert_held(&krcp->lock);
> +	while (i < KFREE_N_BATCHES) {
> +		if (!krcp->krw_arr[i].head_free) {
> +			krwp = &(krcp->krw_arr[i]);
> +			break;
> +		}
> +		i++;
> +	}
>  
> -	/* If a previous RCU batch work is already in progress, we cannot queue
> +	/* If both RCU batches are already in progress, we cannot queue
>  	 * another one, just refuse the optimization and it will be retried
>  	 * again in KFREE_DRAIN_JIFFIES time.
>  	 */

If you are going to remove the traditional first "/*" line of a comment,
why not go all the way and cut the last one as well?  "//".

> -	if (krcp->head_free)
> +	if (!krwp)
>  		return false;
>  
> -	krcp->head_free = krcp->head;
> -	krcp->head = NULL;
> -	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
> -	queue_rcu_work(system_wq, &krcp->rcu_work);
> +	krwp->head_free = xchg(&krcp->head, NULL);

This isn't anywhere near a fastpath, so just acquiring the lock is a
better choice here.

> +	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> +	queue_rcu_work(system_wq, &krwp->rcu_work);
>  
>  	return true;
>  }
> @@ -3736,8 +3756,11 @@ static void __init kfree_rcu_batch_init(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +		int i = KFREE_N_BATCHES;
>  
>  		spin_lock_init(&krcp->lock);
> +		while (i--)
> +			krcp->krw_arr[i].krcp = krcp;

This was indeed a nice trick back in the PDP-11 days of 64-kilobyte
address spaces, so thank you for the nostalgia!  However, a straight-up
"for" loop is less vulnerable to code being added between the declaration
of "i" and the "while" loop.

>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  	}
>  }
> -- 
> 2.23.0.187.g17f5b7556c-goog
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] rcu/tree: Add multiple in-flight batches of kfree_rcu work
  2019-08-28 20:45   ` Paul E. McKenney
@ 2019-08-29 21:26     ` Joel Fernandes
  0 siblings, 0 replies; 6+ messages in thread
From: Joel Fernandes @ 2019-08-29 21:26 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, byungchul.park, Josh Triplett, Lai Jiangshan,
	linux-doc, Mathieu Desnoyers, rcu, Steven Rostedt

On Wed, Aug 28, 2019 at 01:45:21PM -0700, Paul E. McKenney wrote:
> On Wed, Aug 28, 2019 at 10:09:52AM -0400, Joel Fernandes (Google) wrote:
> > During testing, it was observed that amount of memory consumed due
> > kfree_rcu() batching is 300-400MB. Previously we had only a single
> > head_free pointer pointing to the list of rcu_head(s) that are to be
> > freed after a grace period. Until this list is drained, we cannot queue
> > any more objects on it since such objects may not be ready to be
> > reclaimed when the worker thread eventually gets to drainin g the
> > head_free list.
> > 
> > We can do better by maintaining multiple lists as done by this patch.
> > Testing shows that memory consumption came down by around 100-150MB with
> > just adding another list. Adding more than 1 additional list did not
> > show any improvement.
[snip]
> > @@ -2730,12 +2739,14 @@ static void kfree_rcu_work(struct work_struct *work)
> >  {
> >  	unsigned long flags;
> >  	struct rcu_head *head, *next;
> > -	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
> > -					struct kfree_rcu_cpu, rcu_work);
> > +	struct kfree_rcu_work *krwp = container_of(to_rcu_work(work),
> > +					struct kfree_rcu_work, rcu_work);
> > +	struct kfree_rcu_cpu *krcp;
> > +
> > +	krcp = krwp->krcp;
> >  
> >  	spin_lock_irqsave(&krcp->lock, flags);
> > -	head = krcp->head_free;
> > -	krcp->head_free = NULL;
> > +	head = xchg(&krwp->head_free, NULL);
> 
> Given that we hold the lock, why the xchg()?  Alternatively, why not
> just acquire the lock in the other places you use xchg()?  This is a
> per-CPU lock, so contention should not be a problem, should it?

I realized I was being silly :(. Was trying to reduce lines of code and hence
implemented it like that as a one-liner. Locking protocol is not needed or
intended for that xchg since as pointed, a lock is held.

> >  	spin_unlock_irqrestore(&krcp->lock, flags);
> >  
> >  	/*
> > @@ -2758,19 +2769,28 @@ static void kfree_rcu_work(struct work_struct *work)
> >   */
> >  static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
> >  {
> > +	int i = 0;
> > +	struct kfree_rcu_work *krwp = NULL;
> > +
> >  	lockdep_assert_held(&krcp->lock);
> > +	while (i < KFREE_N_BATCHES) {
> > +		if (!krcp->krw_arr[i].head_free) {
> > +			krwp = &(krcp->krw_arr[i]);
> > +			break;
> > +		}
> > +		i++;
> > +	}
> >  
> > -	/* If a previous RCU batch work is already in progress, we cannot queue
> > +	/* If both RCU batches are already in progress, we cannot queue
> >  	 * another one, just refuse the optimization and it will be retried
> >  	 * again in KFREE_DRAIN_JIFFIES time.
> >  	 */
> 
> If you are going to remove the traditional first "/*" line of a comment,
> why not go all the way and cut the last one as well?  "//".

Will add the /* in the beginning :)

> > -	if (krcp->head_free)
> > +	if (!krwp)
> >  		return false;
> >  
> > -	krcp->head_free = krcp->head;
> > -	krcp->head = NULL;
> > -	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
> > -	queue_rcu_work(system_wq, &krcp->rcu_work);
> > +	krwp->head_free = xchg(&krcp->head, NULL);
> 
> This isn't anywhere near a fastpath, so just acquiring the lock is a
> better choice here.

My reasoning was same as above. Will change it to 2 statements since lock is
already held.

> > +	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> > +	queue_rcu_work(system_wq, &krwp->rcu_work);
> >  
> >  	return true;
> >  }
> > @@ -3736,8 +3756,11 @@ static void __init kfree_rcu_batch_init(void)
> >  
> >  	for_each_possible_cpu(cpu) {
> >  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > +		int i = KFREE_N_BATCHES;
> >  
> >  		spin_lock_init(&krcp->lock);
> > +		while (i--)
> > +			krcp->krw_arr[i].krcp = krcp;
> 
> This was indeed a nice trick back in the PDP-11 days of 64-kilobyte
> address spaces, so thank you for the nostalgia!  However, a straight-up
> "for" loop is less vulnerable to code being added between the declaration
> of "i" and the "while" loop.

Ok, will do.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-08-29 21:26 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-27 19:01 [PATCH 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work Joel Fernandes (Google)
2019-08-27 23:52 ` Boqun Feng
2019-08-28 14:02   ` Joel Fernandes
2019-08-28 14:09 ` [PATCH v2] " Joel Fernandes (Google)
2019-08-28 20:45   ` Paul E. McKenney
2019-08-29 21:26     ` Joel Fernandes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).