RCU Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
@ 2019-12-31 12:22 Uladzislau Rezki (Sony)
  2020-01-13 19:03 ` Paul E. McKenney
  2020-01-16  1:14 ` Joel Fernandes
  0 siblings, 2 replies; 18+ messages in thread
From: Uladzislau Rezki (Sony) @ 2019-12-31 12:22 UTC (permalink / raw)
  To: LKML
  Cc: Paul E . McKenney, Joel Fernandes, RCU, Uladzislau Rezki,
	Steven Rostedt, Oleksiy Avramchenko

kfree_rcu() logic can be improved further by using kfree_bulk()
interface along with "basic batching support" introduced earlier.

The are at least two advantages of using "bulk" interface:
- in case of large number of kfree_rcu() requests kfree_bulk()
  reduces the per-object overhead caused by calling kfree()
  per-object.

- reduces the number of cache-misses due to "pointer chasing"
  between objects which can be far spread between each other.

This approach defines a new kfree_rcu_bulk_data structure that
stores pointers in an array with a specific size. Number of entries
in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
structure to be exactly one page.

Since it deals with "block-chain" technique there is an extra
need in dynamic allocation when a new block is required. Memory
is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
allows to skip direct reclaim under low memory condition to
prevent stalling and fails silently under high memory pressure.

The "emergency path" gets maintained when a system is run out
of memory. In that case objects are linked into regular list
and that is it.

In order to evaluate it, the "rcuperf" was run to analyze how
much memory is consumed and what is kfree_bulk() throughput.

Testing on the HiKey-960, arm64, 8xCPUs with below parameters:

CONFIG_SLAB=y
kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1

102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB

rcuperf shows approximately ~12% better throughput(Total time)
in case of using "bulk" interface. The "drain logic" or its RCU
callback does the work faster that leads to better throughput.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 154 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 130 insertions(+), 24 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48fba2257748..4ee5c737558b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2754,22 +2754,45 @@ EXPORT_SYMBOL_GPL(call_rcu);
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
 #define KFREE_N_BATCHES 2
 
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 2)
+
+/**
+ * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * @nr_records: Number of active pointers in the array
+ * @records: Array of the kfree_rcu() pointers
+ * @next: Next bulk object in the block chain
+ */
+struct kfree_rcu_bulk_data {
+	unsigned long nr_records;
+	void *records[KFREE_BULK_MAX_ENTR];
+	struct kfree_rcu_bulk_data *next;
+};
+
 /**
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
  * @krcp: Pointer to @kfree_rcu_cpu structure
  */
 
 struct kfree_rcu_cpu_work {
 	struct rcu_work rcu_work;
 	struct rcu_head *head_free;
+	struct kfree_rcu_bulk_data *bhead_free;
 	struct kfree_rcu_cpu *krcp;
 };
 
 /**
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
+ * @bcached: Keeps at most one object for later reuse when build chain blocks
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -2783,6 +2806,8 @@ struct kfree_rcu_cpu_work {
  */
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
+	struct kfree_rcu_bulk_data *bhead;
+	struct kfree_rcu_bulk_data *bcached;
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	spinlock_t lock;
 	struct delayed_work monitor_work;
@@ -2800,6 +2825,7 @@ static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
 	struct rcu_head *head, *next;
+	struct kfree_rcu_bulk_data *bhead, *bnext;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
 
@@ -2809,22 +2835,39 @@ static void kfree_rcu_work(struct work_struct *work)
 	spin_lock_irqsave(&krcp->lock, flags);
 	head = krwp->head_free;
 	krwp->head_free = NULL;
+	bhead = krwp->bhead_free;
+	krwp->bhead_free = NULL;
 	spin_unlock_irqrestore(&krcp->lock, flags);
 
-	// List "head" is now private, so traverse locklessly.
+	/* List "bhead" is now private, so traverse locklessly. */
+	for (; bhead; bhead = bnext) {
+		bnext = bhead->next;
+
+		rcu_lock_acquire(&rcu_callback_map);
+		kfree_bulk(bhead->nr_records, bhead->records);
+		rcu_lock_release(&rcu_callback_map);
+
+		if (cmpxchg(&krcp->bcached, NULL, bhead))
+			free_page((unsigned long) bhead);
+
+		cond_resched_tasks_rcu_qs();
+	}
+
+	/*
+	 * Emergency case only. It can happen under low memory
+	 * condition when an allocation gets failed, so the "bulk"
+	 * path can not be temporary maintained.
+	 */
 	for (; head; head = next) {
 		unsigned long offset = (unsigned long)head->func;
 
 		next = head->next;
-		// Potentially optimize with kfree_bulk in future.
 		debug_rcu_head_unqueue(head);
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
-		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
-			/* Could be optimized with kfree_bulk() in future. */
+		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
 			kfree((void *)head - offset);
-		}
 
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
@@ -2839,26 +2882,45 @@ static void kfree_rcu_work(struct work_struct *work)
  */
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
+	struct kfree_rcu_cpu_work *krwp;
+	bool queued = false;
 	int i;
-	struct kfree_rcu_cpu_work *krwp = NULL;
 
 	lockdep_assert_held(&krcp->lock);
-	for (i = 0; i < KFREE_N_BATCHES; i++)
-		if (!krcp->krw_arr[i].head_free) {
-			krwp = &(krcp->krw_arr[i]);
-			break;
-		}
 
-	// If a previous RCU batch is in progress, we cannot immediately
-	// queue another one, so return false to tell caller to retry.
-	if (!krwp)
-		return false;
+	for (i = 0; i < KFREE_N_BATCHES; i++) {
+		krwp = &(krcp->krw_arr[i]);
 
-	krwp->head_free = krcp->head;
-	krcp->head = NULL;
-	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
-	queue_rcu_work(system_wq, &krwp->rcu_work);
-	return true;
+		/*
+		 * Try to detach bhead or head and attach it over any
+		 * available corresponding free channel. It can be that
+		 * a previous RCU batch is in progress, it means that
+		 * immediately to queue another one is not possible so
+		 * return false to tell caller to retry.
+		 */
+		if ((krcp->bhead && !krwp->bhead_free) ||
+				(krcp->head && !krwp->head_free)) {
+			if (!krwp->bhead_free) {
+				krwp->bhead_free = krcp->bhead;
+				krcp->bhead = NULL;
+			}
+
+			if (!krwp->head_free) {
+				krwp->head_free = krcp->head;
+				krcp->head = NULL;
+			}
+
+			/*
+			 * The work can already be queued. If so, it means that
+			 * within a short time, second, either head or bhead has
+			 * been detached as well.
+			 */
+			queue_rcu_work(system_wq, &krwp->rcu_work);
+			queued = true;
+		}
+	}
+
+	return queued;
 }
 
 static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
@@ -2895,6 +2957,39 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
+static inline bool
+kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
+{
+	struct kfree_rcu_bulk_data *bnode;
+
+	if (unlikely(!krcp->initialized))
+		return false;
+
+	lockdep_assert_held(&krcp->lock);
+
+	/* Check if a new block is required. */
+	if (!krcp->bhead ||
+			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
+		bnode = xchg(&krcp->bcached, NULL);
+		if (!bnode)
+			bnode = (struct kfree_rcu_bulk_data *)
+				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+		/* No cache or an allocation got failed. */
+		if (unlikely(!bnode))
+			return false;
+
+		/* Initialize the new block. */
+		bnode->nr_records = 0;
+		bnode->next = krcp->bhead;
+		krcp->bhead = bnode;
+	}
+
+	/* Finally insert. */
+	krcp->bhead->records[krcp->bhead->nr_records++] = ptr;
+	return true;
+}
+
 /*
  * Queue a request for lazy invocation of kfree() after a grace period.
  *
@@ -2926,9 +3021,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 			  __func__, head);
 		goto unlock_return;
 	}
-	head->func = func;
-	head->next = krcp->head;
-	krcp->head = head;
+
+	/*
+	 * Under high memory pressure GFP_NOWAIT can fail,
+	 * in that case the emergency path is maintained.
+	 */
+	if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp,
+			(void *) head - (unsigned long) func))) {
+		head->func = func;
+		head->next = krcp->head;
+		krcp->head = head;
+	}
 
 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
@@ -3834,8 +3937,11 @@ static void __init kfree_rcu_batch_init(void)
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		spin_lock_init(&krcp->lock);
-		for (i = 0; i < KFREE_N_BATCHES; i++)
+		for (i = 0; i < KFREE_N_BATCHES; i++) {
+			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
+		}
+
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		krcp->initialized = true;
 	}
-- 
2.20.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2019-12-31 12:22 [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu() Uladzislau Rezki (Sony)
@ 2020-01-13 19:03 ` Paul E. McKenney
  2020-01-14 16:49   ` Joel Fernandes
  2020-01-16  1:14 ` Joel Fernandes
  1 sibling, 1 reply; 18+ messages in thread
From: Paul E. McKenney @ 2020-01-13 19:03 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, Joel Fernandes, RCU, Steven Rostedt, Oleksiy Avramchenko

On Tue, Dec 31, 2019 at 01:22:41PM +0100, Uladzislau Rezki (Sony) wrote:
> kfree_rcu() logic can be improved further by using kfree_bulk()
> interface along with "basic batching support" introduced earlier.
> 
> The are at least two advantages of using "bulk" interface:
> - in case of large number of kfree_rcu() requests kfree_bulk()
>   reduces the per-object overhead caused by calling kfree()
>   per-object.
> 
> - reduces the number of cache-misses due to "pointer chasing"
>   between objects which can be far spread between each other.
> 
> This approach defines a new kfree_rcu_bulk_data structure that
> stores pointers in an array with a specific size. Number of entries
> in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
> structure to be exactly one page.
> 
> Since it deals with "block-chain" technique there is an extra
> need in dynamic allocation when a new block is required. Memory
> is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> allows to skip direct reclaim under low memory condition to
> prevent stalling and fails silently under high memory pressure.
> 
> The "emergency path" gets maintained when a system is run out
> of memory. In that case objects are linked into regular list
> and that is it.
> 
> In order to evaluate it, the "rcuperf" was run to analyze how
> much memory is consumed and what is kfree_bulk() throughput.
> 
> Testing on the HiKey-960, arm64, 8xCPUs with below parameters:
> 
> CONFIG_SLAB=y
> kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> 
> 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
> 89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB
> 
> rcuperf shows approximately ~12% better throughput(Total time)
> in case of using "bulk" interface. The "drain logic" or its RCU
> callback does the work faster that leads to better throughput.

Nice improvement!

But rcuperf uses a single block size, which turns into kfree_bulk() using
a single slab, which results in good locality of reference.  So I have to
ask...  Is this performance result representative of production workloads?

							Thanx, Paul

> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 154 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 130 insertions(+), 24 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 48fba2257748..4ee5c737558b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2754,22 +2754,45 @@ EXPORT_SYMBOL_GPL(call_rcu);
>  #define KFREE_DRAIN_JIFFIES (HZ / 50)
>  #define KFREE_N_BATCHES 2
>  
> +/*
> + * This macro defines how many entries the "records" array
> + * will contain. It is based on the fact that the size of
> + * kfree_rcu_bulk_data structure becomes exactly one page.
> + */
> +#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 2)
> +
> +/**
> + * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
> + * @nr_records: Number of active pointers in the array
> + * @records: Array of the kfree_rcu() pointers
> + * @next: Next bulk object in the block chain
> + */
> +struct kfree_rcu_bulk_data {
> +	unsigned long nr_records;
> +	void *records[KFREE_BULK_MAX_ENTR];
> +	struct kfree_rcu_bulk_data *next;
> +};
> +
>  /**
>   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
>   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
>   * @head_free: List of kfree_rcu() objects waiting for a grace period
> + * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
>   * @krcp: Pointer to @kfree_rcu_cpu structure
>   */
>  
>  struct kfree_rcu_cpu_work {
>  	struct rcu_work rcu_work;
>  	struct rcu_head *head_free;
> +	struct kfree_rcu_bulk_data *bhead_free;
>  	struct kfree_rcu_cpu *krcp;
>  };
>  
>  /**
>   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
>   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> + * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
> + * @bcached: Keeps at most one object for later reuse when build chain blocks
>   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
>   * @lock: Synchronize access to this structure
>   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
> @@ -2783,6 +2806,8 @@ struct kfree_rcu_cpu_work {
>   */
>  struct kfree_rcu_cpu {
>  	struct rcu_head *head;
> +	struct kfree_rcu_bulk_data *bhead;
> +	struct kfree_rcu_bulk_data *bcached;
>  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>  	spinlock_t lock;
>  	struct delayed_work monitor_work;
> @@ -2800,6 +2825,7 @@ static void kfree_rcu_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct rcu_head *head, *next;
> +	struct kfree_rcu_bulk_data *bhead, *bnext;
>  	struct kfree_rcu_cpu *krcp;
>  	struct kfree_rcu_cpu_work *krwp;
>  
> @@ -2809,22 +2835,39 @@ static void kfree_rcu_work(struct work_struct *work)
>  	spin_lock_irqsave(&krcp->lock, flags);
>  	head = krwp->head_free;
>  	krwp->head_free = NULL;
> +	bhead = krwp->bhead_free;
> +	krwp->bhead_free = NULL;
>  	spin_unlock_irqrestore(&krcp->lock, flags);
>  
> -	// List "head" is now private, so traverse locklessly.
> +	/* List "bhead" is now private, so traverse locklessly. */
> +	for (; bhead; bhead = bnext) {
> +		bnext = bhead->next;
> +
> +		rcu_lock_acquire(&rcu_callback_map);
> +		kfree_bulk(bhead->nr_records, bhead->records);
> +		rcu_lock_release(&rcu_callback_map);
> +
> +		if (cmpxchg(&krcp->bcached, NULL, bhead))
> +			free_page((unsigned long) bhead);
> +
> +		cond_resched_tasks_rcu_qs();
> +	}
> +
> +	/*
> +	 * Emergency case only. It can happen under low memory
> +	 * condition when an allocation gets failed, so the "bulk"
> +	 * path can not be temporary maintained.
> +	 */
>  	for (; head; head = next) {
>  		unsigned long offset = (unsigned long)head->func;
>  
>  		next = head->next;
> -		// Potentially optimize with kfree_bulk in future.
>  		debug_rcu_head_unqueue(head);
>  		rcu_lock_acquire(&rcu_callback_map);
>  		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
>  
> -		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
> -			/* Could be optimized with kfree_bulk() in future. */
> +		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
>  			kfree((void *)head - offset);
> -		}
>  
>  		rcu_lock_release(&rcu_callback_map);
>  		cond_resched_tasks_rcu_qs();
> @@ -2839,26 +2882,45 @@ static void kfree_rcu_work(struct work_struct *work)
>   */
>  static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  {
> +	struct kfree_rcu_cpu_work *krwp;
> +	bool queued = false;
>  	int i;
> -	struct kfree_rcu_cpu_work *krwp = NULL;
>  
>  	lockdep_assert_held(&krcp->lock);
> -	for (i = 0; i < KFREE_N_BATCHES; i++)
> -		if (!krcp->krw_arr[i].head_free) {
> -			krwp = &(krcp->krw_arr[i]);
> -			break;
> -		}
>  
> -	// If a previous RCU batch is in progress, we cannot immediately
> -	// queue another one, so return false to tell caller to retry.
> -	if (!krwp)
> -		return false;
> +	for (i = 0; i < KFREE_N_BATCHES; i++) {
> +		krwp = &(krcp->krw_arr[i]);
>  
> -	krwp->head_free = krcp->head;
> -	krcp->head = NULL;
> -	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> -	queue_rcu_work(system_wq, &krwp->rcu_work);
> -	return true;
> +		/*
> +		 * Try to detach bhead or head and attach it over any
> +		 * available corresponding free channel. It can be that
> +		 * a previous RCU batch is in progress, it means that
> +		 * immediately to queue another one is not possible so
> +		 * return false to tell caller to retry.
> +		 */
> +		if ((krcp->bhead && !krwp->bhead_free) ||
> +				(krcp->head && !krwp->head_free)) {
> +			if (!krwp->bhead_free) {
> +				krwp->bhead_free = krcp->bhead;
> +				krcp->bhead = NULL;
> +			}
> +
> +			if (!krwp->head_free) {
> +				krwp->head_free = krcp->head;
> +				krcp->head = NULL;
> +			}
> +
> +			/*
> +			 * The work can already be queued. If so, it means that
> +			 * within a short time, second, either head or bhead has
> +			 * been detached as well.
> +			 */
> +			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			queued = true;
> +		}
> +	}
> +
> +	return queued;
>  }
>  
>  static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> @@ -2895,6 +2957,39 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  		spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
> +static inline bool
> +kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> +{
> +	struct kfree_rcu_bulk_data *bnode;
> +
> +	if (unlikely(!krcp->initialized))
> +		return false;
> +
> +	lockdep_assert_held(&krcp->lock);
> +
> +	/* Check if a new block is required. */
> +	if (!krcp->bhead ||
> +			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
> +		bnode = xchg(&krcp->bcached, NULL);
> +		if (!bnode)
> +			bnode = (struct kfree_rcu_bulk_data *)
> +				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> +
> +		/* No cache or an allocation got failed. */
> +		if (unlikely(!bnode))
> +			return false;
> +
> +		/* Initialize the new block. */
> +		bnode->nr_records = 0;
> +		bnode->next = krcp->bhead;
> +		krcp->bhead = bnode;
> +	}
> +
> +	/* Finally insert. */
> +	krcp->bhead->records[krcp->bhead->nr_records++] = ptr;
> +	return true;
> +}
> +
>  /*
>   * Queue a request for lazy invocation of kfree() after a grace period.
>   *
> @@ -2926,9 +3021,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  			  __func__, head);
>  		goto unlock_return;
>  	}
> -	head->func = func;
> -	head->next = krcp->head;
> -	krcp->head = head;
> +
> +	/*
> +	 * Under high memory pressure GFP_NOWAIT can fail,
> +	 * in that case the emergency path is maintained.
> +	 */
> +	if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp,
> +			(void *) head - (unsigned long) func))) {
> +		head->func = func;
> +		head->next = krcp->head;
> +		krcp->head = head;
> +	}
>  
>  	// Set timer to drain after KFREE_DRAIN_JIFFIES.
>  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> @@ -3834,8 +3937,11 @@ static void __init kfree_rcu_batch_init(void)
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
>  		spin_lock_init(&krcp->lock);
> -		for (i = 0; i < KFREE_N_BATCHES; i++)
> +		for (i = 0; i < KFREE_N_BATCHES; i++) {
> +			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
> +		}
> +
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  		krcp->initialized = true;
>  	}
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-13 19:03 ` Paul E. McKenney
@ 2020-01-14 16:49   ` Joel Fernandes
  2020-01-15 13:14     ` Uladzislau Rezki
  0 siblings, 1 reply; 18+ messages in thread
From: Joel Fernandes @ 2020-01-14 16:49 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki (Sony), LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

Hi Paul,

On Mon, Jan 13, 2020 at 11:03:15AM -0800, Paul E. McKenney wrote:
> On Tue, Dec 31, 2019 at 01:22:41PM +0100, Uladzislau Rezki (Sony) wrote:
> > kfree_rcu() logic can be improved further by using kfree_bulk()
> > interface along with "basic batching support" introduced earlier.
> > 
> > The are at least two advantages of using "bulk" interface:
> > - in case of large number of kfree_rcu() requests kfree_bulk()
> >   reduces the per-object overhead caused by calling kfree()
> >   per-object.
> > 
> > - reduces the number of cache-misses due to "pointer chasing"
> >   between objects which can be far spread between each other.
> > 
> > This approach defines a new kfree_rcu_bulk_data structure that
> > stores pointers in an array with a specific size. Number of entries
> > in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
> > structure to be exactly one page.
> > 
> > Since it deals with "block-chain" technique there is an extra
> > need in dynamic allocation when a new block is required. Memory
> > is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> > allows to skip direct reclaim under low memory condition to
> > prevent stalling and fails silently under high memory pressure.
> > 
> > The "emergency path" gets maintained when a system is run out
> > of memory. In that case objects are linked into regular list
> > and that is it.
> > 
> > In order to evaluate it, the "rcuperf" was run to analyze how
> > much memory is consumed and what is kfree_bulk() throughput.
> > 
> > Testing on the HiKey-960, arm64, 8xCPUs with below parameters:
> > 
> > CONFIG_SLAB=y
> > kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> > 
> > 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
> > 89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB
> > 
> > rcuperf shows approximately ~12% better throughput(Total time)
> > in case of using "bulk" interface. The "drain logic" or its RCU
> > callback does the work faster that leads to better throughput.
> 
> Nice improvement!
> 
> But rcuperf uses a single block size, which turns into kfree_bulk() using
> a single slab, which results in good locality of reference.  So I have to

You meant a "single cache" category when you say "single slab"? Just to
mention, the number of slabs (in a single cache) when a large number of
objects are allocated is more than 1 (not single). With current rcuperf, I
see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
32-byte slab object).

> ask...  Is this performance result representative of production workloads?

I added more variation to allocation sizes to rcuperf (patch below) to distribute
allocations across 4 kmalloc slabs (32,64,96 and 128) and I see a signficant
improvement with Ulad's patch in SLAB in terms of completion time of the
test. Below are the results. With SLUB I see slightly higher memory
footprint, I have never used SLUB and not sure who is using it so I am not
too concerned since the degradation in memory footprint is only slight with
SLAB having the signifcant improvement.

with SLAB:

with Ulad's patch:
[   19.096052] Total time taken by all kfree'ers: 17519684419 ns, loops: 10000, batches: 3378, memory footprint: 319MB
[   18.980837] Total time taken by all kfree'ers: 17460918969 ns, loops: 10000, batches: 3399, memory footprint: 312MB
[   18.671535] Total time taken by all kfree'ers: 17116640301 ns, loops: 10000, batches: 3331, memory footprint: 268MB
[   18.737601] Total time taken by all kfree'ers: 17227635828 ns, loops: 10000, batches: 3311, memory footprint: 329MB

without Ulad's patch:
[   22.679112] Total time taken by all kfree'ers: 21174999896 ns, loops: 10000, batches: 2722, memory footprint: 314MB
[   22.099168] Total time taken by all kfree'ers: 20528110989 ns, loops: 10000, batches: 2611, memory footprint: 240MB
[   22.477571] Total time taken by all kfree'ers: 20975674614 ns, loops: 10000, batches: 2763, memory footprint: 341MB
[   22.772915] Total time taken by all kfree'ers: 21207270347 ns, loops: 10000, batches: 2765, memory footprint: 329MB

with SLUB:

without Ulad's patch:
[   10.714471] Total time taken by all kfree'ers: 9216968353 ns, loops: 10000, batches: 1099, memory footprint: 393MB
[   11.188174] Total time taken by all kfree'ers: 9613032449 ns, loops: 10000, batches: 1147, memory footprint: 387MB
[   11.077431] Total time taken by all kfree'ers: 9547675890 ns, loops: 10000, batches: 1292, memory footprint: 296MB
[   11.212767] Total time taken by all kfree'ers: 9712869591 ns, loops: 10000, batches: 1155, memory footprint: 387MB


with Ulad's patch
[   11.241949] Total time taken by all kfree'ers: 9681912225 ns, loops: 10000, batches: 1087, memory footprint: 417MB
[   11.651831] Total time taken by all kfree'ers: 10154268745 ns, loops: 10000, batches: 1184, memory footprint: 416MB
[   11.342659] Total time taken by all kfree'ers: 9844937317 ns, loops: 10000, batches: 1137, memory footprint: 477MB
[   11.718769] Total time taken by all kfree'ers: 10138649532 ns, loops: 10000, batches: 1159, memory footprint: 395MB

Test patch for rcuperf is below. The memory footprint measurement for rcuperf
is still under discussion in another thread, but I tested based on that anyway:

---8<-----------------------

From d44e4c6112c388d39f7c2241e061dd77cca28d9e Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Tue, 14 Jan 2020 09:59:23 -0500
Subject: [PATCH] rcuperf: Add support to vary the slab object sizes

Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/rcu/rcuperf.c | 43 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index a4a8d097d84d..216d7c072ca2 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -600,17 +600,29 @@ static int kfree_nrealthreads;
 static atomic_t n_kfree_perf_thread_started;
 static atomic_t n_kfree_perf_thread_ended;
 
-struct kfree_obj {
-	char kfree_obj[8];
-	struct rcu_head rh;
-};
+/*
+ * Define a kfree_obj with size as the @size parameter + the size of rcu_head
+ * (rcu_head is 16 bytes on 64-bit arch).
+ */
+#define DEFINE_KFREE_OBJ(size)	\
+struct kfree_obj_ ## size {	\
+	char kfree_obj[size];	\
+	struct rcu_head rh;	\
+}
+
+/* This should goto the right sized slabs on both 32-bit and 64-bit arch */
+DEFINE_KFREE_OBJ(16); // goes on kmalloc-32 slab
+DEFINE_KFREE_OBJ(32); // goes on kmalloc-64 slab
+DEFINE_KFREE_OBJ(64); // goes on kmalloc-96 slab
+DEFINE_KFREE_OBJ(96); // goes on kmalloc-128 slab
 
 static int
 kfree_perf_thread(void *arg)
 {
 	int i, loop = 0;
 	long me = (long)arg;
-	struct kfree_obj *alloc_ptr;
+	void *alloc_ptr;
+
 	u64 start_time, end_time;
 	long long mem_begin, mem_during = 0;
 
@@ -635,11 +647,28 @@ kfree_perf_thread(void *arg)
 		}
 
 		for (i = 0; i < kfree_alloc_num; i++) {
-			alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
+			int kfree_type = i % 4;
+
+			if (kfree_type == 0)
+				alloc_ptr = kmalloc(sizeof(struct kfree_obj_16), GFP_KERNEL);
+			else if (kfree_type == 1)
+				alloc_ptr = kmalloc(sizeof(struct kfree_obj_32), GFP_KERNEL);
+			else if (kfree_type == 2)
+				alloc_ptr = kmalloc(sizeof(struct kfree_obj_64), GFP_KERNEL);
+			else
+				alloc_ptr = kmalloc(sizeof(struct kfree_obj_96),  GFP_KERNEL);
+
 			if (!alloc_ptr)
 				return -ENOMEM;
 
-			kfree_rcu(alloc_ptr, rh);
+			if (kfree_type == 0)
+				kfree_rcu((struct kfree_obj_16 *)alloc_ptr, rh);
+			else if (kfree_type == 1)
+				kfree_rcu((struct kfree_obj_32 *)alloc_ptr, rh);
+			else if (kfree_type == 2)
+				kfree_rcu((struct kfree_obj_64 *)alloc_ptr, rh);
+			else
+				kfree_rcu((struct kfree_obj_96 *)alloc_ptr, rh);
 		}
 
 		cond_resched();
-- 
2.25.0.rc1.283.g88dfdc4193-goog


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-14 16:49   ` Joel Fernandes
@ 2020-01-15 13:14     ` Uladzislau Rezki
  2020-01-15 22:53       ` Joel Fernandes
  0 siblings, 1 reply; 18+ messages in thread
From: Uladzislau Rezki @ 2020-01-15 13:14 UTC (permalink / raw)
  To: Joel Fernandes, Paul E. McKenney
  Cc: Paul E. McKenney, Uladzislau Rezki (Sony),
	LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

Hello, Joel, Paul.

Thank you for comments and testing!

> > 
> > Nice improvement!
> > 
> > But rcuperf uses a single block size, which turns into kfree_bulk() using
> > a single slab, which results in good locality of reference.  So I have to
> 
> You meant a "single cache" category when you say "single slab"? Just to
> mention, the number of slabs (in a single cache) when a large number of
> objects are allocated is more than 1 (not single). With current rcuperf, I
> see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
> slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
> 32-byte slab object).
> 
I think that is about using different slab caches to break locality. It
makes sense, IMHO, because usually the system make use of different slabs,
because of different object sizes. From the other hand i guess there are
test cases when only one slab gets used.

> > ask...  Is this performance result representative of production workloads?
> 
> I added more variation to allocation sizes to rcuperf (patch below) to distribute
> allocations across 4 kmalloc slabs (32,64,96 and 128) and I see a signficant
> improvement with Ulad's patch in SLAB in terms of completion time of the
> test. Below are the results. With SLUB I see slightly higher memory
> footprint, I have never used SLUB and not sure who is using it so I am not
> too concerned since the degradation in memory footprint is only slight with
> SLAB having the signifcant improvement.
> 
Nice patch! I think, it would be useful to have it in "rcuperf" tool with
extra parameter like "different_obj_sizes".

> with SLAB:
> 
> with Ulad's patch:
> [   19.096052] Total time taken by all kfree'ers: 17519684419 ns, loops: 10000, batches: 3378, memory footprint: 319MB
> [   18.980837] Total time taken by all kfree'ers: 17460918969 ns, loops: 10000, batches: 3399, memory footprint: 312MB
> [   18.671535] Total time taken by all kfree'ers: 17116640301 ns, loops: 10000, batches: 3331, memory footprint: 268MB
> [   18.737601] Total time taken by all kfree'ers: 17227635828 ns, loops: 10000, batches: 3311, memory footprint: 329MB
> 
> without Ulad's patch:
> [   22.679112] Total time taken by all kfree'ers: 21174999896 ns, loops: 10000, batches: 2722, memory footprint: 314MB
> [   22.099168] Total time taken by all kfree'ers: 20528110989 ns, loops: 10000, batches: 2611, memory footprint: 240MB
> [   22.477571] Total time taken by all kfree'ers: 20975674614 ns, loops: 10000, batches: 2763, memory footprint: 341MB
> [   22.772915] Total time taken by all kfree'ers: 21207270347 ns, loops: 10000, batches: 2765, memory footprint: 329MB
> 
> with SLUB:
> 
> without Ulad's patch:
> [   10.714471] Total time taken by all kfree'ers: 9216968353 ns, loops: 10000, batches: 1099, memory footprint: 393MB
> [   11.188174] Total time taken by all kfree'ers: 9613032449 ns, loops: 10000, batches: 1147, memory footprint: 387MB
> [   11.077431] Total time taken by all kfree'ers: 9547675890 ns, loops: 10000, batches: 1292, memory footprint: 296MB
> [   11.212767] Total time taken by all kfree'ers: 9712869591 ns, loops: 10000, batches: 1155, memory footprint: 387MB
> 
> 
> with Ulad's patch
> [   11.241949] Total time taken by all kfree'ers: 9681912225 ns, loops: 10000, batches: 1087, memory footprint: 417MB
> [   11.651831] Total time taken by all kfree'ers: 10154268745 ns, loops: 10000, batches: 1184, memory footprint: 416MB
> [   11.342659] Total time taken by all kfree'ers: 9844937317 ns, loops: 10000, batches: 1137, memory footprint: 477MB
> [   11.718769] Total time taken by all kfree'ers: 10138649532 ns, loops: 10000, batches: 1159, memory footprint: 395MB
> 
> Test patch for rcuperf is below. The memory footprint measurement for rcuperf
> is still under discussion in another thread, but I tested based on that anyway:
> 
> ---8<-----------------------
> 
> From d44e4c6112c388d39f7c2241e061dd77cca28d9e Mon Sep 17 00:00:00 2001
> From: Joel Fernandes <joelaf@google.com>
> Date: Tue, 14 Jan 2020 09:59:23 -0500
> Subject: [PATCH] rcuperf: Add support to vary the slab object sizes
> 
> Signed-off-by: Joel Fernandes <joelaf@google.com>
> ---
>  kernel/rcu/rcuperf.c | 43 ++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 36 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> index a4a8d097d84d..216d7c072ca2 100644
> --- a/kernel/rcu/rcuperf.c
> +++ b/kernel/rcu/rcuperf.c
> @@ -600,17 +600,29 @@ static int kfree_nrealthreads;
>  static atomic_t n_kfree_perf_thread_started;
>  static atomic_t n_kfree_perf_thread_ended;
>  
> -struct kfree_obj {
> -	char kfree_obj[8];
> -	struct rcu_head rh;
> -};
> +/*
> + * Define a kfree_obj with size as the @size parameter + the size of rcu_head
> + * (rcu_head is 16 bytes on 64-bit arch).
> + */
> +#define DEFINE_KFREE_OBJ(size)	\
> +struct kfree_obj_ ## size {	\
> +	char kfree_obj[size];	\
> +	struct rcu_head rh;	\
> +}
> +
> +/* This should goto the right sized slabs on both 32-bit and 64-bit arch */
> +DEFINE_KFREE_OBJ(16); // goes on kmalloc-32 slab
> +DEFINE_KFREE_OBJ(32); // goes on kmalloc-64 slab
> +DEFINE_KFREE_OBJ(64); // goes on kmalloc-96 slab
> +DEFINE_KFREE_OBJ(96); // goes on kmalloc-128 slab
>  
>  static int
>  kfree_perf_thread(void *arg)
>  {
>  	int i, loop = 0;
>  	long me = (long)arg;
> -	struct kfree_obj *alloc_ptr;
> +	void *alloc_ptr;
> +
>  	u64 start_time, end_time;
>  	long long mem_begin, mem_during = 0;
>  
> @@ -635,11 +647,28 @@ kfree_perf_thread(void *arg)
>  		}
>  
>  		for (i = 0; i < kfree_alloc_num; i++) {
> -			alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
> +			int kfree_type = i % 4;
> +
> +			if (kfree_type == 0)
> +				alloc_ptr = kmalloc(sizeof(struct kfree_obj_16), GFP_KERNEL);
> +			else if (kfree_type == 1)
> +				alloc_ptr = kmalloc(sizeof(struct kfree_obj_32), GFP_KERNEL);
> +			else if (kfree_type == 2)
> +				alloc_ptr = kmalloc(sizeof(struct kfree_obj_64), GFP_KERNEL);
> +			else
> +				alloc_ptr = kmalloc(sizeof(struct kfree_obj_96),  GFP_KERNEL);
> +
>  			if (!alloc_ptr)
>  				return -ENOMEM;
>  
> -			kfree_rcu(alloc_ptr, rh);
> +			if (kfree_type == 0)
> +				kfree_rcu((struct kfree_obj_16 *)alloc_ptr, rh);
> +			else if (kfree_type == 1)
> +				kfree_rcu((struct kfree_obj_32 *)alloc_ptr, rh);
> +			else if (kfree_type == 2)
> +				kfree_rcu((struct kfree_obj_64 *)alloc_ptr, rh);
> +			else
> +				kfree_rcu((struct kfree_obj_96 *)alloc_ptr, rh);
>  		}
>  
>  		cond_resched();
> -- 
> 2.25.0.rc1.283.g88dfdc4193-goog
I also have done some tests with your patch on my Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz, 12xCPUs
machine to simulate different slab usage:

dev.2020.01.10a branch

# Default, CONFIG_SLAB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
[   83.762963] Total time taken by all kfree'ers: 53607352517 ns, loops: 200000, batches: 1885, memory footprint: 1248MB
[   80.108401] Total time taken by all kfree'ers: 53529637912 ns, loops: 200000, batches: 1921, memory footprint: 1193MB
[   76.622252] Total time taken by all kfree'ers: 53570175705 ns, loops: 200000, batches: 1929, memory footprint: 1250MB

# With the patch, CONFIG_SLAB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
[   48.265008] Total time taken by all kfree'ers: 23981587315 ns, loops: 200000, batches: 810, memory footprint: 1219MB
[   53.263943] Total time taken by all kfree'ers: 23879375281 ns, loops: 200000, batches: 822, memory footprint: 1190MB
[   50.366440] Total time taken by all kfree'ers: 24086841707 ns, loops: 200000, batches: 794, memory footprint: 1380MB

# Default, CONFIG_SLUB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
[   81.818576] Total time taken by all kfree'ers: 51291025022 ns, loops: 200000, batches: 1713, memory footprint: 741MB
[   77.854866] Total time taken by all kfree'ers: 51278911477 ns, loops: 200000, batches: 1671, memory footprint: 719MB
[   76.329577] Total time taken by all kfree'ers: 51256183045 ns, loops: 200000, batches: 1719, memory footprint: 647MB

# With the patch, CONFIG_SLUB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
[   76.254485] Total time taken by all kfree'ers: 50709919132 ns, loops: 200000, batches: 1618, memory footprint: 456MB
[   75.891521] Total time taken by all kfree'ers: 50736297452 ns, loops: 200000, batches: 1633, memory footprint: 507MB
[   76.172573] Total time taken by all kfree'ers: 50660403893 ns, loops: 200000, batches: 1628, memory footprint: 429MB

in case of CONFIG_SLAB there is double increase in performance but slightly higher memory usage.
As for CONFIG_SLUB, i still see higher performance figures + lower memory usage with the patch.

Apart of that, I have got the report from the "kernel test robot":

<snip>
[   13.957168] ------------[ cut here ]------------
[   13.958256] ODEBUG: free active (active state 1) object type: rcu_head hint: 0x0
[   13.962148] WARNING: CPU: 0 PID: 212 at lib/debugobjects.c:484 debug_print_object+0x95/0xd0
[   13.964298] Modules linked in:
[   13.964960] CPU: 0 PID: 212 Comm: kworker/0:2 Not tainted 5.5.0-rc1-00136-g883a2cefc0684 #1
[   13.966712] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   13.968528] Workqueue: events kfree_rcu_work
[   13.969466] RIP: 0010:debug_print_object+0x95/0xd0
[   13.970480] Code: d2 e8 2f 06 d6 ff 8b 43 10 4d 89 f1 4c 89 e6 8b 4b 14 48 c7 c7 88 73 be 82 4d 8b 45 00 48 8b 14 c5 a0 5f 6d 82 e8 7b 65 c6 ff <0f> 0b b9 01 00 00 00 31 d2 be 01 00 00 00 48 c7 c7 98 b8 0c 83 e8
[   13.974435] RSP: 0000:ffff888231677bf8 EFLAGS: 00010282
[   13.975531] RAX: 0000000000000000 RBX: ffff88822d4200e0 RCX: 0000000000000000
[   13.976730] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff8306e028
[   13.977568] RBP: ffff888231677c18 R08: 0000000000000000 R09: ffff888231670790
[   13.978412] R10: ffff888231670000 R11: 0000000000000003 R12: ffffffff82bc5299
[   13.979250] R13: ffffffff82e77360 R14: 0000000000000000 R15: dead000000000100
[   13.980089] FS:  0000000000000000(0000) GS:ffffffff82e4f000(0000) knlGS:0000000000000000
[   13.981069] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   13.981746] CR2: 00007f1e913fc77c CR3: 0000000225ce9000 CR4: 00000000000006f0
[   13.982587] Call Trace:
[   13.982911]  __debug_check_no_obj_freed+0x19a/0x200
[   13.983494]  debug_check_no_obj_freed+0x14/0x20
[   13.984036]  free_pcp_prepare+0xee/0x1d0
[   13.984541]  free_unref_page+0x1b/0x80
[   13.984994]  __free_pages+0x19/0x20
[   13.985503]  __free_pages+0x13/0x20
[   13.985924]  slob_free_pages+0x7d/0x90
[   13.986373]  slob_free+0x34f/0x530
[   13.986784]  kfree+0x154/0x210
[   13.987155]  __kmem_cache_free_bulk+0x44/0x60
[   13.987673]  kmem_cache_free_bulk+0xe/0x10
[   13.988163]  kfree_rcu_work+0x95/0x310
[   13.989010]  ? kfree_rcu_work+0x64/0x310
[   13.989884]  process_one_work+0x378/0x7c0
[   13.990770]  worker_thread+0x40/0x600
[   13.991587]  kthread+0x14e/0x170
[   13.992344]  ? process_one_work+0x7c0/0x7c0
[   13.993256]  ? kthread_create_on_node+0x70/0x70
[   13.994246]  ret_from_fork+0x3a/0x50
[   13.995039] ---[ end trace cdf242638b0e32a0 ]---
[child0:632] trace_fd was -1
<snip>

the trace happens when the kernel is built with CONFIG_DEBUG_OBJECTS_FREE
and CONFIG_DEBUG_OBJECTS_RCU_HEAD. Basically it is not a problem of the patch
itself or there is any bug there. It just does not pair with debug_rcu_head_queue(head)
in the kfree_rcu_work() function, that is why the kernel thinks about freeing
an active object that is not active in reality.

I will upload a V2 to fix that.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-15 13:14     ` Uladzislau Rezki
@ 2020-01-15 22:53       ` Joel Fernandes
  2020-01-17 17:52         ` Uladzislau Rezki
  0 siblings, 1 reply; 18+ messages in thread
From: Joel Fernandes @ 2020-01-15 22:53 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E. McKenney, LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Wed, Jan 15, 2020 at 02:14:46PM +0100, Uladzislau Rezki wrote:
> Hello, Joel, Paul.
> 
> Thank you for comments and testing!
> 
> > > 
> > > Nice improvement!
> > > 
> > > But rcuperf uses a single block size, which turns into kfree_bulk() using
> > > a single slab, which results in good locality of reference.  So I have to
> > 
> > You meant a "single cache" category when you say "single slab"? Just to
> > mention, the number of slabs (in a single cache) when a large number of
> > objects are allocated is more than 1 (not single). With current rcuperf, I
> > see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
> > slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
> > 32-byte slab object).
> > 
> I think that is about using different slab caches to break locality. It
> makes sense, IMHO, because usually the system make use of different slabs,
> because of different object sizes. From the other hand i guess there are
> test cases when only one slab gets used.

I was wondering about "locality". A cache can be split into many slabs. Only
the data on a page is local (contiguous). If there are a large number of
objects, then it goes to a new slab (on the same cache). At least on the
kmalloc slabs, there is only 1 slab per page. So for example, if on
kmalloc-32 slab, there are more than 128 objects, then it goes to a different
slab / page. So how is there still locality?

Further the slab (not sure about slub) doesn't seem to do anything at the
moment to take advantage of locality within a slab.

That said, I am fully supportive of your patch and see the same
improvements as well which are for the reasons you mentioned in the changelog.

> > > ask...  Is this performance result representative of production workloads?
> > 
> > I added more variation to allocation sizes to rcuperf (patch below) to distribute
> > allocations across 4 kmalloc slabs (32,64,96 and 128) and I see a signficant
> > improvement with Ulad's patch in SLAB in terms of completion time of the
> > test. Below are the results. With SLUB I see slightly higher memory
> > footprint, I have never used SLUB and not sure who is using it so I am not
> > too concerned since the degradation in memory footprint is only slight with
> > SLAB having the signifcant improvement.
> > 
> Nice patch! I think, it would be useful to have it in "rcuperf" tool with
> extra parameter like "different_obj_sizes".

cool, I posted something like this.

> > 2.25.0.rc1.283.g88dfdc4193-goog
> I also have done some tests with your patch on my Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz, 12xCPUs
> machine to simulate different slab usage:
> 
> dev.2020.01.10a branch
> 
> # Default, CONFIG_SLAB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
> [   83.762963] Total time taken by all kfree'ers: 53607352517 ns, loops: 200000, batches: 1885, memory footprint: 1248MB
> [   80.108401] Total time taken by all kfree'ers: 53529637912 ns, loops: 200000, batches: 1921, memory footprint: 1193MB
> [   76.622252] Total time taken by all kfree'ers: 53570175705 ns, loops: 200000, batches: 1929, memory footprint: 1250MB
> 
> # With the patch, CONFIG_SLAB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
> [   48.265008] Total time taken by all kfree'ers: 23981587315 ns, loops: 200000, batches: 810, memory footprint: 1219MB
> [   53.263943] Total time taken by all kfree'ers: 23879375281 ns, loops: 200000, batches: 822, memory footprint: 1190MB
> [   50.366440] Total time taken by all kfree'ers: 24086841707 ns, loops: 200000, batches: 794, memory footprint: 1380MB
> 
> # Default, CONFIG_SLUB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
> [   81.818576] Total time taken by all kfree'ers: 51291025022 ns, loops: 200000, batches: 1713, memory footprint: 741MB
> [   77.854866] Total time taken by all kfree'ers: 51278911477 ns, loops: 200000, batches: 1671, memory footprint: 719MB
> [   76.329577] Total time taken by all kfree'ers: 51256183045 ns, loops: 200000, batches: 1719, memory footprint: 647MB
> 
> # With the patch, CONFIG_SLUB, kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1, 16, 32, 64, 96 obj sizes
> [   76.254485] Total time taken by all kfree'ers: 50709919132 ns, loops: 200000, batches: 1618, memory footprint: 456MB
> [   75.891521] Total time taken by all kfree'ers: 50736297452 ns, loops: 200000, batches: 1633, memory footprint: 507MB
> [   76.172573] Total time taken by all kfree'ers: 50660403893 ns, loops: 200000, batches: 1628, memory footprint: 429MB
> 
> in case of CONFIG_SLAB there is double increase in performance but slightly higher memory usage.
> As for CONFIG_SLUB, i still see higher performance figures + lower memory usage with the patch.

Ok, testing today, our results are quite similar.

> 
> Apart of that, I have got the report from the "kernel test robot":
> <snip>
> [   13.957168] ------------[ cut here ]------------
> [   13.958256] ODEBUG: free active (active state 1) object type: rcu_head hint: 0x0
> [   13.962148] WARNING: CPU: 0 PID: 212 at lib/debugobjects.c:484 debug_print_object+0x95/0xd0
> [   13.964298] Modules linked in:
> [   13.964960] CPU: 0 PID: 212 Comm: kworker/0:2 Not tainted 5.5.0-rc1-00136-g883a2cefc0684 #1
> [   13.966712] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> [   13.968528] Workqueue: events kfree_rcu_work
> [   13.969466] RIP: 0010:debug_print_object+0x95/0xd0
> [   13.970480] Code: d2 e8 2f 06 d6 ff 8b 43 10 4d 89 f1 4c 89 e6 8b 4b 14 48 c7 c7 88 73 be 82 4d 8b 45 00 48 8b 14 c5 a0 5f 6d 82 e8 7b 65 c6 ff <0f> 0b b9 01 00 00 00 31 d2 be 01 00 00 00 48 c7 c7 98 b8 0c 83 e8
> [   13.974435] RSP: 0000:ffff888231677bf8 EFLAGS: 00010282
> [   13.975531] RAX: 0000000000000000 RBX: ffff88822d4200e0 RCX: 0000000000000000
> [   13.976730] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff8306e028
> [   13.977568] RBP: ffff888231677c18 R08: 0000000000000000 R09: ffff888231670790
> [   13.978412] R10: ffff888231670000 R11: 0000000000000003 R12: ffffffff82bc5299
> [   13.979250] R13: ffffffff82e77360 R14: 0000000000000000 R15: dead000000000100
> [   13.980089] FS:  0000000000000000(0000) GS:ffffffff82e4f000(0000) knlGS:0000000000000000
> [   13.981069] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   13.981746] CR2: 00007f1e913fc77c CR3: 0000000225ce9000 CR4: 00000000000006f0
> [   13.982587] Call Trace:
> [   13.982911]  __debug_check_no_obj_freed+0x19a/0x200
> [   13.983494]  debug_check_no_obj_freed+0x14/0x20
> [   13.984036]  free_pcp_prepare+0xee/0x1d0
> [   13.984541]  free_unref_page+0x1b/0x80
> [   13.984994]  __free_pages+0x19/0x20
> [   13.985503]  __free_pages+0x13/0x20
> [   13.985924]  slob_free_pages+0x7d/0x90
> [   13.986373]  slob_free+0x34f/0x530
> [   13.986784]  kfree+0x154/0x210
> [   13.987155]  __kmem_cache_free_bulk+0x44/0x60
> [   13.987673]  kmem_cache_free_bulk+0xe/0x10
> [   13.988163]  kfree_rcu_work+0x95/0x310
> [   13.989010]  ? kfree_rcu_work+0x64/0x310
> [   13.989884]  process_one_work+0x378/0x7c0
> [   13.990770]  worker_thread+0x40/0x600
> [   13.991587]  kthread+0x14e/0x170
> [   13.992344]  ? process_one_work+0x7c0/0x7c0
> [   13.993256]  ? kthread_create_on_node+0x70/0x70
> [   13.994246]  ret_from_fork+0x3a/0x50
> [   13.995039] ---[ end trace cdf242638b0e32a0 ]---
> [child0:632] trace_fd was -1
> <snip>
> 
> the trace happens when the kernel is built with CONFIG_DEBUG_OBJECTS_FREE
> and CONFIG_DEBUG_OBJECTS_RCU_HEAD. Basically it is not a problem of the patch
> itself or there is any bug there. It just does not pair with debug_rcu_head_queue(head)
> in the kfree_rcu_work() function, that is why the kernel thinks about freeing
> an active object that is not active in reality.
> 
> I will upload a V2 to fix that.

Oh good point. Thanks for fixing that.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2019-12-31 12:22 [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu() Uladzislau Rezki (Sony)
  2020-01-13 19:03 ` Paul E. McKenney
@ 2020-01-16  1:14 ` Joel Fernandes
  2020-01-16  2:41   ` Paul E. McKenney
  2020-01-16 17:24   ` Uladzislau Rezki
  1 sibling, 2 replies; 18+ messages in thread
From: Joel Fernandes @ 2020-01-16  1:14 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, Paul E . McKenney, RCU, Steven Rostedt, Oleksiy Avramchenko

On Tue, Dec 31, 2019 at 01:22:41PM +0100, Uladzislau Rezki (Sony) wrote:
> kfree_rcu() logic can be improved further by using kfree_bulk()
> interface along with "basic batching support" introduced earlier.
> 
> The are at least two advantages of using "bulk" interface:
> - in case of large number of kfree_rcu() requests kfree_bulk()
>   reduces the per-object overhead caused by calling kfree()
>   per-object.
> 
> - reduces the number of cache-misses due to "pointer chasing"
>   between objects which can be far spread between each other.
> 
> This approach defines a new kfree_rcu_bulk_data structure that
> stores pointers in an array with a specific size. Number of entries
> in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
> structure to be exactly one page.
> 
> Since it deals with "block-chain" technique there is an extra
> need in dynamic allocation when a new block is required. Memory
> is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> allows to skip direct reclaim under low memory condition to
> prevent stalling and fails silently under high memory pressure.
> 
> The "emergency path" gets maintained when a system is run out
> of memory. In that case objects are linked into regular list
> and that is it.
> 
> In order to evaluate it, the "rcuperf" was run to analyze how
> much memory is consumed and what is kfree_bulk() throughput.
> 
> Testing on the HiKey-960, arm64, 8xCPUs with below parameters:
> 
> CONFIG_SLAB=y
> kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> 
> 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
> 89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB
> 
> rcuperf shows approximately ~12% better throughput(Total time)
> in case of using "bulk" interface. The "drain logic" or its RCU
> callback does the work faster that leads to better throughput.

Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>

(Vlad is going to post a v2 which fixes a debugobjects bug but that should
not have any impact on testing).

thanks,

 - Joel



> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 154 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 130 insertions(+), 24 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 48fba2257748..4ee5c737558b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2754,22 +2754,45 @@ EXPORT_SYMBOL_GPL(call_rcu);
>  #define KFREE_DRAIN_JIFFIES (HZ / 50)
>  #define KFREE_N_BATCHES 2
>  
> +/*
> + * This macro defines how many entries the "records" array
> + * will contain. It is based on the fact that the size of
> + * kfree_rcu_bulk_data structure becomes exactly one page.
> + */
> +#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 2)
> +
> +/**
> + * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
> + * @nr_records: Number of active pointers in the array
> + * @records: Array of the kfree_rcu() pointers
> + * @next: Next bulk object in the block chain
> + */
> +struct kfree_rcu_bulk_data {
> +	unsigned long nr_records;
> +	void *records[KFREE_BULK_MAX_ENTR];
> +	struct kfree_rcu_bulk_data *next;
> +};
> +
>  /**
>   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
>   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
>   * @head_free: List of kfree_rcu() objects waiting for a grace period
> + * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
>   * @krcp: Pointer to @kfree_rcu_cpu structure
>   */
>  
>  struct kfree_rcu_cpu_work {
>  	struct rcu_work rcu_work;
>  	struct rcu_head *head_free;
> +	struct kfree_rcu_bulk_data *bhead_free;
>  	struct kfree_rcu_cpu *krcp;
>  };
>  
>  /**
>   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
>   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> + * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
> + * @bcached: Keeps at most one object for later reuse when build chain blocks
>   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
>   * @lock: Synchronize access to this structure
>   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
> @@ -2783,6 +2806,8 @@ struct kfree_rcu_cpu_work {
>   */
>  struct kfree_rcu_cpu {
>  	struct rcu_head *head;
> +	struct kfree_rcu_bulk_data *bhead;
> +	struct kfree_rcu_bulk_data *bcached;
>  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>  	spinlock_t lock;
>  	struct delayed_work monitor_work;
> @@ -2800,6 +2825,7 @@ static void kfree_rcu_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct rcu_head *head, *next;
> +	struct kfree_rcu_bulk_data *bhead, *bnext;
>  	struct kfree_rcu_cpu *krcp;
>  	struct kfree_rcu_cpu_work *krwp;
>  
> @@ -2809,22 +2835,39 @@ static void kfree_rcu_work(struct work_struct *work)
>  	spin_lock_irqsave(&krcp->lock, flags);
>  	head = krwp->head_free;
>  	krwp->head_free = NULL;
> +	bhead = krwp->bhead_free;
> +	krwp->bhead_free = NULL;
>  	spin_unlock_irqrestore(&krcp->lock, flags);
>  
> -	// List "head" is now private, so traverse locklessly.
> +	/* List "bhead" is now private, so traverse locklessly. */
> +	for (; bhead; bhead = bnext) {
> +		bnext = bhead->next;
> +
> +		rcu_lock_acquire(&rcu_callback_map);
> +		kfree_bulk(bhead->nr_records, bhead->records);
> +		rcu_lock_release(&rcu_callback_map);
> +
> +		if (cmpxchg(&krcp->bcached, NULL, bhead))
> +			free_page((unsigned long) bhead);
> +
> +		cond_resched_tasks_rcu_qs();
> +	}
> +
> +	/*
> +	 * Emergency case only. It can happen under low memory
> +	 * condition when an allocation gets failed, so the "bulk"
> +	 * path can not be temporary maintained.
> +	 */
>  	for (; head; head = next) {
>  		unsigned long offset = (unsigned long)head->func;
>  
>  		next = head->next;
> -		// Potentially optimize with kfree_bulk in future.
>  		debug_rcu_head_unqueue(head);
>  		rcu_lock_acquire(&rcu_callback_map);
>  		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
>  
> -		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
> -			/* Could be optimized with kfree_bulk() in future. */
> +		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
>  			kfree((void *)head - offset);
> -		}
>  
>  		rcu_lock_release(&rcu_callback_map);
>  		cond_resched_tasks_rcu_qs();
> @@ -2839,26 +2882,45 @@ static void kfree_rcu_work(struct work_struct *work)
>   */
>  static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  {
> +	struct kfree_rcu_cpu_work *krwp;
> +	bool queued = false;
>  	int i;
> -	struct kfree_rcu_cpu_work *krwp = NULL;
>  
>  	lockdep_assert_held(&krcp->lock);
> -	for (i = 0; i < KFREE_N_BATCHES; i++)
> -		if (!krcp->krw_arr[i].head_free) {
> -			krwp = &(krcp->krw_arr[i]);
> -			break;
> -		}
>  
> -	// If a previous RCU batch is in progress, we cannot immediately
> -	// queue another one, so return false to tell caller to retry.
> -	if (!krwp)
> -		return false;
> +	for (i = 0; i < KFREE_N_BATCHES; i++) {
> +		krwp = &(krcp->krw_arr[i]);
>  
> -	krwp->head_free = krcp->head;
> -	krcp->head = NULL;
> -	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> -	queue_rcu_work(system_wq, &krwp->rcu_work);
> -	return true;
> +		/*
> +		 * Try to detach bhead or head and attach it over any
> +		 * available corresponding free channel. It can be that
> +		 * a previous RCU batch is in progress, it means that
> +		 * immediately to queue another one is not possible so
> +		 * return false to tell caller to retry.
> +		 */
> +		if ((krcp->bhead && !krwp->bhead_free) ||
> +				(krcp->head && !krwp->head_free)) {
> +			if (!krwp->bhead_free) {
> +				krwp->bhead_free = krcp->bhead;
> +				krcp->bhead = NULL;
> +			}
> +
> +			if (!krwp->head_free) {
> +				krwp->head_free = krcp->head;
> +				krcp->head = NULL;
> +			}
> +
> +			/*
> +			 * The work can already be queued. If so, it means that
> +			 * within a short time, second, either head or bhead has
> +			 * been detached as well.
> +			 */
> +			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			queued = true;
> +		}
> +	}
> +
> +	return queued;
>  }
>  
>  static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> @@ -2895,6 +2957,39 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  		spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
> +static inline bool
> +kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> +{
> +	struct kfree_rcu_bulk_data *bnode;
> +
> +	if (unlikely(!krcp->initialized))
> +		return false;
> +
> +	lockdep_assert_held(&krcp->lock);
> +
> +	/* Check if a new block is required. */
> +	if (!krcp->bhead ||
> +			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
> +		bnode = xchg(&krcp->bcached, NULL);
> +		if (!bnode)
> +			bnode = (struct kfree_rcu_bulk_data *)
> +				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> +
> +		/* No cache or an allocation got failed. */
> +		if (unlikely(!bnode))
> +			return false;
> +
> +		/* Initialize the new block. */
> +		bnode->nr_records = 0;
> +		bnode->next = krcp->bhead;
> +		krcp->bhead = bnode;
> +	}
> +
> +	/* Finally insert. */
> +	krcp->bhead->records[krcp->bhead->nr_records++] = ptr;
> +	return true;
> +}
> +
>  /*
>   * Queue a request for lazy invocation of kfree() after a grace period.
>   *
> @@ -2926,9 +3021,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  			  __func__, head);
>  		goto unlock_return;
>  	}
> -	head->func = func;
> -	head->next = krcp->head;
> -	krcp->head = head;
> +
> +	/*
> +	 * Under high memory pressure GFP_NOWAIT can fail,
> +	 * in that case the emergency path is maintained.
> +	 */
> +	if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp,
> +			(void *) head - (unsigned long) func))) {
> +		head->func = func;
> +		head->next = krcp->head;
> +		krcp->head = head;
> +	}
>  
>  	// Set timer to drain after KFREE_DRAIN_JIFFIES.
>  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> @@ -3834,8 +3937,11 @@ static void __init kfree_rcu_batch_init(void)
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>  
>  		spin_lock_init(&krcp->lock);
> -		for (i = 0; i < KFREE_N_BATCHES; i++)
> +		for (i = 0; i < KFREE_N_BATCHES; i++) {
> +			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
> +		}
> +
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  		krcp->initialized = true;
>  	}
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-16  1:14 ` Joel Fernandes
@ 2020-01-16  2:41   ` Paul E. McKenney
  2020-01-16 17:27     ` Uladzislau Rezki
  2020-01-16 17:24   ` Uladzislau Rezki
  1 sibling, 1 reply; 18+ messages in thread
From: Paul E. McKenney @ 2020-01-16  2:41 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony), LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Wed, Jan 15, 2020 at 08:14:10PM -0500, Joel Fernandes wrote:
> On Tue, Dec 31, 2019 at 01:22:41PM +0100, Uladzislau Rezki (Sony) wrote:
> > kfree_rcu() logic can be improved further by using kfree_bulk()
> > interface along with "basic batching support" introduced earlier.
> > 
> > The are at least two advantages of using "bulk" interface:
> > - in case of large number of kfree_rcu() requests kfree_bulk()
> >   reduces the per-object overhead caused by calling kfree()
> >   per-object.
> > 
> > - reduces the number of cache-misses due to "pointer chasing"
> >   between objects which can be far spread between each other.
> > 
> > This approach defines a new kfree_rcu_bulk_data structure that
> > stores pointers in an array with a specific size. Number of entries
> > in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
> > structure to be exactly one page.
> > 
> > Since it deals with "block-chain" technique there is an extra
> > need in dynamic allocation when a new block is required. Memory
> > is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> > allows to skip direct reclaim under low memory condition to
> > prevent stalling and fails silently under high memory pressure.
> > 
> > The "emergency path" gets maintained when a system is run out
> > of memory. In that case objects are linked into regular list
> > and that is it.
> > 
> > In order to evaluate it, the "rcuperf" was run to analyze how
> > much memory is consumed and what is kfree_bulk() throughput.
> > 
> > Testing on the HiKey-960, arm64, 8xCPUs with below parameters:
> > 
> > CONFIG_SLAB=y
> > kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> > 
> > 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
> > 89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB
> > 
> > rcuperf shows approximately ~12% better throughput(Total time)
> > in case of using "bulk" interface. The "drain logic" or its RCU
> > callback does the work faster that leads to better throughput.
> 
> Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> (Vlad is going to post a v2 which fixes a debugobjects bug but that should
> not have any impact on testing).

Very good!  Uladzislau, could you please add Joel's Tested-by in
your next posting?

							Thanx, Paul

> thanks,
> 
>  - Joel
> 
> 
> 
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  kernel/rcu/tree.c | 154 ++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 130 insertions(+), 24 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 48fba2257748..4ee5c737558b 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2754,22 +2754,45 @@ EXPORT_SYMBOL_GPL(call_rcu);
> >  #define KFREE_DRAIN_JIFFIES (HZ / 50)
> >  #define KFREE_N_BATCHES 2
> >  
> > +/*
> > + * This macro defines how many entries the "records" array
> > + * will contain. It is based on the fact that the size of
> > + * kfree_rcu_bulk_data structure becomes exactly one page.
> > + */
> > +#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 2)
> > +
> > +/**
> > + * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
> > + * @nr_records: Number of active pointers in the array
> > + * @records: Array of the kfree_rcu() pointers
> > + * @next: Next bulk object in the block chain
> > + */
> > +struct kfree_rcu_bulk_data {
> > +	unsigned long nr_records;
> > +	void *records[KFREE_BULK_MAX_ENTR];
> > +	struct kfree_rcu_bulk_data *next;
> > +};
> > +
> >  /**
> >   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
> >   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
> >   * @head_free: List of kfree_rcu() objects waiting for a grace period
> > + * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
> >   * @krcp: Pointer to @kfree_rcu_cpu structure
> >   */
> >  
> >  struct kfree_rcu_cpu_work {
> >  	struct rcu_work rcu_work;
> >  	struct rcu_head *head_free;
> > +	struct kfree_rcu_bulk_data *bhead_free;
> >  	struct kfree_rcu_cpu *krcp;
> >  };
> >  
> >  /**
> >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > + * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
> > + * @bcached: Keeps at most one object for later reuse when build chain blocks
> >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> >   * @lock: Synchronize access to this structure
> >   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
> > @@ -2783,6 +2806,8 @@ struct kfree_rcu_cpu_work {
> >   */
> >  struct kfree_rcu_cpu {
> >  	struct rcu_head *head;
> > +	struct kfree_rcu_bulk_data *bhead;
> > +	struct kfree_rcu_bulk_data *bcached;
> >  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> >  	spinlock_t lock;
> >  	struct delayed_work monitor_work;
> > @@ -2800,6 +2825,7 @@ static void kfree_rcu_work(struct work_struct *work)
> >  {
> >  	unsigned long flags;
> >  	struct rcu_head *head, *next;
> > +	struct kfree_rcu_bulk_data *bhead, *bnext;
> >  	struct kfree_rcu_cpu *krcp;
> >  	struct kfree_rcu_cpu_work *krwp;
> >  
> > @@ -2809,22 +2835,39 @@ static void kfree_rcu_work(struct work_struct *work)
> >  	spin_lock_irqsave(&krcp->lock, flags);
> >  	head = krwp->head_free;
> >  	krwp->head_free = NULL;
> > +	bhead = krwp->bhead_free;
> > +	krwp->bhead_free = NULL;
> >  	spin_unlock_irqrestore(&krcp->lock, flags);
> >  
> > -	// List "head" is now private, so traverse locklessly.
> > +	/* List "bhead" is now private, so traverse locklessly. */
> > +	for (; bhead; bhead = bnext) {
> > +		bnext = bhead->next;
> > +
> > +		rcu_lock_acquire(&rcu_callback_map);
> > +		kfree_bulk(bhead->nr_records, bhead->records);
> > +		rcu_lock_release(&rcu_callback_map);
> > +
> > +		if (cmpxchg(&krcp->bcached, NULL, bhead))
> > +			free_page((unsigned long) bhead);
> > +
> > +		cond_resched_tasks_rcu_qs();
> > +	}
> > +
> > +	/*
> > +	 * Emergency case only. It can happen under low memory
> > +	 * condition when an allocation gets failed, so the "bulk"
> > +	 * path can not be temporary maintained.
> > +	 */
> >  	for (; head; head = next) {
> >  		unsigned long offset = (unsigned long)head->func;
> >  
> >  		next = head->next;
> > -		// Potentially optimize with kfree_bulk in future.
> >  		debug_rcu_head_unqueue(head);
> >  		rcu_lock_acquire(&rcu_callback_map);
> >  		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
> >  
> > -		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
> > -			/* Could be optimized with kfree_bulk() in future. */
> > +		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
> >  			kfree((void *)head - offset);
> > -		}
> >  
> >  		rcu_lock_release(&rcu_callback_map);
> >  		cond_resched_tasks_rcu_qs();
> > @@ -2839,26 +2882,45 @@ static void kfree_rcu_work(struct work_struct *work)
> >   */
> >  static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
> >  {
> > +	struct kfree_rcu_cpu_work *krwp;
> > +	bool queued = false;
> >  	int i;
> > -	struct kfree_rcu_cpu_work *krwp = NULL;
> >  
> >  	lockdep_assert_held(&krcp->lock);
> > -	for (i = 0; i < KFREE_N_BATCHES; i++)
> > -		if (!krcp->krw_arr[i].head_free) {
> > -			krwp = &(krcp->krw_arr[i]);
> > -			break;
> > -		}
> >  
> > -	// If a previous RCU batch is in progress, we cannot immediately
> > -	// queue another one, so return false to tell caller to retry.
> > -	if (!krwp)
> > -		return false;
> > +	for (i = 0; i < KFREE_N_BATCHES; i++) {
> > +		krwp = &(krcp->krw_arr[i]);
> >  
> > -	krwp->head_free = krcp->head;
> > -	krcp->head = NULL;
> > -	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> > -	queue_rcu_work(system_wq, &krwp->rcu_work);
> > -	return true;
> > +		/*
> > +		 * Try to detach bhead or head and attach it over any
> > +		 * available corresponding free channel. It can be that
> > +		 * a previous RCU batch is in progress, it means that
> > +		 * immediately to queue another one is not possible so
> > +		 * return false to tell caller to retry.
> > +		 */
> > +		if ((krcp->bhead && !krwp->bhead_free) ||
> > +				(krcp->head && !krwp->head_free)) {
> > +			if (!krwp->bhead_free) {
> > +				krwp->bhead_free = krcp->bhead;
> > +				krcp->bhead = NULL;
> > +			}
> > +
> > +			if (!krwp->head_free) {
> > +				krwp->head_free = krcp->head;
> > +				krcp->head = NULL;
> > +			}
> > +
> > +			/*
> > +			 * The work can already be queued. If so, it means that
> > +			 * within a short time, second, either head or bhead has
> > +			 * been detached as well.
> > +			 */
> > +			queue_rcu_work(system_wq, &krwp->rcu_work);
> > +			queued = true;
> > +		}
> > +	}
> > +
> > +	return queued;
> >  }
> >  
> >  static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
> > @@ -2895,6 +2957,39 @@ static void kfree_rcu_monitor(struct work_struct *work)
> >  		spin_unlock_irqrestore(&krcp->lock, flags);
> >  }
> >  
> > +static inline bool
> > +kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> > +{
> > +	struct kfree_rcu_bulk_data *bnode;
> > +
> > +	if (unlikely(!krcp->initialized))
> > +		return false;
> > +
> > +	lockdep_assert_held(&krcp->lock);
> > +
> > +	/* Check if a new block is required. */
> > +	if (!krcp->bhead ||
> > +			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
> > +		bnode = xchg(&krcp->bcached, NULL);
> > +		if (!bnode)
> > +			bnode = (struct kfree_rcu_bulk_data *)
> > +				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > +
> > +		/* No cache or an allocation got failed. */
> > +		if (unlikely(!bnode))
> > +			return false;
> > +
> > +		/* Initialize the new block. */
> > +		bnode->nr_records = 0;
> > +		bnode->next = krcp->bhead;
> > +		krcp->bhead = bnode;
> > +	}
> > +
> > +	/* Finally insert. */
> > +	krcp->bhead->records[krcp->bhead->nr_records++] = ptr;
> > +	return true;
> > +}
> > +
> >  /*
> >   * Queue a request for lazy invocation of kfree() after a grace period.
> >   *
> > @@ -2926,9 +3021,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> >  			  __func__, head);
> >  		goto unlock_return;
> >  	}
> > -	head->func = func;
> > -	head->next = krcp->head;
> > -	krcp->head = head;
> > +
> > +	/*
> > +	 * Under high memory pressure GFP_NOWAIT can fail,
> > +	 * in that case the emergency path is maintained.
> > +	 */
> > +	if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp,
> > +			(void *) head - (unsigned long) func))) {
> > +		head->func = func;
> > +		head->next = krcp->head;
> > +		krcp->head = head;
> > +	}
> >  
> >  	// Set timer to drain after KFREE_DRAIN_JIFFIES.
> >  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> > @@ -3834,8 +3937,11 @@ static void __init kfree_rcu_batch_init(void)
> >  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> >  
> >  		spin_lock_init(&krcp->lock);
> > -		for (i = 0; i < KFREE_N_BATCHES; i++)
> > +		for (i = 0; i < KFREE_N_BATCHES; i++) {
> > +			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> >  			krcp->krw_arr[i].krcp = krcp;
> > +		}
> > +
> >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> >  		krcp->initialized = true;
> >  	}
> > -- 
> > 2.20.1
> > 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-16  1:14 ` Joel Fernandes
  2020-01-16  2:41   ` Paul E. McKenney
@ 2020-01-16 17:24   ` Uladzislau Rezki
  1 sibling, 0 replies; 18+ messages in thread
From: Uladzislau Rezki @ 2020-01-16 17:24 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony),
	LKML, Paul E . McKenney, RCU, Steven Rostedt,
	Oleksiy Avramchenko

> 
> Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
Thank you and appreciate your help, Joel.

>
> (Vlad is going to post a v2 which fixes a debugobjects bug but that should
> not have any impact on testing).
> 
I will do that :)

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-16  2:41   ` Paul E. McKenney
@ 2020-01-16 17:27     ` Uladzislau Rezki
  2020-01-16 17:44       ` Paul E. McKenney
  0 siblings, 1 reply; 18+ messages in thread
From: Uladzislau Rezki @ 2020-01-16 17:27 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Joel Fernandes, Uladzislau Rezki (Sony),
	LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Wed, Jan 15, 2020 at 06:41:26PM -0800, Paul E. McKenney wrote:
> On Wed, Jan 15, 2020 at 08:14:10PM -0500, Joel Fernandes wrote:
> > On Tue, Dec 31, 2019 at 01:22:41PM +0100, Uladzislau Rezki (Sony) wrote:
> > > kfree_rcu() logic can be improved further by using kfree_bulk()
> > > interface along with "basic batching support" introduced earlier.
> > > 
> > > The are at least two advantages of using "bulk" interface:
> > > - in case of large number of kfree_rcu() requests kfree_bulk()
> > >   reduces the per-object overhead caused by calling kfree()
> > >   per-object.
> > > 
> > > - reduces the number of cache-misses due to "pointer chasing"
> > >   between objects which can be far spread between each other.
> > > 
> > > This approach defines a new kfree_rcu_bulk_data structure that
> > > stores pointers in an array with a specific size. Number of entries
> > > in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
> > > structure to be exactly one page.
> > > 
> > > Since it deals with "block-chain" technique there is an extra
> > > need in dynamic allocation when a new block is required. Memory
> > > is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> > > allows to skip direct reclaim under low memory condition to
> > > prevent stalling and fails silently under high memory pressure.
> > > 
> > > The "emergency path" gets maintained when a system is run out
> > > of memory. In that case objects are linked into regular list
> > > and that is it.
> > > 
> > > In order to evaluate it, the "rcuperf" was run to analyze how
> > > much memory is consumed and what is kfree_bulk() throughput.
> > > 
> > > Testing on the HiKey-960, arm64, 8xCPUs with below parameters:
> > > 
> > > CONFIG_SLAB=y
> > > kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> > > 
> > > 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
> > > 89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB
> > > 
> > > rcuperf shows approximately ~12% better throughput(Total time)
> > > in case of using "bulk" interface. The "drain logic" or its RCU
> > > callback does the work faster that leads to better throughput.
> > 
> > Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > 
> > (Vlad is going to post a v2 which fixes a debugobjects bug but that should
> > not have any impact on testing).
> 
> Very good!  Uladzislau, could you please add Joel's Tested-by in
> your next posting?
> 
I will add for sure, with the a V2 version. Also, i will update the
commit message by adding the results related to different slab cache
usage, i mean with Joel's recent patch.

Thank you.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-16 17:27     ` Uladzislau Rezki
@ 2020-01-16 17:44       ` Paul E. McKenney
  0 siblings, 0 replies; 18+ messages in thread
From: Paul E. McKenney @ 2020-01-16 17:44 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Joel Fernandes, LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Thu, Jan 16, 2020 at 06:27:53PM +0100, Uladzislau Rezki wrote:
> On Wed, Jan 15, 2020 at 06:41:26PM -0800, Paul E. McKenney wrote:
> > On Wed, Jan 15, 2020 at 08:14:10PM -0500, Joel Fernandes wrote:
> > > On Tue, Dec 31, 2019 at 01:22:41PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > kfree_rcu() logic can be improved further by using kfree_bulk()
> > > > interface along with "basic batching support" introduced earlier.
> > > > 
> > > > The are at least two advantages of using "bulk" interface:
> > > > - in case of large number of kfree_rcu() requests kfree_bulk()
> > > >   reduces the per-object overhead caused by calling kfree()
> > > >   per-object.
> > > > 
> > > > - reduces the number of cache-misses due to "pointer chasing"
> > > >   between objects which can be far spread between each other.
> > > > 
> > > > This approach defines a new kfree_rcu_bulk_data structure that
> > > > stores pointers in an array with a specific size. Number of entries
> > > > in that array depends on PAGE_SIZE making kfree_rcu_bulk_data
> > > > structure to be exactly one page.
> > > > 
> > > > Since it deals with "block-chain" technique there is an extra
> > > > need in dynamic allocation when a new block is required. Memory
> > > > is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> > > > allows to skip direct reclaim under low memory condition to
> > > > prevent stalling and fails silently under high memory pressure.
> > > > 
> > > > The "emergency path" gets maintained when a system is run out
> > > > of memory. In that case objects are linked into regular list
> > > > and that is it.
> > > > 
> > > > In order to evaluate it, the "rcuperf" was run to analyze how
> > > > much memory is consumed and what is kfree_bulk() throughput.
> > > > 
> > > > Testing on the HiKey-960, arm64, 8xCPUs with below parameters:
> > > > 
> > > > CONFIG_SLAB=y
> > > > kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> > > > 
> > > > 102898760401 ns, loops: 200000, batches: 5822, memory footprint: 158MB
> > > > 89947009882  ns, loops: 200000, batches: 6715, memory footprint: 115MB
> > > > 
> > > > rcuperf shows approximately ~12% better throughput(Total time)
> > > > in case of using "bulk" interface. The "drain logic" or its RCU
> > > > callback does the work faster that leads to better throughput.
> > > 
> > > Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > 
> > > (Vlad is going to post a v2 which fixes a debugobjects bug but that should
> > > not have any impact on testing).
> > 
> > Very good!  Uladzislau, could you please add Joel's Tested-by in
> > your next posting?
> > 
> I will add for sure, with the a V2 version. Also, i will update the
> commit message by adding the results related to different slab cache
> usage, i mean with Joel's recent patch.

Sounds good, looking forward to it!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-15 22:53       ` Joel Fernandes
@ 2020-01-17 17:52         ` Uladzislau Rezki
  2020-01-17 18:57           ` Joel Fernandes
  0 siblings, 1 reply; 18+ messages in thread
From: Uladzislau Rezki @ 2020-01-17 17:52 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki, Paul E. McKenney, LKML, RCU, Steven Rostedt,
	Oleksiy Avramchenko

> > > > But rcuperf uses a single block size, which turns into kfree_bulk() using
> > > > a single slab, which results in good locality of reference.  So I have to
> > > 
> > > You meant a "single cache" category when you say "single slab"? Just to
> > > mention, the number of slabs (in a single cache) when a large number of
> > > objects are allocated is more than 1 (not single). With current rcuperf, I
> > > see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
> > > slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
> > > 32-byte slab object).
> > > 
> > I think that is about using different slab caches to break locality. It
> > makes sense, IMHO, because usually the system make use of different slabs,
> > because of different object sizes. From the other hand i guess there are
> > test cases when only one slab gets used.
> 
> I was wondering about "locality". A cache can be split into many slabs. Only
> the data on a page is local (contiguous). If there are a large number of
> objects, then it goes to a new slab (on the same cache). At least on the
> kmalloc slabs, there is only 1 slab per page. So for example, if on
> kmalloc-32 slab, there are more than 128 objects, then it goes to a different
> slab / page. So how is there still locality?
> 
Hmm.. On a high level:

one slab cache manages a specific object size, i.e. the slab memory consists of
contiguous pages(when increased probably not) of memory(4096 bytes or so) divided
into equal object size. For example when kmalloc() gets called, the appropriate
cache size(slab that serves only specific size) is selected and an object assigned
from it is returned.

But that is theory and i have not deeply analyzed how the SLAB works internally,
so i can be wrong :)

You mentioned 128 objects per one slab in the kmalloc-32 slab-cache. But all of
them follows each other, i mean it is sequential and is like regular array. In
that sense freeing can be beneficial because when an access is done to any object
whole CPU cache-line is fetched(if it was not before), usually it is 64K.

That is what i meant "locality". In order to "break it" i meant to allocate from
different slabs to see how kfree_slub() behaves in that sense, what is more real
scenario and workload, i think.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-17 17:52         ` Uladzislau Rezki
@ 2020-01-17 18:57           ` Joel Fernandes
  2020-01-17 21:37             ` Paul E. McKenney
  0 siblings, 1 reply; 18+ messages in thread
From: Joel Fernandes @ 2020-01-17 18:57 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E. McKenney, LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Fri, Jan 17, 2020 at 06:52:17PM +0100, Uladzislau Rezki wrote:
> > > > > But rcuperf uses a single block size, which turns into kfree_bulk() using
> > > > > a single slab, which results in good locality of reference.  So I have to
> > > > 
> > > > You meant a "single cache" category when you say "single slab"? Just to
> > > > mention, the number of slabs (in a single cache) when a large number of
> > > > objects are allocated is more than 1 (not single). With current rcuperf, I
> > > > see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
> > > > slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
> > > > 32-byte slab object).
> > > > 
> > > I think that is about using different slab caches to break locality. It
> > > makes sense, IMHO, because usually the system make use of different slabs,
> > > because of different object sizes. From the other hand i guess there are
> > > test cases when only one slab gets used.
> > 
> > I was wondering about "locality". A cache can be split into many slabs. Only
> > the data on a page is local (contiguous). If there are a large number of
> > objects, then it goes to a new slab (on the same cache). At least on the
> > kmalloc slabs, there is only 1 slab per page. So for example, if on
> > kmalloc-32 slab, there are more than 128 objects, then it goes to a different
> > slab / page. So how is there still locality?
> > 
> Hmm.. On a high level:
> 
> one slab cache manages a specific object size, i.e. the slab memory consists of
> contiguous pages(when increased probably not) of memory(4096 bytes or so) divided
> into equal object size. For example when kmalloc() gets called, the appropriate
> cache size(slab that serves only specific size) is selected and an object assigned
> from it is returned.
> 
> But that is theory and i have not deeply analyzed how the SLAB works internally,
> so i can be wrong :)
> 
> You mentioned 128 objects per one slab in the kmalloc-32 slab-cache. But all of
> them follows each other, i mean it is sequential and is like regular array. In

Yes, for these 128 objects it is sequential. But the next 128 could be on
some other page is what I was saying  And we are allocating 10s of 1000s of
objects in this test.  (I believe pages are sequential only per slab and not
for a different slab within same cache).

> that sense freeing can be beneficial because when an access is done to any object
> whole CPU cache-line is fetched(if it was not before), usually it is 64K.

You mean size of the whole L1 cache right? cachelines are in the order of bytes.

> That is what i meant "locality". In order to "break it" i meant to allocate from
> different slabs to see how kfree_slub() behaves in that sense, what is more real
> scenario and workload, i think.

Ok, agreed.
(BTW I do agree your patch is beneficial, just wanted to get the slab
discussion right).

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-17 18:57           ` Joel Fernandes
@ 2020-01-17 21:37             ` Paul E. McKenney
  2020-01-17 21:59               ` Joel Fernandes
  0 siblings, 1 reply; 18+ messages in thread
From: Paul E. McKenney @ 2020-01-17 21:37 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki, LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Fri, Jan 17, 2020 at 01:57:32PM -0500, Joel Fernandes wrote:
> On Fri, Jan 17, 2020 at 06:52:17PM +0100, Uladzislau Rezki wrote:
> > > > > > But rcuperf uses a single block size, which turns into kfree_bulk() using
> > > > > > a single slab, which results in good locality of reference.  So I have to
> > > > > 
> > > > > You meant a "single cache" category when you say "single slab"? Just to
> > > > > mention, the number of slabs (in a single cache) when a large number of
> > > > > objects are allocated is more than 1 (not single). With current rcuperf, I
> > > > > see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
> > > > > slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
> > > > > 32-byte slab object).
> > > > > 
> > > > I think that is about using different slab caches to break locality. It
> > > > makes sense, IMHO, because usually the system make use of different slabs,
> > > > because of different object sizes. From the other hand i guess there are
> > > > test cases when only one slab gets used.
> > > 
> > > I was wondering about "locality". A cache can be split into many slabs. Only
> > > the data on a page is local (contiguous). If there are a large number of
> > > objects, then it goes to a new slab (on the same cache). At least on the
> > > kmalloc slabs, there is only 1 slab per page. So for example, if on
> > > kmalloc-32 slab, there are more than 128 objects, then it goes to a different
> > > slab / page. So how is there still locality?
> > > 
> > Hmm.. On a high level:
> > 
> > one slab cache manages a specific object size, i.e. the slab memory consists of
> > contiguous pages(when increased probably not) of memory(4096 bytes or so) divided
> > into equal object size. For example when kmalloc() gets called, the appropriate
> > cache size(slab that serves only specific size) is selected and an object assigned
> > from it is returned.
> > 
> > But that is theory and i have not deeply analyzed how the SLAB works internally,
> > so i can be wrong :)
> > 
> > You mentioned 128 objects per one slab in the kmalloc-32 slab-cache. But all of
> > them follows each other, i mean it is sequential and is like regular array. In
> 
> Yes, for these 128 objects it is sequential. But the next 128 could be on
> some other page is what I was saying  And we are allocating 10s of 1000s of
> objects in this test.  (I believe pages are sequential only per slab and not
> for a different slab within same cache).
> 
> > that sense freeing can be beneficial because when an access is done to any object
> > whole CPU cache-line is fetched(if it was not before), usually it is 64K.
> 
> You mean size of the whole L1 cache right? cachelines are in the order of bytes.
> 
> > That is what i meant "locality". In order to "break it" i meant to allocate from
> > different slabs to see how kfree_slub() behaves in that sense, what is more real
> > scenario and workload, i think.
> 
> Ok, agreed.
> (BTW I do agree your patch is beneficial, just wanted to get the slab
> discussion right).

Thank you both!

Then I should be looking for an updated version of the patch with an upgraded
commit log?  Or is there more investigation/testing/review in process?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-17 21:37             ` Paul E. McKenney
@ 2020-01-17 21:59               ` Joel Fernandes
  2020-01-19 13:03                 ` Uladzislau Rezki
  0 siblings, 1 reply; 18+ messages in thread
From: Joel Fernandes @ 2020-01-17 21:59 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, LKML, RCU, Steven Rostedt, Oleksiy Avramchenko

On Fri, Jan 17, 2020 at 01:37:21PM -0800, Paul E. McKenney wrote:
> On Fri, Jan 17, 2020 at 01:57:32PM -0500, Joel Fernandes wrote:
> > On Fri, Jan 17, 2020 at 06:52:17PM +0100, Uladzislau Rezki wrote:
> > > > > > > But rcuperf uses a single block size, which turns into kfree_bulk() using
> > > > > > > a single slab, which results in good locality of reference.  So I have to
> > > > > > 
> > > > > > You meant a "single cache" category when you say "single slab"? Just to
> > > > > > mention, the number of slabs (in a single cache) when a large number of
> > > > > > objects are allocated is more than 1 (not single). With current rcuperf, I
> > > > > > see 100s of slabs (each slab being one page) in the kmalloc-32 cache. Each
> > > > > > slab contains around 128 objects of type kfree_rcu (24 byte object aligned to
> > > > > > 32-byte slab object).
> > > > > > 
> > > > > I think that is about using different slab caches to break locality. It
> > > > > makes sense, IMHO, because usually the system make use of different slabs,
> > > > > because of different object sizes. From the other hand i guess there are
> > > > > test cases when only one slab gets used.
> > > > 
> > > > I was wondering about "locality". A cache can be split into many slabs. Only
> > > > the data on a page is local (contiguous). If there are a large number of
> > > > objects, then it goes to a new slab (on the same cache). At least on the
> > > > kmalloc slabs, there is only 1 slab per page. So for example, if on
> > > > kmalloc-32 slab, there are more than 128 objects, then it goes to a different
> > > > slab / page. So how is there still locality?
> > > > 
> > > Hmm.. On a high level:
> > > 
> > > one slab cache manages a specific object size, i.e. the slab memory consists of
> > > contiguous pages(when increased probably not) of memory(4096 bytes or so) divided
> > > into equal object size. For example when kmalloc() gets called, the appropriate
> > > cache size(slab that serves only specific size) is selected and an object assigned
> > > from it is returned.
> > > 
> > > But that is theory and i have not deeply analyzed how the SLAB works internally,
> > > so i can be wrong :)
> > > 
> > > You mentioned 128 objects per one slab in the kmalloc-32 slab-cache. But all of
> > > them follows each other, i mean it is sequential and is like regular array. In
> > 
> > Yes, for these 128 objects it is sequential. But the next 128 could be on
> > some other page is what I was saying  And we are allocating 10s of 1000s of
> > objects in this test.  (I believe pages are sequential only per slab and not
> > for a different slab within same cache).
> > 
> > > that sense freeing can be beneficial because when an access is done to any object
> > > whole CPU cache-line is fetched(if it was not before), usually it is 64K.
> > 
> > You mean size of the whole L1 cache right? cachelines are in the order of bytes.
> > 
> > > That is what i meant "locality". In order to "break it" i meant to allocate from
> > > different slabs to see how kfree_slub() behaves in that sense, what is more real
> > > scenario and workload, i think.
> > 
> > Ok, agreed.
> > (BTW I do agree your patch is beneficial, just wanted to get the slab
> > discussion right).
> 
> Thank you both!
> 
> Then I should be looking for an updated version of the patch with an upgraded
> commit log?  Or is there more investigation/testing/review in process?
> 

From my side the review is complete. I believe he will repost with
debugobjects fix and we should be good.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2020-01-17 21:59               ` Joel Fernandes
@ 2020-01-19 13:03                 ` Uladzislau Rezki
  0 siblings, 0 replies; 18+ messages in thread
From: Uladzislau Rezki @ 2020-01-19 13:03 UTC (permalink / raw)
  To: Joel Fernandes, Paul E. McKenney
  Cc: Paul E. McKenney, Uladzislau Rezki, LKML, RCU, Steven Rostedt,
	Oleksiy Avramchenko

Hello, Paul, Joel.

> > 
> > Thank you both!
> > 
> > Then I should be looking for an updated version of the patch with an upgraded
> > commit log?  Or is there more investigation/testing/review in process?
> > 
> 
> From my side the review is complete. I believe he will repost with
> debugobjects fix and we should be good.
>
I have put the V2 on the test over the weekend, so i will post it next week.
Yes, V2 will contain the debugobjects fix. Also i need to add tracing probe,
something like:

..
trace_rcu_invoke_kfree_bulk_callback();
kfree_bulk(bhead->nr_records, bhead->records);
..

probably it can be done as separate patch.

Thank you.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2019-12-21 23:21 ` Joel Fernandes
@ 2019-12-24 18:49   ` Uladzislau Rezki
  0 siblings, 0 replies; 18+ messages in thread
From: Uladzislau Rezki @ 2019-12-24 18:49 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony),
	LKML, Paul E . McKenney, RCU, Steven Rostedt,
	Oleksiy Avramchenko

Hello, Joel.

> 
> Hi Uladzislau,
> 
> Your patch is based on an older version of the kfree_rcu work. The latest
> version is in Paul's -dev branch. There is also additional work done in that
> branch as well "rcu: Add multiple in-flight batches of kfree_rcu() work" :
> https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git/commit/?h=dev&id=e38fa01b94c87dfa945afa603ed50b4f7955934b
> 
Ahh. I see there are some differences and my baseline is wrong. I will
double check and rebase on Paul's -dev branch.

>
> Could you rebase your patch on Paul's -dev branch? The branch also has an
> rcuperf patch for measuring memory footprint automatically (memory footprint
> value is printed by rcuperf). Although I'd say try to use the latest version
> of the rcuperf patch by reverting that and applying:
> https://lore.kernel.org/patchwork/patch/1170895/ . I can then add your
> Tested-by tag to any future postings of the patch for rcuperf as well!
> 
I will do that and run all tests based on the latest code base.

Thanks for your comments.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
  2019-12-20 12:56 Uladzislau Rezki (Sony)
@ 2019-12-21 23:21 ` Joel Fernandes
  2019-12-24 18:49   ` Uladzislau Rezki
  0 siblings, 1 reply; 18+ messages in thread
From: Joel Fernandes @ 2019-12-21 23:21 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, Paul E . McKenney, RCU, Steven Rostedt, Oleksiy Avramchenko

On Fri, Dec 20, 2019 at 01:56:24PM +0100, Uladzislau Rezki (Sony) wrote:
> kfree_rcu() logic can be improved further by using kfree_bulk()
> interface along with "basic batching support" introduced earlier.
> 
> The are at least two advantages of using "bulk" interface:
> - in case of large number of kfree_rcu() requests kfree_bulk()
>   reduces the per-object overhead caused by calling kfree()
>   per-object.
> 
> - reduces the number of cache-misses due to "pointer chasing"
>   between objects which can be far spread between each other.
> 
> This approach defines a new kfree_rcu_bulk_data structure that
> stores pointers in an array with a specific size. Number of
> entries in that array depends on PAGE_SIZE, i.e. it is based
> on the fact that the size of kfree_rcu_bulk_data should not
> exceed one page therefore there is such dependency.
> 
> Since it deals with "block-chain" technique there is an extra
> need in dynamic allocation when a new block is required. Memory
> is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
> allows to skip direct reclaim under low memory condition to
> prevent stalling and fail silently under high memory pressure.
> 
> The "emergency path" gets maintained when a system is run out
> of memory. In that case objects are linked into regular list
> and that is it.
> 
> In order to evaluate it, the "rcuperf" was run to analyze how
> much memory is consumed and what is kfree_bulk() throughput.
> 
> Testing on the Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz 12xCPUs
> with below parameters:
> 
> CONFIG_SLAB=y
> kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1
> 
> Total time taken by all kfree'ers: 56828146341 ns, loops: 200000, batches: 2096
> Total time taken by all kfree'ers: 57329844331 ns, loops: 200000, batches: 2379
> 
> Total time taken by all kfree'ers: 45498404821 ns, loops: 200000, batches: 2271
> Total time taken by all kfree'ers: 45313811813 ns, loops: 200000, batches: 2263
> 
> rcuperf shows approximately ~21% better throughput(Total time)
> in case of using "bulk" interface. The "drain logic" or its RCU
> callback does the work faster that leads to better throughput.
> 
> During the test an average memory usage(see below run_2) is ~469MB
> with "Default" configuration and ~399MB in the "Bulk interface" case.
> 
> See below detailed plots of three run:
> 
> ftp://vps418301.ovh.net/incoming/rcuperf_mem_usage_run_0.png
> ftp://vps418301.ovh.net/incoming/rcuperf_mem_usage_run_1.png
> ftp://vps418301.ovh.net/incoming/rcuperf_mem_usage_run_2.png

Hi Uladzislau,

Your patch is based on an older version of the kfree_rcu work. The latest
version is in Paul's -dev branch. There is also additional work done in that
branch as well "rcu: Add multiple in-flight batches of kfree_rcu() work" :
https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git/commit/?h=dev&id=e38fa01b94c87dfa945afa603ed50b4f7955934b

Could you rebase your patch on Paul's -dev branch? The branch also has an
rcuperf patch for measuring memory footprint automatically (memory footprint
value is printed by rcuperf). Although I'd say try to use the latest version
of the rcuperf patch by reverting that and applying:
https://lore.kernel.org/patchwork/patch/1170895/ . I can then add your
Tested-by tag to any future postings of the patch for rcuperf as well!

thanks,

 - Joel


> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 123 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 103 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index d8e250c8a48f..942a1beb06bb 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2655,6 +2655,28 @@ EXPORT_SYMBOL_GPL(call_rcu);
>  /* Maximum number of jiffies to wait before draining a batch. */
>  #define KFREE_DRAIN_JIFFIES (HZ / 50)
>  
> +/*
> + * This macro defines how many entries the "records" array
> + * will contain. It is based on the fact that the size of
> + * kfree_rcu_bulk_data structure should not exceed one page
> + * therefore there is a dependency on PAGE_SIZE.
> + *
> + * To be more specific it is set to half of the PAGE_SIZE.
> + * For example if the PAGE_SIZE is 4096, the record size
> + * is 8, the structure size becomes 2048 thus number of
> + * entries are 254.
> + *
> + * We also can reserve exactly one page for that purpose
> + * and switch to using directly "page allocator" instead.
> + */
> +#define KFREE_BULK_MAX_ENTR (((PAGE_SIZE / sizeof(void *)) >> 1) - 2)
> +
> +struct kfree_rcu_bulk_data {
> +	unsigned long nr_records;
> +	void *records[KFREE_BULK_MAX_ENTR];
> +	struct kfree_rcu_bulk_data *next;
> +};
> +
>  /*
>   * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
>   * kfree(s) is queued for freeing after a grace period, right away.
> @@ -2666,21 +2688,40 @@ struct kfree_rcu_cpu {
>  	struct rcu_work rcu_work;
>  
>  	/* The list of objects being queued in a batch but are not yet
> -	 * scheduled to be freed.
> +	 * scheduled to be freed. For emergency path only.
>  	 */
>  	struct rcu_head *head;
>  
>  	/* The list of objects that have now left ->head and are queued for
> -	 * freeing after a grace period.
> +	 * freeing after a grace period. For emergency path only.
>  	 */
>  	struct rcu_head *head_free;
>  
> +	/*
> +	 * The bulk list that keeps pointers in the array of
> +	 * specific size for later take over to bhead_free.
> +	 */
> +	struct kfree_rcu_bulk_data *bhead;
> +
> +	/*
> +	 * The bulk list that is detached from the bhead to
> +	 * perform draining using kfree_bulk() interface.
> +	 */
> +	struct kfree_rcu_bulk_data *bhead_free;
> +
> +	/*
> +	 * Keeps at most one object for late reuse.
> +	 */
> +	struct kfree_rcu_bulk_data *bcached;
> +
>  	/* Protect concurrent access to this structure. */
>  	spinlock_t lock;
>  
> -	/* The delayed work that flushes ->head to ->head_free incase ->head
> -	 * within KFREE_DRAIN_JIFFIES. In case flushing cannot be done if RCU
> -	 * is busy, ->head just continues to grow and we retry flushing later.
> +	/*
> +	 * The delayed work that flushes ->bhead/head to ->bhead_free/head_free
> +	 * incase ->bhead/head within KFREE_DRAIN_JIFFIES. In case flushing cannot
> +	 * be done if RCU is busy, ->bhead/head just continues to grow and we retry
> +	 * flushing later.
>  	 */
>  	struct delayed_work monitor_work;
>  	bool monitor_todo;      /* Is a delayed work pending execution? */
> @@ -2690,27 +2731,44 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
>  
>  /*
>   * This function is invoked in workqueue context after a grace period.
> - * It frees all the objects queued on ->head_free.
> + * It frees all the objects queued on ->head_free or bhead_free.
>   */
>  static void kfree_rcu_work(struct work_struct *work)
>  {
>  	unsigned long flags;
>  	struct rcu_head *head, *next;
> +	struct kfree_rcu_bulk_data *bhead, *bnext;
>  	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
>  											  struct kfree_rcu_cpu, rcu_work);
>  
>  	spin_lock_irqsave(&krcp->lock, flags);
>  	head = krcp->head_free;
>  	krcp->head_free = NULL;
> +	bhead = krcp->bhead_free;
> +	krcp->bhead_free = NULL;
>  	spin_unlock_irqrestore(&krcp->lock, flags);
>  
>  	/*
>  	 * The head is detached and not referenced from anywhere, so lockless
>  	 * access is Ok.
>  	 */
> +	for (; bhead; bhead = bnext) {
> +		bnext = bhead->next;
> +		kfree_bulk(bhead->nr_records, bhead->records);
> +
> +		if (cmpxchg(&krcp->bcached, NULL, bhead))
> +			kfree(bhead);
> +
> +		cond_resched_tasks_rcu_qs();
> +	}
> +
> +	/*
> +	 * Emergency case only. It can happen under low
> +	 * memory condition when kmalloc gets failed, so
> +	 * the "bulk" path can not be temporary maintained.
> +	 */
>  	for (; head; head = next) {
>  		next = head->next;
> -		/* Could be possible to optimize with kfree_bulk in future */
>  		__rcu_reclaim(rcu_state.name, head);
>  		cond_resched_tasks_rcu_qs();
>  	}
> @@ -2730,11 +2788,15 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  	 * another one, just refuse the optimization and it will be retried
>  	 * again in KFREE_DRAIN_JIFFIES time.
>  	 */
> -	if (krcp->head_free)
> +	if (krcp->bhead_free || krcp->head_free)
>  		return false;
>  
>  	krcp->head_free = krcp->head;
>  	krcp->head = NULL;
> +
> +	krcp->bhead_free = krcp->bhead;
> +	krcp->bhead = NULL;
> +
>  	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
>  	queue_rcu_work(system_wq, &krcp->rcu_work);
>  
> @@ -2744,8 +2806,9 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
>  										  unsigned long flags)
>  {
> -	/* Flush ->head to ->head_free, all objects on ->head_free will be
> -	 * kfree'd after a grace period.
> +	/*
> +	 * Flush ->bhead/head to ->bhead_free/head_free, so all objects
> +	 * on ->bhead_free/head_free will be freed after a grace period.
>  	 */
>  	if (queue_kfree_rcu_work(krcp)) {
>  		/* Success! Our job is done here. */
> @@ -2763,7 +2826,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
>  
>  /*
>   * This function is invoked after the KFREE_DRAIN_JIFFIES timeout has elapsed,
> - * and it drains the specified kfree_rcu_cpu structure's ->head list.
> + * and it drains the specified kfree_rcu_cpu structure's ->bhead/head list.
>   */
>  static void kfree_rcu_monitor(struct work_struct *work)
>  {
> @@ -2795,17 +2858,15 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
>   * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
>   * will be kfree'd in workqueue context. This allows us to:
>   *
> - * 1. Batch requests together to reduce the number of grace periods during
> + * Batch requests together to reduce the number of grace periods during
>   * heavy kfree_rcu() load.
> - *
> - * 2. In the future, makes it possible to use kfree_bulk() on a large number of
> - * kfree_rcu() requests thus reducing the per-object overhead of kfree() and
> - * also reducing cache misses.
>   */
>  void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  {
>  	unsigned long flags;
>  	struct kfree_rcu_cpu *krcp;
> +	struct kfree_rcu_bulk_data *bnode;
> +	bool maintain_bulk_list = true;
>  
>  	/* kfree_call_rcu() batching requires timers to be up. If the scheduler
>  	 * is not yet up, just skip batching and do the non-batched version.
> @@ -2813,15 +2874,37 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
>  		return kfree_call_rcu_nobatch(head, func);
>  
> -	head->func = func;
> -
>  	local_irq_save(flags);  /* For safely calling this_cpu_ptr(). */
>  	krcp = this_cpu_ptr(&krc);
>  	spin_lock(&krcp->lock);
>  
> +	/* Check if we need a new block. */
> +	if (!krcp->bhead ||
> +			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
> +		bnode = xchg(&krcp->bcached, NULL);
> +		if (!bnode)
> +			bnode = kmalloc(sizeof(struct kfree_rcu_bulk_data),
> +				GFP_NOWAIT | __GFP_NOWARN);
> +
> +		if (likely(bnode)) {
> +			bnode->nr_records = 0;
> +			bnode->next = krcp->bhead;
> +			krcp->bhead = bnode;
> +		} else {
> +			/* If gets failed, maintain the list instead. */
> +			maintain_bulk_list = false;
> +		}
> +	}
> +
>  	/* Queue the kfree but don't yet schedule the batch. */
> -	head->next = krcp->head;
> -	krcp->head = head;
> +	if (likely(maintain_bulk_list)) {
> +		krcp->bhead->records[krcp->bhead->nr_records++] =
> +			(void *) head - (unsigned long) func;
> +	} else {
> +		head->func = func;
> +		head->next = krcp->head;
> +		krcp->head = head;
> +	}
>  
>  	/* Schedule monitor for timely drain after KFREE_DRAIN_JIFFIES. */
>  	if (!xchg(&krcp->monitor_todo, true))
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu()
@ 2019-12-20 12:56 Uladzislau Rezki (Sony)
  2019-12-21 23:21 ` Joel Fernandes
  0 siblings, 1 reply; 18+ messages in thread
From: Uladzislau Rezki (Sony) @ 2019-12-20 12:56 UTC (permalink / raw)
  To: LKML
  Cc: Paul E . McKenney, Joel Fernandes, RCU, Uladzislau Rezki,
	Steven Rostedt, Oleksiy Avramchenko

kfree_rcu() logic can be improved further by using kfree_bulk()
interface along with "basic batching support" introduced earlier.

The are at least two advantages of using "bulk" interface:
- in case of large number of kfree_rcu() requests kfree_bulk()
  reduces the per-object overhead caused by calling kfree()
  per-object.

- reduces the number of cache-misses due to "pointer chasing"
  between objects which can be far spread between each other.

This approach defines a new kfree_rcu_bulk_data structure that
stores pointers in an array with a specific size. Number of
entries in that array depends on PAGE_SIZE, i.e. it is based
on the fact that the size of kfree_rcu_bulk_data should not
exceed one page therefore there is such dependency.

Since it deals with "block-chain" technique there is an extra
need in dynamic allocation when a new block is required. Memory
is allocated with GFP_NOWAIT | __GFP_NOWARN flags, i.e. that
allows to skip direct reclaim under low memory condition to
prevent stalling and fail silently under high memory pressure.

The "emergency path" gets maintained when a system is run out
of memory. In that case objects are linked into regular list
and that is it.

In order to evaluate it, the "rcuperf" was run to analyze how
much memory is consumed and what is kfree_bulk() throughput.

Testing on the Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz 12xCPUs
with below parameters:

CONFIG_SLAB=y
kfree_loops=200000 kfree_alloc_num=1000 kfree_rcu_test=1

Total time taken by all kfree'ers: 56828146341 ns, loops: 200000, batches: 2096
Total time taken by all kfree'ers: 57329844331 ns, loops: 200000, batches: 2379

Total time taken by all kfree'ers: 45498404821 ns, loops: 200000, batches: 2271
Total time taken by all kfree'ers: 45313811813 ns, loops: 200000, batches: 2263

rcuperf shows approximately ~21% better throughput(Total time)
in case of using "bulk" interface. The "drain logic" or its RCU
callback does the work faster that leads to better throughput.

During the test an average memory usage(see below run_2) is ~469MB
with "Default" configuration and ~399MB in the "Bulk interface" case.

See below detailed plots of three run:

ftp://vps418301.ovh.net/incoming/rcuperf_mem_usage_run_0.png
ftp://vps418301.ovh.net/incoming/rcuperf_mem_usage_run_1.png
ftp://vps418301.ovh.net/incoming/rcuperf_mem_usage_run_2.png

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 123 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 103 insertions(+), 20 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d8e250c8a48f..942a1beb06bb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2655,6 +2655,28 @@ EXPORT_SYMBOL_GPL(call_rcu);
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
 
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kfree_rcu_bulk_data structure should not exceed one page
+ * therefore there is a dependency on PAGE_SIZE.
+ *
+ * To be more specific it is set to half of the PAGE_SIZE.
+ * For example if the PAGE_SIZE is 4096, the record size
+ * is 8, the structure size becomes 2048 thus number of
+ * entries are 254.
+ *
+ * We also can reserve exactly one page for that purpose
+ * and switch to using directly "page allocator" instead.
+ */
+#define KFREE_BULK_MAX_ENTR (((PAGE_SIZE / sizeof(void *)) >> 1) - 2)
+
+struct kfree_rcu_bulk_data {
+	unsigned long nr_records;
+	void *records[KFREE_BULK_MAX_ENTR];
+	struct kfree_rcu_bulk_data *next;
+};
+
 /*
  * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
  * kfree(s) is queued for freeing after a grace period, right away.
@@ -2666,21 +2688,40 @@ struct kfree_rcu_cpu {
 	struct rcu_work rcu_work;
 
 	/* The list of objects being queued in a batch but are not yet
-	 * scheduled to be freed.
+	 * scheduled to be freed. For emergency path only.
 	 */
 	struct rcu_head *head;
 
 	/* The list of objects that have now left ->head and are queued for
-	 * freeing after a grace period.
+	 * freeing after a grace period. For emergency path only.
 	 */
 	struct rcu_head *head_free;
 
+	/*
+	 * The bulk list that keeps pointers in the array of
+	 * specific size for later take over to bhead_free.
+	 */
+	struct kfree_rcu_bulk_data *bhead;
+
+	/*
+	 * The bulk list that is detached from the bhead to
+	 * perform draining using kfree_bulk() interface.
+	 */
+	struct kfree_rcu_bulk_data *bhead_free;
+
+	/*
+	 * Keeps at most one object for late reuse.
+	 */
+	struct kfree_rcu_bulk_data *bcached;
+
 	/* Protect concurrent access to this structure. */
 	spinlock_t lock;
 
-	/* The delayed work that flushes ->head to ->head_free incase ->head
-	 * within KFREE_DRAIN_JIFFIES. In case flushing cannot be done if RCU
-	 * is busy, ->head just continues to grow and we retry flushing later.
+	/*
+	 * The delayed work that flushes ->bhead/head to ->bhead_free/head_free
+	 * incase ->bhead/head within KFREE_DRAIN_JIFFIES. In case flushing cannot
+	 * be done if RCU is busy, ->bhead/head just continues to grow and we retry
+	 * flushing later.
 	 */
 	struct delayed_work monitor_work;
 	bool monitor_todo;      /* Is a delayed work pending execution? */
@@ -2690,27 +2731,44 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
 
 /*
  * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->head_free.
+ * It frees all the objects queued on ->head_free or bhead_free.
  */
 static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
 	struct rcu_head *head, *next;
+	struct kfree_rcu_bulk_data *bhead, *bnext;
 	struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
 											  struct kfree_rcu_cpu, rcu_work);
 
 	spin_lock_irqsave(&krcp->lock, flags);
 	head = krcp->head_free;
 	krcp->head_free = NULL;
+	bhead = krcp->bhead_free;
+	krcp->bhead_free = NULL;
 	spin_unlock_irqrestore(&krcp->lock, flags);
 
 	/*
 	 * The head is detached and not referenced from anywhere, so lockless
 	 * access is Ok.
 	 */
+	for (; bhead; bhead = bnext) {
+		bnext = bhead->next;
+		kfree_bulk(bhead->nr_records, bhead->records);
+
+		if (cmpxchg(&krcp->bcached, NULL, bhead))
+			kfree(bhead);
+
+		cond_resched_tasks_rcu_qs();
+	}
+
+	/*
+	 * Emergency case only. It can happen under low
+	 * memory condition when kmalloc gets failed, so
+	 * the "bulk" path can not be temporary maintained.
+	 */
 	for (; head; head = next) {
 		next = head->next;
-		/* Could be possible to optimize with kfree_bulk in future */
 		__rcu_reclaim(rcu_state.name, head);
 		cond_resched_tasks_rcu_qs();
 	}
@@ -2730,11 +2788,15 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 	 * another one, just refuse the optimization and it will be retried
 	 * again in KFREE_DRAIN_JIFFIES time.
 	 */
-	if (krcp->head_free)
+	if (krcp->bhead_free || krcp->head_free)
 		return false;
 
 	krcp->head_free = krcp->head;
 	krcp->head = NULL;
+
+	krcp->bhead_free = krcp->bhead;
+	krcp->bhead = NULL;
+
 	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
 	queue_rcu_work(system_wq, &krcp->rcu_work);
 
@@ -2744,8 +2806,9 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
 										  unsigned long flags)
 {
-	/* Flush ->head to ->head_free, all objects on ->head_free will be
-	 * kfree'd after a grace period.
+	/*
+	 * Flush ->bhead/head to ->bhead_free/head_free, so all objects
+	 * on ->bhead_free/head_free will be freed after a grace period.
 	 */
 	if (queue_kfree_rcu_work(krcp)) {
 		/* Success! Our job is done here. */
@@ -2763,7 +2826,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
 
 /*
  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout has elapsed,
- * and it drains the specified kfree_rcu_cpu structure's ->head list.
+ * and it drains the specified kfree_rcu_cpu structure's ->bhead/head list.
  */
 static void kfree_rcu_monitor(struct work_struct *work)
 {
@@ -2795,17 +2858,15 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
  * will be kfree'd in workqueue context. This allows us to:
  *
- * 1. Batch requests together to reduce the number of grace periods during
+ * Batch requests together to reduce the number of grace periods during
  * heavy kfree_rcu() load.
- *
- * 2. In the future, makes it possible to use kfree_bulk() on a large number of
- * kfree_rcu() requests thus reducing the per-object overhead of kfree() and
- * also reducing cache misses.
  */
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
+	struct kfree_rcu_bulk_data *bnode;
+	bool maintain_bulk_list = true;
 
 	/* kfree_call_rcu() batching requires timers to be up. If the scheduler
 	 * is not yet up, just skip batching and do the non-batched version.
@@ -2813,15 +2874,37 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
 		return kfree_call_rcu_nobatch(head, func);
 
-	head->func = func;
-
 	local_irq_save(flags);  /* For safely calling this_cpu_ptr(). */
 	krcp = this_cpu_ptr(&krc);
 	spin_lock(&krcp->lock);
 
+	/* Check if we need a new block. */
+	if (!krcp->bhead ||
+			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
+		bnode = xchg(&krcp->bcached, NULL);
+		if (!bnode)
+			bnode = kmalloc(sizeof(struct kfree_rcu_bulk_data),
+				GFP_NOWAIT | __GFP_NOWARN);
+
+		if (likely(bnode)) {
+			bnode->nr_records = 0;
+			bnode->next = krcp->bhead;
+			krcp->bhead = bnode;
+		} else {
+			/* If gets failed, maintain the list instead. */
+			maintain_bulk_list = false;
+		}
+	}
+
 	/* Queue the kfree but don't yet schedule the batch. */
-	head->next = krcp->head;
-	krcp->head = head;
+	if (likely(maintain_bulk_list)) {
+		krcp->bhead->records[krcp->bhead->nr_records++] =
+			(void *) head - (unsigned long) func;
+	} else {
+		head->func = func;
+		head->next = krcp->head;
+		krcp->head = head;
+	}
 
 	/* Schedule monitor for timely drain after KFREE_DRAIN_JIFFIES. */
 	if (!xchg(&krcp->monitor_todo, true))
-- 
2.20.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, back to index

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-31 12:22 [PATCH 1/1] rcu/tree: support kfree_bulk() interface in kfree_rcu() Uladzislau Rezki (Sony)
2020-01-13 19:03 ` Paul E. McKenney
2020-01-14 16:49   ` Joel Fernandes
2020-01-15 13:14     ` Uladzislau Rezki
2020-01-15 22:53       ` Joel Fernandes
2020-01-17 17:52         ` Uladzislau Rezki
2020-01-17 18:57           ` Joel Fernandes
2020-01-17 21:37             ` Paul E. McKenney
2020-01-17 21:59               ` Joel Fernandes
2020-01-19 13:03                 ` Uladzislau Rezki
2020-01-16  1:14 ` Joel Fernandes
2020-01-16  2:41   ` Paul E. McKenney
2020-01-16 17:27     ` Uladzislau Rezki
2020-01-16 17:44       ` Paul E. McKenney
2020-01-16 17:24   ` Uladzislau Rezki
  -- strict thread matches above, loose matches on Subject: below --
2019-12-20 12:56 Uladzislau Rezki (Sony)
2019-12-21 23:21 ` Joel Fernandes
2019-12-24 18:49   ` Uladzislau Rezki

RCU Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/rcu/0 rcu/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 rcu rcu/ https://lore.kernel.org/rcu \
		rcu@vger.kernel.org
	public-inbox-index rcu

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.rcu


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git