linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] rcu/tree: add emergency pool for headless case
@ 2020-04-03 17:30 Uladzislau Rezki (Sony)
  2020-04-03 18:16 ` Matthew Wilcox
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-04-03 17:30 UTC (permalink / raw)
  To: LKML, Paul E . McKenney, Joel Fernandes
  Cc: RCU, linux-mm, Andrew Morton, Uladzislau Rezki, Steven Rostedt,
	Oleksiy Avramchenko

Maintain an emergency pool for each CPU with some
extra objects. There is read-only sysfs attribute,
the name is "rcu_nr_emergency_objs". It reflects
the size of the pool. As for now the default value
is 3.

The pool is populated when low memory condition is
detected. Please note it is only for headless case
it means when the regular SLAB is not able to serve
any request, the pool is used.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 133 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 97 insertions(+), 36 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5e26145e9ead..f9f1f935ab0b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -114,6 +114,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 int rcu_kfree_nowarn;
 module_param(rcu_kfree_nowarn, int, 0444);
 
+/*
+ * For headless variant. Under memory pressure an
+ * emergency pool can be used if the regular SLAB
+ * is not able to serve some memory for us.
+ */
+int rcu_nr_emergency_objs = 3;
+module_param(rcu_nr_emergency_objs, int, 0444);
+
 /* Number of rcu_nodes at specified level. */
 int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
@@ -2877,6 +2885,12 @@ struct kfree_rcu_cpu {
 	bool initialized;
 	// Number of objects for which GP not started
 	int count;
+
+	/*
+	 * Reserved emergency pool for headless variant.
+	 */
+	int nr_emergency;
+	void **emergency;
 };
 
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
@@ -2892,6 +2906,27 @@ debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
 #endif
 }
 
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+	struct kfree_rcu_cpu *krcp;
+
+	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
+	krcp = this_cpu_ptr(&krc);
+	if (likely(krcp->initialized))
+		spin_lock(&krcp->lock);
+
+	return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+	if (likely(krcp->initialized))
+		spin_unlock(&krcp->lock);
+	local_irq_restore(flags);
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bhead_free or ->head_free.
@@ -2974,6 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	 */
 	for (; head; head = next) {
 		unsigned long offset = (unsigned long)head->func;
+		unsigned long flags;
 		bool headless;
 		void *ptr;
 
@@ -2991,10 +3027,23 @@ static void kfree_rcu_work(struct work_struct *work)
 		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
 
 		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) {
-			if (headless)
+			if (headless) {
 				kvfree((void *) *((unsigned long *) ptr));
 
-			kvfree(ptr);
+				krcp = krc_this_cpu_lock(&flags);
+				if (krcp->emergency) {
+					if (krcp->nr_emergency < rcu_nr_emergency_objs) {
+						krcp->emergency[krcp->nr_emergency++] = ptr;
+
+						/* Bypass freeing of it, it is in emergency pool. */
+						ptr = NULL;
+					}
+				}
+				krc_this_cpu_unlock(krcp, flags);
+			}
+
+			if (ptr)
+				kvfree(ptr);
 		}
 
 		rcu_lock_release(&rcu_callback_map);
@@ -3144,40 +3193,26 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 	return true;
 }
 
-static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
+static inline struct rcu_head *
+set_ptr_in_rcu_head_obj(void *ptr, unsigned long *rho)
+{
+	rho[0] = (unsigned long) ptr;
+	return ((struct rcu_head *) ++rho);
+}
+
+static inline struct rcu_head *
+alloc_rcu_head_obj(void *ptr)
 {
-	unsigned long *ptr;
+	unsigned long *rho;
 
 	/* Try hard to get the memory. */
-	ptr = kmalloc(sizeof(unsigned long *) +
+	rho = kmalloc(sizeof(unsigned long *) +
 		sizeof(struct rcu_head), GFP_KERNEL |
 			__GFP_ATOMIC | __GFP_HIGH | __GFP_RETRY_MAYFAIL);
-	if (!ptr)
+	if (!rho)
 		return NULL;
 
-	ptr[0] = (unsigned long) obj;
-	return ((struct rcu_head *) ++ptr);
-}
-
-static inline struct kfree_rcu_cpu *
-krc_this_cpu_lock(unsigned long *flags)
-{
-	struct kfree_rcu_cpu *krcp;
-
-	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
-	krcp = this_cpu_ptr(&krc);
-	if (likely(krcp->initialized))
-		spin_lock(&krcp->lock);
-
-	return krcp;
-}
-
-static inline void
-krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
-{
-	if (likely(krcp->initialized))
-		spin_unlock(&krcp->lock);
-	local_irq_restore(flags);
+	return set_ptr_in_rcu_head_obj(ptr, rho);
 }
 
 /*
@@ -3237,15 +3272,31 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	if (!success) {
 		/* Is headless object? */
 		if (head == NULL) {
-			/* Drop the lock. */
+			/*
+			 * Drop the lock to use more permissive
+			 * parameters, after that take it back.
+			 */
 			krc_this_cpu_unlock(krcp, flags);
+			head = alloc_rcu_head_obj(ptr);
+			krcp = krc_this_cpu_lock(&flags);
 
-			head = attach_rcu_head_to_object(ptr);
-			if (head == NULL)
-				goto inline_return;
+			/*
+			 * Use emergency pool if still fails.
+			 */
+			if (head == NULL) {
+				if (!krcp->nr_emergency)
+					goto unlock_return;
 
-			/* Take it back. */
-			krcp = krc_this_cpu_lock(&flags);
+				head = set_ptr_in_rcu_head_obj(ptr,
+					krcp->emergency[--krcp->nr_emergency]);
+
+				/*
+				 * We do not need to do it. But just in case
+				 * let's set the pulled slot to NULL to avoid
+				 * magic issues.
+				 */
+				krcp->emergency[krcp->nr_emergency] = NULL;
+			}
 
 			/*
 			 * Tag the headless object. Such objects have a back-pointer
@@ -3282,7 +3333,6 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 unlock_return:
 	krc_this_cpu_unlock(krcp, flags);
 
-inline_return:
 	/*
 	 * High memory pressure, so inline kvfree() after
 	 * synchronize_rcu(). We can do it from might_sleep()
@@ -4272,6 +4322,17 @@ static void __init kfree_rcu_batch_init(void)
 		}
 
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+
+		/*
+		 * The poll will be populated when low memory condition
+		 * is detected. Therefore we do not fill it in here.
+		 */
+		krcp->emergency = kmalloc_array(rcu_nr_emergency_objs,
+			sizeof(void *), GFP_NOWAIT);
+
+		if (!krcp->emergency)
+			pr_err("Failed to create emergency pool for %d CPU!\n", cpu);
+
 		krcp->initialized = true;
 	}
 	if (register_shrinker(&kfree_rcu_shrinker))
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-03 17:30 [PATCH 1/1] rcu/tree: add emergency pool for headless case Uladzislau Rezki (Sony)
@ 2020-04-03 18:16 ` Matthew Wilcox
  2020-04-04 19:09   ` Uladzislau Rezki
  2020-04-03 19:14 ` Paul E. McKenney
  2020-04-04 19:51 ` Joel Fernandes
  2 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2020-04-03 18:16 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, Paul E . McKenney, Joel Fernandes, RCU, linux-mm,
	Andrew Morton, Steven Rostedt, Oleksiy Avramchenko

On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> @@ -2877,6 +2885,12 @@ struct kfree_rcu_cpu {
>  	bool initialized;
>  	// Number of objects for which GP not started
>  	int count;
> +
> +	/*
> +	 * Reserved emergency pool for headless variant.
> +	 */
> +	int nr_emergency;
> +	void **emergency;

This is a pretty expensive way to maintain an emergency pool.

Try something like this ...

struct emergency_pool_object {
	union {
		struct whatever foo;
		struct {
			int remaining;
			void *next;
		};
	};
};

struct kfree_rcu_cpu {
	...
	struct emergency_pool_object *epo;
};

struct whatever *get_emergency_object(struct kfree_rcu_cpu *krc)
{
	struct emergency_pool_object *epo = krc->epo;
	if (epo)
		krc->epo = epo->next;
	return &epo->foo;
}

void alloc_emergency_objects(struct kfree_rcu_cpu *krc, int n)
{
	int i = 0;

	if (krc->epo)
		i = krc->epo->remaining;

	while (++i < n) {
		struct emergency_pool_object *epo = kmalloc(sizeof(epo), GFP);
		epo->remaining = i;
		epo->next = krc->epo;
		krc->epo = epo;
	}
}


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-03 17:30 [PATCH 1/1] rcu/tree: add emergency pool for headless case Uladzislau Rezki (Sony)
  2020-04-03 18:16 ` Matthew Wilcox
@ 2020-04-03 19:14 ` Paul E. McKenney
  2020-04-04 19:10   ` Uladzislau Rezki
  2020-04-04 19:51 ` Joel Fernandes
  2 siblings, 1 reply; 14+ messages in thread
From: Paul E. McKenney @ 2020-04-03 19:14 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, Joel Fernandes, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> Maintain an emergency pool for each CPU with some
> extra objects. There is read-only sysfs attribute,
> the name is "rcu_nr_emergency_objs". It reflects
> the size of the pool. As for now the default value
> is 3.
> 
> The pool is populated when low memory condition is
> detected. Please note it is only for headless case
> it means when the regular SLAB is not able to serve
> any request, the pool is used.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 133 +++++++++++++++++++++++++++++++++-------------
>  1 file changed, 97 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 5e26145e9ead..f9f1f935ab0b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -114,6 +114,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
>  int rcu_kfree_nowarn;
>  module_param(rcu_kfree_nowarn, int, 0444);
>  
> +/*
> + * For headless variant. Under memory pressure an
> + * emergency pool can be used if the regular SLAB
> + * is not able to serve some memory for us.
> + */
> +int rcu_nr_emergency_objs = 3;
> +module_param(rcu_nr_emergency_objs, int, 0444);

Please document this in Documentation/admin-guide/kernel-parameters.txt.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-03 18:16 ` Matthew Wilcox
@ 2020-04-04 19:09   ` Uladzislau Rezki
  0 siblings, 0 replies; 14+ messages in thread
From: Uladzislau Rezki @ 2020-04-04 19:09 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Uladzislau Rezki (Sony),
	LKML, Paul E . McKenney, Joel Fernandes, RCU, linux-mm,
	Andrew Morton, Steven Rostedt, Oleksiy Avramchenko

Hello, Matthew.

> On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> > @@ -2877,6 +2885,12 @@ struct kfree_rcu_cpu {
> >  	bool initialized;
> >  	// Number of objects for which GP not started
> >  	int count;
> > +
> > +	/*
> > +	 * Reserved emergency pool for headless variant.
> > +	 */
> > +	int nr_emergency;
> > +	void **emergency;
> 
> This is a pretty expensive way to maintain an emergency pool.
> 
Well. I do not see what is expansive there, really. But i see
some drawbacks i would like to fix. First of all get rid of

<snip>
    krcp->emergency = kmalloc_array(rcu_nr_emergency_objs,
        sizeof(void *), GFP_NOWAIT);
<snip>

and second one, as you pointed below to use list instead of
an array. There is some advantages, first is no need in array
bound check, second, in case of list we can dynamically control
its length via exposed sysfs attribute.

> Try something like this ...
> 
> struct emergency_pool_object {
> 	union {
> 		struct whatever foo;
> 		struct {
> 			int remaining;
> 			void *next;
> 		};
> 	};
> };
> 
> struct kfree_rcu_cpu {
> 	...
> 	struct emergency_pool_object *epo;
> };
> 
> struct whatever *get_emergency_object(struct kfree_rcu_cpu *krc)
> {
> 	struct emergency_pool_object *epo = krc->epo;
> 	if (epo)
> 		krc->epo = epo->next;
> 	return &epo->foo;
> }
> 
> void alloc_emergency_objects(struct kfree_rcu_cpu *krc, int n)
> {
> 	int i = 0;
> 
> 	if (krc->epo)
> 		i = krc->epo->remaining;
> 
> 	while (++i < n) {
> 		struct emergency_pool_object *epo = kmalloc(sizeof(epo), GFP);
> 		epo->remaining = i;
> 		epo->next = krc->epo;
> 		krc->epo = epo;
> 	}
> }
> 
I will upload v2. I just stash objects in the list. Something like:

<snip>
@@ -2877,6 +2888,18 @@ struct kfree_rcu_cpu {
        bool initialized;
        // Number of objects for which GP not started
        int count;
+
+       /*
+        * Reserved emergency objects for headless variant.
+        * The objects are queued into the lock-less list,
+        * the length of the list is limited therefore we
+        * also have a counter.
+        *
+        * Actually we have the room for embedding a counter
+        * into our cached object, but let's keep it simple.
+        */
+       int nr_objs_elist;
+       struct llist_head elist;
 };
...
+static inline unsigned long *
+get_emergency_object(struct kfree_rcu_cpu *krcp)
+{
+       if (!krcp->nr_objs_elist)
+               return NULL;
+
+       krcp->nr_objs_elist--;
+       return (unsigned long *)
+               llist_del_first(&krcp->elist);
+}
+
+static inline bool
+put_emergency_object(struct kfree_rcu_cpu *krcp,
+       unsigned long *obj)
+{
+       /* Check the limit. */
+       if (krcp->nr_objs_elist >= rcu_nr_emergency_objs)
+               return false;
+
+       llist_add((struct llist_node *) obj, &krcp->elist);
+       krcp->nr_objs_elist++;
+       return true;
+}
<snip>

Thanks for your comments!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-03 19:14 ` Paul E. McKenney
@ 2020-04-04 19:10   ` Uladzislau Rezki
  0 siblings, 0 replies; 14+ messages in thread
From: Uladzislau Rezki @ 2020-04-04 19:10 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki (Sony),
	LKML, Joel Fernandes, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

On Fri, Apr 03, 2020 at 12:14:19PM -0700, Paul E. McKenney wrote:
> On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> > Maintain an emergency pool for each CPU with some
> > extra objects. There is read-only sysfs attribute,
> > the name is "rcu_nr_emergency_objs". It reflects
> > the size of the pool. As for now the default value
> > is 3.
> > 
> > The pool is populated when low memory condition is
> > detected. Please note it is only for headless case
> > it means when the regular SLAB is not able to serve
> > any request, the pool is used.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  kernel/rcu/tree.c | 133 +++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 97 insertions(+), 36 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 5e26145e9ead..f9f1f935ab0b 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -114,6 +114,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
> >  int rcu_kfree_nowarn;
> >  module_param(rcu_kfree_nowarn, int, 0444);
> >  
> > +/*
> > + * For headless variant. Under memory pressure an
> > + * emergency pool can be used if the regular SLAB
> > + * is not able to serve some memory for us.
> > + */
> > +int rcu_nr_emergency_objs = 3;
> > +module_param(rcu_nr_emergency_objs, int, 0444);
> 
> Please document this in Documentation/admin-guide/kernel-parameters.txt.
> 
Will do that, Paul!

Thanks for good point :)

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-03 17:30 [PATCH 1/1] rcu/tree: add emergency pool for headless case Uladzislau Rezki (Sony)
  2020-04-03 18:16 ` Matthew Wilcox
  2020-04-03 19:14 ` Paul E. McKenney
@ 2020-04-04 19:51 ` Joel Fernandes
  2020-04-05 17:21   ` Uladzislau Rezki
  2 siblings, 1 reply; 14+ messages in thread
From: Joel Fernandes @ 2020-04-04 19:51 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, Paul E . McKenney, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> Maintain an emergency pool for each CPU with some
> extra objects. There is read-only sysfs attribute,
> the name is "rcu_nr_emergency_objs". It reflects
> the size of the pool. As for now the default value
> is 3.
> 
> The pool is populated when low memory condition is
> detected. Please note it is only for headless case
> it means when the regular SLAB is not able to serve
> any request, the pool is used.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>

Hi Vlad,

One concern I have is this moves the problem a bit further down. My belief is
we should avoid the likelihood of even needing an rcu_head allocated for the
headless case, to begin with - than trying to do damage-control when it does
happen. The only way we would end up needing an rcu_head is if we could not
allocate an array.

So instead of adding a pool for rcu_head allocations, how do you feel about
pre-allocation of the per-cpu cache array instead, which has the same effect
as you are intending?

This has 3 benefits:
1. It scales with number of CPUs, no configuration needed.
2. It makes the first kfree_rcu() faster and less dependent on an allocation
   succeeding.
3. Much simpler code, no new structures or special handling.
4. In the future we can extend it to allocate more than 2 pages per CPU using
   the same caching mechanism.

The obvious drawback being its 2 pages per CPU but at least it scales by
number of CPUs. Something like the following (just lightly tested):

---8<-----------------------

From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Subject: [PATCH] rcu/tree: Preallocate the per-cpu cache for kfree_rcu()

In recent changes, we have made it possible to use kfree_rcu() without
embedding an rcu_head in the object being free'd. This requires dynamic
allocation. In case dynamic allocation fails due to memory pressure, we
would end up synchronously waiting for an RCU grace period thus hurting
kfree_rcu() latency.

To make this less probable, let us pre-allocate the per-cpu cache so we
depend less on the dynamic allocation succeeding. This also has the
effect of making kfree_rcu() slightly faster at run time.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6172e6296dd7d..9fbdeb4048425 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4251,6 +4251,11 @@ static void __init kfree_rcu_batch_init(void)
 			krcp->krw_arr[i].krcp = krcp;
 		}
 
+		krcp->bkvcache[0] =  (struct kvfree_rcu_bulk_data *)
+					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+		krcp->bkvcache[1] =  (struct kvfree_rcu_bulk_data *)
+					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		krcp->initialized = true;
 	}
-- 
2.26.0.292.g33ef6b2f38-goog


> ---
>  kernel/rcu/tree.c | 133 +++++++++++++++++++++++++++++++++-------------
>  1 file changed, 97 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 5e26145e9ead..f9f1f935ab0b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -114,6 +114,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
>  int rcu_kfree_nowarn;
>  module_param(rcu_kfree_nowarn, int, 0444);
>  
> +/*
> + * For headless variant. Under memory pressure an
> + * emergency pool can be used if the regular SLAB
> + * is not able to serve some memory for us.
> + */
> +int rcu_nr_emergency_objs = 3;
> +module_param(rcu_nr_emergency_objs, int, 0444);
> +
>  /* Number of rcu_nodes at specified level. */
>  int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
>  int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
> @@ -2877,6 +2885,12 @@ struct kfree_rcu_cpu {
>  	bool initialized;
>  	// Number of objects for which GP not started
>  	int count;
> +
> +	/*
> +	 * Reserved emergency pool for headless variant.
> +	 */
> +	int nr_emergency;
> +	void **emergency;
>  };
>  
>  static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
> @@ -2892,6 +2906,27 @@ debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
>  #endif
>  }
>  
> +static inline struct kfree_rcu_cpu *
> +krc_this_cpu_lock(unsigned long *flags)
> +{
> +	struct kfree_rcu_cpu *krcp;
> +
> +	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
> +	krcp = this_cpu_ptr(&krc);
> +	if (likely(krcp->initialized))
> +		spin_lock(&krcp->lock);
> +
> +	return krcp;
> +}
> +
> +static inline void
> +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
> +{
> +	if (likely(krcp->initialized))
> +		spin_unlock(&krcp->lock);
> +	local_irq_restore(flags);
> +}
> +
>  /*
>   * This function is invoked in workqueue context after a grace period.
>   * It frees all the objects queued on ->bhead_free or ->head_free.
> @@ -2974,6 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work)
>  	 */
>  	for (; head; head = next) {
>  		unsigned long offset = (unsigned long)head->func;
> +		unsigned long flags;
>  		bool headless;
>  		void *ptr;
>  
> @@ -2991,10 +3027,23 @@ static void kfree_rcu_work(struct work_struct *work)
>  		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
>  
>  		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) {
> -			if (headless)
> +			if (headless) {
>  				kvfree((void *) *((unsigned long *) ptr));
>  
> -			kvfree(ptr);
> +				krcp = krc_this_cpu_lock(&flags);
> +				if (krcp->emergency) {
> +					if (krcp->nr_emergency < rcu_nr_emergency_objs) {
> +						krcp->emergency[krcp->nr_emergency++] = ptr;
> +
> +						/* Bypass freeing of it, it is in emergency pool. */
> +						ptr = NULL;
> +					}
> +				}
> +				krc_this_cpu_unlock(krcp, flags);
> +			}
> +
> +			if (ptr)
> +				kvfree(ptr);
>  		}
>  
>  		rcu_lock_release(&rcu_callback_map);
> @@ -3144,40 +3193,26 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
>  	return true;
>  }
>  
> -static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
> +static inline struct rcu_head *
> +set_ptr_in_rcu_head_obj(void *ptr, unsigned long *rho)
> +{
> +	rho[0] = (unsigned long) ptr;
> +	return ((struct rcu_head *) ++rho);
> +}
> +
> +static inline struct rcu_head *
> +alloc_rcu_head_obj(void *ptr)
>  {
> -	unsigned long *ptr;
> +	unsigned long *rho;
>  
>  	/* Try hard to get the memory. */
> -	ptr = kmalloc(sizeof(unsigned long *) +
> +	rho = kmalloc(sizeof(unsigned long *) +
>  		sizeof(struct rcu_head), GFP_KERNEL |
>  			__GFP_ATOMIC | __GFP_HIGH | __GFP_RETRY_MAYFAIL);
> -	if (!ptr)
> +	if (!rho)
>  		return NULL;
>  
> -	ptr[0] = (unsigned long) obj;
> -	return ((struct rcu_head *) ++ptr);
> -}
> -
> -static inline struct kfree_rcu_cpu *
> -krc_this_cpu_lock(unsigned long *flags)
> -{
> -	struct kfree_rcu_cpu *krcp;
> -
> -	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
> -	krcp = this_cpu_ptr(&krc);
> -	if (likely(krcp->initialized))
> -		spin_lock(&krcp->lock);
> -
> -	return krcp;
> -}
> -
> -static inline void
> -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
> -{
> -	if (likely(krcp->initialized))
> -		spin_unlock(&krcp->lock);
> -	local_irq_restore(flags);
> +	return set_ptr_in_rcu_head_obj(ptr, rho);
>  }
>  
>  /*
> @@ -3237,15 +3272,31 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  	if (!success) {
>  		/* Is headless object? */
>  		if (head == NULL) {
> -			/* Drop the lock. */
> +			/*
> +			 * Drop the lock to use more permissive
> +			 * parameters, after that take it back.
> +			 */
>  			krc_this_cpu_unlock(krcp, flags);
> +			head = alloc_rcu_head_obj(ptr);
> +			krcp = krc_this_cpu_lock(&flags);
>  
> -			head = attach_rcu_head_to_object(ptr);
> -			if (head == NULL)
> -				goto inline_return;
> +			/*
> +			 * Use emergency pool if still fails.
> +			 */
> +			if (head == NULL) {
> +				if (!krcp->nr_emergency)
> +					goto unlock_return;
>  
> -			/* Take it back. */
> -			krcp = krc_this_cpu_lock(&flags);
> +				head = set_ptr_in_rcu_head_obj(ptr,
> +					krcp->emergency[--krcp->nr_emergency]);
> +
> +				/*
> +				 * We do not need to do it. But just in case
> +				 * let's set the pulled slot to NULL to avoid
> +				 * magic issues.
> +				 */
> +				krcp->emergency[krcp->nr_emergency] = NULL;
> +			}
>  
>  			/*
>  			 * Tag the headless object. Such objects have a back-pointer
> @@ -3282,7 +3333,6 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  unlock_return:
>  	krc_this_cpu_unlock(krcp, flags);
>  
> -inline_return:
>  	/*
>  	 * High memory pressure, so inline kvfree() after
>  	 * synchronize_rcu(). We can do it from might_sleep()
> @@ -4272,6 +4322,17 @@ static void __init kfree_rcu_batch_init(void)
>  		}
>  
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> +
> +		/*
> +		 * The poll will be populated when low memory condition
> +		 * is detected. Therefore we do not fill it in here.
> +		 */
> +		krcp->emergency = kmalloc_array(rcu_nr_emergency_objs,
> +			sizeof(void *), GFP_NOWAIT);
> +
> +		if (!krcp->emergency)
> +			pr_err("Failed to create emergency pool for %d CPU!\n", cpu);
> +
>  		krcp->initialized = true;
>  	}
>  	if (register_shrinker(&kfree_rcu_shrinker))
> -- 
> 2.20.1
> 


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-04 19:51 ` Joel Fernandes
@ 2020-04-05 17:21   ` Uladzislau Rezki
  2020-04-05 23:30     ` Joel Fernandes
  0 siblings, 1 reply; 14+ messages in thread
From: Uladzislau Rezki @ 2020-04-05 17:21 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony),
	LKML, Paul E . McKenney, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

On Sat, Apr 04, 2020 at 03:51:29PM -0400, Joel Fernandes wrote:
> On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> > Maintain an emergency pool for each CPU with some
> > extra objects. There is read-only sysfs attribute,
> > the name is "rcu_nr_emergency_objs". It reflects
> > the size of the pool. As for now the default value
> > is 3.
> > 
> > The pool is populated when low memory condition is
> > detected. Please note it is only for headless case
> > it means when the regular SLAB is not able to serve
> > any request, the pool is used.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> 
> Hi Vlad,
> 
> One concern I have is this moves the problem a bit further down. My belief is
> we should avoid the likelihood of even needing an rcu_head allocated for the
> headless case, to begin with - than trying to do damage-control when it does
> happen. The only way we would end up needing an rcu_head is if we could not
> allocate an array.
> 
Let me share my view on all such caching. I think that now it becomes less as
the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
I see that it does help a lot. I tried to simulate low memory condition and 
apply high memory pressure with that. I did not manage to trigger the
"synchronize rcu" path at all. It is because of using much more permissive
parameters when we request a memory from the SLAB(direct reclaim, etc...).

>
> So instead of adding a pool for rcu_head allocations, how do you feel about
> pre-allocation of the per-cpu cache array instead, which has the same effect
> as you are intending?
> 
In the v2 i have a list of such objects. It is also per-CPU(it is scaled to CPUs),
but the difference is, those objects require much less memory, it is 8 + sizeof(struct
rcu_head) bytes comparing to one page. Therefore the memory footprint is lower.

I have doubts that we would ever hit this emergency list, because of mentioned
above patch, but from the other hand i can not say and guarantee 100%. Just in
case, we may keep it. 

Paul, could you please share your view and opinion? It would be appreciated :)

> This has 3 benefits:
> 1. It scales with number of CPUs, no configuration needed.
> 2. It makes the first kfree_rcu() faster and less dependent on an allocation
>    succeeding.
> 3. Much simpler code, no new structures or special handling.
> 4. In the future we can extend it to allocate more than 2 pages per CPU using
>    the same caching mechanism.
> 
> The obvious drawback being its 2 pages per CPU but at least it scales by
> number of CPUs. Something like the following (just lightly tested):
> 
> ---8<-----------------------
> 
> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> Subject: [PATCH] rcu/tree: Preallocate the per-cpu cache for kfree_rcu()
> 
> In recent changes, we have made it possible to use kfree_rcu() without
> embedding an rcu_head in the object being free'd. This requires dynamic
> allocation. In case dynamic allocation fails due to memory pressure, we
> would end up synchronously waiting for an RCU grace period thus hurting
> kfree_rcu() latency.
> 
> To make this less probable, let us pre-allocate the per-cpu cache so we
> depend less on the dynamic allocation succeeding. This also has the
> effect of making kfree_rcu() slightly faster at run time.
> 
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  kernel/rcu/tree.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 6172e6296dd7d..9fbdeb4048425 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -4251,6 +4251,11 @@ static void __init kfree_rcu_batch_init(void)
>  			krcp->krw_arr[i].krcp = krcp;
>  		}
>  
> +		krcp->bkvcache[0] =  (struct kvfree_rcu_bulk_data *)
> +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> +		krcp->bkvcache[1] =  (struct kvfree_rcu_bulk_data *)
> +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> +
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  		krcp->initialized = true;
>  	}
We pre-allocate it, but differently comparing with your proposal :) I do not see
how it can improve things. The difference is you do it during initializing or booting  
phase. In case of current code it will pre-allocate and cache one page after first
calling of the kvfree_call_rcu(), say in one second. So basically both variants are
the same.

But i think that we should allow to be used two pages as cached ones, no matter 
whether it is vmalloc ptrs. or SLAB ones. So basically, two cached pages can be
used by vmalloc path and SLAB path. And probably it makes sense because of two
phases: one is when we collect pointers, second one is memory reclaim path. Thus
one page per one phase, i.e. it would be paired.

Thanks, Joel!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-05 17:21   ` Uladzislau Rezki
@ 2020-04-05 23:30     ` Joel Fernandes
  2020-04-06 12:56       ` Uladzislau Rezki
  0 siblings, 1 reply; 14+ messages in thread
From: Joel Fernandes @ 2020-04-05 23:30 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, Paul E . McKenney, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

Hi Vlad,

On Sun, Apr 05, 2020 at 07:21:05PM +0200, Uladzislau Rezki wrote:
> On Sat, Apr 04, 2020 at 03:51:29PM -0400, Joel Fernandes wrote:
> > On Fri, Apr 03, 2020 at 07:30:51PM +0200, Uladzislau Rezki (Sony) wrote:
> > > Maintain an emergency pool for each CPU with some
> > > extra objects. There is read-only sysfs attribute,
> > > the name is "rcu_nr_emergency_objs". It reflects
> > > the size of the pool. As for now the default value
> > > is 3.
> > > 
> > > The pool is populated when low memory condition is
> > > detected. Please note it is only for headless case
> > > it means when the regular SLAB is not able to serve
> > > any request, the pool is used.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > 
> > Hi Vlad,
> > 
> > One concern I have is this moves the problem a bit further down. My belief is
> > we should avoid the likelihood of even needing an rcu_head allocated for the
> > headless case, to begin with - than trying to do damage-control when it does
> > happen. The only way we would end up needing an rcu_head is if we could not
> > allocate an array.
> > 
> Let me share my view on all such caching. I think that now it becomes less as
> the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> I see that it does help a lot. I tried to simulate low memory condition and 
> apply high memory pressure with that. I did not manage to trigger the
> "synchronize rcu" path at all. It is because of using much more permissive
> parameters when we request a memory from the SLAB(direct reclaim, etc...).

That's a good sign that we don't hit this path in your tests.

I guess also, with your latest patch on releasing the lock to be in a
non-atomic context, and then doing the allocation, it became even more
permissive? If you drop that patch and tried, do you still not hit the
synchronous path more often?

Could you also try introducing memory pressure by reducing your system's
total memory and see how it behaves?

> > So instead of adding a pool for rcu_head allocations, how do you feel about
> > pre-allocation of the per-cpu cache array instead, which has the same effect
> > as you are intending?
> > 
> In the v2 i have a list of such objects. It is also per-CPU(it is scaled to CPUs),
> but the difference is, those objects require much less memory, it is 8 + sizeof(struct
> rcu_head) bytes comparing to one page. Therefore the memory footprint is lower.

Yes, true. That is one drawback is it higher memory usage. But if you have at
least 1 kfree_rcu() request an each CPU, then pre-allocation does not
increase memory usage any more than it already has right now. Unless, we
extend my proposal to cache more than 2 pages per-cpu which I think you
mentioned below.

> I have doubts that we would ever hit this emergency list, because of mentioned
> above patch, but from the other hand i can not say and guarantee 100%. Just in
> case, we may keep it. 

You really have to hit OOM in your tests to trigger it I suppose. Basically
the emergency pool improves situation under OOM, but otherwise does not
improve it due to the direct-reclaim that happens as you mentioned. Right?

> Paul, could you please share your view and opinion? It would be appreciated :)
> 
> > This has 3 benefits:
> > 1. It scales with number of CPUs, no configuration needed.
> > 2. It makes the first kfree_rcu() faster and less dependent on an allocation
> >    succeeding.
> > 3. Much simpler code, no new structures or special handling.
> > 4. In the future we can extend it to allocate more than 2 pages per CPU using
> >    the same caching mechanism.
> > 
> > The obvious drawback being its 2 pages per CPU but at least it scales by
> > number of CPUs. Something like the following (just lightly tested):
> > 
> > ---8<-----------------------
> > 
> > From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> > Subject: [PATCH] rcu/tree: Preallocate the per-cpu cache for kfree_rcu()
> > 
> > In recent changes, we have made it possible to use kfree_rcu() without
> > embedding an rcu_head in the object being free'd. This requires dynamic
> > allocation. In case dynamic allocation fails due to memory pressure, we
> > would end up synchronously waiting for an RCU grace period thus hurting
> > kfree_rcu() latency.
> > 
> > To make this less probable, let us pre-allocate the per-cpu cache so we
> > depend less on the dynamic allocation succeeding. This also has the
> > effect of making kfree_rcu() slightly faster at run time.
> > 
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > ---
> >  kernel/rcu/tree.c | 5 +++++
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 6172e6296dd7d..9fbdeb4048425 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -4251,6 +4251,11 @@ static void __init kfree_rcu_batch_init(void)
> >  			krcp->krw_arr[i].krcp = krcp;
> >  		}
> >  
> > +		krcp->bkvcache[0] =  (struct kvfree_rcu_bulk_data *)
> > +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > +		krcp->bkvcache[1] =  (struct kvfree_rcu_bulk_data *)
> > +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > +
> >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> >  		krcp->initialized = true;
> >  	}
> We pre-allocate it, but differently comparing with your proposal :) I do not see
> how it can improve things. The difference is you do it during initializing or booting  
> phase. In case of current code it will pre-allocate and cache one page after first
> calling of the kvfree_call_rcu(), say in one second. So basically both variants are
> the same.

Well, one proposal is only 5 lines extra ;-). That has got to be at least a
bit appealing ;-) ;-).

> But i think that we should allow to be used two pages as cached ones, no matter 
> whether it is vmalloc ptrs. or SLAB ones. So basically, two cached pages can be
> used by vmalloc path and SLAB path. And probably it makes sense because of two
> phases: one is when we collect pointers, second one is memory reclaim path. Thus
> one page per one phase, i.e. it would be paired.

You are saying this with regard to my proposal right?  I agree number of
pages could be increased. The caching mechanism already in-place could be
starting point for that extension.

Thanks, Vlad!

 - Joel

> Thanks, Joel!
> 
> --
> Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-05 23:30     ` Joel Fernandes
@ 2020-04-06 12:56       ` Uladzislau Rezki
  2020-04-06 15:18         ` Joel Fernandes
  2020-04-06 15:31         ` Paul E. McKenney
  0 siblings, 2 replies; 14+ messages in thread
From: Uladzislau Rezki @ 2020-04-06 12:56 UTC (permalink / raw)
  To: Joel Fernandes, Paul E . McKenney
  Cc: Uladzislau Rezki, LKML, Paul E . McKenney, RCU, linux-mm,
	Andrew Morton, Steven Rostedt, Oleksiy Avramchenko

Hello, Joel.

> > > 
> > > Hi Vlad,
> > > 
> > > One concern I have is this moves the problem a bit further down. My belief is
> > > we should avoid the likelihood of even needing an rcu_head allocated for the
> > > headless case, to begin with - than trying to do damage-control when it does
> > > happen. The only way we would end up needing an rcu_head is if we could not
> > > allocate an array.
> > > 
> > Let me share my view on all such caching. I think that now it becomes less as
> > the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> > I see that it does help a lot. I tried to simulate low memory condition and 
> > apply high memory pressure with that. I did not manage to trigger the
> > "synchronize rcu" path at all. It is because of using much more permissive
> > parameters when we request a memory from the SLAB(direct reclaim, etc...).
> 
> That's a good sign that we don't hit this path in your tests.
> 
Just one request, of course if you have a time :) Could you please
double check on your test environment to stress the system to check
if you also can not hit it?

How i test it. Please apply below patch:
<snip>
t a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5e26145e9ead..25f7ac8583e1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3203,6 +3203,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
        if (head) {
                ptr = (void *) head - (unsigned long) func;
+               head = NULL;
        } else {
                /*
                 * Please note there is a limitation for the head-less
@@ -3233,16 +3234,18 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
         * Under high memory pressure GFP_NOWAIT can fail,
         * in that case the emergency path is maintained.
         */
-       success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
-       if (!success) {
+       /* success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); */
+       /* if (!success) { */
                /* Is headless object? */
                if (head == NULL) {
                        /* Drop the lock. */
                        krc_this_cpu_unlock(krcp, flags);
 
                        head = attach_rcu_head_to_object(ptr);
-                       if (head == NULL)
+                       if (head == NULL) {
+                               success = false;
                                goto inline_return;
+                       }
 
                        /* Take it back. */
                        krcp = krc_this_cpu_lock(&flags);
@@ -3267,7 +3270,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
                 */
                expedited_drain = true;
                success = true;
-       }
+       /* } */
 
        WRITE_ONCE(krcp->count, krcp->count + 1);
 
@@ -3297,7 +3300,9 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
                if (!rcu_kfree_nowarn)
                        WARN_ON_ONCE(1);
                debug_rcu_head_unqueue(ptr);
-               synchronize_rcu();
+               /* synchronize_rcu(); */
+               printk(KERN_ERR "-> hit synchronize_rcu() path.\n");
+               trace_printk("-> hit synchronize_rcu() path.\n");
                kvfree(ptr);
        }
 }
<snip>

lower the memory size and run kfree rcu tests. It would be appreciated.

>
> I guess also, with your latest patch on releasing the lock to be in a
> non-atomic context, and then doing the allocation, it became even more
> permissive? If you drop that patch and tried, do you still not hit the
> synchronous path more often?
> 
Yep. If i drop the patch, i can hit it.

>
> Could you also try introducing memory pressure by reducing your system's
> total memory and see how it behaves?
> 
I simulated low memory condition by setting the system memory to 145MB.
That was the minimum amount the KVM system was capable of booting. After
that i used kfree rcu tests to simulate memory pressure.

> > > So instead of adding a pool for rcu_head allocations, how do you feel about
> > > pre-allocation of the per-cpu cache array instead, which has the same effect
> > > as you are intending?
> > > 
> > In the v2 i have a list of such objects. It is also per-CPU(it is scaled to CPUs),
> > but the difference is, those objects require much less memory, it is 8 + sizeof(struct
> > rcu_head) bytes comparing to one page. Therefore the memory footprint is lower.
> 
> Yes, true. That is one drawback is it higher memory usage. But if you have at
> least 1 kfree_rcu() request an each CPU, then pre-allocation does not
> increase memory usage any more than it already has right now. Unless, we
> extend my proposal to cache more than 2 pages per-cpu which I think you
> mentioned below.
> 
If we cache two pages per-CPU, i think that is fine. When it comes to increasing
it, it can be a bit wasting. For example consider 128 CPUs system.

> > I have doubts that we would ever hit this emergency list, because of mentioned
> > above patch, but from the other hand i can not say and guarantee 100%. Just in
> > case, we may keep it. 
> 
> You really have to hit OOM in your tests to trigger it I suppose. Basically
> the emergency pool improves situation under OOM, but otherwise does not
> improve it due to the direct-reclaim that happens as you mentioned. Right?
>
See above how i simulated it. Direct reclaim is our GFP_KERNEL + other flags
helper. If even after reclaim process there is no memory, then emergency list
is supposed to be used. But we can drop this patch, i mean "emergency list"
if we agree on it. The good point would be if you could stress your system
by the i did. See above description :)

> > Paul, could you please share your view and opinion? It would be appreciated :)
> > 
> > > This has 3 benefits:
> > > 1. It scales with number of CPUs, no configuration needed.
> > > 2. It makes the first kfree_rcu() faster and less dependent on an allocation
> > >    succeeding.
> > > 3. Much simpler code, no new structures or special handling.
> > > 4. In the future we can extend it to allocate more than 2 pages per CPU using
> > >    the same caching mechanism.
> > > 
> > > The obvious drawback being its 2 pages per CPU but at least it scales by
> > > number of CPUs. Something like the following (just lightly tested):
> > > 
> > > ---8<-----------------------
> > > 
> > > From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> > > Subject: [PATCH] rcu/tree: Preallocate the per-cpu cache for kfree_rcu()
> > > 
> > > In recent changes, we have made it possible to use kfree_rcu() without
> > > embedding an rcu_head in the object being free'd. This requires dynamic
> > > allocation. In case dynamic allocation fails due to memory pressure, we
> > > would end up synchronously waiting for an RCU grace period thus hurting
> > > kfree_rcu() latency.
> > > 
> > > To make this less probable, let us pre-allocate the per-cpu cache so we
> > > depend less on the dynamic allocation succeeding. This also has the
> > > effect of making kfree_rcu() slightly faster at run time.
> > > 
> > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > ---
> > >  kernel/rcu/tree.c | 5 +++++
> > >  1 file changed, 5 insertions(+)
> > > 
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 6172e6296dd7d..9fbdeb4048425 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -4251,6 +4251,11 @@ static void __init kfree_rcu_batch_init(void)
> > >  			krcp->krw_arr[i].krcp = krcp;
> > >  		}
> > >  
> > > +		krcp->bkvcache[0] =  (struct kvfree_rcu_bulk_data *)
> > > +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > +		krcp->bkvcache[1] =  (struct kvfree_rcu_bulk_data *)
> > > +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > +
> > >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > >  		krcp->initialized = true;
> > >  	}
> > We pre-allocate it, but differently comparing with your proposal :) I do not see
> > how it can improve things. The difference is you do it during initializing or booting  
> > phase. In case of current code it will pre-allocate and cache one page after first
> > calling of the kvfree_call_rcu(), say in one second. So basically both variants are
> > the same.
> 
> Well, one proposal is only 5 lines extra ;-). That has got to be at least a
> bit appealing ;-) ;-).
> 
:)

> > But i think that we should allow to be used two pages as cached ones, no matter 
> > whether it is vmalloc ptrs. or SLAB ones. So basically, two cached pages can be
> > used by vmalloc path and SLAB path. And probably it makes sense because of two
> > phases: one is when we collect pointers, second one is memory reclaim path. Thus
> > one page per one phase, i.e. it would be paired.
> 
> You are saying this with regard to my proposal right?  I agree number of
> pages could be increased. The caching mechanism already in-place could be
> starting point for that extension.
> 
We already have two pages. What we need is to allow to use them in both
paths, vmalloc one and SLAB one, i mean reclaim path. At least that fits
well to the collecting/reclaim phases.

Thanks for comments!

--
Vlad Rezki

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-06 12:56       ` Uladzislau Rezki
@ 2020-04-06 15:18         ` Joel Fernandes
  2020-04-06 16:17           ` Uladzislau Rezki
  2020-04-06 15:31         ` Paul E. McKenney
  1 sibling, 1 reply; 14+ messages in thread
From: Joel Fernandes @ 2020-04-06 15:18 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E . McKenney, LKML, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

Hi Vlad,

On Mon, Apr 06, 2020 at 02:56:40PM +0200, Uladzislau Rezki wrote:
> Hello, Joel.
> 
> > > > 
> > > > Hi Vlad,
> > > > 
> > > > One concern I have is this moves the problem a bit further down. My belief is
> > > > we should avoid the likelihood of even needing an rcu_head allocated for the
> > > > headless case, to begin with - than trying to do damage-control when it does
> > > > happen. The only way we would end up needing an rcu_head is if we could not
> > > > allocate an array.
> > > > 
> > > Let me share my view on all such caching. I think that now it becomes less as
> > > the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> > > I see that it does help a lot. I tried to simulate low memory condition and 
> > > apply high memory pressure with that. I did not manage to trigger the
> > > "synchronize rcu" path at all. It is because of using much more permissive
> > > parameters when we request a memory from the SLAB(direct reclaim, etc...).
> > 
> > That's a good sign that we don't hit this path in your tests.
> > 
> Just one request, of course if you have a time :)
> Could you please double check on your test environment to stress the system
> to check if you also can not hit it?

Sure, I am planning to do so and happy to spend time on it :) One question I
had about the below test:

> How i test it. Please apply below patch:
> <snip>
> t a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 5e26145e9ead..25f7ac8583e1 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3203,6 +3203,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  
>         if (head) {
>                 ptr = (void *) head - (unsigned long) func;
> +               head = NULL;
>         } else {
>                 /*
>                  * Please note there is a limitation for the head-less
> @@ -3233,16 +3234,18 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>          * Under high memory pressure GFP_NOWAIT can fail,
>          * in that case the emergency path is maintained.
>          */
> -       success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> -       if (!success) {
> +       /* success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); */

If you drop this, then it is not realistic right? I mean it changes behavior
of the code completely. We need to try to allocate array and then try to
allocate the head.

> +       /* if (!success) { */
>                 /* Is headless object? */
>                 if (head == NULL) {
>                         /* Drop the lock. */
>                         krc_this_cpu_unlock(krcp, flags);
>  
>                         head = attach_rcu_head_to_object(ptr);
> -                       if (head == NULL)
> +                       if (head == NULL) {
> +                               success = false;
>                                 goto inline_return;
> +                       }
>  
>                         /* Take it back. */
>                         krcp = krc_this_cpu_lock(&flags);
> @@ -3267,7 +3270,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>                  */
>                 expedited_drain = true;
>                 success = true;
> -       }
> +       /* } */
>  
>         WRITE_ONCE(krcp->count, krcp->count + 1);
>  
> @@ -3297,7 +3300,9 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>                 if (!rcu_kfree_nowarn)
>                         WARN_ON_ONCE(1);
>                 debug_rcu_head_unqueue(ptr);
> -               synchronize_rcu();
> +               /* synchronize_rcu(); */
> +               printk(KERN_ERR "-> hit synchronize_rcu() path.\n");
> +               trace_printk("-> hit synchronize_rcu() path.\n");
>                 kvfree(ptr);
>         }
>  }
> <snip>
> 
> lower the memory size and run kfree rcu tests. It would be appreciated.

I am happy to try out the diff if I can understand how the above diff is
close enough with current code's behavior, if we are not using the array. One
other issue with current kfree rcu tests is, the test is itself the reason
for the pressure -- I believe we should also have some testing that shows
that the memory pressure is caused else where (such as a real user workload
causing OOM), and then we see how RCU behaves under OOM -- if we have too
many synchronous latencies, does the additional caching remove such latenies
under OOM? etc.  I also want to look into your vmalloc tests.

> > I guess also, with your latest patch on releasing the lock to be in a
> > non-atomic context, and then doing the allocation, it became even more
> > permissive? If you drop that patch and tried, do you still not hit the
> > synchronous path more often?
> > 
> Yep. If i drop the patch, i can hit it.

Ah, cool. So basically the direct-reclaim path does the synchronous waiting,
instead of synchronize_rcu(). Either way, we wait synchronously. How to chose
which way is better though? If direct reclaim improves the memory situation,
then we should enter that path. But if direct reclaim takes too much time
(thus hurting the kfree_rcu() latency), then perhaps it is better for
kfree_rcu() to just do the synchronize_rcu() and let someone else enter the
direct-reclaim path. We should probably quantify and see which approach works
better.

> > Could you also try introducing memory pressure by reducing your system's
> > total memory and see how it behaves?
> > 
> I simulated low memory condition by setting the system memory to 145MB.
> That was the minimum amount the KVM system was capable of booting. After
> that i used kfree rcu tests to simulate memory pressure.

Ah, ok. I do a similar thing. Thanks for sharing. It would be nice if we can
both commit something into the tree (like modify the rcu torture KVM scripts
to simulate this automatically (while also generating memory pressure
external to RCU).

> > > > So instead of adding a pool for rcu_head allocations, how do you feel about
> > > > pre-allocation of the per-cpu cache array instead, which has the same effect
> > > > as you are intending?
> > > > 
> > > In the v2 i have a list of such objects. It is also per-CPU(it is scaled to CPUs),
> > > but the difference is, those objects require much less memory, it is 8 + sizeof(struct
> > > rcu_head) bytes comparing to one page. Therefore the memory footprint is lower.
> > 
> > Yes, true. That is one drawback is it higher memory usage. But if you have at
> > least 1 kfree_rcu() request an each CPU, then pre-allocation does not
> > increase memory usage any more than it already has right now. Unless, we
> > extend my proposal to cache more than 2 pages per-cpu which I think you
> > mentioned below.
> > 
> If we cache two pages per-CPU, i think that is fine. When it comes to increasing
> it, it can be a bit wasting. For example consider 128 CPUs system.

Right, if we could we assume that the system's memory scales with number of
CPUs, it could be reasonable.

> > > I have doubts that we would ever hit this emergency list, because of mentioned
> > > above patch, but from the other hand i can not say and guarantee 100%. Just in
> > > case, we may keep it. 
> > 
> > You really have to hit OOM in your tests to trigger it I suppose. Basically
> > the emergency pool improves situation under OOM, but otherwise does not
> > improve it due to the direct-reclaim that happens as you mentioned. Right?
> >
> See above how i simulated it. Direct reclaim is our GFP_KERNEL + other flags
> helper. If even after reclaim process there is no memory, then emergency list
> is supposed to be used. But we can drop this patch, i mean "emergency list"
> if we agree on it. The good point would be if you could stress your system
> by the i did. See above description :)

Yes I will stress it and make time to do so today :)
 
> > > But i think that we should allow to be used two pages as cached ones, no matter 
> > > whether it is vmalloc ptrs. or SLAB ones. So basically, two cached pages can be
> > > used by vmalloc path and SLAB path. And probably it makes sense because of two
> > > phases: one is when we collect pointers, second one is memory reclaim path. Thus
> > > one page per one phase, i.e. it would be paired.
> > 
> > You are saying this with regard to my proposal right?  I agree number of
> > pages could be increased. The caching mechanism already in-place could be
> > starting point for that extension.
> > 
> We already have two pages. What we need is to allow to use them in both
> paths, vmalloc one and SLAB one, i mean reclaim path. At least that fits
> well to the collecting/reclaim phases.

Ah yes, that's even better.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-06 12:56       ` Uladzislau Rezki
  2020-04-06 15:18         ` Joel Fernandes
@ 2020-04-06 15:31         ` Paul E. McKenney
  2020-04-06 16:32           ` Uladzislau Rezki
  1 sibling, 1 reply; 14+ messages in thread
From: Paul E. McKenney @ 2020-04-06 15:31 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Joel Fernandes, LKML, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

On Mon, Apr 06, 2020 at 02:56:40PM +0200, Uladzislau Rezki wrote:
> Hello, Joel.
> 
> > > > 
> > > > Hi Vlad,
> > > > 
> > > > One concern I have is this moves the problem a bit further down. My belief is
> > > > we should avoid the likelihood of even needing an rcu_head allocated for the
> > > > headless case, to begin with - than trying to do damage-control when it does
> > > > happen. The only way we would end up needing an rcu_head is if we could not
> > > > allocate an array.
> > > > 
> > > Let me share my view on all such caching. I think that now it becomes less as
> > > the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> > > I see that it does help a lot. I tried to simulate low memory condition and 
> > > apply high memory pressure with that. I did not manage to trigger the
> > > "synchronize rcu" path at all. It is because of using much more permissive
> > > parameters when we request a memory from the SLAB(direct reclaim, etc...).
> > 
> > That's a good sign that we don't hit this path in your tests.
> > 
> Just one request, of course if you have a time :) Could you please
> double check on your test environment to stress the system to check
> if you also can not hit it?
> 
> How i test it. Please apply below patch:

This is of course a double challenge.

I can assure you that even if we cannot make it happen in the comfort and
safety of our tests systems, someone somewhere will make it happen all
the time.  Because there is a very large number of Linux systems running
out there.

Which leads to the other challenge:  How do we test this code path?

							Thanx, Paul

> <snip>
> t a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 5e26145e9ead..25f7ac8583e1 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3203,6 +3203,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  
>         if (head) {
>                 ptr = (void *) head - (unsigned long) func;
> +               head = NULL;
>         } else {
>                 /*
>                  * Please note there is a limitation for the head-less
> @@ -3233,16 +3234,18 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>          * Under high memory pressure GFP_NOWAIT can fail,
>          * in that case the emergency path is maintained.
>          */
> -       success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> -       if (!success) {
> +       /* success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); */
> +       /* if (!success) { */
>                 /* Is headless object? */
>                 if (head == NULL) {
>                         /* Drop the lock. */
>                         krc_this_cpu_unlock(krcp, flags);
>  
>                         head = attach_rcu_head_to_object(ptr);
> -                       if (head == NULL)
> +                       if (head == NULL) {
> +                               success = false;
>                                 goto inline_return;
> +                       }
>  
>                         /* Take it back. */
>                         krcp = krc_this_cpu_lock(&flags);
> @@ -3267,7 +3270,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>                  */
>                 expedited_drain = true;
>                 success = true;
> -       }
> +       /* } */
>  
>         WRITE_ONCE(krcp->count, krcp->count + 1);
>  
> @@ -3297,7 +3300,9 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>                 if (!rcu_kfree_nowarn)
>                         WARN_ON_ONCE(1);
>                 debug_rcu_head_unqueue(ptr);
> -               synchronize_rcu();
> +               /* synchronize_rcu(); */
> +               printk(KERN_ERR "-> hit synchronize_rcu() path.\n");
> +               trace_printk("-> hit synchronize_rcu() path.\n");
>                 kvfree(ptr);
>         }
>  }
> <snip>
> 
> lower the memory size and run kfree rcu tests. It would be appreciated.
> 
> >
> > I guess also, with your latest patch on releasing the lock to be in a
> > non-atomic context, and then doing the allocation, it became even more
> > permissive? If you drop that patch and tried, do you still not hit the
> > synchronous path more often?
> > 
> Yep. If i drop the patch, i can hit it.
> 
> >
> > Could you also try introducing memory pressure by reducing your system's
> > total memory and see how it behaves?
> > 
> I simulated low memory condition by setting the system memory to 145MB.
> That was the minimum amount the KVM system was capable of booting. After
> that i used kfree rcu tests to simulate memory pressure.
> 
> > > > So instead of adding a pool for rcu_head allocations, how do you feel about
> > > > pre-allocation of the per-cpu cache array instead, which has the same effect
> > > > as you are intending?
> > > > 
> > > In the v2 i have a list of such objects. It is also per-CPU(it is scaled to CPUs),
> > > but the difference is, those objects require much less memory, it is 8 + sizeof(struct
> > > rcu_head) bytes comparing to one page. Therefore the memory footprint is lower.
> > 
> > Yes, true. That is one drawback is it higher memory usage. But if you have at
> > least 1 kfree_rcu() request an each CPU, then pre-allocation does not
> > increase memory usage any more than it already has right now. Unless, we
> > extend my proposal to cache more than 2 pages per-cpu which I think you
> > mentioned below.
> > 
> If we cache two pages per-CPU, i think that is fine. When it comes to increasing
> it, it can be a bit wasting. For example consider 128 CPUs system.
> 
> > > I have doubts that we would ever hit this emergency list, because of mentioned
> > > above patch, but from the other hand i can not say and guarantee 100%. Just in
> > > case, we may keep it. 
> > 
> > You really have to hit OOM in your tests to trigger it I suppose. Basically
> > the emergency pool improves situation under OOM, but otherwise does not
> > improve it due to the direct-reclaim that happens as you mentioned. Right?
> >
> See above how i simulated it. Direct reclaim is our GFP_KERNEL + other flags
> helper. If even after reclaim process there is no memory, then emergency list
> is supposed to be used. But we can drop this patch, i mean "emergency list"
> if we agree on it. The good point would be if you could stress your system
> by the i did. See above description :)
> 
> > > Paul, could you please share your view and opinion? It would be appreciated :)
> > > 
> > > > This has 3 benefits:
> > > > 1. It scales with number of CPUs, no configuration needed.
> > > > 2. It makes the first kfree_rcu() faster and less dependent on an allocation
> > > >    succeeding.
> > > > 3. Much simpler code, no new structures or special handling.
> > > > 4. In the future we can extend it to allocate more than 2 pages per CPU using
> > > >    the same caching mechanism.
> > > > 
> > > > The obvious drawback being its 2 pages per CPU but at least it scales by
> > > > number of CPUs. Something like the following (just lightly tested):
> > > > 
> > > > ---8<-----------------------
> > > > 
> > > > From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> > > > Subject: [PATCH] rcu/tree: Preallocate the per-cpu cache for kfree_rcu()
> > > > 
> > > > In recent changes, we have made it possible to use kfree_rcu() without
> > > > embedding an rcu_head in the object being free'd. This requires dynamic
> > > > allocation. In case dynamic allocation fails due to memory pressure, we
> > > > would end up synchronously waiting for an RCU grace period thus hurting
> > > > kfree_rcu() latency.
> > > > 
> > > > To make this less probable, let us pre-allocate the per-cpu cache so we
> > > > depend less on the dynamic allocation succeeding. This also has the
> > > > effect of making kfree_rcu() slightly faster at run time.
> > > > 
> > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > ---
> > > >  kernel/rcu/tree.c | 5 +++++
> > > >  1 file changed, 5 insertions(+)
> > > > 
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index 6172e6296dd7d..9fbdeb4048425 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -4251,6 +4251,11 @@ static void __init kfree_rcu_batch_init(void)
> > > >  			krcp->krw_arr[i].krcp = krcp;
> > > >  		}
> > > >  
> > > > +		krcp->bkvcache[0] =  (struct kvfree_rcu_bulk_data *)
> > > > +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > > +		krcp->bkvcache[1] =  (struct kvfree_rcu_bulk_data *)
> > > > +					__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > > +
> > > >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > > >  		krcp->initialized = true;
> > > >  	}
> > > We pre-allocate it, but differently comparing with your proposal :) I do not see
> > > how it can improve things. The difference is you do it during initializing or booting  
> > > phase. In case of current code it will pre-allocate and cache one page after first
> > > calling of the kvfree_call_rcu(), say in one second. So basically both variants are
> > > the same.
> > 
> > Well, one proposal is only 5 lines extra ;-). That has got to be at least a
> > bit appealing ;-) ;-).
> > 
> :)
> 
> > > But i think that we should allow to be used two pages as cached ones, no matter 
> > > whether it is vmalloc ptrs. or SLAB ones. So basically, two cached pages can be
> > > used by vmalloc path and SLAB path. And probably it makes sense because of two
> > > phases: one is when we collect pointers, second one is memory reclaim path. Thus
> > > one page per one phase, i.e. it would be paired.
> > 
> > You are saying this with regard to my proposal right?  I agree number of
> > pages could be increased. The caching mechanism already in-place could be
> > starting point for that extension.
> > 
> We already have two pages. What we need is to allow to use them in both
> paths, vmalloc one and SLAB one, i mean reclaim path. At least that fits
> well to the collecting/reclaim phases.
> 
> Thanks for comments!
> 
> --
> Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-06 15:18         ` Joel Fernandes
@ 2020-04-06 16:17           ` Uladzislau Rezki
  2020-04-07  1:47             ` Joel Fernandes
  0 siblings, 1 reply; 14+ messages in thread
From: Uladzislau Rezki @ 2020-04-06 16:17 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki, Paul E . McKenney, LKML, RCU, linux-mm,
	Andrew Morton, Steven Rostedt, Oleksiy Avramchenko

On Mon, Apr 06, 2020 at 11:18:51AM -0400, Joel Fernandes wrote:
> Hi Vlad,
> 
> On Mon, Apr 06, 2020 at 02:56:40PM +0200, Uladzislau Rezki wrote:
> > Hello, Joel.
> > 
> > > > > 
> > > > > Hi Vlad,
> > > > > 
> > > > > One concern I have is this moves the problem a bit further down. My belief is
> > > > > we should avoid the likelihood of even needing an rcu_head allocated for the
> > > > > headless case, to begin with - than trying to do damage-control when it does
> > > > > happen. The only way we would end up needing an rcu_head is if we could not
> > > > > allocate an array.
> > > > > 
> > > > Let me share my view on all such caching. I think that now it becomes less as
> > > > the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> > > > I see that it does help a lot. I tried to simulate low memory condition and 
> > > > apply high memory pressure with that. I did not manage to trigger the
> > > > "synchronize rcu" path at all. It is because of using much more permissive
> > > > parameters when we request a memory from the SLAB(direct reclaim, etc...).
> > > 
> > > That's a good sign that we don't hit this path in your tests.
> > > 
> > Just one request, of course if you have a time :)
> > Could you please double check on your test environment to stress the system
> > to check if you also can not hit it?
> 
> Sure, I am planning to do so and happy to spend time on it :) One question I
> had about the below test:
> 
> > How i test it. Please apply below patch:
> > <snip>
> > t a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 5e26145e9ead..25f7ac8583e1 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3203,6 +3203,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> >  
> >         if (head) {
> >                 ptr = (void *) head - (unsigned long) func;
> > +               head = NULL;
> >         } else {
> >                 /*
> >                  * Please note there is a limitation for the head-less
> > @@ -3233,16 +3234,18 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> >          * Under high memory pressure GFP_NOWAIT can fail,
> >          * in that case the emergency path is maintained.
> >          */
> > -       success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> > -       if (!success) {
> > +       /* success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); */
> 
> If you drop this, then it is not realistic right? I mean it changes behavior
> of the code completely. We need to try to allocate array and then try to
> allocate the head.
> 
That just bypasses an allocation for the array, to make it more simple
and move forward toward the path we would like to test. Also head is
set to NULL to simulated headless freeing.

> > +       /* if (!success) { */
> >                 /* Is headless object? */
> >                 if (head == NULL) {
> >                         /* Drop the lock. */
> >                         krc_this_cpu_unlock(krcp, flags);
> >  
> >                         head = attach_rcu_head_to_object(ptr);
> > -                       if (head == NULL)
> > +                       if (head == NULL) {
> > +                               success = false;
> >                                 goto inline_return;
> > +                       }
> >  
> >                         /* Take it back. */
> >                         krcp = krc_this_cpu_lock(&flags);
> > @@ -3267,7 +3270,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> >                  */
> >                 expedited_drain = true;
> >                 success = true;
> > -       }
> > +       /* } */
> >  
> >         WRITE_ONCE(krcp->count, krcp->count + 1);
> >  
> > @@ -3297,7 +3300,9 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> >                 if (!rcu_kfree_nowarn)
> >                         WARN_ON_ONCE(1);
> >                 debug_rcu_head_unqueue(ptr);
> > -               synchronize_rcu();
> > +               /* synchronize_rcu(); */
> > +               printk(KERN_ERR "-> hit synchronize_rcu() path.\n");
> > +               trace_printk("-> hit synchronize_rcu() path.\n");
> >                 kvfree(ptr);
> >         }
> >  }
> > <snip>
> > 
> > lower the memory size and run kfree rcu tests. It would be appreciated.
> 
> I am happy to try out the diff if I can understand how the above diff is
> close enough with current code's behavior, if we are not using the array. One
> other issue with current kfree rcu tests is, the test is itself the reason
> for the pressure -- I believe we should also have some testing that shows
> that the memory pressure is caused else where (such as a real user workload
> causing OOM), and then we see how RCU behaves under OOM -- if we have too
> many synchronous latencies, does the additional caching remove such latenies
> under OOM? etc.  I also want to look into your vmalloc tests.
> 
Of course to have real tests would be good. 

> > > I guess also, with your latest patch on releasing the lock to be in a
> > > non-atomic context, and then doing the allocation, it became even more
> > > permissive? If you drop that patch and tried, do you still not hit the
> > > synchronous path more often?
> > > 
> > Yep. If i drop the patch, i can hit it.
> 
> Ah, cool. So basically the direct-reclaim path does the synchronous waiting,
> instead of synchronize_rcu(). Either way, we wait synchronously. How to chose
> which way is better though? If direct reclaim improves the memory situation,
> then we should enter that path. But if direct reclaim takes too much time
> (thus hurting the kfree_rcu() latency), then perhaps it is better for
> kfree_rcu() to just do the synchronize_rcu() and let someone else enter the
> direct-reclaim path. We should probably quantify and see which approach works
> better.
> 
I see at it like, headless variant has to be called from the sleeping context, 
therefore it can sleep. What is better to call synchronize_rcu() or doing direct
reclaim depends on how many CPUs in a system we have. I suspect that doing
direct reclaim is better, at least it will free some memory for us. We
could also extend that patch and make it a bit different, for example do
NOWAIT then try ATOMIC and as a last step do GFP_KERNEL alloc.

> > > Could you also try introducing memory pressure by reducing your system's
> > > total memory and see how it behaves?
> > > 
> > I simulated low memory condition by setting the system memory to 145MB.
> > That was the minimum amount the KVM system was capable of booting. After
> > that i used kfree rcu tests to simulate memory pressure.
> 
> Ah, ok. I do a similar thing. Thanks for sharing. It would be nice if we can
> both commit something into the tree (like modify the rcu torture KVM scripts
> to simulate this automatically (while also generating memory pressure
> external to RCU).
>
That makes sense.

> > > > > So instead of adding a pool for rcu_head allocations, how do you feel about
> > > > > pre-allocation of the per-cpu cache array instead, which has the same effect
> > > > > as you are intending?
> > > > > 
> > > > In the v2 i have a list of such objects. It is also per-CPU(it is scaled to CPUs),
> > > > but the difference is, those objects require much less memory, it is 8 + sizeof(struct
> > > > rcu_head) bytes comparing to one page. Therefore the memory footprint is lower.
> > > 
> > > Yes, true. That is one drawback is it higher memory usage. But if you have at
> > > least 1 kfree_rcu() request an each CPU, then pre-allocation does not
> > > increase memory usage any more than it already has right now. Unless, we
> > > extend my proposal to cache more than 2 pages per-cpu which I think you
> > > mentioned below.
> > > 
> > If we cache two pages per-CPU, i think that is fine. When it comes to increasing
> > it, it can be a bit wasting. For example consider 128 CPUs system.
> 
> Right, if we could we assume that the system's memory scales with number of
> CPUs, it could be reasonable.
> 
> > > > I have doubts that we would ever hit this emergency list, because of mentioned
> > > > above patch, but from the other hand i can not say and guarantee 100%. Just in
> > > > case, we may keep it. 
> > > 
> > > You really have to hit OOM in your tests to trigger it I suppose. Basically
> > > the emergency pool improves situation under OOM, but otherwise does not
> > > improve it due to the direct-reclaim that happens as you mentioned. Right?
> > >
> > See above how i simulated it. Direct reclaim is our GFP_KERNEL + other flags
> > helper. If even after reclaim process there is no memory, then emergency list
> > is supposed to be used. But we can drop this patch, i mean "emergency list"
> > if we agree on it. The good point would be if you could stress your system
> > by the i did. See above description :)
> 
> Yes I will stress it and make time to do so today :)
>  
> > > > But i think that we should allow to be used two pages as cached ones, no matter 
> > > > whether it is vmalloc ptrs. or SLAB ones. So basically, two cached pages can be
> > > > used by vmalloc path and SLAB path. And probably it makes sense because of two
> > > > phases: one is when we collect pointers, second one is memory reclaim path. Thus
> > > > one page per one phase, i.e. it would be paired.
> > > 
> > > You are saying this with regard to my proposal right?  I agree number of
> > > pages could be increased. The caching mechanism already in-place could be
> > > starting point for that extension.
> > > 
> > We already have two pages. What we need is to allow to use them in both
> > paths, vmalloc one and SLAB one, i mean reclaim path. At least that fits
> > well to the collecting/reclaim phases.
> 
> Ah yes, that's even better.
> 
Good :)

Thanks!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-06 15:31         ` Paul E. McKenney
@ 2020-04-06 16:32           ` Uladzislau Rezki
  0 siblings, 0 replies; 14+ messages in thread
From: Uladzislau Rezki @ 2020-04-06 16:32 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Joel Fernandes, LKML, RCU, linux-mm,
	Andrew Morton, Steven Rostedt, Oleksiy Avramchenko

> On Mon, Apr 06, 2020 at 02:56:40PM +0200, Uladzislau Rezki wrote:
> > Hello, Joel.
> > 
> > > > > 
> > > > > Hi Vlad,
> > > > > 
> > > > > One concern I have is this moves the problem a bit further down. My belief is
> > > > > we should avoid the likelihood of even needing an rcu_head allocated for the
> > > > > headless case, to begin with - than trying to do damage-control when it does
> > > > > happen. The only way we would end up needing an rcu_head is if we could not
> > > > > allocate an array.
> > > > > 
> > > > Let me share my view on all such caching. I think that now it becomes less as
> > > > the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> > > > I see that it does help a lot. I tried to simulate low memory condition and 
> > > > apply high memory pressure with that. I did not manage to trigger the
> > > > "synchronize rcu" path at all. It is because of using much more permissive
> > > > parameters when we request a memory from the SLAB(direct reclaim, etc...).
> > > 
> > > That's a good sign that we don't hit this path in your tests.
> > > 
> > Just one request, of course if you have a time :) Could you please
> > double check on your test environment to stress the system to check
> > if you also can not hit it?
> > 
> > How i test it. Please apply below patch:
> 
> This is of course a double challenge.
> 
> I can assure you that even if we cannot make it happen in the comfort and
> safety of our tests systems, someone somewhere will make it happen all
> the time.  Because there is a very large number of Linux systems running
> out there.
> 
> Which leads to the other challenge:  How do we test this code path?
> 
I have added extra tests to my "vmalloc tests" https://lkml.org/lkml/2020/4/2/384
for stressing head/headless variants. Also we have rcuperf module. Running them
together under KVM(selftests) would be good. Plus we can add a counter of the
path we think is bad, synchronize_rcu() and so on.

Thanks!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/1] rcu/tree: add emergency pool for headless case
  2020-04-06 16:17           ` Uladzislau Rezki
@ 2020-04-07  1:47             ` Joel Fernandes
  0 siblings, 0 replies; 14+ messages in thread
From: Joel Fernandes @ 2020-04-07  1:47 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Paul E . McKenney, LKML, RCU, linux-mm, Andrew Morton,
	Steven Rostedt, Oleksiy Avramchenko

On Mon, Apr 06, 2020 at 06:17:08PM +0200, Uladzislau Rezki wrote:
> On Mon, Apr 06, 2020 at 11:18:51AM -0400, Joel Fernandes wrote:
> > Hi Vlad,
> > 
> > On Mon, Apr 06, 2020 at 02:56:40PM +0200, Uladzislau Rezki wrote:
> > > Hello, Joel.
> > > 
> > > > > > 
> > > > > > Hi Vlad,
> > > > > > 
> > > > > > One concern I have is this moves the problem a bit further down. My belief is
> > > > > > we should avoid the likelihood of even needing an rcu_head allocated for the
> > > > > > headless case, to begin with - than trying to do damage-control when it does
> > > > > > happen. The only way we would end up needing an rcu_head is if we could not
> > > > > > allocate an array.
> > > > > > 
> > > > > Let me share my view on all such caching. I think that now it becomes less as
> > > > > the issue, because of we have now https://lkml.org/lkml/2020/4/2/383 patch.
> > > > > I see that it does help a lot. I tried to simulate low memory condition and 
> > > > > apply high memory pressure with that. I did not manage to trigger the
> > > > > "synchronize rcu" path at all. It is because of using much more permissive
> > > > > parameters when we request a memory from the SLAB(direct reclaim, etc...).
> > > > 
> > > > That's a good sign that we don't hit this path in your tests.
> > > > 
> > > Just one request, of course if you have a time :)
> > > Could you please double check on your test environment to stress the system
> > > to check if you also can not hit it?
> > 
> > Sure, I am planning to do so and happy to spend time on it :) One question I
> > had about the below test:
> > 
> > > How i test it. Please apply below patch:
> > > <snip>
> > > t a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 5e26145e9ead..25f7ac8583e1 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -3203,6 +3203,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > >  
> > >         if (head) {
> > >                 ptr = (void *) head - (unsigned long) func;
> > > +               head = NULL;
> > >         } else {
> > >                 /*
> > >                  * Please note there is a limitation for the head-less
> > > @@ -3233,16 +3234,18 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > >          * Under high memory pressure GFP_NOWAIT can fail,
> > >          * in that case the emergency path is maintained.
> > >          */
> > > -       success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> > > -       if (!success) {
> > > +       /* success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); */
> > 
> > If you drop this, then it is not realistic right? I mean it changes behavior
> > of the code completely. We need to try to allocate array and then try to
> > allocate the head.
> > 
> That just bypasses an allocation for the array, to make it more simple
> and move forward toward the path we would like to test. Also head is
> set to NULL to simulated headless freeing.

Makes sense, I know you are forcing code to invoke the bad case more often
but I was concerned the hack would change dynamics of code enough to make it
an unrealistic situation. But I see your point.

> > > +       /* if (!success) { */
> > >                 /* Is headless object? */
> > >                 if (head == NULL) {
> > >                         /* Drop the lock. */
> > >                         krc_this_cpu_unlock(krcp, flags);
> > >  
> > >                         head = attach_rcu_head_to_object(ptr);
> > > -                       if (head == NULL)
> > > +                       if (head == NULL) {
> > > +                               success = false;
> > >                                 goto inline_return;
> > > +                       }
> > >  
> > >                         /* Take it back. */
> > >                         krcp = krc_this_cpu_lock(&flags);
> > > @@ -3267,7 +3270,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > >                  */
> > >                 expedited_drain = true;
> > >                 success = true;
> > > -       }
> > > +       /* } */
> > >  
> > >         WRITE_ONCE(krcp->count, krcp->count + 1);
> > >  
> > > @@ -3297,7 +3300,9 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > >                 if (!rcu_kfree_nowarn)
> > >                         WARN_ON_ONCE(1);
> > >                 debug_rcu_head_unqueue(ptr);
> > > -               synchronize_rcu();
> > > +               /* synchronize_rcu(); */
> > > +               printk(KERN_ERR "-> hit synchronize_rcu() path.\n");
> > > +               trace_printk("-> hit synchronize_rcu() path.\n");
> > >                 kvfree(ptr);
> > >         }
> > >  }
> > > <snip>
> > > 
> > > lower the memory size and run kfree rcu tests. It would be appreciated.
> > 
> > I am happy to try out the diff if I can understand how the above diff is
> > close enough with current code's behavior, if we are not using the array. One
> > other issue with current kfree rcu tests is, the test is itself the reason
> > for the pressure -- I believe we should also have some testing that shows
> > that the memory pressure is caused else where (such as a real user workload
> > causing OOM), and then we see how RCU behaves under OOM -- if we have too
> > many synchronous latencies, does the additional caching remove such latenies
> > under OOM? etc.  I also want to look into your vmalloc tests.
> > 
> Of course to have real tests would be good. 

Agreed.

> > > > I guess also, with your latest patch on releasing the lock to be in a
> > > > non-atomic context, and then doing the allocation, it became even more
> > > > permissive? If you drop that patch and tried, do you still not hit the
> > > > synchronous path more often?
> > > > 
> > > Yep. If i drop the patch, i can hit it.
> > 
> > Ah, cool. So basically the direct-reclaim path does the synchronous waiting,
> > instead of synchronize_rcu(). Either way, we wait synchronously. How to chose
> > which way is better though? If direct reclaim improves the memory situation,
> > then we should enter that path. But if direct reclaim takes too much time
> > (thus hurting the kfree_rcu() latency), then perhaps it is better for
> > kfree_rcu() to just do the synchronize_rcu() and let someone else enter the
> > direct-reclaim path. We should probably quantify and see which approach works
> > better.
> > 
> I see at it like, headless variant has to be called from the sleeping context, 
> therefore it can sleep. What is better to call synchronize_rcu() or doing direct
> reclaim depends on how many CPUs in a system we have. I suspect that doing
> direct reclaim is better, at least it will free some memory for us. We
> could also extend that patch and make it a bit different, for example do
> NOWAIT then try ATOMIC and as a last step do GFP_KERNEL alloc.

Yes, that's a good idea. That way perhaps we reduce chance that kfree_rcu()
enters into direct-reclaim. Let us do it that way. At least I don't see any
drawbacks in such approach.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-04-07  1:47 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-03 17:30 [PATCH 1/1] rcu/tree: add emergency pool for headless case Uladzislau Rezki (Sony)
2020-04-03 18:16 ` Matthew Wilcox
2020-04-04 19:09   ` Uladzislau Rezki
2020-04-03 19:14 ` Paul E. McKenney
2020-04-04 19:10   ` Uladzislau Rezki
2020-04-04 19:51 ` Joel Fernandes
2020-04-05 17:21   ` Uladzislau Rezki
2020-04-05 23:30     ` Joel Fernandes
2020-04-06 12:56       ` Uladzislau Rezki
2020-04-06 15:18         ` Joel Fernandes
2020-04-06 16:17           ` Uladzislau Rezki
2020-04-07  1:47             ` Joel Fernandes
2020-04-06 15:31         ` Paul E. McKenney
2020-04-06 16:32           ` Uladzislau Rezki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).