linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode
       [not found] <20200507004240.GA9156@paulmck-ThinkPad-P72>
@ 2020-05-07  9:36 ` Hillf Danton
  2020-05-07 15:49   ` Paul E. McKenney
  0 siblings, 1 reply; 4+ messages in thread
From: Hillf Danton @ 2020-05-07  9:36 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: linux-kernel, rcu, akpm, Hillf Danton, linux-mm


Hello Paul

On Wed, 6 May 2020 17:42:40 Paul E. McKenney wrote:
> 
> This commit adds a shrinker so as to inform RCU when memory is scarce.

A simpler hook is added in the logic of kswapd for subscribing the info
that memory pressure is high, and then on top of it make rcu a subscriber
by copying your code for the shrinker, wishing it makes a sense to you.

What's not yet included is to make the hook per node to help make every
reviewer convinced that memory is becoming tight. Of course without the
cost of making subscribers node aware.

Hillf

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -49,6 +49,16 @@ static inline void set_max_mapnr(unsigne
 static inline void set_max_mapnr(unsigned long limit) { }
 #endif
 
+/* subscriber of kswapd's memory_pressure_high signal */
+struct mph_subscriber {
+	struct list_head node;
+	void (*info) (void *data);
+	void *data;
+};
+
+int mph_subscribe(struct mph_subscriber *ms);
+void mph_unsubscribe(struct mph_subscriber *ms);
+
 extern atomic_long_t _totalram_pages;
 static inline unsigned long totalram_pages(void)
 {
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3536,6 +3536,40 @@ static bool kswapd_shrink_node(pg_data_t
 }
 
 /*
+ * subscribers of kswapd's signal that memory pressure is high
+ */
+static LIST_HEAD(mph_subs);
+static DEFINE_MUTEX(mph_lock);
+
+int mph_subscribe(struct mph_subscriber *ms)
+{
+	if (!ms->info)
+		return -EAGAIN;
+
+	mutex_lock(&mph_lock);
+	list_add_tail(&ms->node, &mph_subs);
+	mutex_unlock(&mph_lock);
+	return 0;
+}
+
+void mph_unsubscribe(struct mph_subscriber *ms)
+{
+	mutex_lock(&mph_lock);
+	list_del(&ms->node);
+	mutex_unlock(&mph_lock);
+}
+
+static void kswapd_bbc_mph(void)
+{
+	struct mph_subscriber *ms;
+
+	mutex_lock(&mph_lock);
+	list_for_each_entry(ms, &mph_subs, node)
+		ms->info(ms->data);
+	mutex_unlock(&mph_lock);
+}
+
+/*
  * For kswapd, balance_pgdat() will reclaim pages across a node from zones
  * that are eligible for use by the caller until at least one zone is
  * balanced.
@@ -3663,8 +3697,11 @@ restart:
 		 * If we're getting trouble reclaiming, start doing writepage
 		 * even in laptop mode.
 		 */
-		if (sc.priority < DEF_PRIORITY - 2)
+		if (sc.priority < DEF_PRIORITY - 2) {
 			sc.may_writepage = 1;
+			if (sc.priority == DEF_PRIORITY - 3)
+				kswapd_bbc_mph();
+		}
 
 		/* Call soft limit reclaim before calling shrink_node. */
 		sc.nr_scanned = 0;
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -325,6 +325,8 @@ struct rcu_state {
 	int ncpus_snap;				/* # CPUs seen last time. */
 	u8 cbovld;				/* Callback overload now? */
 	u8 cbovldnext;				/* ^        ^  next time? */
+	u8 mph;					/* mm pressure high signal from kswapd */
+	unsigned long mph_end;			/* time stamp in jiffies */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -52,6 +52,7 @@
 #include <linux/kprobes.h>
 #include <linux/gfp.h>
 #include <linux/oom.h>
+#include <linux/mm.h>
 #include <linux/smpboot.h>
 #include <linux/jiffies.h>
 #include <linux/slab.h>
@@ -2314,8 +2315,15 @@ static void force_qs_rnp(int (*f)(struct
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
-	rcu_state.cbovld = rcu_state.cbovldnext;
+	rcu_state.cbovld = smp_load_acquire(&rcu_state.mph) ||
+					rcu_state.cbovldnext;
 	rcu_state.cbovldnext = false;
+
+	if (READ_ONCE(rcu_state.mph) &&
+	    time_after(jiffies, READ_ONCE(rcu_state.mph_end))) {
+		WRITE_ONCE(rcu_state.mph, false);
+		pr_info("%s: Ending OOM-mode grace periods.\n", __func__);
+	}
 	rcu_for_each_leaf_node(rnp) {
 		cond_resched_tasks_rcu_qs();
 		mask = 0;
@@ -2643,6 +2651,20 @@ static void check_cb_ovld(struct rcu_dat
 	raw_spin_unlock_rcu_node(rnp);
 }
 
+static void rcu_mph_info(void *data)
+{
+	struct rcu_state *state = data;
+
+	WRITE_ONCE(state->mph_end, jiffies + HZ / 10);
+	smp_store_release(&state->mph, true);
+	rcu_force_quiescent_state();
+}
+
+static struct mph_subscriber rcu_mph_subscriber = {
+	.info = rcu_mph_info,
+	.data = &rcu_state,
+};
+
 /* Helper function for call_rcu() and friends.  */
 static void
 __call_rcu(struct rcu_head *head, rcu_callback_t func)
@@ -4036,6 +4058,8 @@ void __init rcu_init(void)
 		qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
 	else
 		qovld_calc = qovld;
+
+	mph_subscribe(&rcu_mph_subscriber);
 }
 
 #include "tree_stall.h"



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode
  2020-05-07  9:36 ` [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode Hillf Danton
@ 2020-05-07 15:49   ` Paul E. McKenney
  2020-05-08 13:37     ` Hillf Danton
  0 siblings, 1 reply; 4+ messages in thread
From: Paul E. McKenney @ 2020-05-07 15:49 UTC (permalink / raw)
  To: Hillf Danton; +Cc: linux-kernel, rcu, akpm, linux-mm

On Thu, May 07, 2020 at 05:36:47PM +0800, Hillf Danton wrote:
> 
> Hello Paul
> 
> On Wed, 6 May 2020 17:42:40 Paul E. McKenney wrote:
> > 
> > This commit adds a shrinker so as to inform RCU when memory is scarce.
> 
> A simpler hook is added in the logic of kswapd for subscribing the info
> that memory pressure is high, and then on top of it make rcu a subscriber
> by copying your code for the shrinker, wishing it makes a sense to you.
> 
> What's not yet included is to make the hook per node to help make every
> reviewer convinced that memory is becoming tight. Of course without the
> cost of making subscribers node aware.
> 
> Hillf

I must defer to the MM folks on the MM portion of this patch, but early
warning of impending memory pressure would be extremely good.  A few
RCU-related notes inline below, though.

							Thanx, Paul

> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -49,6 +49,16 @@ static inline void set_max_mapnr(unsigne
>  static inline void set_max_mapnr(unsigned long limit) { }
>  #endif
>  
> +/* subscriber of kswapd's memory_pressure_high signal */
> +struct mph_subscriber {
> +	struct list_head node;
> +	void (*info) (void *data);
> +	void *data;
> +};
> +
> +int mph_subscribe(struct mph_subscriber *ms);
> +void mph_unsubscribe(struct mph_subscriber *ms);
> +
>  extern atomic_long_t _totalram_pages;
>  static inline unsigned long totalram_pages(void)
>  {
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3536,6 +3536,40 @@ static bool kswapd_shrink_node(pg_data_t
>  }
>  
>  /*
> + * subscribers of kswapd's signal that memory pressure is high
> + */
> +static LIST_HEAD(mph_subs);
> +static DEFINE_MUTEX(mph_lock);
> +
> +int mph_subscribe(struct mph_subscriber *ms)
> +{
> +	if (!ms->info)
> +		return -EAGAIN;
> +
> +	mutex_lock(&mph_lock);
> +	list_add_tail(&ms->node, &mph_subs);
> +	mutex_unlock(&mph_lock);
> +	return 0;
> +}
> +
> +void mph_unsubscribe(struct mph_subscriber *ms)
> +{
> +	mutex_lock(&mph_lock);
> +	list_del(&ms->node);
> +	mutex_unlock(&mph_lock);
> +}
> +
> +static void kswapd_bbc_mph(void)
> +{
> +	struct mph_subscriber *ms;
> +
> +	mutex_lock(&mph_lock);
> +	list_for_each_entry(ms, &mph_subs, node)
> +		ms->info(ms->data);
> +	mutex_unlock(&mph_lock);
> +}
> +
> +/*
>   * For kswapd, balance_pgdat() will reclaim pages across a node from zones
>   * that are eligible for use by the caller until at least one zone is
>   * balanced.
> @@ -3663,8 +3697,11 @@ restart:
>  		 * If we're getting trouble reclaiming, start doing writepage
>  		 * even in laptop mode.
>  		 */
> -		if (sc.priority < DEF_PRIORITY - 2)
> +		if (sc.priority < DEF_PRIORITY - 2) {
>  			sc.may_writepage = 1;
> +			if (sc.priority == DEF_PRIORITY - 3)
> +				kswapd_bbc_mph();
> +		}
>  
>  		/* Call soft limit reclaim before calling shrink_node. */
>  		sc.nr_scanned = 0;
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -325,6 +325,8 @@ struct rcu_state {
>  	int ncpus_snap;				/* # CPUs seen last time. */
>  	u8 cbovld;				/* Callback overload now? */
>  	u8 cbovldnext;				/* ^        ^  next time? */
> +	u8 mph;					/* mm pressure high signal from kswapd */
> +	unsigned long mph_end;			/* time stamp in jiffies */
>  
>  	unsigned long jiffies_force_qs;		/* Time at which to invoke */
>  						/*  force_quiescent_state(). */
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -52,6 +52,7 @@
>  #include <linux/kprobes.h>
>  #include <linux/gfp.h>
>  #include <linux/oom.h>
> +#include <linux/mm.h>
>  #include <linux/smpboot.h>
>  #include <linux/jiffies.h>
>  #include <linux/slab.h>
> @@ -2314,8 +2315,15 @@ static void force_qs_rnp(int (*f)(struct
>  	struct rcu_data *rdp;
>  	struct rcu_node *rnp;
>  
> -	rcu_state.cbovld = rcu_state.cbovldnext;
> +	rcu_state.cbovld = smp_load_acquire(&rcu_state.mph) ||
> +					rcu_state.cbovldnext;
>  	rcu_state.cbovldnext = false;
> +
> +	if (READ_ONCE(rcu_state.mph) &&
> +	    time_after(jiffies, READ_ONCE(rcu_state.mph_end))) {
> +		WRITE_ONCE(rcu_state.mph, false);
> +		pr_info("%s: Ending OOM-mode grace periods.\n", __func__);
> +	}
>  	rcu_for_each_leaf_node(rnp) {
>  		cond_resched_tasks_rcu_qs();
>  		mask = 0;
> @@ -2643,6 +2651,20 @@ static void check_cb_ovld(struct rcu_dat
>  	raw_spin_unlock_rcu_node(rnp);
>  }
>  
> +static void rcu_mph_info(void *data)

This pointer will always be &rcu_state, so why not ignore the pointer
and use "rcu_state" below?

RCU grace periods are inherently global, so I don't know of any way
for RCU to focus on a given NUMA node.  All or nothing.  But on the
other hand, speeding up RCU grace periods will also help specific
NUMA nodes, so I believe that it is all good.

> +{
> +	struct rcu_state *state = data;
> +
> +	WRITE_ONCE(state->mph_end, jiffies + HZ / 10);
> +	smp_store_release(&state->mph, true);
> +	rcu_force_quiescent_state();
> +}
> +
> +static struct mph_subscriber rcu_mph_subscriber = {
> +	.info = rcu_mph_info,
> +	.data = &rcu_state,

Then this ".data" entry can be omitted, correct?

> +};
> +
>  /* Helper function for call_rcu() and friends.  */
>  static void
>  __call_rcu(struct rcu_head *head, rcu_callback_t func)
> @@ -4036,6 +4058,8 @@ void __init rcu_init(void)
>  		qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
>  	else
>  		qovld_calc = qovld;
> +
> +	mph_subscribe(&rcu_mph_subscriber);
>  }
>  
>  #include "tree_stall.h"
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode
  2020-05-07 15:49   ` Paul E. McKenney
@ 2020-05-08 13:37     ` Hillf Danton
  2020-05-08 14:47       ` Paul E. McKenney
  0 siblings, 1 reply; 4+ messages in thread
From: Hillf Danton @ 2020-05-08 13:37 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Hillf Danton, linux-kernel, rcu, akpm, linux-mm


On Thu, 7 May 2020 08:49:10 Paul E. McKenney wrote:
>  
> > +static void rcu_mph_info(void *data)
>
> This pointer will always be &rcu_state, so why not ignore the pointer
> and use "rcu_state" below?
>
Yes you're right.

> RCU grace periods are inherently global, so I don't know of any way
> for RCU to focus on a given NUMA node.  All or nothing.

Or is it feasible to expose certain RCU thing to VM, say, with which kswapd
can kick grace period every time the kthreads think it's needed? That way
the work to gauge memory pressure can be off RCU's shoulders.

> But on the
> other hand, speeding up RCU grace periods will also help specific
> NUMA nodes, so I believe that it is all good.
> 
> > +{
> > +	struct rcu_state *state = data;
> > +
> > +	WRITE_ONCE(state->mph_end, jiffies + HZ / 10);
> > +	smp_store_release(&state->mph, true);
> > +	rcu_force_quiescent_state();
> > +}
> > +
> > +static struct mph_subscriber rcu_mph_subscriber = {
> > +	.info = rcu_mph_info,
> > +	.data = &rcu_state,
> 
> Then this ".data" entry can be omitted, correct?

Yes :)

Hillf



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode
  2020-05-08 13:37     ` Hillf Danton
@ 2020-05-08 14:47       ` Paul E. McKenney
  0 siblings, 0 replies; 4+ messages in thread
From: Paul E. McKenney @ 2020-05-08 14:47 UTC (permalink / raw)
  To: Hillf Danton; +Cc: linux-kernel, rcu, akpm, linux-mm

On Fri, May 08, 2020 at 09:37:43PM +0800, Hillf Danton wrote:
> 
> On Thu, 7 May 2020 08:49:10 Paul E. McKenney wrote:
> >  
> > > +static void rcu_mph_info(void *data)
> >
> > This pointer will always be &rcu_state, so why not ignore the pointer
> > and use "rcu_state" below?
> >
> Yes you're right.
> 
> > RCU grace periods are inherently global, so I don't know of any way
> > for RCU to focus on a given NUMA node.  All or nothing.
> 
> Or is it feasible to expose certain RCU thing to VM, say, with which kswapd
> can kick grace period every time the kthreads think it's needed? That way
> the work to gauge memory pressure can be off RCU's shoulders.

A pair of functions RCU provides is easy for me.  ;-)

							Thanx, Paul

> > But on the
> > other hand, speeding up RCU grace periods will also help specific
> > NUMA nodes, so I believe that it is all good.
> > 
> > > +{
> > > +	struct rcu_state *state = data;
> > > +
> > > +	WRITE_ONCE(state->mph_end, jiffies + HZ / 10);
> > > +	smp_store_release(&state->mph, true);
> > > +	rcu_force_quiescent_state();
> > > +}
> > > +
> > > +static struct mph_subscriber rcu_mph_subscriber = {
> > > +	.info = rcu_mph_info,
> > > +	.data = &rcu_state,
> > 
> > Then this ".data" entry can be omitted, correct?
> 
> Yes :)
> 
> Hillf
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-05-08 14:47 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20200507004240.GA9156@paulmck-ThinkPad-P72>
2020-05-07  9:36 ` [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode Hillf Danton
2020-05-07 15:49   ` Paul E. McKenney
2020-05-08 13:37     ` Hillf Danton
2020-05-08 14:47       ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).