linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
@ 2020-10-29 16:50 Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies Uladzislau Rezki (Sony)
                   ` (16 more replies)
  0 siblings, 17 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

The current memmory-allocation interface presents to following
difficulties that this patch is designed to overcome:

a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
   complain about violation("BUG: Invalid wait context") of the
   nesting rules. It does the raw_spinlock vs. spinlock nesting
   checks, i.e. it is not legal to acquire a spinlock_t while
   holding a raw_spinlock_t.

   Internally the kfree_rcu() uses raw_spinlock_t whereas the
   "page allocator" internally deals with spinlock_t to access
   to its zones. The code also can be broken from higher level
   of view:
   <snip>
       raw_spin_lock(&some_lock);
       kfree_rcu(some_pointer, some_field_offset);
   <snip>

b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
   is converted into sleepable variant. Invoking the page allocator from
   atomic contexts leads to "BUG: scheduling while atomic".

c) call_rcu() is invoked from raw atomic context and kfree_rcu()
   and kvfree_rcu() are expected to be called from atomic raw context
   as well.

Move out a page allocation from contexts which trigger kvfree_rcu()
function to the separate worker. When a k[v]free_rcu() per-cpu page
cache is empty a fallback mechanism is used and a special job is
scheduled to refill the per-cpu cache.

Link: https://lore.kernel.org/lkml/20200630164543.4mdcf6zb4zfclhln@linutronix.de/
Fixes: 3042f83f19be ("rcu: Support reclaim for head-less object")
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 43 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 06895ef85d69..f2da2a1cc716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
  * per-CPU. Object size is equal to one page. This value
  * can be changed at boot time.
  */
-static int rcu_min_cached_objs = 2;
+static int rcu_min_cached_objs = 5;
 module_param(rcu_min_cached_objs, int, 0444);
 
 /* Retrieve RCU kthreads priority for rcutorture */
@@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
  *	In order to save some per-cpu space the list is singular.
  *	Even though it is lockless an access has to be protected by the
  *	per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
  * @nr_bkv_objs: number of allocated objects at @bkvcache.
  *
  * This is a per-CPU structure.  The reason that it is not included in
@@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
 	bool monitor_todo;
 	bool initialized;
 	int count;
+
+	struct work_struct page_cache_work;
+	atomic_t work_in_progress;
+	struct hrtimer hrtimer;
+
 	struct llist_head bkvcache;
 	int nr_bkv_objs;
 };
@@ -3217,10 +3225,10 @@ static void kfree_rcu_work(struct work_struct *work)
 			}
 			rcu_lock_release(&rcu_callback_map);
 
-			krcp = krc_this_cpu_lock(&flags);
+			raw_spin_lock_irqsave(&krcp->lock, flags);
 			if (put_cached_bnode(krcp, bkvhead[i]))
 				bkvhead[i] = NULL;
-			krc_this_cpu_unlock(krcp, flags);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 			if (bkvhead[i])
 				free_page((unsigned long) bkvhead[i]);
@@ -3347,6 +3355,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+	struct kfree_rcu_cpu *krcp =
+		container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+	queue_work(system_highpri_wq, &krcp->page_cache_work);
+	return HRTIMER_NORESTART;
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+	struct kvfree_rcu_bulk_data *bnode;
+	struct kfree_rcu_cpu *krcp =
+		container_of(work, struct kfree_rcu_cpu,
+			page_cache_work);
+	unsigned long flags;
+	bool pushed;
+	int i;
+
+	for (i = 0; i < rcu_min_cached_objs; i++) {
+		bnode = (struct kvfree_rcu_bulk_data *)
+			__get_free_page(GFP_KERNEL | __GFP_NOWARN);
+
+		if (bnode) {
+			raw_spin_lock_irqsave(&krcp->lock, flags);
+			pushed = put_cached_bnode(krcp, bnode);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+			if (!pushed) {
+				free_page((unsigned long) bnode);
+				break;
+			}
+		}
+	}
+
+	atomic_set(&krcp->work_in_progress, 0);
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+			!atomic_xchg(&krcp->work_in_progress, 1)) {
+		hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+			HRTIMER_MODE_REL);
+		krcp->hrtimer.function = schedule_page_work_fn;
+		hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+	}
+}
+
 static inline bool
 kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 {
@@ -3363,32 +3422,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 	if (!krcp->bkvhead[idx] ||
 			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
 		bnode = get_cached_bnode(krcp);
-		if (!bnode) {
-			/*
-			 * To keep this path working on raw non-preemptible
-			 * sections, prevent the optional entry into the
-			 * allocator as it uses sleeping locks. In fact, even
-			 * if the caller of kfree_rcu() is preemptible, this
-			 * path still is not, as krcp->lock is a raw spinlock.
-			 * With additional page pre-allocation in the works,
-			 * hitting this return is going to be much less likely.
-			 */
-			if (IS_ENABLED(CONFIG_PREEMPT_RT))
-				return false;
-
-			/*
-			 * NOTE: For one argument of kvfree_rcu() we can
-			 * drop the lock and get the page in sleepable
-			 * context. That would allow to maintain an array
-			 * for the CONFIG_PREEMPT_RT as well if no cached
-			 * pages are available.
-			 */
-			bnode = (struct kvfree_rcu_bulk_data *)
-				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-		}
-
 		/* Switch to emergency path. */
-		if (unlikely(!bnode))
+		if (!bnode)
 			return false;
 
 		/* Initialize the new block. */
@@ -3452,12 +3487,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		goto unlock_return;
 	}
 
-	/*
-	 * Under high memory pressure GFP_NOWAIT can fail,
-	 * in that case the emergency path is maintained.
-	 */
 	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
 	if (!success) {
+		run_page_cache_worker(krcp);
+
 		if (head == NULL)
 			// Inline if kvfree_rcu(one_arg) call.
 			goto unlock_return;
@@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-		struct kvfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 		}
 
-		for (i = 0; i < rcu_min_cached_objs; i++) {
-			bnode = (struct kvfree_rcu_bulk_data *)
-				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-
-			if (bnode)
-				put_cached_bnode(krcp, bnode);
-			else
-				pr_err("Failed to preallocate for %d CPU!\n", cpu);
-		}
-
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
 	if (register_shrinker(&kfree_rcu_shrinker))
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 03/16] preempt: Make preempt count unconditional Uladzislau Rezki (Sony)
                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

From: Thomas Gleixner <tglx@linutronix.de>

ARCH_NO_PREEMPT disables the selection of CONFIG_PREEMPT_VOLUNTARY and
CONFIG_PREEMPT, but architectures which set this config option still
support preempt count for hard and softirq accounting.

There is absolutely no reason to prevent lockdep from using the preempt
counter nor is there a reason to prevent the enablement of
CONFIG_DEBUG_ATOMIC_SLEEP on such architectures.

Remove the dependencies.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 lib/Kconfig.debug | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d7a7bc3b6098..89c9a177fb9b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1159,7 +1159,7 @@ config PROVE_LOCKING
 	select DEBUG_RWSEMS
 	select DEBUG_WW_MUTEX_SLOWPATH
 	select DEBUG_LOCK_ALLOC
-	select PREEMPT_COUNT if !ARCH_NO_PREEMPT
+	select PREEMPT_COUNT
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1321,7 +1321,6 @@ config DEBUG_ATOMIC_SLEEP
 	bool "Sleep inside atomic section checking"
 	select PREEMPT_COUNT
 	depends on DEBUG_KERNEL
-	depends on !ARCH_NO_PREEMPT
 	help
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called inside atomic sections: when a spinlock is
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 03/16] preempt: Make preempt count unconditional
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 04/16] preempt: Cleanup PREEMPT_COUNT leftovers Uladzislau Rezki (Sony)
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

From: Thomas Gleixner <tglx@linutronix.de>

The handling of preempt_count() is inconsistent accross kernel
configurations. On kernels which have PREEMPT_COUNT=n
preempt_disable/enable() and the lock/unlock functions are not affecting
the preempt count, only local_bh_disable/enable() and _bh variants of
locking, soft interrupt delivery, hard interrupt and NMI context affect it.

It's therefore impossible to have a consistent set of checks which provide
information about the context in which a function is called. In many cases
it makes sense to have seperate functions for seperate contexts, but there
are valid reasons to avoid that and handle different calling contexts
conditionally.

The lack of such indicators which work on all kernel configuratios is a
constant source of trouble because developers either do not understand the
implications or try to work around this inconsistency in weird
ways. Neither seem these issues be catched by reviewers and testing.

Recently merged code does:

	 gfp = preemptible() ? GFP_KERNEL : GFP_ATOMIC;

Looks obviously correct, except for the fact that preemptible() is
unconditionally false for CONFIF_PREEMPT_COUNT=n, i.e. all allocations in
that code use GFP_ATOMIC on such kernels.

Attempts to make preempt count unconditional and consistent have been
rejected in the past with handwaving performance arguments.

Freshly conducted benchmarks did not reveal any measurable impact from
enabling preempt count unconditionally. On kernels with CONFIG_PREEMPT_NONE
or CONFIG_PREEMPT_VOLUNTARY the preempt count is only incremented and
decremented but the result of the decrement is not tested. Contrary to that
enabling CONFIG_PREEMPT which tests the result has a small but measurable
impact due to the conditional branch/call.

It's about time to make essential functionality of the kernel consistent
accross the various preemption models.

Enable CONFIG_PREEMPT_COUNT unconditionally. Follow up changes will remove
the #ifdeffery and remove the config option at the end.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/Kconfig.preempt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..3f4712ff073b 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -75,8 +75,7 @@ config PREEMPT_RT
 endchoice
 
 config PREEMPT_COUNT
-       bool
+       def_bool y
 
 config PREEMPTION
        bool
-       select PREEMPT_COUNT
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 04/16] preempt: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 03/16] preempt: Make preempt count unconditional Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 05/16] lockdep: " Uladzislau Rezki (Sony)
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/preempt.h | 37 ++++---------------------------------
 1 file changed, 4 insertions(+), 33 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7d9c1c0e149c..513769b1edf8 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -56,8 +56,7 @@
 #define PREEMPT_DISABLED	(PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 
 /*
- * Disable preemption until the scheduler is running -- use an unconditional
- * value so that it also works on !PREEMPT_COUNT kernels.
+ * Disable preemption until the scheduler is running.
  *
  * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
  */
@@ -69,7 +68,6 @@
  *
  *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
  *
- * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
  * Note: See finish_task_switch().
  */
 #define FORK_PREEMPT_COUNT	(2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
@@ -106,11 +104,7 @@
 /*
  * The preempt_count offset after preempt_disable();
  */
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_DISABLE_OFFSET	PREEMPT_OFFSET
-#else
-# define PREEMPT_DISABLE_OFFSET	0
-#endif
+#define PREEMPT_DISABLE_OFFSET	PREEMPT_OFFSET
 
 /*
  * The preempt_count offset after spin_lock()
@@ -122,8 +116,8 @@
  *
  *  spin_lock_bh()
  *
- * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
- * softirqs, such that unlock sequences of:
+ * Which need to disable both preemption and softirqs, such that unlock
+ * sequences of:
  *
  *  spin_unlock();
  *  local_bh_enable();
@@ -164,8 +158,6 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
-#ifdef CONFIG_PREEMPT_COUNT
-
 #define preempt_disable() \
 do { \
 	preempt_count_inc(); \
@@ -231,27 +223,6 @@ do { \
 	__preempt_count_dec(); \
 } while (0)
 
-#else /* !CONFIG_PREEMPT_COUNT */
-
-/*
- * Even if we don't have any preemption, we need preempt disable/enable
- * to be barriers, so that we don't have things like get_user/put_user
- * that can cause faults and scheduling migrate into our preempt-protected
- * region.
- */
-#define preempt_disable()			barrier()
-#define sched_preempt_enable_no_resched()	barrier()
-#define preempt_enable_no_resched()		barrier()
-#define preempt_enable()			barrier()
-#define preempt_check_resched()			do { } while (0)
-
-#define preempt_disable_notrace()		barrier()
-#define preempt_enable_no_resched_notrace()	barrier()
-#define preempt_enable_notrace()		barrier()
-#define preemptible()				0
-
-#endif /* CONFIG_PREEMPT_COUNT */
-
 #ifdef MODULE
 /*
  * Modules have no business playing preemption tricks.
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 05/16] lockdep: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (2 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 04/16] preempt: Cleanup PREEMPT_COUNT leftovers Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 06/16] mm/pagemap: " Uladzislau Rezki (Sony)
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Ingo Molnar, Will Deacon

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
[ Rezki: Adopted for 5.10.0-rc1 kernel. ]
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/lockdep.h | 6 ++----
 lib/Kconfig.debug       | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index f5594879175a..d05db575f60f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -580,16 +580,14 @@ do {									\
 
 #define lockdep_assert_preemption_enabled()				\
 do {									\
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
-		     __lockdep_enabled			&&		\
+	WARN_ON_ONCE(__lockdep_enabled			&&		\
 		     (preempt_count() != 0		||		\
 		      !this_cpu_read(hardirqs_enabled)));		\
 } while (0)
 
 #define lockdep_assert_preemption_disabled()				\
 do {									\
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
-		     __lockdep_enabled			&&		\
+	WARN_ON_ONCE(__lockdep_enabled			&&		\
 		     (preempt_count() == 0		&&		\
 		      this_cpu_read(hardirqs_enabled)));		\
 } while (0)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 89c9a177fb9b..03a85065805e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1159,7 +1159,6 @@ config PROVE_LOCKING
 	select DEBUG_RWSEMS
 	select DEBUG_WW_MUTEX_SLOWPATH
 	select DEBUG_LOCK_ALLOC
-	select PREEMPT_COUNT
 	select TRACE_IRQFLAGS
 	default n
 	help
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (3 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 05/16] lockdep: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 20:57   ` Uladzislau Rezki
  2020-10-29 16:50 ` [PATCH 07/16] locking/bitspinlock: " Uladzislau Rezki (Sony)
                   ` (11 subsequent siblings)
  16 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, linux-mm

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/pagemap.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c77b7c31b2e4..cbfbe2bcca75 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,9 +204,7 @@ void release_pages(struct page **pages, int nr);
 static inline int __page_cache_add_speculative(struct page *page, int count)
 {
 #ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
-	VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
+	VM_BUG_ON(preemptible())
 	/*
 	 * Preempt must be disabled here - we rely on rcu_read_lock doing
 	 * this for us.
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 07/16] locking/bitspinlock: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (4 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 06/16] mm/pagemap: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 08/16] uaccess: " Uladzislau Rezki (Sony)
                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Will Deacon

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/bit_spinlock.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index bbc4730a6505..1e03d54b0b6f 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -90,10 +90,8 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
 {
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT_COUNT
-	return preempt_count();
 #else
-	return 1;
+	return preempt_count();
 #endif
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 08/16] uaccess: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (5 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 07/16] locking/bitspinlock: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 09/16] sched: " Uladzislau Rezki (Sony)
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/uaccess.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index c7c6e8b8344d..d6473a72a336 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -275,9 +275,9 @@ static inline bool pagefault_disabled(void)
  *
  * This function should only be used by the fault handlers. Other users should
  * stick to pagefault_disabled().
- * Please NEVER use preempt_disable() to disable the fault handler. With
- * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
- * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
+ *
+ * Please NEVER use preempt_disable() or local_irq_disable() to disable the
+ * fault handler.
  */
 #define faulthandler_disabled() (pagefault_disabled() || in_atomic())
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 09/16] sched: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (6 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 08/16] uaccess: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 10/16] ARM: " Uladzislau Rezki (Sony)
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/sched/core.c | 6 +-----
 lib/Kconfig.debug   | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..e172f2ddfa16 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3702,8 +3702,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	 * finish_task_switch() for details.
 	 *
 	 * finish_task_switch() will drop rq->lock() and lower preempt_count
-	 * and the preempt_enable() will end up enabling preemption (on
-	 * PREEMPT_COUNT kernels).
+	 * and the preempt_enable() will end up enabling preemption.
 	 */
 
 	rq = finish_task_switch(prev);
@@ -7307,9 +7306,6 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
 	if (irqs_disabled())
 		return;
 
-	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
-		return;
-
 	if (preempt_count() > preempt_offset)
 		return;
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 03a85065805e..d62806c81f6d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1318,7 +1318,6 @@ config DEBUG_LOCKDEP
 
 config DEBUG_ATOMIC_SLEEP
 	bool "Sleep inside atomic section checking"
-	select PREEMPT_COUNT
 	depends on DEBUG_KERNEL
 	help
 	  If you say Y here, various routines which may sleep will become very
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 10/16] ARM: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (7 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 09/16] sched: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 11/16] xtensa: " Uladzislau Rezki (Sony)
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Russell King,
	linux-arm-kernel

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 arch/arm/include/asm/assembler.h   | 11 -----------
 arch/arm/kernel/iwmmxt.S           |  2 --
 arch/arm/mach-ep93xx/crunch-bits.S |  2 --
 3 files changed, 15 deletions(-)

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index feac2c8b86f2..fce52eedc6e1 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -212,7 +212,6 @@
 /*
  * Increment/decrement the preempt count.
  */
-#ifdef CONFIG_PREEMPT_COUNT
 	.macro	inc_preempt_count, ti, tmp
 	ldr	\tmp, [\ti, #TI_PREEMPT]	@ get preempt count
 	add	\tmp, \tmp, #1			@ increment it
@@ -229,16 +228,6 @@
 	get_thread_info \ti
 	dec_preempt_count \ti, \tmp
 	.endm
-#else
-	.macro	inc_preempt_count, ti, tmp
-	.endm
-
-	.macro	dec_preempt_count, ti, tmp
-	.endm
-
-	.macro	dec_preempt_count_ti, ti, tmp
-	.endm
-#endif
 
 #define USERL(l, x...)				\
 9999:	x;					\
diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S
index 0dcae787b004..1f845be78c43 100644
--- a/arch/arm/kernel/iwmmxt.S
+++ b/arch/arm/kernel/iwmmxt.S
@@ -94,9 +94,7 @@ ENTRY(iwmmxt_task_enable)
 	mov	r2, r2				@ cpwait
 	bl	concan_save
 
-#ifdef CONFIG_PREEMPT_COUNT
 	get_thread_info r10
-#endif
 4:	dec_preempt_count r10, r3
 	ret	r9				@ normal exit from exception
 
diff --git a/arch/arm/mach-ep93xx/crunch-bits.S b/arch/arm/mach-ep93xx/crunch-bits.S
index fb2dbf76f09e..0aabcf4ebe8e 100644
--- a/arch/arm/mach-ep93xx/crunch-bits.S
+++ b/arch/arm/mach-ep93xx/crunch-bits.S
@@ -191,9 +191,7 @@ crunch_load:
 	cfldr64		mvdx15, [r0, #CRUNCH_MVDX15]
 
 1:
-#ifdef CONFIG_PREEMPT_COUNT
 	get_thread_info r10
-#endif
 2:	dec_preempt_count r10, r3
 	ret	lr
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 11/16] xtensa: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (8 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 10/16] ARM: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 12/16] drm/i915: " Uladzislau Rezki (Sony)
                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Chris Zankel,
	Max Filippov, linux-xtensa

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: linux-xtensa@linux-xtensa.org
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 arch/xtensa/kernel/entry.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index 703cf6205efe..5a27dd34d3fc 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -819,7 +819,7 @@ ENTRY(debug_exception)
 	 * preemption if we have HW breakpoints to preserve DEBUGCAUSE.DBNUM
 	 * meaning.
 	 */
-#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_HAVE_HW_BREAKPOINT)
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
 	GET_THREAD_INFO(a2, a1)
 	l32i	a3, a2, TI_PRE_COUNT
 	addi	a3, a3, 1
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 12/16] drm/i915: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (9 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 11/16] xtensa: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 13/16] rcutorture: " Uladzislau Rezki (Sony)
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Jani Nikula,
	Joonas Lahtinen, Rodrigo Vivi, David Airlie, Daniel Vetter,
	intel-gfx, dri-devel

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 drivers/gpu/drm/i915/Kconfig.debug | 1 -
 drivers/gpu/drm/i915/i915_utils.h  | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug
index 25cd9788a4d5..b149f76d3ccd 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -20,7 +20,6 @@ config DRM_I915_DEBUG
 	bool "Enable additional driver debugging"
 	depends on DRM_I915
 	select DEBUG_FS
-	select PREEMPT_COUNT
 	select I2C_CHARDEV
 	select STACKDEPOT
 	select DRM_DP_AUX_CHARDEV
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 54773371e6bd..ecfed860b3f6 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -337,8 +337,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
 						   (Wmax))
 #define wait_for(COND, MS)		_wait_for((COND), (MS) * 1000, 10, 1000)
 
-/* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT)
+#ifdef CONFIG_DRM_I915_DEBUG
 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
 #else
 # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 13/16] rcutorture: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (10 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 12/16] drm/i915: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 14/16] preempt: Remove PREEMPT_COUNT from Kconfig Uladzislau Rezki (Sony)
                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko, Josh Triplett,
	Steven Rostedt, Mathieu Desnoyers, Lai Jiangshan, Shuah Khan,
	linux-kselftest

From: Thomas Gleixner <tglx@linutronix.de>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: rcu@vger.kernel.org
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-t        | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/SRCU-u        | 1 -
 tools/testing/selftests/rcutorture/configs/rcu/TINY01        | 1 -
 tools/testing/selftests/rcutorture/doc/TINY_RCU.txt          | 5 ++---
 tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt  | 1 -
 .../selftests/rcutorture/formal/srcu-cbmc/src/config.h       | 1 -
 6 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
index 6c78022c8cd8..553cf6534e67 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
@@ -7,4 +7,3 @@ CONFIG_RCU_TRACE=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_DEBUG_ATOMIC_SLEEP=y
-#CHECK#CONFIG_PREEMPT_COUNT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
index c15ada821e45..99563da21732 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
@@ -7,4 +7,3 @@ CONFIG_RCU_TRACE=n
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY01 b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
index 6db705e55487..9b22b8e768d5 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
@@ -10,4 +10,3 @@ CONFIG_RCU_TRACE=n
 #CHECK#CONFIG_RCU_STALL_COMMON=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
index a75b16991a92..d30cedf07826 100644
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -3,11 +3,10 @@ This document gives a brief rationale for the TINY_RCU test cases.
 
 Kconfig Parameters:
 
-CONFIG_DEBUG_LOCK_ALLOC -- Do all three and none of the three.
-CONFIG_PREEMPT_COUNT
+CONFIG_DEBUG_LOCK_ALLOC -- Do both and none of the two.
 CONFIG_RCU_TRACE
 
-The theory here is that randconfig testing will hit the other six possible
+The theory here is that randconfig testing will hit the other two possible
 combinations of these parameters.
 
 
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 1b96d68473b8..cfdd48f689de 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -43,7 +43,6 @@ CONFIG_64BIT
 
 	Used only to check CONFIG_RCU_FANOUT value, inspection suffices.
 
-CONFIG_PREEMPT_COUNT
 CONFIG_PREEMPT_RCU
 
 	Redundant with CONFIG_PREEMPT, ignore.
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
index 283d7103334f..d0d485d48649 100644
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
@@ -8,7 +8,6 @@
 #undef CONFIG_HOTPLUG_CPU
 #undef CONFIG_MODULES
 #undef CONFIG_NO_HZ_FULL_SYSIDLE
-#undef CONFIG_PREEMPT_COUNT
 #undef CONFIG_PREEMPT_RCU
 #undef CONFIG_PROVE_RCU
 #undef CONFIG_RCU_NOCB_CPU
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 14/16] preempt: Remove PREEMPT_COUNT from Kconfig
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (11 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 13/16] rcutorture: " Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 16:50 ` [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible Uladzislau Rezki (Sony)
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

From: Thomas Gleixner <tglx@linutronix.de>

All conditionals and irritations are gone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/Kconfig.preempt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f4712ff073b..120b63f0c55a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -74,8 +74,5 @@ config PREEMPT_RT
 
 endchoice
 
-config PREEMPT_COUNT
-       def_bool y
-
 config PREEMPTION
        bool
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (12 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 14/16] preempt: Remove PREEMPT_COUNT from Kconfig Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-11-03 18:03   ` Joel Fernandes
  2020-10-29 16:50 ` [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache Uladzislau Rezki (Sony)
                   ` (2 subsequent siblings)
  16 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

Given that CONFIG_PREEMPT_COUNT is unconditionally enabled by the
earlier commits in this series, the preemptible() macro now properly
detects preempt-disable code regions even in kernels built with
CONFIG_PREEMPT_NONE.

This commit therefore uses preemptible() to determine whether allocation
is possible at all for double-argument kvfree_rcu().  If !preemptible(),
then allocation is not possible, and kvfree_rcu() falls back to using
the less cache-friendly rcu_head approach.  Even when preemptible(),
the caller might be involved in reclaim, so the GFP_ flags used by
double-argument kvfree_rcu() must avoid invoking reclaim processing.

Note that single-argument kvfree_rcu() must be invoked in sleepable
contexts, and that its fallback is the relatively high latency
synchronize_rcu().  Single-argument kvfree_rcu() therefore uses
GFP_KERNEL|__GFP_RETRY_MAYFAIL to allow limited sleeping within the
memory allocator.

[ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ]
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 48 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f2da2a1cc716..3f9b016a44dc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3406,37 +3406,55 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
 	}
 }
 
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags.  If can_sleep is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// If !can_sleep, then if !preemptible() no allocation will be undertaken,
+// otherwise the allocation will use GFP_ATOMIC to avoid the remainder of
+// the aforementioned deadlock possibilities.  Returns true if ptr was
+// successfully recorded, else the caller must use a fallback.
 static inline bool
-kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+	unsigned long *flags, void *ptr, bool can_sleep)
 {
 	struct kvfree_rcu_bulk_data *bnode;
+	bool can_alloc_page = preemptible();
+	gfp_t gfp = (can_sleep ? GFP_KERNEL | __GFP_RETRY_MAYFAIL :
+		GFP_ATOMIC) | __GFP_NOWARN;
 	int idx;
 
-	if (unlikely(!krcp->initialized))
+	*krcp = krc_this_cpu_lock(flags);
+	if (unlikely(!(*krcp)->initialized))
 		return false;
 
-	lockdep_assert_held(&krcp->lock);
 	idx = !!is_vmalloc_addr(ptr);
 
 	/* Check if a new block is required. */
-	if (!krcp->bkvhead[idx] ||
-			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
-		bnode = get_cached_bnode(krcp);
-		/* Switch to emergency path. */
+	if (!(*krcp)->bkvhead[idx] ||
+			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+		bnode = get_cached_bnode(*krcp);
+		if (!bnode && can_alloc_page) {
+			krc_this_cpu_unlock(*krcp, *flags);
+			bnode = (struct kvfree_rcu_bulk_data *)
+				__get_free_page(gfp);
+			*krcp = krc_this_cpu_lock(flags);
+		}
+
 		if (!bnode)
 			return false;
 
 		/* Initialize the new block. */
 		bnode->nr_records = 0;
-		bnode->next = krcp->bkvhead[idx];
+		bnode->next = (*krcp)->bkvhead[idx];
 
 		/* Attach it to the head. */
-		krcp->bkvhead[idx] = bnode;
+		(*krcp)->bkvhead[idx] = bnode;
 	}
 
 	/* Finally insert. */
-	krcp->bkvhead[idx]->records
-		[krcp->bkvhead[idx]->nr_records++] = ptr;
+	(*krcp)->bkvhead[idx]->records
+		[(*krcp)->bkvhead[idx]->nr_records++] = ptr;
 
 	return true;
 }
@@ -3474,20 +3492,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		ptr = (unsigned long *) func;
 	}
 
-	krcp = krc_this_cpu_lock(&flags);
-
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
 			  __func__, head);
 
-		// Mark as success and leave.
-		success = true;
-		goto unlock_return;
+		return;
 	}
 
-	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
 	if (!success) {
 		run_page_cache_worker(krcp);
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (13 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible Uladzislau Rezki (Sony)
@ 2020-10-29 16:50 ` Uladzislau Rezki (Sony)
  2020-10-29 19:47   ` Paul E. McKenney
  2020-11-03 15:47 ` [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Joel Fernandes
  2020-11-03 17:54 ` Joel Fernandes
  16 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki (Sony) @ 2020-10-29 16:50 UTC (permalink / raw)
  To: LKML, RCU, Paul E . McKenney
  Cc: Andrew Morton, Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Joel Fernandes, Sebastian Andrzej Siewior,
	Uladzislau Rezki, Oleksiy Avramchenko

A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
can be obtained directly from a kvfree_rcu() path. To distinguish
that and take a decision the preemptable() macro is used when it
is save to enter allocator.

It means that refilling a cache is not important from timing point
of view. Switch to a delayed work, so the actual work is queued from
the timer interrupt with 1 jiffy delay. An immediate placing a task
on a current CPU can lead to rq->lock double lock. That is why a
delayed method is in place.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3f9b016a44dc..9ea55f800b4b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3086,7 +3086,6 @@ struct kfree_rcu_cpu_work {
  *	per-cpu lock.
  * @page_cache_work: A work to refill the cache when it is empty
  * @work_in_progress: Indicates that page_cache_work is running
- * @hrtimer: A hrtimer for scheduling a page_cache_work
  * @nr_bkv_objs: number of allocated objects at @bkvcache.
  *
  * This is a per-CPU structure.  The reason that it is not included in
@@ -3104,9 +3103,8 @@ struct kfree_rcu_cpu {
 	bool initialized;
 	int count;
 
-	struct work_struct page_cache_work;
+	struct delayed_work page_cache_work;
 	atomic_t work_in_progress;
-	struct hrtimer hrtimer;
 
 	struct llist_head bkvcache;
 	int nr_bkv_objs;
@@ -3355,22 +3353,12 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
-static enum hrtimer_restart
-schedule_page_work_fn(struct hrtimer *t)
-{
-	struct kfree_rcu_cpu *krcp =
-		container_of(t, struct kfree_rcu_cpu, hrtimer);
-
-	queue_work(system_highpri_wq, &krcp->page_cache_work);
-	return HRTIMER_NORESTART;
-}
-
 static void fill_page_cache_func(struct work_struct *work)
 {
 	struct kvfree_rcu_bulk_data *bnode;
 	struct kfree_rcu_cpu *krcp =
 		container_of(work, struct kfree_rcu_cpu,
-			page_cache_work);
+			page_cache_work.work);
 	unsigned long flags;
 	bool pushed;
 	int i;
@@ -3398,12 +3386,8 @@ static void
 run_page_cache_worker(struct kfree_rcu_cpu *krcp)
 {
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
-			!atomic_xchg(&krcp->work_in_progress, 1)) {
-		hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
-			HRTIMER_MODE_REL);
-		krcp->hrtimer.function = schedule_page_work_fn;
-		hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
-	}
+			!atomic_xchg(&krcp->work_in_progress, 1))
+		schedule_delayed_work(&krcp->page_cache_work, 1);
 }
 
 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
@@ -4503,7 +4487,7 @@ static void __init kfree_rcu_batch_init(void)
 		}
 
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
-		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
 	if (register_shrinker(&kfree_rcu_shrinker))
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache
  2020-10-29 16:50 ` [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache Uladzislau Rezki (Sony)
@ 2020-10-29 19:47   ` Paul E. McKenney
  2020-10-29 20:13     ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2020-10-29 19:47 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Andrew Morton, Peter Zijlstra, Michal Hocko,
	Thomas Gleixner, Theodore Y . Ts'o, Joel Fernandes,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> can be obtained directly from a kvfree_rcu() path. To distinguish
> that and take a decision the preemptable() macro is used when it
> is save to enter allocator.
> 
> It means that refilling a cache is not important from timing point
> of view. Switch to a delayed work, so the actual work is queued from
> the timer interrupt with 1 jiffy delay. An immediate placing a task
> on a current CPU can lead to rq->lock double lock. That is why a
> delayed method is in place.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>

Thank you, Uladzislau!

I applied this on top of v5.10-rc1 and got the following from the
single-CPU builds:

  SYNC    include/config/auto.conf.cmd
  DESCEND  objtool
  CC      kernel/bounds.s
  CALL    scripts/atomic/check-atomics.sh
  UPD     include/generated/bounds.h
  CC      arch/x86/kernel/asm-offsets.s
In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
                 from ./include/linux/atomic.h:82,
                 from ./include/linux/crypto.h:15,
                 from arch/x86/kernel/asm-offsets.c:9:
./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
 #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
                                 ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
                         ^~~~~~~~~~~~~~~~~~~~
./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
  VM_BUG_ON(preemptible())
  ^~~~~~~~~
scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
Makefile:1199: recipe for target 'prepare0' failed
make: *** [prepare0] Error 2

I vaguely recall something like this showing up in the previous series
and that we did something or another to address it.  Could you please
check against the old series at -rcu branch dev.2020.10.22a?  (I verified
that the old series does run correctly in the single-CPU scenarios.)

							Thanx, Paul

> ---
>  kernel/rcu/tree.c | 26 +++++---------------------
>  1 file changed, 5 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 3f9b016a44dc..9ea55f800b4b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3086,7 +3086,6 @@ struct kfree_rcu_cpu_work {
>   *	per-cpu lock.
>   * @page_cache_work: A work to refill the cache when it is empty
>   * @work_in_progress: Indicates that page_cache_work is running
> - * @hrtimer: A hrtimer for scheduling a page_cache_work
>   * @nr_bkv_objs: number of allocated objects at @bkvcache.
>   *
>   * This is a per-CPU structure.  The reason that it is not included in
> @@ -3104,9 +3103,8 @@ struct kfree_rcu_cpu {
>  	bool initialized;
>  	int count;
>  
> -	struct work_struct page_cache_work;
> +	struct delayed_work page_cache_work;
>  	atomic_t work_in_progress;
> -	struct hrtimer hrtimer;
>  
>  	struct llist_head bkvcache;
>  	int nr_bkv_objs;
> @@ -3355,22 +3353,12 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  		raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
> -static enum hrtimer_restart
> -schedule_page_work_fn(struct hrtimer *t)
> -{
> -	struct kfree_rcu_cpu *krcp =
> -		container_of(t, struct kfree_rcu_cpu, hrtimer);
> -
> -	queue_work(system_highpri_wq, &krcp->page_cache_work);
> -	return HRTIMER_NORESTART;
> -}
> -
>  static void fill_page_cache_func(struct work_struct *work)
>  {
>  	struct kvfree_rcu_bulk_data *bnode;
>  	struct kfree_rcu_cpu *krcp =
>  		container_of(work, struct kfree_rcu_cpu,
> -			page_cache_work);
> +			page_cache_work.work);
>  	unsigned long flags;
>  	bool pushed;
>  	int i;
> @@ -3398,12 +3386,8 @@ static void
>  run_page_cache_worker(struct kfree_rcu_cpu *krcp)
>  {
>  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> -			!atomic_xchg(&krcp->work_in_progress, 1)) {
> -		hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
> -			HRTIMER_MODE_REL);
> -		krcp->hrtimer.function = schedule_page_work_fn;
> -		hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
> -	}
> +			!atomic_xchg(&krcp->work_in_progress, 1))
> +		schedule_delayed_work(&krcp->page_cache_work, 1);
>  }
>  
>  // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
> @@ -4503,7 +4487,7 @@ static void __init kfree_rcu_batch_init(void)
>  		}
>  
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> -		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> +		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
>  		krcp->initialized = true;
>  	}
>  	if (register_shrinker(&kfree_rcu_shrinker))
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache
  2020-10-29 19:47   ` Paul E. McKenney
@ 2020-10-29 20:13     ` Uladzislau Rezki
  2020-10-29 20:22       ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-10-29 20:13 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki (Sony),
	LKML, RCU, Andrew Morton, Peter Zijlstra, Michal Hocko,
	Thomas Gleixner, Theodore Y . Ts'o, Joel Fernandes,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > can be obtained directly from a kvfree_rcu() path. To distinguish
> > that and take a decision the preemptable() macro is used when it
> > is save to enter allocator.
> > 
> > It means that refilling a cache is not important from timing point
> > of view. Switch to a delayed work, so the actual work is queued from
> > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > on a current CPU can lead to rq->lock double lock. That is why a
> > delayed method is in place.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> 
> Thank you, Uladzislau!
> 
> I applied this on top of v5.10-rc1 and got the following from the
> single-CPU builds:
> 
>   SYNC    include/config/auto.conf.cmd
>   DESCEND  objtool
>   CC      kernel/bounds.s
>   CALL    scripts/atomic/check-atomics.sh
>   UPD     include/generated/bounds.h
>   CC      arch/x86/kernel/asm-offsets.s
> In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
>                  from ./include/linux/atomic.h:82,
>                  from ./include/linux/crypto.h:15,
>                  from arch/x86/kernel/asm-offsets.c:9:
> ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
>  #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
>                                  ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
>  #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
>                          ^~~~~~~~~~~~~~~~~~~~
> ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
>   VM_BUG_ON(preemptible())
>   ^~~~~~~~~
> scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> Makefile:1199: recipe for target 'prepare0' failed
> make: *** [prepare0] Error 2
> 
> I vaguely recall something like this showing up in the previous series
> and that we did something or another to address it.  Could you please
> check against the old series at -rcu branch dev.2020.10.22a?  (I verified
> that the old series does run correctly in the single-CPU scenarios.)
> 
I see the same build error. Will double check if we have similar in the
previous series also. It looks like the error is caused by the Thomas series.

Will check!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache
  2020-10-29 20:13     ` Uladzislau Rezki
@ 2020-10-29 20:22       ` Uladzislau Rezki
  2020-10-29 20:33         ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-10-29 20:22 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: LKML, RCU, Andrew Morton, Peter Zijlstra, Michal Hocko,
	Thomas Gleixner, Theodore Y . Ts'o, Joel Fernandes,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko, urezki

On Thu, Oct 29, 2020 at 09:13:42PM +0100, Uladzislau Rezki wrote:
> On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> > On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > > can be obtained directly from a kvfree_rcu() path. To distinguish
> > > that and take a decision the preemptable() macro is used when it
> > > is save to enter allocator.
> > > 
> > > It means that refilling a cache is not important from timing point
> > > of view. Switch to a delayed work, so the actual work is queued from
> > > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > > on a current CPU can lead to rq->lock double lock. That is why a
> > > delayed method is in place.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > 
> > Thank you, Uladzislau!
> > 
> > I applied this on top of v5.10-rc1 and got the following from the
> > single-CPU builds:
> > 
> >   SYNC    include/config/auto.conf.cmd
> >   DESCEND  objtool
> >   CC      kernel/bounds.s
> >   CALL    scripts/atomic/check-atomics.sh
> >   UPD     include/generated/bounds.h
> >   CC      arch/x86/kernel/asm-offsets.s
> > In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> >                  from ./include/linux/atomic.h:82,
> >                  from ./include/linux/crypto.h:15,
> >                  from arch/x86/kernel/asm-offsets.c:9:
> > ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> > ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> >  #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> >                                  ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> >  #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> >                          ^~~~~~~~~~~~~~~~~~~~
> > ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> >   VM_BUG_ON(preemptible())
> >   ^~~~~~~~~
> > scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> > make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> > Makefile:1199: recipe for target 'prepare0' failed
> > make: *** [prepare0] Error 2
> > 
> > I vaguely recall something like this showing up in the previous series
> > and that we did something or another to address it.  Could you please
> > check against the old series at -rcu branch dev.2020.10.22a?  (I verified
> > that the old series does run correctly in the single-CPU scenarios.)
> > 
> I see the same build error. Will double check if we have similar in the
> previous series also. It looks like the error is caused by the Thomas series.
> 
> Will check!
> 
OK. Found it:

urezki@pc638:~/data/coding/linux.git$ git diff
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cbfbe2bcca75..7dd3523093db 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,7 +204,7 @@ void release_pages(struct page **pages, int nr);
 static inline int __page_cache_add_speculative(struct page *page, int count)
 {
 #ifdef CONFIG_TINY_RCU
-       VM_BUG_ON(preemptible())
+       VM_BUG_ON(preemptible());
        /*
         * Preempt must be disabled here - we rely on rcu_read_lock doing
         * this for us.
urezki@pc638:~/data/coding/linux.git$

I guess we had some extra patch that fixes a kernel compilation for !SMP
case. Will check dev.2020.10.22a.

--
Vlad Rezki

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache
  2020-10-29 20:22       ` Uladzislau Rezki
@ 2020-10-29 20:33         ` Paul E. McKenney
  2020-10-29 21:00           ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2020-10-29 20:33 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, RCU, Andrew Morton, Peter Zijlstra, Michal Hocko,
	Thomas Gleixner, Theodore Y . Ts'o, Joel Fernandes,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Thu, Oct 29, 2020 at 09:22:41PM +0100, Uladzislau Rezki wrote:
> On Thu, Oct 29, 2020 at 09:13:42PM +0100, Uladzislau Rezki wrote:
> > On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> > > On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > > > can be obtained directly from a kvfree_rcu() path. To distinguish
> > > > that and take a decision the preemptable() macro is used when it
> > > > is save to enter allocator.
> > > > 
> > > > It means that refilling a cache is not important from timing point
> > > > of view. Switch to a delayed work, so the actual work is queued from
> > > > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > > > on a current CPU can lead to rq->lock double lock. That is why a
> > > > delayed method is in place.
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > 
> > > Thank you, Uladzislau!
> > > 
> > > I applied this on top of v5.10-rc1 and got the following from the
> > > single-CPU builds:
> > > 
> > >   SYNC    include/config/auto.conf.cmd
> > >   DESCEND  objtool
> > >   CC      kernel/bounds.s
> > >   CALL    scripts/atomic/check-atomics.sh
> > >   UPD     include/generated/bounds.h
> > >   CC      arch/x86/kernel/asm-offsets.s
> > > In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> > >                  from ./include/linux/atomic.h:82,
> > >                  from ./include/linux/crypto.h:15,
> > >                  from arch/x86/kernel/asm-offsets.c:9:
> > > ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> > > ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> > >  #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> > >                                  ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > > ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> > >  #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> > >                          ^~~~~~~~~~~~~~~~~~~~
> > > ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> > >   VM_BUG_ON(preemptible())
> > >   ^~~~~~~~~
> > > scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> > > make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> > > Makefile:1199: recipe for target 'prepare0' failed
> > > make: *** [prepare0] Error 2
> > > 
> > > I vaguely recall something like this showing up in the previous series
> > > and that we did something or another to address it.  Could you please
> > > check against the old series at -rcu branch dev.2020.10.22a?  (I verified
> > > that the old series does run correctly in the single-CPU scenarios.)
> > > 
> > I see the same build error. Will double check if we have similar in the
> > previous series also. It looks like the error is caused by the Thomas series.
> > 
> > Will check!
> > 
> OK. Found it:
> 
> urezki@pc638:~/data/coding/linux.git$ git diff
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index cbfbe2bcca75..7dd3523093db 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -204,7 +204,7 @@ void release_pages(struct page **pages, int nr);
>  static inline int __page_cache_add_speculative(struct page *page, int count)
>  {
>  #ifdef CONFIG_TINY_RCU
> -       VM_BUG_ON(preemptible())
> +       VM_BUG_ON(preemptible());
>         /*
>          * Preempt must be disabled here - we rely on rcu_read_lock doing
>          * this for us.
> urezki@pc638:~/data/coding/linux.git$
> 
> I guess we had some extra patch that fixes a kernel compilation for !SMP
> case. Will check dev.2020.10.22a.

I guess I am blind today!

And yes, that semicolon is in the corresponding commit in -tip.

So, an important safety tip:  When forward porting, start from the
commits that have been tested rather than from the original series
of patches.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 16:50 ` [PATCH 06/16] mm/pagemap: " Uladzislau Rezki (Sony)
@ 2020-10-29 20:57   ` Uladzislau Rezki
  2020-10-29 21:26     ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-10-29 20:57 UTC (permalink / raw)
  To: Paul E . McKenney
  Cc: LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Joel Fernandes, Sebastian Andrzej Siewior, Oleksiy Avramchenko,
	linux-mm, urezki

On Thu, Oct 29, 2020 at 05:50:09PM +0100, Uladzislau Rezki (Sony) wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
> removed. Cleanup the leftovers before doing so.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: linux-mm@kvack.org
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  include/linux/pagemap.h | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c77b7c31b2e4..cbfbe2bcca75 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -204,9 +204,7 @@ void release_pages(struct page **pages, int nr);
>  static inline int __page_cache_add_speculative(struct page *page, int count)
>  {
>  #ifdef CONFIG_TINY_RCU
> -# ifdef CONFIG_PREEMPT_COUNT
> -	VM_BUG_ON(!in_atomic() && !irqs_disabled());
> -# endif
> +	VM_BUG_ON(preemptible())
>  	/*
>  	 * Preempt must be disabled here - we rely on rcu_read_lock doing
>  	 * this for us.
> -- 
> 2.20.1
> 
Hello, Paul.

Sorry for a small mistake, it was fixed by you before, whereas i took an
old version of the patch that is question. Please use below one instead of
posted one:

Author: Thomas Gleixner <tglx@linutronix.de>
Date:   Mon Sep 14 19:25:00 2020 +0200

    mm/pagemap: Cleanup PREEMPT_COUNT leftovers
    
    CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
    removed. Cleanup the leftovers before doing so.
    
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
    Cc: Andrew Morton <akpm@linux-foundation.org>
    Cc: linux-mm@kvack.org
    [ paulmck: Fix !SMP build error per kernel test robot feedback. ]
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7de11dcd534d..b3d9d9217ea0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -168,9 +168,7 @@ void release_pages(struct page **pages, int nr);
 static inline int __page_cache_add_speculative(struct page *page, int count)
 {
 #ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
-       VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
+       VM_BUG_ON(preemptible());
        /*
         * Preempt must be disabled here - we rely on rcu_read_lock doing
         * this for us.

--
Vlad Rezki

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache
  2020-10-29 20:33         ` Paul E. McKenney
@ 2020-10-29 21:00           ` Uladzislau Rezki
  0 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki @ 2020-10-29 21:00 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, LKML, RCU, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Joel Fernandes, Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Thu, Oct 29, 2020 at 01:33:55PM -0700, Paul E. McKenney wrote:
> On Thu, Oct 29, 2020 at 09:22:41PM +0100, Uladzislau Rezki wrote:
> > On Thu, Oct 29, 2020 at 09:13:42PM +0100, Uladzislau Rezki wrote:
> > > On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> > > > On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > > > > can be obtained directly from a kvfree_rcu() path. To distinguish
> > > > > that and take a decision the preemptable() macro is used when it
> > > > > is save to enter allocator.
> > > > > 
> > > > > It means that refilling a cache is not important from timing point
> > > > > of view. Switch to a delayed work, so the actual work is queued from
> > > > > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > > > > on a current CPU can lead to rq->lock double lock. That is why a
> > > > > delayed method is in place.
> > > > > 
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > 
> > > > Thank you, Uladzislau!
> > > > 
> > > > I applied this on top of v5.10-rc1 and got the following from the
> > > > single-CPU builds:
> > > > 
> > > >   SYNC    include/config/auto.conf.cmd
> > > >   DESCEND  objtool
> > > >   CC      kernel/bounds.s
> > > >   CALL    scripts/atomic/check-atomics.sh
> > > >   UPD     include/generated/bounds.h
> > > >   CC      arch/x86/kernel/asm-offsets.s
> > > > In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> > > >                  from ./include/linux/atomic.h:82,
> > > >                  from ./include/linux/crypto.h:15,
> > > >                  from arch/x86/kernel/asm-offsets.c:9:
> > > > ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> > > > ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> > > >  #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> > > >                                  ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > > > ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> > > >  #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> > > >                          ^~~~~~~~~~~~~~~~~~~~
> > > > ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> > > >   VM_BUG_ON(preemptible())
> > > >   ^~~~~~~~~
> > > > scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> > > > make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> > > > Makefile:1199: recipe for target 'prepare0' failed
> > > > make: *** [prepare0] Error 2
> > > > 
> > > > I vaguely recall something like this showing up in the previous series
> > > > and that we did something or another to address it.  Could you please
> > > > check against the old series at -rcu branch dev.2020.10.22a?  (I verified
> > > > that the old series does run correctly in the single-CPU scenarios.)
> > > > 
> > > I see the same build error. Will double check if we have similar in the
> > > previous series also. It looks like the error is caused by the Thomas series.
> > > 
> > > Will check!
> > > 
> > OK. Found it:
> > 
> > urezki@pc638:~/data/coding/linux.git$ git diff
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index cbfbe2bcca75..7dd3523093db 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -204,7 +204,7 @@ void release_pages(struct page **pages, int nr);
> >  static inline int __page_cache_add_speculative(struct page *page, int count)
> >  {
> >  #ifdef CONFIG_TINY_RCU
> > -       VM_BUG_ON(preemptible())
> > +       VM_BUG_ON(preemptible());
> >         /*
> >          * Preempt must be disabled here - we rely on rcu_read_lock doing
> >          * this for us.
> > urezki@pc638:~/data/coding/linux.git$
> > 
> > I guess we had some extra patch that fixes a kernel compilation for !SMP
> > case. Will check dev.2020.10.22a.
> 
> I guess I am blind today!
> 
> And yes, that semicolon is in the corresponding commit in -tip.
> 
> So, an important safety tip:  When forward porting, start from the
> commits that have been tested rather than from the original series
> of patches.  ;-)
> 
> 							Thanx, Paul
Right. Sorry for that. I remember you mentioned about a build error
some time ago. My bad :)

I have updated the: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers
in this series.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers
  2020-10-29 20:57   ` Uladzislau Rezki
@ 2020-10-29 21:26     ` Paul E. McKenney
  0 siblings, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2020-10-29 21:26 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, RCU, Andrew Morton, Peter Zijlstra, Michal Hocko,
	Thomas Gleixner, Theodore Y . Ts'o, Joel Fernandes,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko, linux-mm

On Thu, Oct 29, 2020 at 09:57:17PM +0100, Uladzislau Rezki wrote:
> On Thu, Oct 29, 2020 at 05:50:09PM +0100, Uladzislau Rezki (Sony) wrote:
> > From: Thomas Gleixner <tglx@linutronix.de>
> > 
> > CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
> > removed. Cleanup the leftovers before doing so.
> > 
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: linux-mm@kvack.org
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  include/linux/pagemap.h | 4 +---
> >  1 file changed, 1 insertion(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index c77b7c31b2e4..cbfbe2bcca75 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -204,9 +204,7 @@ void release_pages(struct page **pages, int nr);
> >  static inline int __page_cache_add_speculative(struct page *page, int count)
> >  {
> >  #ifdef CONFIG_TINY_RCU
> > -# ifdef CONFIG_PREEMPT_COUNT
> > -	VM_BUG_ON(!in_atomic() && !irqs_disabled());
> > -# endif
> > +	VM_BUG_ON(preemptible())
> >  	/*
> >  	 * Preempt must be disabled here - we rely on rcu_read_lock doing
> >  	 * this for us.
> > -- 
> > 2.20.1
> > 
> Hello, Paul.
> 
> Sorry for a small mistake, it was fixed by you before, whereas i took an
> old version of the patch that is question. Please use below one instead of
> posted one:

We have all been there and done that!  ;-)

I will give this update a spin and see what happens.

							Thanx, Paul

> Author: Thomas Gleixner <tglx@linutronix.de>
> Date:   Mon Sep 14 19:25:00 2020 +0200
> 
>     mm/pagemap: Cleanup PREEMPT_COUNT leftovers
>     
>     CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
>     removed. Cleanup the leftovers before doing so.
>     
>     Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>     Cc: Andrew Morton <akpm@linux-foundation.org>
>     Cc: linux-mm@kvack.org
>     [ paulmck: Fix !SMP build error per kernel test robot feedback. ]
>     Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> 
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 7de11dcd534d..b3d9d9217ea0 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -168,9 +168,7 @@ void release_pages(struct page **pages, int nr);
>  static inline int __page_cache_add_speculative(struct page *page, int count)
>  {
>  #ifdef CONFIG_TINY_RCU
> -# ifdef CONFIG_PREEMPT_COUNT
> -       VM_BUG_ON(!in_atomic() && !irqs_disabled());
> -# endif
> +       VM_BUG_ON(preemptible());
>         /*
>          * Preempt must be disabled here - we rely on rcu_read_lock doing
>          * this for us.
> 
> --
> Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (14 preceding siblings ...)
  2020-10-29 16:50 ` [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache Uladzislau Rezki (Sony)
@ 2020-11-03 15:47 ` Joel Fernandes
  2020-11-03 16:33   ` Uladzislau Rezki
  2020-11-03 17:54 ` Joel Fernandes
  16 siblings, 1 reply; 36+ messages in thread
From: Joel Fernandes @ 2020-11-03 15:47 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko, willy

On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> The current memmory-allocation interface presents to following
> difficulties that this patch is designed to overcome:
> 
> a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
>    complain about violation("BUG: Invalid wait context") of the
>    nesting rules. It does the raw_spinlock vs. spinlock nesting
>    checks, i.e. it is not legal to acquire a spinlock_t while
>    holding a raw_spinlock_t.
> 
>    Internally the kfree_rcu() uses raw_spinlock_t whereas the
>    "page allocator" internally deals with spinlock_t to access
>    to its zones. The code also can be broken from higher level
>    of view:
>    <snip>
>        raw_spin_lock(&some_lock);
>        kfree_rcu(some_pointer, some_field_offset);
>    <snip>
> 
> b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
>    is converted into sleepable variant. Invoking the page allocator from
>    atomic contexts leads to "BUG: scheduling while atomic".
> 
> c) call_rcu() is invoked from raw atomic context and kfree_rcu()
>    and kvfree_rcu() are expected to be called from atomic raw context
>    as well.
> 
> Move out a page allocation from contexts which trigger kvfree_rcu()
> function to the separate worker. When a k[v]free_rcu() per-cpu page
> cache is empty a fallback mechanism is used and a special job is
> scheduled to refill the per-cpu cache.

Looks good, still reviewing here. BTW just for my education, I was wondering
about Thomas's email:
https://lkml.org/lkml/2020/8/11/939

If slab allocations in pure raw-atomic context on RT is not allowed or
recommended, should kfree_rcu() be allowed?
slab can have same issue right? If per-cpu cache is drained, it has to
allocate page from buddy allocator and there's no GFP flag to tell it about
context where alloc is happening from.

Or are we saying that we want to support kfree on RT from raw atomic atomic
context, even though kmalloc is not supported? I hate to bring up this
elephant in the room, but since I am a part of the people maintaining this
code, I believe I would rather set some rules than supporting unsupported
usages. :-\ (Once I know what is supported and what isn't that is). If indeed
raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
put a giant warning than supporting it :-(.

But I don't stand in the way of this patch if all agree on it. thanks,

 - Joel


> Link: https://lore.kernel.org/lkml/20200630164543.4mdcf6zb4zfclhln@linutronix.de/
> Fixes: 3042f83f19be ("rcu: Support reclaim for head-less object")
> Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
>  1 file changed, 66 insertions(+), 43 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 06895ef85d69..f2da2a1cc716 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
>   * per-CPU. Object size is equal to one page. This value
>   * can be changed at boot time.
>   */
> -static int rcu_min_cached_objs = 2;
> +static int rcu_min_cached_objs = 5;
>  module_param(rcu_min_cached_objs, int, 0444);
>  
>  /* Retrieve RCU kthreads priority for rcutorture */
> @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
>   *	In order to save some per-cpu space the list is singular.
>   *	Even though it is lockless an access has to be protected by the
>   *	per-cpu lock.
> + * @page_cache_work: A work to refill the cache when it is empty
> + * @work_in_progress: Indicates that page_cache_work is running
> + * @hrtimer: A hrtimer for scheduling a page_cache_work
>   * @nr_bkv_objs: number of allocated objects at @bkvcache.
>   *
>   * This is a per-CPU structure.  The reason that it is not included in
> @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
>  	bool monitor_todo;
>  	bool initialized;
>  	int count;
> +
> +	struct work_struct page_cache_work;
> +	atomic_t work_in_progress;
> +	struct hrtimer hrtimer;
> +
>  	struct llist_head bkvcache;
>  	int nr_bkv_objs;
>  };
> @@ -3217,10 +3225,10 @@ static void kfree_rcu_work(struct work_struct *work)
>  			}
>  			rcu_lock_release(&rcu_callback_map);
>  
> -			krcp = krc_this_cpu_lock(&flags);
> +			raw_spin_lock_irqsave(&krcp->lock, flags);
>  			if (put_cached_bnode(krcp, bkvhead[i]))
>  				bkvhead[i] = NULL;
> -			krc_this_cpu_unlock(krcp, flags);
> +			raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  
>  			if (bkvhead[i])
>  				free_page((unsigned long) bkvhead[i]);
> @@ -3347,6 +3355,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  		raw_spin_unlock_irqrestore(&krcp->lock, flags);
>  }
>  
> +static enum hrtimer_restart
> +schedule_page_work_fn(struct hrtimer *t)
> +{
> +	struct kfree_rcu_cpu *krcp =
> +		container_of(t, struct kfree_rcu_cpu, hrtimer);
> +
> +	queue_work(system_highpri_wq, &krcp->page_cache_work);
> +	return HRTIMER_NORESTART;
> +}
> +
> +static void fill_page_cache_func(struct work_struct *work)
> +{
> +	struct kvfree_rcu_bulk_data *bnode;
> +	struct kfree_rcu_cpu *krcp =
> +		container_of(work, struct kfree_rcu_cpu,
> +			page_cache_work);
> +	unsigned long flags;
> +	bool pushed;
> +	int i;
> +
> +	for (i = 0; i < rcu_min_cached_objs; i++) {
> +		bnode = (struct kvfree_rcu_bulk_data *)
> +			__get_free_page(GFP_KERNEL | __GFP_NOWARN);
> +
> +		if (bnode) {
> +			raw_spin_lock_irqsave(&krcp->lock, flags);
> +			pushed = put_cached_bnode(krcp, bnode);
> +			raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +
> +			if (!pushed) {
> +				free_page((unsigned long) bnode);
> +				break;
> +			}
> +		}
> +	}
> +
> +	atomic_set(&krcp->work_in_progress, 0);
> +}
> +
> +static void
> +run_page_cache_worker(struct kfree_rcu_cpu *krcp)
> +{
> +	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> +			!atomic_xchg(&krcp->work_in_progress, 1)) {
> +		hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
> +			HRTIMER_MODE_REL);
> +		krcp->hrtimer.function = schedule_page_work_fn;
> +		hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
> +	}
> +}
> +
>  static inline bool
>  kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
>  {
> @@ -3363,32 +3422,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
>  	if (!krcp->bkvhead[idx] ||
>  			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
>  		bnode = get_cached_bnode(krcp);
> -		if (!bnode) {
> -			/*
> -			 * To keep this path working on raw non-preemptible
> -			 * sections, prevent the optional entry into the
> -			 * allocator as it uses sleeping locks. In fact, even
> -			 * if the caller of kfree_rcu() is preemptible, this
> -			 * path still is not, as krcp->lock is a raw spinlock.
> -			 * With additional page pre-allocation in the works,
> -			 * hitting this return is going to be much less likely.
> -			 */
> -			if (IS_ENABLED(CONFIG_PREEMPT_RT))
> -				return false;
> -
> -			/*
> -			 * NOTE: For one argument of kvfree_rcu() we can
> -			 * drop the lock and get the page in sleepable
> -			 * context. That would allow to maintain an array
> -			 * for the CONFIG_PREEMPT_RT as well if no cached
> -			 * pages are available.
> -			 */
> -			bnode = (struct kvfree_rcu_bulk_data *)
> -				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> -		}
> -
>  		/* Switch to emergency path. */
> -		if (unlikely(!bnode))
> +		if (!bnode)
>  			return false;
>  
>  		/* Initialize the new block. */
> @@ -3452,12 +3487,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		goto unlock_return;
>  	}
>  
> -	/*
> -	 * Under high memory pressure GFP_NOWAIT can fail,
> -	 * in that case the emergency path is maintained.
> -	 */
>  	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
>  	if (!success) {
> +		run_page_cache_worker(krcp);
> +
>  		if (head == NULL)
>  			// Inline if kvfree_rcu(one_arg) call.
>  			goto unlock_return;
> @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> -		struct kvfree_rcu_bulk_data *bnode;
>  
>  		for (i = 0; i < KFREE_N_BATCHES; i++) {
>  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
>  		}
>  
> -		for (i = 0; i < rcu_min_cached_objs; i++) {
> -			bnode = (struct kvfree_rcu_bulk_data *)
> -				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> -
> -			if (bnode)
> -				put_cached_bnode(krcp, bnode);
> -			else
> -				pr_err("Failed to preallocate for %d CPU!\n", cpu);
> -		}
> -
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> +		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
>  		krcp->initialized = true;
>  	}
>  	if (register_shrinker(&kfree_rcu_shrinker))
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-03 15:47 ` [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Joel Fernandes
@ 2020-11-03 16:33   ` Uladzislau Rezki
  2020-11-03 19:18     ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-11-03 16:33 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony),
	LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko, willy

On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > The current memmory-allocation interface presents to following
> > difficulties that this patch is designed to overcome:
> > 
> > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> >    complain about violation("BUG: Invalid wait context") of the
> >    nesting rules. It does the raw_spinlock vs. spinlock nesting
> >    checks, i.e. it is not legal to acquire a spinlock_t while
> >    holding a raw_spinlock_t.
> > 
> >    Internally the kfree_rcu() uses raw_spinlock_t whereas the
> >    "page allocator" internally deals with spinlock_t to access
> >    to its zones. The code also can be broken from higher level
> >    of view:
> >    <snip>
> >        raw_spin_lock(&some_lock);
> >        kfree_rcu(some_pointer, some_field_offset);
> >    <snip>
> > 
> > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> >    is converted into sleepable variant. Invoking the page allocator from
> >    atomic contexts leads to "BUG: scheduling while atomic".
> > 
> > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> >    and kvfree_rcu() are expected to be called from atomic raw context
> >    as well.
> > 
> > Move out a page allocation from contexts which trigger kvfree_rcu()
> > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > cache is empty a fallback mechanism is used and a special job is
> > scheduled to refill the per-cpu cache.
> 
> Looks good, still reviewing here. BTW just for my education, I was wondering
> about Thomas's email:
> https://lkml.org/lkml/2020/8/11/939
> 
> If slab allocations in pure raw-atomic context on RT is not allowed or
> recommended, should kfree_rcu() be allowed?
>
Thanks for reviewing, Joel :)

The decision was made that we need to support kfree_rcu() from "real atomic contexts",
to align with how it used to be before. We can go and just convert our local locks
to the spinlock_t variant but that was not Paul goal, it can be that some users need
kfree_rcu() for raw atomics.

>
> slab can have same issue right? If per-cpu cache is drained, it has to
> allocate page from buddy allocator and there's no GFP flag to tell it about
> context where alloc is happening from.
> 
Sounds like that. Apart of that, it might turn out soon that we or somebody
else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
So who knows..

>
> Or are we saying that we want to support kfree on RT from raw atomic atomic
> context, even though kmalloc is not supported? I hate to bring up this
> elephant in the room, but since I am a part of the people maintaining this
> code, I believe I would rather set some rules than supporting unsupported
> usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> put a giant warning than supporting it :-(.
> 
We discussed it several times, the conclusion was that we need to support 
kfree_rcu() from raw contexts. At least that was a clear signal from Paul 
to me. I think, if we obtain the preemtable(), so it becomes versatile, we
can drop the patch that is in question later on in the future.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
                   ` (15 preceding siblings ...)
  2020-11-03 15:47 ` [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Joel Fernandes
@ 2020-11-03 17:54 ` Joel Fernandes
  2020-11-04 12:12   ` Uladzislau Rezki
  16 siblings, 1 reply; 36+ messages in thread
From: Joel Fernandes @ 2020-11-03 17:54 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> The current memmory-allocation interface presents to following
> difficulties that this patch is designed to overcome
[...]
> ---
>  kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
>  1 file changed, 66 insertions(+), 43 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 06895ef85d69..f2da2a1cc716 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
>   * per-CPU. Object size is equal to one page. This value
>   * can be changed at boot time.
>   */
> -static int rcu_min_cached_objs = 2;
> +static int rcu_min_cached_objs = 5;
>  module_param(rcu_min_cached_objs, int, 0444);
>  
>  /* Retrieve RCU kthreads priority for rcutorture */
> @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
>   *	In order to save some per-cpu space the list is singular.
>   *	Even though it is lockless an access has to be protected by the
>   *	per-cpu lock.
> + * @page_cache_work: A work to refill the cache when it is empty
> + * @work_in_progress: Indicates that page_cache_work is running
> + * @hrtimer: A hrtimer for scheduling a page_cache_work
>   * @nr_bkv_objs: number of allocated objects at @bkvcache.
>   *
>   * This is a per-CPU structure.  The reason that it is not included in
> @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
>  	bool monitor_todo;
>  	bool initialized;
>  	int count;
> +
> +	struct work_struct page_cache_work;
> +	atomic_t work_in_progress;

Does it need to be atomic? run_page_cache_work() is only called under a lock.
You can use xchg() there. And when you do the atomic_set, you can use
WRITE_ONCE as it is a data-race.

> @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> -		struct kvfree_rcu_bulk_data *bnode;
>  
>  		for (i = 0; i < KFREE_N_BATCHES; i++) {
>  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
>  		}
>  
> -		for (i = 0; i < rcu_min_cached_objs; i++) {
> -			bnode = (struct kvfree_rcu_bulk_data *)
> -				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> -
> -			if (bnode)
> -				put_cached_bnode(krcp, bnode);
> -			else
> -				pr_err("Failed to preallocate for %d CPU!\n", cpu);
> -		}
> -
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> +		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
>  		krcp->initialized = true;

During initialization, is it not better to still pre-allocate? That way you
don't have to wait to get into a situation where you need to initially
allocate.

AFAICS after the above line deletions, the bulk pages are initially empty.

thanks,

 - Joel


>  	}
>  	if (register_shrinker(&kfree_rcu_shrinker))
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible
  2020-10-29 16:50 ` [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible Uladzislau Rezki (Sony)
@ 2020-11-03 18:03   ` Joel Fernandes
  2020-11-04 11:39     ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Joel Fernandes @ 2020-11-03 18:03 UTC (permalink / raw)
  To: Uladzislau Rezki (Sony)
  Cc: LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

Hi Vlad,
Few minor nits:

On Thu, Oct 29, 2020 at 05:50:18PM +0100, Uladzislau Rezki (Sony) wrote:
> Given that CONFIG_PREEMPT_COUNT is unconditionally enabled by the
> earlier commits in this series, the preemptible() macro now properly
> detects preempt-disable code regions even in kernels built with
> CONFIG_PREEMPT_NONE.
> 
> This commit therefore uses preemptible() to determine whether allocation
> is possible at all for double-argument kvfree_rcu().  If !preemptible(),
> then allocation is not possible, and kvfree_rcu() falls back to using
> the less cache-friendly rcu_head approach.  Even when preemptible(),
> the caller might be involved in reclaim, so the GFP_ flags used by
> double-argument kvfree_rcu() must avoid invoking reclaim processing.
> 
> Note that single-argument kvfree_rcu() must be invoked in sleepable
> contexts, and that its fallback is the relatively high latency
> synchronize_rcu().  Single-argument kvfree_rcu() therefore uses
> GFP_KERNEL|__GFP_RETRY_MAYFAIL to allow limited sleeping within the
> memory allocator.
> 
> [ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ]
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> ---
>  kernel/rcu/tree.c | 48 ++++++++++++++++++++++++++++++-----------------
>  1 file changed, 31 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f2da2a1cc716..3f9b016a44dc 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3406,37 +3406,55 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
>  	}
>  }
>  
> +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
> +// state specified by flags.  If can_sleep is true, the caller must
> +// be schedulable and not be holding any locks or mutexes that might be
> +// acquired by the memory allocator or anything that it might invoke.
> +// If !can_sleep, then if !preemptible() no allocation will be undertaken,
> +// otherwise the allocation will use GFP_ATOMIC to avoid the remainder of
> +// the aforementioned deadlock possibilities.  Returns true if ptr was
> +// successfully recorded, else the caller must use a fallback.
>  static inline bool
> -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
> +	unsigned long *flags, void *ptr, bool can_sleep)
>  {
>  	struct kvfree_rcu_bulk_data *bnode;
> +	bool can_alloc_page = preemptible();
> +	gfp_t gfp = (can_sleep ? GFP_KERNEL | __GFP_RETRY_MAYFAIL :
> +		GFP_ATOMIC) | __GFP_NOWARN;
>  	int idx;
>  
> -	if (unlikely(!krcp->initialized))
> +	*krcp = krc_this_cpu_lock(flags);
> +	if (unlikely(!(*krcp)->initialized))
>  		return false;
>  
> -	lockdep_assert_held(&krcp->lock);
>  	idx = !!is_vmalloc_addr(ptr);
>  
>  	/* Check if a new block is required. */

Maybe convert this comment also to //... like the new ones you added (and the
ones below).

> -	if (!krcp->bkvhead[idx] ||
> -			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> -		bnode = get_cached_bnode(krcp);
> -		/* Switch to emergency path. */
> +	if (!(*krcp)->bkvhead[idx] ||
> +			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> +		bnode = get_cached_bnode(*krcp);
> +		if (!bnode && can_alloc_page) {

I think you can directly put preemptible() here with a comment saying
allocate only if preemptible and get rid of can_alloc_page.

> +			krc_this_cpu_unlock(*krcp, *flags);
> +			bnode = (struct kvfree_rcu_bulk_data *)
> +				__get_free_page(gfp);
> +			*krcp = krc_this_cpu_lock(flags);
> +		}
> +

I think the "Switch to emergency path" comment should stay here before the
if().

thanks,

 - Joel

>  		if (!bnode)
>  			return false;
>  
>  		/* Initialize the new block. */
>  		bnode->nr_records = 0;
> -		bnode->next = krcp->bkvhead[idx];
> +		bnode->next = (*krcp)->bkvhead[idx];
>  
>  		/* Attach it to the head. */
> -		krcp->bkvhead[idx] = bnode;
> +		(*krcp)->bkvhead[idx] = bnode;
>  	}
>  
>  	/* Finally insert. */
> -	krcp->bkvhead[idx]->records
> -		[krcp->bkvhead[idx]->nr_records++] = ptr;
> +	(*krcp)->bkvhead[idx]->records
> +		[(*krcp)->bkvhead[idx]->nr_records++] = ptr;
>  
>  	return true;
>  }
> @@ -3474,20 +3492,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		ptr = (unsigned long *) func;
>  	}
>  
> -	krcp = krc_this_cpu_lock(&flags);
> -
>  	// Queue the object but don't yet schedule the batch.
>  	if (debug_rcu_head_queue(ptr)) {
>  		// Probable double kfree_rcu(), just leak.
>  		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
>  			  __func__, head);
>  
> -		// Mark as success and leave.
> -		success = true;
> -		goto unlock_return;
> +		return;
>  	}
>  
> -	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> +	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
>  	if (!success) {
>  		run_page_cache_worker(krcp);
>  
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-03 16:33   ` Uladzislau Rezki
@ 2020-11-03 19:18     ` Paul E. McKenney
  2020-11-04 12:35       ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2020-11-03 19:18 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Joel Fernandes, LKML, RCU, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko, willy

On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > The current memmory-allocation interface presents to following
> > > difficulties that this patch is designed to overcome:
> > > 
> > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > >    complain about violation("BUG: Invalid wait context") of the
> > >    nesting rules. It does the raw_spinlock vs. spinlock nesting
> > >    checks, i.e. it is not legal to acquire a spinlock_t while
> > >    holding a raw_spinlock_t.
> > > 
> > >    Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > >    "page allocator" internally deals with spinlock_t to access
> > >    to its zones. The code also can be broken from higher level
> > >    of view:
> > >    <snip>
> > >        raw_spin_lock(&some_lock);
> > >        kfree_rcu(some_pointer, some_field_offset);
> > >    <snip>
> > > 
> > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > >    is converted into sleepable variant. Invoking the page allocator from
> > >    atomic contexts leads to "BUG: scheduling while atomic".
> > > 
> > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > >    and kvfree_rcu() are expected to be called from atomic raw context
> > >    as well.
> > > 
> > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > cache is empty a fallback mechanism is used and a special job is
> > > scheduled to refill the per-cpu cache.
> > 
> > Looks good, still reviewing here. BTW just for my education, I was wondering
> > about Thomas's email:
> > https://lkml.org/lkml/2020/8/11/939
> > 
> > If slab allocations in pure raw-atomic context on RT is not allowed or
> > recommended, should kfree_rcu() be allowed?
> >
> Thanks for reviewing, Joel :)
> 
> The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> to align with how it used to be before. We can go and just convert our local locks
> to the spinlock_t variant but that was not Paul goal, it can be that some users need
> kfree_rcu() for raw atomics.

People invoke call_rcu() from raw atomics, and so we should provide
the same for kfree_rcu().  Yes, people could work around a raw-atomic
prohibition, but such prohibitions incur constant costs over time in
terms of development effort, increased bug rate, and increased complexity.
Yes, this does increase all of those for RCU, but the relative increase
is negligible, RCU being what it is.

> > slab can have same issue right? If per-cpu cache is drained, it has to
> > allocate page from buddy allocator and there's no GFP flag to tell it about
> > context where alloc is happening from.
> > 
> Sounds like that. Apart of that, it might turn out soon that we or somebody
> else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> So who knows..

I would prefer that slab provide some way of dealing with raw atomic
context, but the maintainers are thus far unconvinced.

> > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > context, even though kmalloc is not supported? I hate to bring up this
> > elephant in the room, but since I am a part of the people maintaining this
> > code, I believe I would rather set some rules than supporting unsupported
> > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > put a giant warning than supporting it :-(.
> > 
> We discussed it several times, the conclusion was that we need to support 
> kfree_rcu() from raw contexts. At least that was a clear signal from Paul 
> to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> can drop the patch that is in question later on in the future.

Given a universally meaningful preemptible(), we could directly call
the allocator in some cases.  It might (or might not) still make sense
to defer the allocation when preemptible() indicated that a direct call
to the allocator was unsafe.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible
  2020-11-03 18:03   ` Joel Fernandes
@ 2020-11-04 11:39     ` Uladzislau Rezki
  2020-11-04 14:36       ` Joel Fernandes
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-11-04 11:39 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony),
	LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

Hello, Joel.

> 
> On Thu, Oct 29, 2020 at 05:50:18PM +0100, Uladzislau Rezki (Sony) wrote:
> > Given that CONFIG_PREEMPT_COUNT is unconditionally enabled by the
> > earlier commits in this series, the preemptible() macro now properly
> > detects preempt-disable code regions even in kernels built with
> > CONFIG_PREEMPT_NONE.
> > 
> > This commit therefore uses preemptible() to determine whether allocation
> > is possible at all for double-argument kvfree_rcu().  If !preemptible(),
> > then allocation is not possible, and kvfree_rcu() falls back to using
> > the less cache-friendly rcu_head approach.  Even when preemptible(),
> > the caller might be involved in reclaim, so the GFP_ flags used by
> > double-argument kvfree_rcu() must avoid invoking reclaim processing.
> > 
> > Note that single-argument kvfree_rcu() must be invoked in sleepable
> > contexts, and that its fallback is the relatively high latency
> > synchronize_rcu().  Single-argument kvfree_rcu() therefore uses
> > GFP_KERNEL|__GFP_RETRY_MAYFAIL to allow limited sleeping within the
> > memory allocator.
> > 
> > [ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ]
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > ---
> >  kernel/rcu/tree.c | 48 ++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 31 insertions(+), 17 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index f2da2a1cc716..3f9b016a44dc 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3406,37 +3406,55 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
> >  	}
> >  }
> >  
> > +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
> > +// state specified by flags.  If can_sleep is true, the caller must
> > +// be schedulable and not be holding any locks or mutexes that might be
> > +// acquired by the memory allocator or anything that it might invoke.
> > +// If !can_sleep, then if !preemptible() no allocation will be undertaken,
> > +// otherwise the allocation will use GFP_ATOMIC to avoid the remainder of
> > +// the aforementioned deadlock possibilities.  Returns true if ptr was
> > +// successfully recorded, else the caller must use a fallback.
> >  static inline bool
> > -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> > +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
> > +	unsigned long *flags, void *ptr, bool can_sleep)
> >  {
> >  	struct kvfree_rcu_bulk_data *bnode;
> > +	bool can_alloc_page = preemptible();
> > +	gfp_t gfp = (can_sleep ? GFP_KERNEL | __GFP_RETRY_MAYFAIL :
> > +		GFP_ATOMIC) | __GFP_NOWARN;
> >  	int idx;
> >  
> > -	if (unlikely(!krcp->initialized))
> > +	*krcp = krc_this_cpu_lock(flags);
> > +	if (unlikely(!(*krcp)->initialized))
> >  		return false;
> >  
> > -	lockdep_assert_held(&krcp->lock);
> >  	idx = !!is_vmalloc_addr(ptr);
> >  
> >  	/* Check if a new block is required. */
> 
> Maybe convert this comment also to //... like the new ones you added (and the
> ones below).
> 
No, problem. I can convert it.

> > -	if (!krcp->bkvhead[idx] ||
> > -			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > -		bnode = get_cached_bnode(krcp);
> > -		/* Switch to emergency path. */
> > +	if (!(*krcp)->bkvhead[idx] ||
> > +			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > +		bnode = get_cached_bnode(*krcp);
> > +		if (!bnode && can_alloc_page) {
> 
> I think you can directly put preemptible() here with a comment saying
> allocate only if preemptible and get rid of can_alloc_page.
> 
Not really. We check preemtable() before acquiring the internal lock,
otherwise it will always return "false". Thus, it is checked on the
entry in the beginning.

> > +			krc_this_cpu_unlock(*krcp, *flags);
> > +			bnode = (struct kvfree_rcu_bulk_data *)
> > +				__get_free_page(gfp);
> > +			*krcp = krc_this_cpu_lock(flags);
> > +		}
> > +
> 
> I think the "Switch to emergency path" comment should stay here before the
> if().
> 
Right. We need to keep it. Will take it back.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-03 17:54 ` Joel Fernandes
@ 2020-11-04 12:12   ` Uladzislau Rezki
  2020-11-04 15:01     ` Joel Fernandes
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-11-04 12:12 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki (Sony),
	LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Tue, Nov 03, 2020 at 12:54:22PM -0500, Joel Fernandes wrote:
> On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > The current memmory-allocation interface presents to following
> > difficulties that this patch is designed to overcome
> [...]
> > ---
> >  kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
> >  1 file changed, 66 insertions(+), 43 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 06895ef85d69..f2da2a1cc716 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
> >   * per-CPU. Object size is equal to one page. This value
> >   * can be changed at boot time.
> >   */
> > -static int rcu_min_cached_objs = 2;
> > +static int rcu_min_cached_objs = 5;
> >  module_param(rcu_min_cached_objs, int, 0444);
> >  
> >  /* Retrieve RCU kthreads priority for rcutorture */
> > @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
> >   *	In order to save some per-cpu space the list is singular.
> >   *	Even though it is lockless an access has to be protected by the
> >   *	per-cpu lock.
> > + * @page_cache_work: A work to refill the cache when it is empty
> > + * @work_in_progress: Indicates that page_cache_work is running
> > + * @hrtimer: A hrtimer for scheduling a page_cache_work
> >   * @nr_bkv_objs: number of allocated objects at @bkvcache.
> >   *
> >   * This is a per-CPU structure.  The reason that it is not included in
> > @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> >  	bool monitor_todo;
> >  	bool initialized;
> >  	int count;
> > +
> > +	struct work_struct page_cache_work;
> > +	atomic_t work_in_progress;
> 
> Does it need to be atomic? run_page_cache_work() is only called under a lock.
> You can use xchg() there. And when you do the atomic_set, you can use
> WRITE_ONCE as it is a data-race.
> 
We can use xchg together with *_ONCE() macro. Could you please clarify what
is your concern about using atomic_t? Both xchg() and atomic_xchg() guarantee
atamarity. Same as WRITE_ONCE() or atomic_set().

> > @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
> >  
> >  	for_each_possible_cpu(cpu) {
> >  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > -		struct kvfree_rcu_bulk_data *bnode;
> >  
> >  		for (i = 0; i < KFREE_N_BATCHES; i++) {
> >  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> >  			krcp->krw_arr[i].krcp = krcp;
> >  		}
> >  
> > -		for (i = 0; i < rcu_min_cached_objs; i++) {
> > -			bnode = (struct kvfree_rcu_bulk_data *)
> > -				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > -
> > -			if (bnode)
> > -				put_cached_bnode(krcp, bnode);
> > -			else
> > -				pr_err("Failed to preallocate for %d CPU!\n", cpu);
> > -		}
> > -
> >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > +		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> >  		krcp->initialized = true;
> 
> During initialization, is it not better to still pre-allocate? That way you
> don't have to wait to get into a situation where you need to initially
> allocate.
> 
Since we have a worker that does it when a cache is empty there is no
a high need in doing it during initialization phase. If we can reduce
an amount of code it is always good :)

Thanks, Joel.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-03 19:18     ` Paul E. McKenney
@ 2020-11-04 12:35       ` Uladzislau Rezki
  2020-11-04 14:12         ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Uladzislau Rezki @ 2020-11-04 12:35 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Joel Fernandes, LKML, RCU, Andrew Morton,
	Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Sebastian Andrzej Siewior,
	Oleksiy Avramchenko, willy

On Tue, Nov 03, 2020 at 11:18:22AM -0800, Paul E. McKenney wrote:
> On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> > On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > The current memmory-allocation interface presents to following
> > > > difficulties that this patch is designed to overcome:
> > > > 
> > > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > >    complain about violation("BUG: Invalid wait context") of the
> > > >    nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > >    checks, i.e. it is not legal to acquire a spinlock_t while
> > > >    holding a raw_spinlock_t.
> > > > 
> > > >    Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > >    "page allocator" internally deals with spinlock_t to access
> > > >    to its zones. The code also can be broken from higher level
> > > >    of view:
> > > >    <snip>
> > > >        raw_spin_lock(&some_lock);
> > > >        kfree_rcu(some_pointer, some_field_offset);
> > > >    <snip>
> > > > 
> > > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > >    is converted into sleepable variant. Invoking the page allocator from
> > > >    atomic contexts leads to "BUG: scheduling while atomic".
> > > > 
> > > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > >    and kvfree_rcu() are expected to be called from atomic raw context
> > > >    as well.
> > > > 
> > > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > > cache is empty a fallback mechanism is used and a special job is
> > > > scheduled to refill the per-cpu cache.
> > > 
> > > Looks good, still reviewing here. BTW just for my education, I was wondering
> > > about Thomas's email:
> > > https://lkml.org/lkml/2020/8/11/939
> > > 
> > > If slab allocations in pure raw-atomic context on RT is not allowed or
> > > recommended, should kfree_rcu() be allowed?
> > >
> > Thanks for reviewing, Joel :)
> > 
> > The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> > to align with how it used to be before. We can go and just convert our local locks
> > to the spinlock_t variant but that was not Paul goal, it can be that some users need
> > kfree_rcu() for raw atomics.
> 
> People invoke call_rcu() from raw atomics, and so we should provide
> the same for kfree_rcu().  Yes, people could work around a raw-atomic
> prohibition, but such prohibitions incur constant costs over time in
> terms of development effort, increased bug rate, and increased complexity.
> Yes, this does increase all of those for RCU, but the relative increase
> is negligible, RCU being what it is.
> 
I see your point.

> > > slab can have same issue right? If per-cpu cache is drained, it has to
> > > allocate page from buddy allocator and there's no GFP flag to tell it about
> > > context where alloc is happening from.
> > > 
> > Sounds like that. Apart of that, it might turn out soon that we or somebody
> > else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> > So who knows..
> 
> I would prefer that slab provide some way of dealing with raw atomic
> context, but the maintainers are thus far unconvinced.
> 
I think, when preempt_rt is fully integrated to the kernel, we might get
new users with such demand. So, it is not a closed topic so far, IMHO.

> > > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > > context, even though kmalloc is not supported? I hate to bring up this
> > > elephant in the room, but since I am a part of the people maintaining this
> > > code, I believe I would rather set some rules than supporting unsupported
> > > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > > put a giant warning than supporting it :-(.
> > > 
> > We discussed it several times, the conclusion was that we need to support 
> > kfree_rcu() from raw contexts. At least that was a clear signal from Paul 
> > to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> > can drop the patch that is in question later on in the future.
> 
> Given a universally meaningful preemptible(), we could directly call
> the allocator in some cases.  It might (or might not) still make sense
> to defer the allocation when preemptible() indicated that a direct call
> to the allocator was unsafe.
> 
I do not have a strong opinion here. Giving the fact that maintaining of
such "deferring" is not considered as a big effort, i think, we can live
with it.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-04 12:35       ` Uladzislau Rezki
@ 2020-11-04 14:12         ` Paul E. McKenney
  2020-11-04 14:40           ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2020-11-04 14:12 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: Joel Fernandes, LKML, RCU, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko, willy

On Wed, Nov 04, 2020 at 01:35:53PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 03, 2020 at 11:18:22AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> > > On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > > > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > > The current memmory-allocation interface presents to following
> > > > > difficulties that this patch is designed to overcome:
> > > > > 
> > > > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > > >    complain about violation("BUG: Invalid wait context") of the
> > > > >    nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > > >    checks, i.e. it is not legal to acquire a spinlock_t while
> > > > >    holding a raw_spinlock_t.
> > > > > 
> > > > >    Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > > >    "page allocator" internally deals with spinlock_t to access
> > > > >    to its zones. The code also can be broken from higher level
> > > > >    of view:
> > > > >    <snip>
> > > > >        raw_spin_lock(&some_lock);
> > > > >        kfree_rcu(some_pointer, some_field_offset);
> > > > >    <snip>
> > > > > 
> > > > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > > >    is converted into sleepable variant. Invoking the page allocator from
> > > > >    atomic contexts leads to "BUG: scheduling while atomic".
> > > > > 
> > > > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > > >    and kvfree_rcu() are expected to be called from atomic raw context
> > > > >    as well.
> > > > > 
> > > > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > > > cache is empty a fallback mechanism is used and a special job is
> > > > > scheduled to refill the per-cpu cache.
> > > > 
> > > > Looks good, still reviewing here. BTW just for my education, I was wondering
> > > > about Thomas's email:
> > > > https://lkml.org/lkml/2020/8/11/939
> > > > 
> > > > If slab allocations in pure raw-atomic context on RT is not allowed or
> > > > recommended, should kfree_rcu() be allowed?
> > > >
> > > Thanks for reviewing, Joel :)
> > > 
> > > The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> > > to align with how it used to be before. We can go and just convert our local locks
> > > to the spinlock_t variant but that was not Paul goal, it can be that some users need
> > > kfree_rcu() for raw atomics.
> > 
> > People invoke call_rcu() from raw atomics, and so we should provide
> > the same for kfree_rcu().  Yes, people could work around a raw-atomic
> > prohibition, but such prohibitions incur constant costs over time in
> > terms of development effort, increased bug rate, and increased complexity.
> > Yes, this does increase all of those for RCU, but the relative increase
> > is negligible, RCU being what it is.
> > 
> I see your point.
> 
> > > > slab can have same issue right? If per-cpu cache is drained, it has to
> > > > allocate page from buddy allocator and there's no GFP flag to tell it about
> > > > context where alloc is happening from.
> > > > 
> > > Sounds like that. Apart of that, it might turn out soon that we or somebody
> > > else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> > > So who knows..
> > 
> > I would prefer that slab provide some way of dealing with raw atomic
> > context, but the maintainers are thus far unconvinced.
> > 
> I think, when preempt_rt is fully integrated to the kernel, we might get
> new users with such demand. So, it is not a closed topic so far, IMHO.

Agreed!  ;-)

> > > > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > > > context, even though kmalloc is not supported? I hate to bring up this
> > > > elephant in the room, but since I am a part of the people maintaining this
> > > > code, I believe I would rather set some rules than supporting unsupported
> > > > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > > > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > > > put a giant warning than supporting it :-(.
> > > > 
> > > We discussed it several times, the conclusion was that we need to support 
> > > kfree_rcu() from raw contexts. At least that was a clear signal from Paul 
> > > to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> > > can drop the patch that is in question later on in the future.
> > 
> > Given a universally meaningful preemptible(), we could directly call
> > the allocator in some cases.  It might (or might not) still make sense
> > to defer the allocation when preemptible() indicated that a direct call
> > to the allocator was unsafe.
> > 
> I do not have a strong opinion here. Giving the fact that maintaining of
> such "deferring" is not considered as a big effort, i think, we can live
> with it.

And agreed here as well.  If this were instead a large body of complex
code, I might feel otherwise.  But as it is, why worry?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible
  2020-11-04 11:39     ` Uladzislau Rezki
@ 2020-11-04 14:36       ` Joel Fernandes
  0 siblings, 0 replies; 36+ messages in thread
From: Joel Fernandes @ 2020-11-04 14:36 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Wed, Nov 04, 2020 at 12:39:31PM +0100, Uladzislau Rezki wrote:
[..] 
> > > -	if (!krcp->bkvhead[idx] ||
> > > -			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > > -		bnode = get_cached_bnode(krcp);
> > > -		/* Switch to emergency path. */
> > > +	if (!(*krcp)->bkvhead[idx] ||
> > > +			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > > +		bnode = get_cached_bnode(*krcp);
> > > +		if (!bnode && can_alloc_page) {
> > 
> > I think you can directly put preemptible() here with a comment saying
> > allocate only if preemptible and get rid of can_alloc_page.
> > 
> Not really. We check preemtable() before acquiring the internal lock,
> otherwise it will always return "false". Thus, it is checked on the
> entry in the beginning.

You are right. Sorry. Sounds good to me.

 - Joel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-04 14:12         ` Paul E. McKenney
@ 2020-11-04 14:40           ` Uladzislau Rezki
  0 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki @ 2020-11-04 14:40 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Uladzislau Rezki, Joel Fernandes, LKML, RCU, Andrew Morton,
	Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Sebastian Andrzej Siewior,
	Oleksiy Avramchenko, willy

On Wed, Nov 04, 2020 at 06:12:00AM -0800, Paul E. McKenney wrote:
> On Wed, Nov 04, 2020 at 01:35:53PM +0100, Uladzislau Rezki wrote:
> > On Tue, Nov 03, 2020 at 11:18:22AM -0800, Paul E. McKenney wrote:
> > > On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> > > > On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > > > > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > > > The current memmory-allocation interface presents to following
> > > > > > difficulties that this patch is designed to overcome:
> > > > > > 
> > > > > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > > > >    complain about violation("BUG: Invalid wait context") of the
> > > > > >    nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > > > >    checks, i.e. it is not legal to acquire a spinlock_t while
> > > > > >    holding a raw_spinlock_t.
> > > > > > 
> > > > > >    Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > > > >    "page allocator" internally deals with spinlock_t to access
> > > > > >    to its zones. The code also can be broken from higher level
> > > > > >    of view:
> > > > > >    <snip>
> > > > > >        raw_spin_lock(&some_lock);
> > > > > >        kfree_rcu(some_pointer, some_field_offset);
> > > > > >    <snip>
> > > > > > 
> > > > > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > > > >    is converted into sleepable variant. Invoking the page allocator from
> > > > > >    atomic contexts leads to "BUG: scheduling while atomic".
> > > > > > 
> > > > > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > > > >    and kvfree_rcu() are expected to be called from atomic raw context
> > > > > >    as well.
> > > > > > 
> > > > > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > > > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > > > > cache is empty a fallback mechanism is used and a special job is
> > > > > > scheduled to refill the per-cpu cache.
> > > > > 
> > > > > Looks good, still reviewing here. BTW just for my education, I was wondering
> > > > > about Thomas's email:
> > > > > https://lkml.org/lkml/2020/8/11/939
> > > > > 
> > > > > If slab allocations in pure raw-atomic context on RT is not allowed or
> > > > > recommended, should kfree_rcu() be allowed?
> > > > >
> > > > Thanks for reviewing, Joel :)
> > > > 
> > > > The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> > > > to align with how it used to be before. We can go and just convert our local locks
> > > > to the spinlock_t variant but that was not Paul goal, it can be that some users need
> > > > kfree_rcu() for raw atomics.
> > > 
> > > People invoke call_rcu() from raw atomics, and so we should provide
> > > the same for kfree_rcu().  Yes, people could work around a raw-atomic
> > > prohibition, but such prohibitions incur constant costs over time in
> > > terms of development effort, increased bug rate, and increased complexity.
> > > Yes, this does increase all of those for RCU, but the relative increase
> > > is negligible, RCU being what it is.
> > > 
> > I see your point.
> > 
> > > > > slab can have same issue right? If per-cpu cache is drained, it has to
> > > > > allocate page from buddy allocator and there's no GFP flag to tell it about
> > > > > context where alloc is happening from.
> > > > > 
> > > > Sounds like that. Apart of that, it might turn out soon that we or somebody
> > > > else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> > > > So who knows..
> > > 
> > > I would prefer that slab provide some way of dealing with raw atomic
> > > context, but the maintainers are thus far unconvinced.
> > > 
> > I think, when preempt_rt is fully integrated to the kernel, we might get
> > new users with such demand. So, it is not a closed topic so far, IMHO.
> 
> Agreed!  ;-)
> 
> > > > > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > > > > context, even though kmalloc is not supported? I hate to bring up this
> > > > > elephant in the room, but since I am a part of the people maintaining this
> > > > > code, I believe I would rather set some rules than supporting unsupported
> > > > > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > > > > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > > > > put a giant warning than supporting it :-(.
> > > > > 
> > > > We discussed it several times, the conclusion was that we need to support 
> > > > kfree_rcu() from raw contexts. At least that was a clear signal from Paul 
> > > > to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> > > > can drop the patch that is in question later on in the future.
> > > 
> > > Given a universally meaningful preemptible(), we could directly call
> > > the allocator in some cases.  It might (or might not) still make sense
> > > to defer the allocation when preemptible() indicated that a direct call
> > > to the allocator was unsafe.
> > > 
> > I do not have a strong opinion here. Giving the fact that maintaining of
> > such "deferring" is not considered as a big effort, i think, we can live
> > with it.
> 
> And agreed here as well.  If this were instead a large body of complex
> code, I might feel otherwise.  But as it is, why worry?
> 
Agreed! I do not consider it as extra complexity.

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-04 12:12   ` Uladzislau Rezki
@ 2020-11-04 15:01     ` Joel Fernandes
  2020-11-04 18:38       ` Uladzislau Rezki
  0 siblings, 1 reply; 36+ messages in thread
From: Joel Fernandes @ 2020-11-04 15:01 UTC (permalink / raw)
  To: Uladzislau Rezki
  Cc: LKML, RCU, Paul E . McKenney, Andrew Morton, Peter Zijlstra,
	Michal Hocko, Thomas Gleixner, Theodore Y . Ts'o,
	Sebastian Andrzej Siewior, Oleksiy Avramchenko

On Wed, Nov 04, 2020 at 01:12:03PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 03, 2020 at 12:54:22PM -0500, Joel Fernandes wrote:
> > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > The current memmory-allocation interface presents to following
> > > difficulties that this patch is designed to overcome
> > [...]
> > > ---
> > >  kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
> > >  1 file changed, 66 insertions(+), 43 deletions(-)
> > > 
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 06895ef85d69..f2da2a1cc716 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
> > >   * per-CPU. Object size is equal to one page. This value
> > >   * can be changed at boot time.
> > >   */
> > > -static int rcu_min_cached_objs = 2;
> > > +static int rcu_min_cached_objs = 5;
> > >  module_param(rcu_min_cached_objs, int, 0444);
> > >  
> > >  /* Retrieve RCU kthreads priority for rcutorture */
> > > @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
> > >   *	In order to save some per-cpu space the list is singular.
> > >   *	Even though it is lockless an access has to be protected by the
> > >   *	per-cpu lock.
> > > + * @page_cache_work: A work to refill the cache when it is empty
> > > + * @work_in_progress: Indicates that page_cache_work is running
> > > + * @hrtimer: A hrtimer for scheduling a page_cache_work
> > >   * @nr_bkv_objs: number of allocated objects at @bkvcache.
> > >   *
> > >   * This is a per-CPU structure.  The reason that it is not included in
> > > @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> > >  	bool monitor_todo;
> > >  	bool initialized;
> > >  	int count;
> > > +
> > > +	struct work_struct page_cache_work;
> > > +	atomic_t work_in_progress;
> > 
> > Does it need to be atomic? run_page_cache_work() is only called under a lock.
> > You can use xchg() there. And when you do the atomic_set, you can use
> > WRITE_ONCE as it is a data-race.
> > 
> We can use xchg together with *_ONCE() macro. Could you please clarify what
> is your concern about using atomic_t? Both xchg() and atomic_xchg() guarantee
> atamarity. Same as WRITE_ONCE() or atomic_set().

Right, whether there's lock or not does not matter as xchg() is also
atomic-swap.

atomic_t is a more complex type though, I would directly use int since
atomic_t is not needed here and there's no lost-update issue here. It could
be matter of style as well.

BTW I did think atomic_xchg() adds additional memory barriers
but I could not find that to be the case in the implementation. Is that not
the case? Docs says "atomic_xchg must provide explicit memory barriers around
the operation.".

> > > @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
> > >  
> > >  	for_each_possible_cpu(cpu) {
> > >  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > -		struct kvfree_rcu_bulk_data *bnode;
> > >  
> > >  		for (i = 0; i < KFREE_N_BATCHES; i++) {
> > >  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > >  			krcp->krw_arr[i].krcp = krcp;
> > >  		}
> > >  
> > > -		for (i = 0; i < rcu_min_cached_objs; i++) {
> > > -			bnode = (struct kvfree_rcu_bulk_data *)
> > > -				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > -
> > > -			if (bnode)
> > > -				put_cached_bnode(krcp, bnode);
> > > -			else
> > > -				pr_err("Failed to preallocate for %d CPU!\n", cpu);
> > > -		}
> > > -
> > >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > > +		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> > >  		krcp->initialized = true;
> > 
> > During initialization, is it not better to still pre-allocate? That way you
> > don't have to wait to get into a situation where you need to initially
> > allocate.
> > 
> Since we have a worker that does it when a cache is empty there is no
> a high need in doing it during initialization phase. If we can reduce
> an amount of code it is always good :)

I am all for not having more code than needed. But you would hit
synchronize_rcu() slow path immediately on first headless kfree_rcu() right?
That seems like a step back from the current code :)

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
  2020-11-04 15:01     ` Joel Fernandes
@ 2020-11-04 18:38       ` Uladzislau Rezki
  0 siblings, 0 replies; 36+ messages in thread
From: Uladzislau Rezki @ 2020-11-04 18:38 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Uladzislau Rezki, LKML, RCU, Paul E . McKenney, Andrew Morton,
	Peter Zijlstra, Michal Hocko, Thomas Gleixner,
	Theodore Y . Ts'o, Sebastian Andrzej Siewior,
	Oleksiy Avramchenko

> > > >   * This is a per-CPU structure.  The reason that it is not included in
> > > > @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> > > >  	bool monitor_todo;
> > > >  	bool initialized;
> > > >  	int count;
> > > > +
> > > > +	struct work_struct page_cache_work;
> > > > +	atomic_t work_in_progress;
> > > 
> > > Does it need to be atomic? run_page_cache_work() is only called under a lock.
> > > You can use xchg() there. And when you do the atomic_set, you can use
> > > WRITE_ONCE as it is a data-race.
> > > 
> > We can use xchg together with *_ONCE() macro. Could you please clarify what
> > is your concern about using atomic_t? Both xchg() and atomic_xchg() guarantee
> > atamarity. Same as WRITE_ONCE() or atomic_set().
> 
> Right, whether there's lock or not does not matter as xchg() is also
> atomic-swap.
> 
> atomic_t is a more complex type though, I would directly use int since
> atomic_t is not needed here and there's no lost-update issue here. It could
> be matter of style as well.
> 
> BTW I did think atomic_xchg() adds additional memory barriers
> but I could not find that to be the case in the implementation. Is that not
> the case? Docs says "atomic_xchg must provide explicit memory barriers around
> the operation.".
> 
In most of the systems atmoc_xchg() is same as xchg() and atomic_set()
is same as WRITE_ONCE(). But there are exceptions, for example "parisc"

*** arch/parisc/include/asm/atomic.h:
<snip>
...
#define _atomic_spin_lock_irqsave(l,f) do { \
    arch_spinlock_t *s = ATOMIC_HASH(l); \
    local_irq_save(f);   \
    arch_spin_lock(s);   \
} while(0)
...
static __inline__ void atomic_set(atomic_t *v, int i)
{
     unsigned long flags;
     _atomic_spin_lock_irqsave(v, flags);

     v->counter = i;

     _atomic_spin_unlock_irqrestore(v, flags);
}
<snip>

I will switch to xchg() and WRITE_ONCE(), because of such specific ARCHs.

> > > > @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
> > > >  
> > > >  	for_each_possible_cpu(cpu) {
> > > >  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > > -		struct kvfree_rcu_bulk_data *bnode;
> > > >  
> > > >  		for (i = 0; i < KFREE_N_BATCHES; i++) {
> > > >  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > > >  			krcp->krw_arr[i].krcp = krcp;
> > > >  		}
> > > >  
> > > > -		for (i = 0; i < rcu_min_cached_objs; i++) {
> > > > -			bnode = (struct kvfree_rcu_bulk_data *)
> > > > -				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > > -
> > > > -			if (bnode)
> > > > -				put_cached_bnode(krcp, bnode);
> > > > -			else
> > > > -				pr_err("Failed to preallocate for %d CPU!\n", cpu);
> > > > -		}
> > > > -
> > > >  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > > > +		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> > > >  		krcp->initialized = true;
> > > 
> > > During initialization, is it not better to still pre-allocate? That way you
> > > don't have to wait to get into a situation where you need to initially
> > > allocate.
> > > 
> > Since we have a worker that does it when a cache is empty there is no
> > a high need in doing it during initialization phase. If we can reduce
> > an amount of code it is always good :)
> 
> I am all for not having more code than needed. But you would hit
> synchronize_rcu() slow path immediately on first headless kfree_rcu() right?
> That seems like a step back from the current code :)
> 
As for slow path and hitting the synchronize_rcu() immediately. Yes, a slow 
hit "counter" will be increased by 1, the difference between two variants
will be N and N + 1 times. I do not consider N + 1 as a big difference and
impact on performance.

Should we guarantee that a first user does not hit a fallback path that
invokes synchronize_rcu()? If not, i would rather remove redundant code.

Any thoughts here?

Thanks!

--
Vlad Rezki

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2020-11-04 18:38 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-29 16:50 [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 03/16] preempt: Make preempt count unconditional Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 04/16] preempt: Cleanup PREEMPT_COUNT leftovers Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 05/16] lockdep: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 06/16] mm/pagemap: " Uladzislau Rezki (Sony)
2020-10-29 20:57   ` Uladzislau Rezki
2020-10-29 21:26     ` Paul E. McKenney
2020-10-29 16:50 ` [PATCH 07/16] locking/bitspinlock: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 08/16] uaccess: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 09/16] sched: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 10/16] ARM: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 11/16] xtensa: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 12/16] drm/i915: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 13/16] rcutorture: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 14/16] preempt: Remove PREEMPT_COUNT from Kconfig Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible Uladzislau Rezki (Sony)
2020-11-03 18:03   ` Joel Fernandes
2020-11-04 11:39     ` Uladzislau Rezki
2020-11-04 14:36       ` Joel Fernandes
2020-10-29 16:50 ` [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache Uladzislau Rezki (Sony)
2020-10-29 19:47   ` Paul E. McKenney
2020-10-29 20:13     ` Uladzislau Rezki
2020-10-29 20:22       ` Uladzislau Rezki
2020-10-29 20:33         ` Paul E. McKenney
2020-10-29 21:00           ` Uladzislau Rezki
2020-11-03 15:47 ` [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Joel Fernandes
2020-11-03 16:33   ` Uladzislau Rezki
2020-11-03 19:18     ` Paul E. McKenney
2020-11-04 12:35       ` Uladzislau Rezki
2020-11-04 14:12         ` Paul E. McKenney
2020-11-04 14:40           ` Uladzislau Rezki
2020-11-03 17:54 ` Joel Fernandes
2020-11-04 12:12   ` Uladzislau Rezki
2020-11-04 15:01     ` Joel Fernandes
2020-11-04 18:38       ` Uladzislau Rezki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).