rcu.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
       [not found] <20230411130854.46795-1-zhengqi.arch@bytedance.com>
@ 2023-04-11 13:40 ` Vlastimil Babka
  2023-04-11 14:08   ` Qi Zheng
  0 siblings, 1 reply; 23+ messages in thread
From: Vlastimil Babka @ 2023-04-11 13:40 UTC (permalink / raw)
  To: Qi Zheng, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl
  Cc: linux-mm, linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On 4/11/23 15:08, Qi Zheng wrote:
> The list_lock can be held in the critical section of
> raw_spinlock, and then lockdep will complain about it
> like below:
> 
>  =============================
>  [ BUG: Invalid wait context ]
>  6.3.0-rc6-next-20230411 #7 Not tainted
>  -----------------------------
>  swapper/0/1 is trying to lock:
>  ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>  other info that might help us debug this:
>  context-{5:5}
>  2 locks held by swapper/0/1:
>   #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>   #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>  stack backtrace:
>  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>  Call Trace:
>   <TASK>
>   dump_stack_lvl+0x77/0xc0
>   __lock_acquire+0xa65/0x2950
>   ? arch_stack_walk+0x65/0xf0
>   ? arch_stack_walk+0x65/0xf0
>   ? unwind_next_frame+0x602/0x8d0
>   lock_acquire+0xe0/0x300
>   ? ___slab_alloc+0x73d/0x1330
>   ? find_usage_forwards+0x39/0x50
>   ? check_irq_usage+0x162/0xa70
>   ? __bfs+0x10c/0x2c0
>   _raw_spin_lock_irqsave+0x4f/0x90
>   ? ___slab_alloc+0x73d/0x1330
>   ___slab_alloc+0x73d/0x1330
>   ? fill_pool+0x16b/0x2a0
>   ? look_up_lock_class+0x5d/0x160
>   ? register_lock_class+0x48/0x500
>   ? __lock_acquire+0xabc/0x2950
>   ? fill_pool+0x16b/0x2a0
>   kmem_cache_alloc+0x358/0x3b0
>   ? __lock_acquire+0xabc/0x2950
>   fill_pool+0x16b/0x2a0
>   ? __debug_object_init+0x292/0x560
>   ? lock_acquire+0xe0/0x300
>   ? cblist_init_generic+0x232/0x2d0
>   __debug_object_init+0x2c/0x560
>   cblist_init_generic+0x147/0x2d0
>   rcu_init_tasks_generic+0x15/0x190
>   kernel_init_freeable+0x6e/0x3e0
>   ? rest_init+0x1e0/0x1e0
>   kernel_init+0x1b/0x1d0
>   ? rest_init+0x1e0/0x1e0
>   ret_from_fork+0x1f/0x30
>   </TASK>
> 
> The fill_pool() can only be called in the !PREEMPT_RT kernel
> or in the preemptible context of the PREEMPT_RT kernel, so
> the above warning is not a real issue, but it's better to
> annotate kmem_cache_node->list_lock as raw_spinlock to get
> rid of such issue.

+ CC some RT and RCU people

AFAIK raw_spinlock is not just an annotation, but on RT it changes the
implementation from preemptible mutex to actual spin lock, so it would be
rather unfortunate to do that for a spurious warning. Can it be somehow
fixed in a better way?

> Reported-by: Zhao Gongyi <zhaogongyi@bytedance.com>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  mm/slab.h |  4 ++--
>  mm/slub.c | 66 +++++++++++++++++++++++++++----------------------------
>  2 files changed, 35 insertions(+), 35 deletions(-)
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index f01ac256a8f5..43f3436d13b4 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -723,8 +723,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
>   * The slab lists for all objects.
>   */
>  struct kmem_cache_node {
> -#ifdef CONFIG_SLAB
>  	raw_spinlock_t list_lock;
> +
> +#ifdef CONFIG_SLAB
>  	struct list_head slabs_partial;	/* partial list first, better asm code */
>  	struct list_head slabs_full;
>  	struct list_head slabs_free;
> @@ -740,7 +741,6 @@ struct kmem_cache_node {
>  #endif
>  
>  #ifdef CONFIG_SLUB
> -	spinlock_t list_lock;
>  	unsigned long nr_partial;
>  	struct list_head partial;
>  #ifdef CONFIG_SLUB_DEBUG
> diff --git a/mm/slub.c b/mm/slub.c
> index c87628cd8a9a..e66a35643624 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1331,7 +1331,7 @@ static void add_full(struct kmem_cache *s,
>  	if (!(s->flags & SLAB_STORE_USER))
>  		return;
>  
> -	lockdep_assert_held(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  	list_add(&slab->slab_list, &n->full);
>  }
>  
> @@ -1340,7 +1340,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct
>  	if (!(s->flags & SLAB_STORE_USER))
>  		return;
>  
> -	lockdep_assert_held(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  	list_del(&slab->slab_list);
>  }
>  
> @@ -2113,14 +2113,14 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
>  static inline void add_partial(struct kmem_cache_node *n,
>  				struct slab *slab, int tail)
>  {
> -	lockdep_assert_held(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  	__add_partial(n, slab, tail);
>  }
>  
>  static inline void remove_partial(struct kmem_cache_node *n,
>  					struct slab *slab)
>  {
> -	lockdep_assert_held(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  	list_del(&slab->slab_list);
>  	n->nr_partial--;
>  }
> @@ -2136,7 +2136,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
>  {
>  	void *object;
>  
> -	lockdep_assert_held(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  
>  	object = slab->freelist;
>  	slab->freelist = get_freepointer(s, object);
> @@ -2181,7 +2181,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s,
>  		 */
>  		return NULL;
>  
> -	spin_lock_irqsave(&n->list_lock, flags);
> +	raw_spin_lock_irqsave(&n->list_lock, flags);
>  
>  	if (slab->inuse == slab->objects)
>  		add_full(s, n, slab);
> @@ -2189,7 +2189,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s,
>  		add_partial(n, slab, DEACTIVATE_TO_HEAD);
>  
>  	inc_slabs_node(s, nid, slab->objects);
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  
>  	return object;
>  }
> @@ -2208,7 +2208,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
>  	unsigned long counters;
>  	struct slab new;
>  
> -	lockdep_assert_held(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  
>  	/*
>  	 * Zap the freelist and set the frozen bit.
> @@ -2267,7 +2267,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>  	if (!n || !n->nr_partial)
>  		return NULL;
>  
> -	spin_lock_irqsave(&n->list_lock, flags);
> +	raw_spin_lock_irqsave(&n->list_lock, flags);
>  	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
>  		void *t;
>  
> @@ -2304,7 +2304,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>  #endif
>  
>  	}
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  	return object;
>  }
>  
> @@ -2548,7 +2548,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
>  		 * Taking the spinlock removes the possibility that
>  		 * acquire_slab() will see a slab that is frozen
>  		 */
> -		spin_lock_irqsave(&n->list_lock, flags);
> +		raw_spin_lock_irqsave(&n->list_lock, flags);
>  	} else {
>  		mode = M_FULL_NOLIST;
>  	}
> @@ -2559,14 +2559,14 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
>  				new.freelist, new.counters,
>  				"unfreezing slab")) {
>  		if (mode == M_PARTIAL)
> -			spin_unlock_irqrestore(&n->list_lock, flags);
> +			raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  		goto redo;
>  	}
>  
>  
>  	if (mode == M_PARTIAL) {
>  		add_partial(n, slab, tail);
> -		spin_unlock_irqrestore(&n->list_lock, flags);
> +		raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  		stat(s, tail);
>  	} else if (mode == M_FREE) {
>  		stat(s, DEACTIVATE_EMPTY);
> @@ -2594,10 +2594,10 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
>  		n2 = get_node(s, slab_nid(slab));
>  		if (n != n2) {
>  			if (n)
> -				spin_unlock_irqrestore(&n->list_lock, flags);
> +				raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  
>  			n = n2;
> -			spin_lock_irqsave(&n->list_lock, flags);
> +			raw_spin_lock_irqsave(&n->list_lock, flags);
>  		}
>  
>  		do {
> @@ -2626,7 +2626,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
>  	}
>  
>  	if (n)
> -		spin_unlock_irqrestore(&n->list_lock, flags);
> +		raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  
>  	while (slab_to_discard) {
>  		slab = slab_to_discard;
> @@ -2951,10 +2951,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
>  	unsigned long x = 0;
>  	struct slab *slab;
>  
> -	spin_lock_irqsave(&n->list_lock, flags);
> +	raw_spin_lock_irqsave(&n->list_lock, flags);
>  	list_for_each_entry(slab, &n->partial, slab_list)
>  		x += get_count(slab);
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  	return x;
>  }
>  #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
> @@ -3515,7 +3515,7 @@ static noinline void free_to_partial_list(
>  	if (s->flags & SLAB_STORE_USER)
>  		handle = set_track_prepare();
>  
> -	spin_lock_irqsave(&n->list_lock, flags);
> +	raw_spin_lock_irqsave(&n->list_lock, flags);
>  
>  	if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
>  		void *prior = slab->freelist;
> @@ -3554,7 +3554,7 @@ static noinline void free_to_partial_list(
>  		dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
>  	}
>  
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  
>  	if (slab_free) {
>  		stat(s, FREE_SLAB);
> @@ -3594,7 +3594,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  
>  	do {
>  		if (unlikely(n)) {
> -			spin_unlock_irqrestore(&n->list_lock, flags);
> +			raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  			n = NULL;
>  		}
>  		prior = slab->freelist;
> @@ -3626,7 +3626,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  				 * Otherwise the list_lock will synchronize with
>  				 * other processors updating the list of slabs.
>  				 */
> -				spin_lock_irqsave(&n->list_lock, flags);
> +				raw_spin_lock_irqsave(&n->list_lock, flags);
>  
>  			}
>  		}
> @@ -3668,7 +3668,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  		add_partial(n, slab, DEACTIVATE_TO_TAIL);
>  		stat(s, FREE_ADD_PARTIAL);
>  	}
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  	return;
>  
>  slab_empty:
> @@ -3683,7 +3683,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  		remove_full(s, n, slab);
>  	}
>  
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  	stat(s, FREE_SLAB);
>  	discard_slab(s, slab);
>  }
> @@ -4180,7 +4180,7 @@ static void
>  init_kmem_cache_node(struct kmem_cache_node *n)
>  {
>  	n->nr_partial = 0;
> -	spin_lock_init(&n->list_lock);
> +	raw_spin_lock_init(&n->list_lock);
>  	INIT_LIST_HEAD(&n->partial);
>  #ifdef CONFIG_SLUB_DEBUG
>  	atomic_long_set(&n->nr_slabs, 0);
> @@ -4576,7 +4576,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>  	struct slab *slab, *h;
>  
>  	BUG_ON(irqs_disabled());
> -	spin_lock_irq(&n->list_lock);
> +	raw_spin_lock_irq(&n->list_lock);
>  	list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
>  		if (!slab->inuse) {
>  			remove_partial(n, slab);
> @@ -4586,7 +4586,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>  			  "Objects remaining in %s on __kmem_cache_shutdown()");
>  		}
>  	}
> -	spin_unlock_irq(&n->list_lock);
> +	raw_spin_unlock_irq(&n->list_lock);
>  
>  	list_for_each_entry_safe(slab, h, &discard, slab_list)
>  		discard_slab(s, slab);
> @@ -4790,7 +4790,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
>  		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
>  			INIT_LIST_HEAD(promote + i);
>  
> -		spin_lock_irqsave(&n->list_lock, flags);
> +		raw_spin_lock_irqsave(&n->list_lock, flags);
>  
>  		/*
>  		 * Build lists of slabs to discard or promote.
> @@ -4822,7 +4822,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
>  		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
>  			list_splice(promote + i, &n->partial);
>  
> -		spin_unlock_irqrestore(&n->list_lock, flags);
> +		raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  
>  		/* Release empty slabs */
>  		list_for_each_entry_safe(slab, t, &discard, slab_list)
> @@ -5147,7 +5147,7 @@ static int validate_slab_node(struct kmem_cache *s,
>  	struct slab *slab;
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&n->list_lock, flags);
> +	raw_spin_lock_irqsave(&n->list_lock, flags);
>  
>  	list_for_each_entry(slab, &n->partial, slab_list) {
>  		validate_slab(s, slab, obj_map);
> @@ -5173,7 +5173,7 @@ static int validate_slab_node(struct kmem_cache *s,
>  	}
>  
>  out:
> -	spin_unlock_irqrestore(&n->list_lock, flags);
> +	raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  	return count;
>  }
>  
> @@ -6399,12 +6399,12 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
>  		if (!atomic_long_read(&n->nr_slabs))
>  			continue;
>  
> -		spin_lock_irqsave(&n->list_lock, flags);
> +		raw_spin_lock_irqsave(&n->list_lock, flags);
>  		list_for_each_entry(slab, &n->partial, slab_list)
>  			process_slab(t, s, slab, alloc, obj_map);
>  		list_for_each_entry(slab, &n->full, slab_list)
>  			process_slab(t, s, slab, alloc, obj_map);
> -		spin_unlock_irqrestore(&n->list_lock, flags);
> +		raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  	}
>  
>  	/* Sort locations by count */


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-11 13:40 ` [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock Vlastimil Babka
@ 2023-04-11 14:08   ` Qi Zheng
  2023-04-11 14:19     ` Vlastimil Babka
  0 siblings, 1 reply; 23+ messages in thread
From: Qi Zheng @ 2023-04-11 14:08 UTC (permalink / raw)
  To: Vlastimil Babka, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl
  Cc: linux-mm, linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney



On 2023/4/11 21:40, Vlastimil Babka wrote:
> On 4/11/23 15:08, Qi Zheng wrote:
>> The list_lock can be held in the critical section of
>> raw_spinlock, and then lockdep will complain about it
>> like below:
>>
>>   =============================
>>   [ BUG: Invalid wait context ]
>>   6.3.0-rc6-next-20230411 #7 Not tainted
>>   -----------------------------
>>   swapper/0/1 is trying to lock:
>>   ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>   other info that might help us debug this:
>>   context-{5:5}
>>   2 locks held by swapper/0/1:
>>    #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>    #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>   stack backtrace:
>>   CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>   Call Trace:
>>    <TASK>
>>    dump_stack_lvl+0x77/0xc0
>>    __lock_acquire+0xa65/0x2950
>>    ? arch_stack_walk+0x65/0xf0
>>    ? arch_stack_walk+0x65/0xf0
>>    ? unwind_next_frame+0x602/0x8d0
>>    lock_acquire+0xe0/0x300
>>    ? ___slab_alloc+0x73d/0x1330
>>    ? find_usage_forwards+0x39/0x50
>>    ? check_irq_usage+0x162/0xa70
>>    ? __bfs+0x10c/0x2c0
>>    _raw_spin_lock_irqsave+0x4f/0x90
>>    ? ___slab_alloc+0x73d/0x1330
>>    ___slab_alloc+0x73d/0x1330
>>    ? fill_pool+0x16b/0x2a0
>>    ? look_up_lock_class+0x5d/0x160
>>    ? register_lock_class+0x48/0x500
>>    ? __lock_acquire+0xabc/0x2950
>>    ? fill_pool+0x16b/0x2a0
>>    kmem_cache_alloc+0x358/0x3b0
>>    ? __lock_acquire+0xabc/0x2950
>>    fill_pool+0x16b/0x2a0
>>    ? __debug_object_init+0x292/0x560
>>    ? lock_acquire+0xe0/0x300
>>    ? cblist_init_generic+0x232/0x2d0
>>    __debug_object_init+0x2c/0x560
>>    cblist_init_generic+0x147/0x2d0
>>    rcu_init_tasks_generic+0x15/0x190
>>    kernel_init_freeable+0x6e/0x3e0
>>    ? rest_init+0x1e0/0x1e0
>>    kernel_init+0x1b/0x1d0
>>    ? rest_init+0x1e0/0x1e0
>>    ret_from_fork+0x1f/0x30
>>    </TASK>
>>
>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>> or in the preemptible context of the PREEMPT_RT kernel, so
>> the above warning is not a real issue, but it's better to
>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>> rid of such issue.
> 
> + CC some RT and RCU people

Thanks.

> 
> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
> implementation from preemptible mutex to actual spin lock, so it would be

Yeah.

> rather unfortunate to do that for a spurious warning. Can it be somehow
> fixed in a better way?

It's indeed unfortunate for the warning in the commit message. But
functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
in the critical section of raw_spinlock or in the hardirq context, which
will cause problem in the PREEMPT_RT kernel. So I still think it is
reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.

In addition, there are many fix patches for this kind of warning in the
git log, so I also think there should be a general and better solution. :)

> 

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-11 14:08   ` Qi Zheng
@ 2023-04-11 14:19     ` Vlastimil Babka
  2023-04-11 14:25       ` Qi Zheng
  0 siblings, 1 reply; 23+ messages in thread
From: Vlastimil Babka @ 2023-04-11 14:19 UTC (permalink / raw)
  To: Qi Zheng, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl
  Cc: linux-mm, linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On 4/11/23 16:08, Qi Zheng wrote:
> 
> 
> On 2023/4/11 21:40, Vlastimil Babka wrote:
>> On 4/11/23 15:08, Qi Zheng wrote:
>>> The list_lock can be held in the critical section of
>>> raw_spinlock, and then lockdep will complain about it
>>> like below:
>>>
>>>   =============================
>>>   [ BUG: Invalid wait context ]
>>>   6.3.0-rc6-next-20230411 #7 Not tainted
>>>   -----------------------------
>>>   swapper/0/1 is trying to lock:
>>>   ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>>   other info that might help us debug this:
>>>   context-{5:5}
>>>   2 locks held by swapper/0/1:
>>>    #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>>    #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>   stack backtrace:
>>>   CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>>   Call Trace:
>>>    <TASK>
>>>    dump_stack_lvl+0x77/0xc0
>>>    __lock_acquire+0xa65/0x2950
>>>    ? arch_stack_walk+0x65/0xf0
>>>    ? arch_stack_walk+0x65/0xf0
>>>    ? unwind_next_frame+0x602/0x8d0
>>>    lock_acquire+0xe0/0x300
>>>    ? ___slab_alloc+0x73d/0x1330
>>>    ? find_usage_forwards+0x39/0x50
>>>    ? check_irq_usage+0x162/0xa70
>>>    ? __bfs+0x10c/0x2c0
>>>    _raw_spin_lock_irqsave+0x4f/0x90
>>>    ? ___slab_alloc+0x73d/0x1330
>>>    ___slab_alloc+0x73d/0x1330
>>>    ? fill_pool+0x16b/0x2a0
>>>    ? look_up_lock_class+0x5d/0x160
>>>    ? register_lock_class+0x48/0x500
>>>    ? __lock_acquire+0xabc/0x2950
>>>    ? fill_pool+0x16b/0x2a0
>>>    kmem_cache_alloc+0x358/0x3b0
>>>    ? __lock_acquire+0xabc/0x2950
>>>    fill_pool+0x16b/0x2a0
>>>    ? __debug_object_init+0x292/0x560
>>>    ? lock_acquire+0xe0/0x300
>>>    ? cblist_init_generic+0x232/0x2d0
>>>    __debug_object_init+0x2c/0x560
>>>    cblist_init_generic+0x147/0x2d0
>>>    rcu_init_tasks_generic+0x15/0x190
>>>    kernel_init_freeable+0x6e/0x3e0
>>>    ? rest_init+0x1e0/0x1e0
>>>    kernel_init+0x1b/0x1d0
>>>    ? rest_init+0x1e0/0x1e0
>>>    ret_from_fork+0x1f/0x30
>>>    </TASK>
>>>
>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>> the above warning is not a real issue, but it's better to
>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>> rid of such issue.
>> 
>> + CC some RT and RCU people
> 
> Thanks.
> 
>> 
>> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
>> implementation from preemptible mutex to actual spin lock, so it would be
> 
> Yeah.
> 
>> rather unfortunate to do that for a spurious warning. Can it be somehow
>> fixed in a better way?
> 
> It's indeed unfortunate for the warning in the commit message. But
> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
> in the critical section of raw_spinlock or in the hardirq context, which

Hmm, I thought they may not, actually.

> will cause problem in the PREEMPT_RT kernel. So I still think it is
> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.

It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
slab allocation for such context, it means also page allocation can happen
to refill the slabs, so lockdep will eventually complain about zone->lock,
and who knows what else.

> In addition, there are many fix patches for this kind of warning in the
> git log, so I also think there should be a general and better solution. :)

Maybe, but given above, I doubt it's this one.

> 
>> 
> 


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-11 14:19     ` Vlastimil Babka
@ 2023-04-11 14:25       ` Qi Zheng
  2023-04-12  5:51         ` Boqun Feng
  0 siblings, 1 reply; 23+ messages in thread
From: Qi Zheng @ 2023-04-11 14:25 UTC (permalink / raw)
  To: Vlastimil Babka, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl
  Cc: linux-mm, linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney



On 2023/4/11 22:19, Vlastimil Babka wrote:
> On 4/11/23 16:08, Qi Zheng wrote:
>>
>>
>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>> The list_lock can be held in the critical section of
>>>> raw_spinlock, and then lockdep will complain about it
>>>> like below:
>>>>
>>>>    =============================
>>>>    [ BUG: Invalid wait context ]
>>>>    6.3.0-rc6-next-20230411 #7 Not tainted
>>>>    -----------------------------
>>>>    swapper/0/1 is trying to lock:
>>>>    ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>>>    other info that might help us debug this:
>>>>    context-{5:5}
>>>>    2 locks held by swapper/0/1:
>>>>     #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>>>     #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>    stack backtrace:
>>>>    CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>>>    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>>>    Call Trace:
>>>>     <TASK>
>>>>     dump_stack_lvl+0x77/0xc0
>>>>     __lock_acquire+0xa65/0x2950
>>>>     ? arch_stack_walk+0x65/0xf0
>>>>     ? arch_stack_walk+0x65/0xf0
>>>>     ? unwind_next_frame+0x602/0x8d0
>>>>     lock_acquire+0xe0/0x300
>>>>     ? ___slab_alloc+0x73d/0x1330
>>>>     ? find_usage_forwards+0x39/0x50
>>>>     ? check_irq_usage+0x162/0xa70
>>>>     ? __bfs+0x10c/0x2c0
>>>>     _raw_spin_lock_irqsave+0x4f/0x90
>>>>     ? ___slab_alloc+0x73d/0x1330
>>>>     ___slab_alloc+0x73d/0x1330
>>>>     ? fill_pool+0x16b/0x2a0
>>>>     ? look_up_lock_class+0x5d/0x160
>>>>     ? register_lock_class+0x48/0x500
>>>>     ? __lock_acquire+0xabc/0x2950
>>>>     ? fill_pool+0x16b/0x2a0
>>>>     kmem_cache_alloc+0x358/0x3b0
>>>>     ? __lock_acquire+0xabc/0x2950
>>>>     fill_pool+0x16b/0x2a0
>>>>     ? __debug_object_init+0x292/0x560
>>>>     ? lock_acquire+0xe0/0x300
>>>>     ? cblist_init_generic+0x232/0x2d0
>>>>     __debug_object_init+0x2c/0x560
>>>>     cblist_init_generic+0x147/0x2d0
>>>>     rcu_init_tasks_generic+0x15/0x190
>>>>     kernel_init_freeable+0x6e/0x3e0
>>>>     ? rest_init+0x1e0/0x1e0
>>>>     kernel_init+0x1b/0x1d0
>>>>     ? rest_init+0x1e0/0x1e0
>>>>     ret_from_fork+0x1f/0x30
>>>>     </TASK>
>>>>
>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>> the above warning is not a real issue, but it's better to
>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>> rid of such issue.
>>>
>>> + CC some RT and RCU people
>>
>> Thanks.
>>
>>>
>>> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
>>> implementation from preemptible mutex to actual spin lock, so it would be
>>
>> Yeah.
>>
>>> rather unfortunate to do that for a spurious warning. Can it be somehow
>>> fixed in a better way?
>>
>> It's indeed unfortunate for the warning in the commit message. But
>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
>> in the critical section of raw_spinlock or in the hardirq context, which
> 
> Hmm, I thought they may not, actually.
> 
>> will cause problem in the PREEMPT_RT kernel. So I still think it is
>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
> 
> It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
> slab allocation for such context, it means also page allocation can happen
> to refill the slabs, so lockdep will eventually complain about zone->lock,
> and who knows what else.

Oh, indeed. :(

> 
>> In addition, there are many fix patches for this kind of warning in the
>> git log, so I also think there should be a general and better solution. :)
> 
> Maybe, but given above, I doubt it's this one.
> 
>>
>>>
>>
> 

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-11 14:25       ` Qi Zheng
@ 2023-04-12  5:51         ` Boqun Feng
  2023-04-12  6:44           ` Zhang, Qiang1
  2023-04-12  6:45           ` Qi Zheng
  0 siblings, 2 replies; 23+ messages in thread
From: Boqun Feng @ 2023-04-12  5:51 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Vlastimil Babka, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney

On Tue, Apr 11, 2023 at 10:25:06PM +0800, Qi Zheng wrote:
> 
> 
> On 2023/4/11 22:19, Vlastimil Babka wrote:
> > On 4/11/23 16:08, Qi Zheng wrote:
> > > 
> > > 
> > > On 2023/4/11 21:40, Vlastimil Babka wrote:
> > > > On 4/11/23 15:08, Qi Zheng wrote:
> > > > > The list_lock can be held in the critical section of
> > > > > raw_spinlock, and then lockdep will complain about it
> > > > > like below:
> > > > > 
> > > > >    =============================
> > > > >    [ BUG: Invalid wait context ]
> > > > >    6.3.0-rc6-next-20230411 #7 Not tainted
> > > > >    -----------------------------
> > > > >    swapper/0/1 is trying to lock:
> > > > >    ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
> > > > >    other info that might help us debug this:
> > > > >    context-{5:5}
> > > > >    2 locks held by swapper/0/1:
> > > > >     #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
> > > > >     #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
> > > > >    stack backtrace:
> > > > >    CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
> > > > >    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
> > > > >    Call Trace:
> > > > >     <TASK>
> > > > >     dump_stack_lvl+0x77/0xc0
> > > > >     __lock_acquire+0xa65/0x2950
> > > > >     ? arch_stack_walk+0x65/0xf0
> > > > >     ? arch_stack_walk+0x65/0xf0
> > > > >     ? unwind_next_frame+0x602/0x8d0
> > > > >     lock_acquire+0xe0/0x300
> > > > >     ? ___slab_alloc+0x73d/0x1330
> > > > >     ? find_usage_forwards+0x39/0x50
> > > > >     ? check_irq_usage+0x162/0xa70
> > > > >     ? __bfs+0x10c/0x2c0
> > > > >     _raw_spin_lock_irqsave+0x4f/0x90
> > > > >     ? ___slab_alloc+0x73d/0x1330
> > > > >     ___slab_alloc+0x73d/0x1330
> > > > >     ? fill_pool+0x16b/0x2a0
> > > > >     ? look_up_lock_class+0x5d/0x160
> > > > >     ? register_lock_class+0x48/0x500
> > > > >     ? __lock_acquire+0xabc/0x2950
> > > > >     ? fill_pool+0x16b/0x2a0
> > > > >     kmem_cache_alloc+0x358/0x3b0
> > > > >     ? __lock_acquire+0xabc/0x2950
> > > > >     fill_pool+0x16b/0x2a0
> > > > >     ? __debug_object_init+0x292/0x560
> > > > >     ? lock_acquire+0xe0/0x300
> > > > >     ? cblist_init_generic+0x232/0x2d0
> > > > >     __debug_object_init+0x2c/0x560

This "__debug_object_init" is because INIT_WORK() is called in
cblist_init_generic(), so..

> > > > >     cblist_init_generic+0x147/0x2d0
> > > > >     rcu_init_tasks_generic+0x15/0x190
> > > > >     kernel_init_freeable+0x6e/0x3e0
> > > > >     ? rest_init+0x1e0/0x1e0
> > > > >     kernel_init+0x1b/0x1d0
> > > > >     ? rest_init+0x1e0/0x1e0
> > > > >     ret_from_fork+0x1f/0x30
> > > > >     </TASK>
> > > > > 
> > > > > The fill_pool() can only be called in the !PREEMPT_RT kernel
> > > > > or in the preemptible context of the PREEMPT_RT kernel, so
> > > > > the above warning is not a real issue, but it's better to
> > > > > annotate kmem_cache_node->list_lock as raw_spinlock to get
> > > > > rid of such issue.
> > > > 
> > > > + CC some RT and RCU people
> > > 
> > > Thanks.
> > > 
> > > > 
> > > > AFAIK raw_spinlock is not just an annotation, but on RT it changes the
> > > > implementation from preemptible mutex to actual spin lock, so it would be
> > > 
> > > Yeah.
> > > 
> > > > rather unfortunate to do that for a spurious warning. Can it be somehow
> > > > fixed in a better way?

... probably a better fix is to drop locks and call INIT_WORK(), or make
the cblist_init_generic() lockless (or part lockless), given it's just
initializing the cblist, it's probably doable. But I haven't taken a
careful look yet.

Regards,
Boqun

> > > 
> > > It's indeed unfortunate for the warning in the commit message. But
> > > functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
> > > in the critical section of raw_spinlock or in the hardirq context, which
> > 
> > Hmm, I thought they may not, actually.
> > 
> > > will cause problem in the PREEMPT_RT kernel. So I still think it is
> > > reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
> > 
> > It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
> > slab allocation for such context, it means also page allocation can happen
> > to refill the slabs, so lockdep will eventually complain about zone->lock,
> > and who knows what else.
> 
> Oh, indeed. :(
> 
> > 
> > > In addition, there are many fix patches for this kind of warning in the
> > > git log, so I also think there should be a general and better solution. :)
> > 
> > Maybe, but given above, I doubt it's this one.
> > 
> > > 
> > > > 
> > > 
> > 
> 
> -- 
> Thanks,
> Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  5:51         ` Boqun Feng
@ 2023-04-12  6:44           ` Zhang, Qiang1
  2023-04-12  6:50             ` Vlastimil Babka
  2023-04-12  6:57             ` Qi Zheng
  2023-04-12  6:45           ` Qi Zheng
  1 sibling, 2 replies; 23+ messages in thread
From: Zhang, Qiang1 @ 2023-04-12  6:44 UTC (permalink / raw)
  To: Boqun Feng, Qi Zheng
  Cc: Vlastimil Babka, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney

> 
> 
> On 2023/4/11 22:19, Vlastimil Babka wrote:
> > On 4/11/23 16:08, Qi Zheng wrote:
> > > 
> > > 
> > > On 2023/4/11 21:40, Vlastimil Babka wrote:
> > > > On 4/11/23 15:08, Qi Zheng wrote:
> > > > > The list_lock can be held in the critical section of
> > > > > raw_spinlock, and then lockdep will complain about it
> > > > > like below:
> > > > > 
> > > > >    =============================
> > > > >    [ BUG: Invalid wait context ]
> > > > >    6.3.0-rc6-next-20230411 #7 Not tainted
> > > > >    -----------------------------
> > > > >    swapper/0/1 is trying to lock:
> > > > >    ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
> > > > >    other info that might help us debug this:
> > > > >    context-{5:5}
> > > > >    2 locks held by swapper/0/1:
> > > > >     #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
> > > > >     #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
> > > > >    stack backtrace:
> > > > >    CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
> > > > >    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
> > > > >    Call Trace:
> > > > >     <TASK>
> > > > >     dump_stack_lvl+0x77/0xc0
> > > > >     __lock_acquire+0xa65/0x2950
> > > > >     ? arch_stack_walk+0x65/0xf0
> > > > >     ? arch_stack_walk+0x65/0xf0
> > > > >     ? unwind_next_frame+0x602/0x8d0
> > > > >     lock_acquire+0xe0/0x300
> > > > >     ? ___slab_alloc+0x73d/0x1330
> > > > >     ? find_usage_forwards+0x39/0x50
> > > > >     ? check_irq_usage+0x162/0xa70
> > > > >     ? __bfs+0x10c/0x2c0
> > > > >     _raw_spin_lock_irqsave+0x4f/0x90
> > > > >     ? ___slab_alloc+0x73d/0x1330
> > > > >     ___slab_alloc+0x73d/0x1330
> > > > >     ? fill_pool+0x16b/0x2a0
> > > > >     ? look_up_lock_class+0x5d/0x160
> > > > >     ? register_lock_class+0x48/0x500
> > > > >     ? __lock_acquire+0xabc/0x2950
> > > > >     ? fill_pool+0x16b/0x2a0
> > > > >     kmem_cache_alloc+0x358/0x3b0
> > > > >     ? __lock_acquire+0xabc/0x2950
> > > > >     fill_pool+0x16b/0x2a0
> > > > >     ? __debug_object_init+0x292/0x560
> > > > >     ? lock_acquire+0xe0/0x300
> > > > >     ? cblist_init_generic+0x232/0x2d0
> > > > >     __debug_object_init+0x2c/0x560
>
>This "__debug_object_init" is because INIT_WORK() is called in
>cblist_init_generic(), so..
>
> > > > >     cblist_init_generic+0x147/0x2d0
> > > > >     rcu_init_tasks_generic+0x15/0x190
> > > > >     kernel_init_freeable+0x6e/0x3e0
> > > > >     ? rest_init+0x1e0/0x1e0
> > > > >     kernel_init+0x1b/0x1d0
> > > > >     ? rest_init+0x1e0/0x1e0
> > > > >     ret_from_fork+0x1f/0x30
> > > > >     </TASK>
> > > > > 
> > > > > The fill_pool() can only be called in the !PREEMPT_RT kernel
> > > > > or in the preemptible context of the PREEMPT_RT kernel, so
> > > > > the above warning is not a real issue, but it's better to
> > > > > annotate kmem_cache_node->list_lock as raw_spinlock to get
> > > > > rid of such issue.
> > > > 
> > > > + CC some RT and RCU people
> > > 
> > > Thanks.
> > > 
> > > > 
> > > > AFAIK raw_spinlock is not just an annotation, but on RT it changes the
> > > > implementation from preemptible mutex to actual spin lock, so it would be
> > > 
> > > Yeah.
> > > 
> > > > rather unfortunate to do that for a spurious warning. Can it be somehow
> > > > fixed in a better way?
>
>... probably a better fix is to drop locks and call INIT_WORK(), or make
>the cblist_init_generic() lockless (or part lockless), given it's just
>initializing the cblist, it's probably doable. But I haven't taken a
>careful look yet.
>


This is just one of the paths that triggers an invalid wait,  the following paths can also trigger:

[  129.914547] [ BUG: Invalid wait context ]
[  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
[  129.915044] -----------------------------
[  129.915272] kworker/2:0/28 is trying to lock:
[  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: ___slab_alloc+0x68/0x12e0
[  129.915967] other info that might help us debug this:
[  129.916241] context-{5:5}
[  129.916392] 3 locks held by kworker/2:0/28:
[  129.916642]  #0: ffff888100084d48 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
[  129.917145]  #1: ffff888100c17dd0 ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: process_on0
[  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: kfree_rcu_monitor+0x29f/0x810
[  129.918207] stack backtrace:
[  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 6.3.0-rc1-yocto-standard+ #2
[  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
[  129.919397] Workqueue: events kfree_rcu_monitor
[  129.919662] Call Trace:
[  129.919812]  <TASK>
[  129.919941]  dump_stack_lvl+0x64/0xb0
[  129.920171]  dump_stack+0x10/0x20
[  129.920372]  __lock_acquire+0xeb8/0x3a80
[  129.920603]  ? ret_from_fork+0x2c/0x50
[  129.920824]  ? __pfx___lock_acquire+0x10/0x10
[  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
[  129.921343]  ? ret_from_fork+0x2c/0x50
[  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
[  129.921847]  lock_acquire+0x194/0x480
[  129.922060]  ? ___slab_alloc+0x68/0x12e0
[  129.922293]  ? __pfx_lock_acquire+0x10/0x10
[  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
[  129.922778]  ? __kasan_check_read+0x11/0x20
[  129.922998]  ___slab_alloc+0x9a/0x12e0
[  129.923222]  ? ___slab_alloc+0x68/0x12e0
[  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
[  129.923706]  ? __kasan_check_read+0x11/0x20
[  129.923937]  ? fill_pool+0x22a/0x370
[  129.924161]  ? __lock_acquire+0xf5b/0x3a80
[  129.924387]  ? fill_pool+0x22a/0x370
[  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
[  129.924832]  kmem_cache_alloc+0x296/0x3d0
[  129.925073]  ? fill_pool+0x22a/0x370
[  129.925291]  fill_pool+0x22a/0x370
[  129.925495]  ? __pfx_fill_pool+0x10/0x10
[  129.925718]  ? __pfx___lock_acquire+0x10/0x10
[  129.926034]  ? __kasan_check_read+0x11/0x20
[  129.926269]  ? check_chain_key+0x200/0x2b0
[  129.926503]  __debug_object_init+0x82/0x8c0
[  129.926734]  ? __pfx_lock_release+0x10/0x10
[  129.926984]  ? __pfx___debug_object_init+0x10/0x10
[  129.927249]  ? __kasan_check_read+0x11/0x20
[  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
[  129.927758]  debug_object_activate+0x2d1/0x2f0
[  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
[  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
[  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
[  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
[  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
[  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
[  129.929803]  ? __pfx_lock_acquired+0x10/0x10
[  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
[  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
[  129.930627]  call_rcu+0xe/0x20
[  129.930821]  queue_rcu_work+0x4f/0x60
[  129.931050]  kfree_rcu_monitor+0x5d3/0x810
[  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
[  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
[  129.931878]  process_one_work+0x607/0xba0
[  129.932129]  ? __pfx_process_one_work+0x10/0x10
[  129.932408]  ? worker_thread+0xd6/0x710
[  129.932653]  worker_thread+0x2d4/0x710
[  129.932888]  ? __pfx_worker_thread+0x10/0x10
[  129.933154]  kthread+0x18b/0x1c0
[  129.933363]  ? __pfx_kthread+0x10/0x10
[  129.933598]  ret_from_fork+0x2c/0x50
[  129.933825]  </TASK>

Maybe no need to convert ->list_lock to raw_spinlock.

--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
        unsigned long flags;

        /*
-        * On RT enabled kernels the pool refill must happen in preemptible
+        * The pool refill must happen in preemptible
         * context:
         */
-       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+       if (preemptible())
                fill_pool();

        db = get_bucket((unsigned long) addr);



Thanks
Zqiang

>
>
>Regards,
>Boqun
>
> > > 
> > > It's indeed unfortunate for the warning in the commit message. But
> > > functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
> > > in the critical section of raw_spinlock or in the hardirq context, which
> > 
> > Hmm, I thought they may not, actually.
> > 
> > > will cause problem in the PREEMPT_RT kernel. So I still think it is
> > > reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
> > 
> > It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
> > slab allocation for such context, it means also page allocation can happen
> > to refill the slabs, so lockdep will eventually complain about zone->lock,
> > and who knows what else.
> 
> Oh, indeed. :(
> 
> > 
> > > In addition, there are many fix patches for this kind of warning in the
> > > git log, so I also think there should be a general and better solution. :)
> > 
> > Maybe, but given above, I doubt it's this one.
> > 
> > > 
> > > > 
> > > 
> > 
> 
> -- 
> Thanks,
> Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  5:51         ` Boqun Feng
  2023-04-12  6:44           ` Zhang, Qiang1
@ 2023-04-12  6:45           ` Qi Zheng
  1 sibling, 0 replies; 23+ messages in thread
From: Qi Zheng @ 2023-04-12  6:45 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Vlastimil Babka, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney



On 2023/4/12 13:51, Boqun Feng wrote:
> On Tue, Apr 11, 2023 at 10:25:06PM +0800, Qi Zheng wrote:
>>
>>
>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>> The list_lock can be held in the critical section of
>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>> like below:
>>>>>>
>>>>>>     =============================
>>>>>>     [ BUG: Invalid wait context ]
>>>>>>     6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>     -----------------------------
>>>>>>     swapper/0/1 is trying to lock:
>>>>>>     ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>>>>>     other info that might help us debug this:
>>>>>>     context-{5:5}
>>>>>>     2 locks held by swapper/0/1:
>>>>>>      #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>>>>>      #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>     stack backtrace:
>>>>>>     CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>>>>>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>>>>>     Call Trace:
>>>>>>      <TASK>
>>>>>>      dump_stack_lvl+0x77/0xc0
>>>>>>      __lock_acquire+0xa65/0x2950
>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>      ? unwind_next_frame+0x602/0x8d0
>>>>>>      lock_acquire+0xe0/0x300
>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>      ? find_usage_forwards+0x39/0x50
>>>>>>      ? check_irq_usage+0x162/0xa70
>>>>>>      ? __bfs+0x10c/0x2c0
>>>>>>      _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>      ___slab_alloc+0x73d/0x1330
>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>      ? look_up_lock_class+0x5d/0x160
>>>>>>      ? register_lock_class+0x48/0x500
>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>      kmem_cache_alloc+0x358/0x3b0
>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>      fill_pool+0x16b/0x2a0
>>>>>>      ? __debug_object_init+0x292/0x560
>>>>>>      ? lock_acquire+0xe0/0x300
>>>>>>      ? cblist_init_generic+0x232/0x2d0
>>>>>>      __debug_object_init+0x2c/0x560
> 
> This "__debug_object_init" is because INIT_WORK() is called in
> cblist_init_generic(), so..

Yes, a more precise call stack is as follows:

cblist_init_generic
--> INIT_WORK
     --> lockdep_init_map
         --> lockdep_init_map_type
             --> register_lock_class
                 --> init_data_structures_once
                     --> init_rcu_head
                         --> debug_object_init
                             --> __debug_object_init

> 
>>>>>>      cblist_init_generic+0x147/0x2d0
>>>>>>      rcu_init_tasks_generic+0x15/0x190
>>>>>>      kernel_init_freeable+0x6e/0x3e0
>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>      kernel_init+0x1b/0x1d0
>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>      ret_from_fork+0x1f/0x30
>>>>>>      </TASK>
>>>>>>
>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>> the above warning is not a real issue, but it's better to
>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>> rid of such issue.
>>>>>
>>>>> + CC some RT and RCU people
>>>>
>>>> Thanks.
>>>>
>>>>>
>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
>>>>> implementation from preemptible mutex to actual spin lock, so it would be
>>>>
>>>> Yeah.
>>>>
>>>>> rather unfortunate to do that for a spurious warning. Can it be somehow
>>>>> fixed in a better way?
> 
> ... probably a better fix is to drop locks and call INIT_WORK(), or make
> the cblist_init_generic() lockless (or part lockless), given it's just
> initializing the cblist, it's probably doable. But I haven't taken a
> careful look yet.

This might be a doable solution for this warning, but I also saw another 
stacks like the following on v5.15:

[   30.349171] Call Trace:
[   30.349171]  <TASK>
[   30.349171]  dump_stack_lvl+0x69/0x97
[   30.349171]  __lock_acquire+0x4a0/0x1aa0
[   30.349171]  lock_acquire+0x275/0x2e0
[   30.349171]  _raw_spin_lock_irqsave+0x4c/0x90
[   30.349171]  ___slab_alloc.constprop.95+0x3ea/0xa80
[   30.349171]  __slab_alloc.isra.89.constprop.94+0x1c/0x30
[   30.349171]  kmem_cache_alloc+0x2bd/0x320
[   30.349171]  fill_pool+0x1b2/0x2d0
[   30.349171]  __debug_object_init+0x2c/0x500
[   30.349171]  debug_object_activate+0x136/0x200
[   30.349171]  add_timer+0x10b/0x170
[   30.349171]  queue_delayed_work_on+0x63/0xa0
[   30.349171]  init_mm_internals+0x226/0x2b0
[   30.349171]  kernel_init_freeable+0x82/0x24e
[   30.349171]  kernel_init+0x17/0x140
[   30.349171]  ret_from_fork+0x1f/0x30
[   30.349171]  </TASK>

So I'm a bit confused whether to fix individual cases or should there be
a general solution.

Thanks,
Qi

> 
> Regards,
> Boqun
> 
>>>>
>>>> It's indeed unfortunate for the warning in the commit message. But
>>>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
>>>> in the critical section of raw_spinlock or in the hardirq context, which
>>>
>>> Hmm, I thought they may not, actually.
>>>
>>>> will cause problem in the PREEMPT_RT kernel. So I still think it is
>>>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
>>>
>>> It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
>>> slab allocation for such context, it means also page allocation can happen
>>> to refill the slabs, so lockdep will eventually complain about zone->lock,
>>> and who knows what else.
>>
>> Oh, indeed. :(
>>
>>>
>>>> In addition, there are many fix patches for this kind of warning in the
>>>> git log, so I also think there should be a general and better solution. :)
>>>
>>> Maybe, but given above, I doubt it's this one.
>>>
>>>>
>>>>>
>>>>
>>>
>>
>> -- 
>> Thanks,
>> Qi

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  6:44           ` Zhang, Qiang1
@ 2023-04-12  6:50             ` Vlastimil Babka
  2023-04-12  7:30               ` Qi Zheng
  2023-04-12 12:47               ` Peter Zijlstra
  2023-04-12  6:57             ` Qi Zheng
  1 sibling, 2 replies; 23+ messages in thread
From: Vlastimil Babka @ 2023-04-12  6:50 UTC (permalink / raw)
  To: Zhang, Qiang1, Boqun Feng, Qi Zheng
  Cc: 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim, rientjes,
	penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney, Peter Zijlstra



On 4/12/23 08:44, Zhang, Qiang1 wrote:
>>
>>
>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>> The list_lock can be held in the critical section of
>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>> like below:
>>>>>>
>>>>>>    =============================
>>>>>>    [ BUG: Invalid wait context ]
>>>>>>    6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>    -----------------------------
>>>>>>    swapper/0/1 is trying to lock:
>>>>>>    ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>>>>>    other info that might help us debug this:
>>>>>>    context-{5:5}
>>>>>>    2 locks held by swapper/0/1:
>>>>>>     #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>>>>>     #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>    stack backtrace:
>>>>>>    CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>>>>>    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>>>>>    Call Trace:
>>>>>>     <TASK>
>>>>>>     dump_stack_lvl+0x77/0xc0
>>>>>>     __lock_acquire+0xa65/0x2950
>>>>>>     ? arch_stack_walk+0x65/0xf0
>>>>>>     ? arch_stack_walk+0x65/0xf0
>>>>>>     ? unwind_next_frame+0x602/0x8d0
>>>>>>     lock_acquire+0xe0/0x300
>>>>>>     ? ___slab_alloc+0x73d/0x1330
>>>>>>     ? find_usage_forwards+0x39/0x50
>>>>>>     ? check_irq_usage+0x162/0xa70
>>>>>>     ? __bfs+0x10c/0x2c0
>>>>>>     _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>     ? ___slab_alloc+0x73d/0x1330
>>>>>>     ___slab_alloc+0x73d/0x1330
>>>>>>     ? fill_pool+0x16b/0x2a0
>>>>>>     ? look_up_lock_class+0x5d/0x160
>>>>>>     ? register_lock_class+0x48/0x500
>>>>>>     ? __lock_acquire+0xabc/0x2950
>>>>>>     ? fill_pool+0x16b/0x2a0
>>>>>>     kmem_cache_alloc+0x358/0x3b0
>>>>>>     ? __lock_acquire+0xabc/0x2950
>>>>>>     fill_pool+0x16b/0x2a0
>>>>>>     ? __debug_object_init+0x292/0x560
>>>>>>     ? lock_acquire+0xe0/0x300
>>>>>>     ? cblist_init_generic+0x232/0x2d0
>>>>>>     __debug_object_init+0x2c/0x560
>>
>> This "__debug_object_init" is because INIT_WORK() is called in
>> cblist_init_generic(), so..
>>
>>>>>>     cblist_init_generic+0x147/0x2d0
>>>>>>     rcu_init_tasks_generic+0x15/0x190
>>>>>>     kernel_init_freeable+0x6e/0x3e0
>>>>>>     ? rest_init+0x1e0/0x1e0
>>>>>>     kernel_init+0x1b/0x1d0
>>>>>>     ? rest_init+0x1e0/0x1e0
>>>>>>     ret_from_fork+0x1f/0x30
>>>>>>     </TASK>
>>>>>>
>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>> the above warning is not a real issue, but it's better to
>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>> rid of such issue.
>>>>>
>>>>> + CC some RT and RCU people
>>>>
>>>> Thanks.
>>>>
>>>>>
>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
>>>>> implementation from preemptible mutex to actual spin lock, so it would be
>>>>
>>>> Yeah.
>>>>
>>>>> rather unfortunate to do that for a spurious warning. Can it be somehow
>>>>> fixed in a better way?
>>
>> ... probably a better fix is to drop locks and call INIT_WORK(), or make
>> the cblist_init_generic() lockless (or part lockless), given it's just
>> initializing the cblist, it's probably doable. But I haven't taken a
>> careful look yet.
>>
> 
> 
> This is just one of the paths that triggers an invalid wait,  the following paths can also trigger:
> 
> [  129.914547] [ BUG: Invalid wait context ]
> [  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
> [  129.915044] -----------------------------
> [  129.915272] kworker/2:0/28 is trying to lock:
> [  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: ___slab_alloc+0x68/0x12e0
> [  129.915967] other info that might help us debug this:
> [  129.916241] context-{5:5}
> [  129.916392] 3 locks held by kworker/2:0/28:
> [  129.916642]  #0: ffff888100084d48 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
> [  129.917145]  #1: ffff888100c17dd0 ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: process_on0
> [  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: kfree_rcu_monitor+0x29f/0x810
> [  129.918207] stack backtrace:
> [  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 6.3.0-rc1-yocto-standard+ #2
> [  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
> [  129.919397] Workqueue: events kfree_rcu_monitor
> [  129.919662] Call Trace:
> [  129.919812]  <TASK>
> [  129.919941]  dump_stack_lvl+0x64/0xb0
> [  129.920171]  dump_stack+0x10/0x20
> [  129.920372]  __lock_acquire+0xeb8/0x3a80
> [  129.920603]  ? ret_from_fork+0x2c/0x50
> [  129.920824]  ? __pfx___lock_acquire+0x10/0x10
> [  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
> [  129.921343]  ? ret_from_fork+0x2c/0x50
> [  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.921847]  lock_acquire+0x194/0x480
> [  129.922060]  ? ___slab_alloc+0x68/0x12e0
> [  129.922293]  ? __pfx_lock_acquire+0x10/0x10
> [  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
> [  129.922778]  ? __kasan_check_read+0x11/0x20
> [  129.922998]  ___slab_alloc+0x9a/0x12e0
> [  129.923222]  ? ___slab_alloc+0x68/0x12e0
> [  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
> [  129.923706]  ? __kasan_check_read+0x11/0x20
> [  129.923937]  ? fill_pool+0x22a/0x370
> [  129.924161]  ? __lock_acquire+0xf5b/0x3a80
> [  129.924387]  ? fill_pool+0x22a/0x370
> [  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
> [  129.924832]  kmem_cache_alloc+0x296/0x3d0
> [  129.925073]  ? fill_pool+0x22a/0x370
> [  129.925291]  fill_pool+0x22a/0x370
> [  129.925495]  ? __pfx_fill_pool+0x10/0x10
> [  129.925718]  ? __pfx___lock_acquire+0x10/0x10
> [  129.926034]  ? __kasan_check_read+0x11/0x20
> [  129.926269]  ? check_chain_key+0x200/0x2b0
> [  129.926503]  __debug_object_init+0x82/0x8c0
> [  129.926734]  ? __pfx_lock_release+0x10/0x10
> [  129.926984]  ? __pfx___debug_object_init+0x10/0x10
> [  129.927249]  ? __kasan_check_read+0x11/0x20
> [  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
> [  129.927758]  debug_object_activate+0x2d1/0x2f0
> [  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
> [  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
> [  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
> [  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
> [  129.929803]  ? __pfx_lock_acquired+0x10/0x10
> [  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
> [  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
> [  129.930627]  call_rcu+0xe/0x20
> [  129.930821]  queue_rcu_work+0x4f/0x60
> [  129.931050]  kfree_rcu_monitor+0x5d3/0x810
> [  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
> [  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.931878]  process_one_work+0x607/0xba0
> [  129.932129]  ? __pfx_process_one_work+0x10/0x10
> [  129.932408]  ? worker_thread+0xd6/0x710
> [  129.932653]  worker_thread+0x2d4/0x710
> [  129.932888]  ? __pfx_worker_thread+0x10/0x10
> [  129.933154]  kthread+0x18b/0x1c0
> [  129.933363]  ? __pfx_kthread+0x10/0x10
> [  129.933598]  ret_from_fork+0x2c/0x50
> [  129.933825]  </TASK>
> 
> Maybe no need to convert ->list_lock to raw_spinlock.
> 
> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>         unsigned long flags;
> 
>         /*
> -        * On RT enabled kernels the pool refill must happen in preemptible
> +        * The pool refill must happen in preemptible
>          * context:
>          */
> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> +       if (preemptible())
>                 fill_pool();

+CC Peterz

Aha so this is in fact another case where the code is written with
actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?

>         db = get_bucket((unsigned long) addr);
> 
> 
> 
> Thanks
> Zqiang
> 
>>
>>
>> Regards,
>> Boqun
>>
>>>>
>>>> It's indeed unfortunate for the warning in the commit message. But
>>>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
>>>> in the critical section of raw_spinlock or in the hardirq context, which
>>>
>>> Hmm, I thought they may not, actually.
>>>
>>>> will cause problem in the PREEMPT_RT kernel. So I still think it is
>>>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
>>>
>>> It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
>>> slab allocation for such context, it means also page allocation can happen
>>> to refill the slabs, so lockdep will eventually complain about zone->lock,
>>> and who knows what else.
>>
>> Oh, indeed. :(
>>
>>>
>>>> In addition, there are many fix patches for this kind of warning in the
>>>> git log, so I also think there should be a general and better solution. :)
>>>
>>> Maybe, but given above, I doubt it's this one.
>>>
>>>>
>>>>>
>>>>
>>>
>>
>> -- 
>> Thanks,
>> Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  6:44           ` Zhang, Qiang1
  2023-04-12  6:50             ` Vlastimil Babka
@ 2023-04-12  6:57             ` Qi Zheng
  2023-04-13  0:24               ` Joel Fernandes
  1 sibling, 1 reply; 23+ messages in thread
From: Qi Zheng @ 2023-04-12  6:57 UTC (permalink / raw)
  To: Zhang, Qiang1, Boqun Feng
  Cc: Vlastimil Babka, 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim,
	rientjes, penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney



On 2023/4/12 14:44, Zhang, Qiang1 wrote:
>>
>>
>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>> The list_lock can be held in the critical section of
>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>> like below:
>>>>>>
>>>>>>     =============================
>>>>>>     [ BUG: Invalid wait context ]
>>>>>>     6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>     -----------------------------
>>>>>>     swapper/0/1 is trying to lock:
>>>>>>     ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>>>>>     other info that might help us debug this:
>>>>>>     context-{5:5}
>>>>>>     2 locks held by swapper/0/1:
>>>>>>      #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>>>>>      #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>     stack backtrace:
>>>>>>     CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>>>>>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>>>>>     Call Trace:
>>>>>>      <TASK>
>>>>>>      dump_stack_lvl+0x77/0xc0
>>>>>>      __lock_acquire+0xa65/0x2950
>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>      ? unwind_next_frame+0x602/0x8d0
>>>>>>      lock_acquire+0xe0/0x300
>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>      ? find_usage_forwards+0x39/0x50
>>>>>>      ? check_irq_usage+0x162/0xa70
>>>>>>      ? __bfs+0x10c/0x2c0
>>>>>>      _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>      ___slab_alloc+0x73d/0x1330
>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>      ? look_up_lock_class+0x5d/0x160
>>>>>>      ? register_lock_class+0x48/0x500
>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>      kmem_cache_alloc+0x358/0x3b0
>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>      fill_pool+0x16b/0x2a0
>>>>>>      ? __debug_object_init+0x292/0x560
>>>>>>      ? lock_acquire+0xe0/0x300
>>>>>>      ? cblist_init_generic+0x232/0x2d0
>>>>>>      __debug_object_init+0x2c/0x560
>>
>> This "__debug_object_init" is because INIT_WORK() is called in
>> cblist_init_generic(), so..
>>
>>>>>>      cblist_init_generic+0x147/0x2d0
>>>>>>      rcu_init_tasks_generic+0x15/0x190
>>>>>>      kernel_init_freeable+0x6e/0x3e0
>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>      kernel_init+0x1b/0x1d0
>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>      ret_from_fork+0x1f/0x30
>>>>>>      </TASK>
>>>>>>
>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>> the above warning is not a real issue, but it's better to
>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>> rid of such issue.
>>>>>
>>>>> + CC some RT and RCU people
>>>>
>>>> Thanks.
>>>>
>>>>>
>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
>>>>> implementation from preemptible mutex to actual spin lock, so it would be
>>>>
>>>> Yeah.
>>>>
>>>>> rather unfortunate to do that for a spurious warning. Can it be somehow
>>>>> fixed in a better way?
>>
>> ... probably a better fix is to drop locks and call INIT_WORK(), or make
>> the cblist_init_generic() lockless (or part lockless), given it's just
>> initializing the cblist, it's probably doable. But I haven't taken a
>> careful look yet.
>>
> 
> 
> This is just one of the paths that triggers an invalid wait,  the following paths can also trigger:
> 
> [  129.914547] [ BUG: Invalid wait context ]
> [  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
> [  129.915044] -----------------------------
> [  129.915272] kworker/2:0/28 is trying to lock:
> [  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: ___slab_alloc+0x68/0x12e0
> [  129.915967] other info that might help us debug this:
> [  129.916241] context-{5:5}
> [  129.916392] 3 locks held by kworker/2:0/28:
> [  129.916642]  #0: ffff888100084d48 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
> [  129.917145]  #1: ffff888100c17dd0 ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: process_on0
> [  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: kfree_rcu_monitor+0x29f/0x810
> [  129.918207] stack backtrace:
> [  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 6.3.0-rc1-yocto-standard+ #2
> [  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
> [  129.919397] Workqueue: events kfree_rcu_monitor
> [  129.919662] Call Trace:
> [  129.919812]  <TASK>
> [  129.919941]  dump_stack_lvl+0x64/0xb0
> [  129.920171]  dump_stack+0x10/0x20
> [  129.920372]  __lock_acquire+0xeb8/0x3a80
> [  129.920603]  ? ret_from_fork+0x2c/0x50
> [  129.920824]  ? __pfx___lock_acquire+0x10/0x10
> [  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
> [  129.921343]  ? ret_from_fork+0x2c/0x50
> [  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.921847]  lock_acquire+0x194/0x480
> [  129.922060]  ? ___slab_alloc+0x68/0x12e0
> [  129.922293]  ? __pfx_lock_acquire+0x10/0x10
> [  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
> [  129.922778]  ? __kasan_check_read+0x11/0x20
> [  129.922998]  ___slab_alloc+0x9a/0x12e0
> [  129.923222]  ? ___slab_alloc+0x68/0x12e0
> [  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
> [  129.923706]  ? __kasan_check_read+0x11/0x20
> [  129.923937]  ? fill_pool+0x22a/0x370
> [  129.924161]  ? __lock_acquire+0xf5b/0x3a80
> [  129.924387]  ? fill_pool+0x22a/0x370
> [  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
> [  129.924832]  kmem_cache_alloc+0x296/0x3d0
> [  129.925073]  ? fill_pool+0x22a/0x370
> [  129.925291]  fill_pool+0x22a/0x370
> [  129.925495]  ? __pfx_fill_pool+0x10/0x10
> [  129.925718]  ? __pfx___lock_acquire+0x10/0x10
> [  129.926034]  ? __kasan_check_read+0x11/0x20
> [  129.926269]  ? check_chain_key+0x200/0x2b0
> [  129.926503]  __debug_object_init+0x82/0x8c0
> [  129.926734]  ? __pfx_lock_release+0x10/0x10
> [  129.926984]  ? __pfx___debug_object_init+0x10/0x10
> [  129.927249]  ? __kasan_check_read+0x11/0x20
> [  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
> [  129.927758]  debug_object_activate+0x2d1/0x2f0
> [  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
> [  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
> [  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
> [  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
> [  129.929803]  ? __pfx_lock_acquired+0x10/0x10
> [  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
> [  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
> [  129.930627]  call_rcu+0xe/0x20
> [  129.930821]  queue_rcu_work+0x4f/0x60
> [  129.931050]  kfree_rcu_monitor+0x5d3/0x810
> [  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
> [  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
> [  129.931878]  process_one_work+0x607/0xba0
> [  129.932129]  ? __pfx_process_one_work+0x10/0x10
> [  129.932408]  ? worker_thread+0xd6/0x710
> [  129.932653]  worker_thread+0x2d4/0x710
> [  129.932888]  ? __pfx_worker_thread+0x10/0x10
> [  129.933154]  kthread+0x18b/0x1c0
> [  129.933363]  ? __pfx_kthread+0x10/0x10
> [  129.933598]  ret_from_fork+0x2c/0x50
> [  129.933825]  </TASK>
> 
> Maybe no need to convert ->list_lock to raw_spinlock.
> 
> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>          unsigned long flags;
> 
>          /*
> -        * On RT enabled kernels the pool refill must happen in preemptible
> +        * The pool refill must happen in preemptible
>           * context:
>           */
> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> +       if (preemptible())
>                  fill_pool();
> 
>          db = get_bucket((unsigned long) addr);

Ah, this does fix the warning I was encountered!

> 
> 
> 
> Thanks
> Zqiang
> 
>>
>>
>> Regards,
>> Boqun
>>
>>>>
>>>> It's indeed unfortunate for the warning in the commit message. But
>>>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
>>>> in the critical section of raw_spinlock or in the hardirq context, which
>>>
>>> Hmm, I thought they may not, actually.
>>>
>>>> will cause problem in the PREEMPT_RT kernel. So I still think it is
>>>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
>>>
>>> It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
>>> slab allocation for such context, it means also page allocation can happen
>>> to refill the slabs, so lockdep will eventually complain about zone->lock,
>>> and who knows what else.
>>
>> Oh, indeed. :(
>>
>>>
>>>> In addition, there are many fix patches for this kind of warning in the
>>>> git log, so I also think there should be a general and better solution. :)
>>>
>>> Maybe, but given above, I doubt it's this one.
>>>
>>>>
>>>>>
>>>>
>>>
>>
>> -- 
>> Thanks,
>> Qi

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  6:50             ` Vlastimil Babka
@ 2023-04-12  7:30               ` Qi Zheng
  2023-04-12  8:32                 ` Qi Zheng
  2023-04-12 12:48                 ` Peter Zijlstra
  2023-04-12 12:47               ` Peter Zijlstra
  1 sibling, 2 replies; 23+ messages in thread
From: Qi Zheng @ 2023-04-12  7:30 UTC (permalink / raw)
  To: Vlastimil Babka, Zhang, Qiang1, Boqun Feng
  Cc: 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim, rientjes,
	penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney, Peter Zijlstra



On 2023/4/12 14:50, Vlastimil Babka wrote:
> 
> 
> On 4/12/23 08:44, Zhang, Qiang1 wrote:
>>>
>>>
>>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>>
>>>>>
>>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>>> The list_lock can be held in the critical section of
>>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>>> like below:
>>>>>>>
>>>>>>>     =============================
>>>>>>>     [ BUG: Invalid wait context ]
>>>>>>>     6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>>     -----------------------------
>>>>>>>     swapper/0/1 is trying to lock:
>>>>>>>     ffff888100055418 (&n->list_lock){....}-{3:3}, at: ___slab_alloc+0x73d/0x1330
>>>>>>>     other info that might help us debug this:
>>>>>>>     context-{5:5}
>>>>>>>     2 locks held by swapper/0/1:
>>>>>>>      #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: cblist_init_generic+0x22/0x2d0
>>>>>>>      #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>>     stack backtrace:
>>>>>>>     CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc6-next-20230411 #7
>>>>>>>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>>>>>>>     Call Trace:
>>>>>>>      <TASK>
>>>>>>>      dump_stack_lvl+0x77/0xc0
>>>>>>>      __lock_acquire+0xa65/0x2950
>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>      ? unwind_next_frame+0x602/0x8d0
>>>>>>>      lock_acquire+0xe0/0x300
>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>      ? find_usage_forwards+0x39/0x50
>>>>>>>      ? check_irq_usage+0x162/0xa70
>>>>>>>      ? __bfs+0x10c/0x2c0
>>>>>>>      _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>      ___slab_alloc+0x73d/0x1330
>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>      ? look_up_lock_class+0x5d/0x160
>>>>>>>      ? register_lock_class+0x48/0x500
>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>      kmem_cache_alloc+0x358/0x3b0
>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>      fill_pool+0x16b/0x2a0
>>>>>>>      ? __debug_object_init+0x292/0x560
>>>>>>>      ? lock_acquire+0xe0/0x300
>>>>>>>      ? cblist_init_generic+0x232/0x2d0
>>>>>>>      __debug_object_init+0x2c/0x560
>>>
>>> This "__debug_object_init" is because INIT_WORK() is called in
>>> cblist_init_generic(), so..
>>>
>>>>>>>      cblist_init_generic+0x147/0x2d0
>>>>>>>      rcu_init_tasks_generic+0x15/0x190
>>>>>>>      kernel_init_freeable+0x6e/0x3e0
>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>      kernel_init+0x1b/0x1d0
>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>      ret_from_fork+0x1f/0x30
>>>>>>>      </TASK>
>>>>>>>
>>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>>> the above warning is not a real issue, but it's better to
>>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>>> rid of such issue.
>>>>>>
>>>>>> + CC some RT and RCU people
>>>>>
>>>>> Thanks.
>>>>>
>>>>>>
>>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it changes the
>>>>>> implementation from preemptible mutex to actual spin lock, so it would be
>>>>>
>>>>> Yeah.
>>>>>
>>>>>> rather unfortunate to do that for a spurious warning. Can it be somehow
>>>>>> fixed in a better way?
>>>
>>> ... probably a better fix is to drop locks and call INIT_WORK(), or make
>>> the cblist_init_generic() lockless (or part lockless), given it's just
>>> initializing the cblist, it's probably doable. But I haven't taken a
>>> careful look yet.
>>>
>>
>>
>> This is just one of the paths that triggers an invalid wait,  the following paths can also trigger:
>>
>> [  129.914547] [ BUG: Invalid wait context ]
>> [  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
>> [  129.915044] -----------------------------
>> [  129.915272] kworker/2:0/28 is trying to lock:
>> [  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: ___slab_alloc+0x68/0x12e0
>> [  129.915967] other info that might help us debug this:
>> [  129.916241] context-{5:5}
>> [  129.916392] 3 locks held by kworker/2:0/28:
>> [  129.916642]  #0: ffff888100084d48 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
>> [  129.917145]  #1: ffff888100c17dd0 ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: process_on0
>> [  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: kfree_rcu_monitor+0x29f/0x810
>> [  129.918207] stack backtrace:
>> [  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 6.3.0-rc1-yocto-standard+ #2
>> [  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
>> [  129.919397] Workqueue: events kfree_rcu_monitor
>> [  129.919662] Call Trace:
>> [  129.919812]  <TASK>
>> [  129.919941]  dump_stack_lvl+0x64/0xb0
>> [  129.920171]  dump_stack+0x10/0x20
>> [  129.920372]  __lock_acquire+0xeb8/0x3a80
>> [  129.920603]  ? ret_from_fork+0x2c/0x50
>> [  129.920824]  ? __pfx___lock_acquire+0x10/0x10
>> [  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
>> [  129.921343]  ? ret_from_fork+0x2c/0x50
>> [  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
>> [  129.921847]  lock_acquire+0x194/0x480
>> [  129.922060]  ? ___slab_alloc+0x68/0x12e0
>> [  129.922293]  ? __pfx_lock_acquire+0x10/0x10
>> [  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
>> [  129.922778]  ? __kasan_check_read+0x11/0x20
>> [  129.922998]  ___slab_alloc+0x9a/0x12e0
>> [  129.923222]  ? ___slab_alloc+0x68/0x12e0
>> [  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
>> [  129.923706]  ? __kasan_check_read+0x11/0x20
>> [  129.923937]  ? fill_pool+0x22a/0x370
>> [  129.924161]  ? __lock_acquire+0xf5b/0x3a80
>> [  129.924387]  ? fill_pool+0x22a/0x370
>> [  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
>> [  129.924832]  kmem_cache_alloc+0x296/0x3d0
>> [  129.925073]  ? fill_pool+0x22a/0x370
>> [  129.925291]  fill_pool+0x22a/0x370
>> [  129.925495]  ? __pfx_fill_pool+0x10/0x10
>> [  129.925718]  ? __pfx___lock_acquire+0x10/0x10
>> [  129.926034]  ? __kasan_check_read+0x11/0x20
>> [  129.926269]  ? check_chain_key+0x200/0x2b0
>> [  129.926503]  __debug_object_init+0x82/0x8c0
>> [  129.926734]  ? __pfx_lock_release+0x10/0x10
>> [  129.926984]  ? __pfx___debug_object_init+0x10/0x10
>> [  129.927249]  ? __kasan_check_read+0x11/0x20
>> [  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
>> [  129.927758]  debug_object_activate+0x2d1/0x2f0
>> [  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
>> [  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
>> [  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
>> [  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
>> [  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
>> [  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
>> [  129.929803]  ? __pfx_lock_acquired+0x10/0x10
>> [  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
>> [  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
>> [  129.930627]  call_rcu+0xe/0x20
>> [  129.930821]  queue_rcu_work+0x4f/0x60
>> [  129.931050]  kfree_rcu_monitor+0x5d3/0x810
>> [  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
>> [  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
>> [  129.931878]  process_one_work+0x607/0xba0
>> [  129.932129]  ? __pfx_process_one_work+0x10/0x10
>> [  129.932408]  ? worker_thread+0xd6/0x710
>> [  129.932653]  worker_thread+0x2d4/0x710
>> [  129.932888]  ? __pfx_worker_thread+0x10/0x10
>> [  129.933154]  kthread+0x18b/0x1c0
>> [  129.933363]  ? __pfx_kthread+0x10/0x10
>> [  129.933598]  ret_from_fork+0x2c/0x50
>> [  129.933825]  </TASK>
>>
>> Maybe no need to convert ->list_lock to raw_spinlock.
>>
>> --- a/lib/debugobjects.c
>> +++ b/lib/debugobjects.c
>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>>          unsigned long flags;
>>
>>          /*
>> -        * On RT enabled kernels the pool refill must happen in preemptible
>> +        * The pool refill must happen in preemptible
>>           * context:
>>           */
>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>> +       if (preemptible())
>>                  fill_pool();
> 
> +CC Peterz
> 
> Aha so this is in fact another case where the code is written with
> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?

Maybe we should make CONFIG_PROVE_RAW_LOCK_NESTING depend on
CONFIG_PREEMPT_RT:

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f0d5b82e478d..257b170aacb6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1262,6 +1262,7 @@ config PROVE_LOCKING
  config PROVE_RAW_LOCK_NESTING
         bool "Enable raw_spinlock - spinlock nesting checks"
         depends on PROVE_LOCKING
+       depends on PREEMPT_RT
         default n
         help
          Enable the raw_spinlock vs. spinlock nesting checks which ensure

> 
>>          db = get_bucket((unsigned long) addr);
>>
>>
>>
>> Thanks
>> Zqiang
>>
>>>
>>>
>>> Regards,
>>> Boqun
>>>
>>>>>
>>>>> It's indeed unfortunate for the warning in the commit message. But
>>>>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
>>>>> in the critical section of raw_spinlock or in the hardirq context, which
>>>>
>>>> Hmm, I thought they may not, actually.
>>>>
>>>>> will cause problem in the PREEMPT_RT kernel. So I still think it is
>>>>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
>>>>
>>>> It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
>>>> slab allocation for such context, it means also page allocation can happen
>>>> to refill the slabs, so lockdep will eventually complain about zone->lock,
>>>> and who knows what else.
>>>
>>> Oh, indeed. :(
>>>
>>>>
>>>>> In addition, there are many fix patches for this kind of warning in the
>>>>> git log, so I also think there should be a general and better solution. :)
>>>>
>>>> Maybe, but given above, I doubt it's this one.
>>>>
>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>> -- 
>>> Thanks,
>>> Qi

-- 
Thanks,
Qi

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  7:30               ` Qi Zheng
@ 2023-04-12  8:32                 ` Qi Zheng
  2023-04-12 13:09                   ` Waiman Long
  2023-04-12 12:48                 ` Peter Zijlstra
  1 sibling, 1 reply; 23+ messages in thread
From: Qi Zheng @ 2023-04-12  8:32 UTC (permalink / raw)
  To: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, longman
  Cc: 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim, rientjes,
	penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney, Peter Zijlstra



On 2023/4/12 15:30, Qi Zheng wrote:
> 
> 
> On 2023/4/12 14:50, Vlastimil Babka wrote:
>>
>>
>> On 4/12/23 08:44, Zhang, Qiang1 wrote:
>>>>
>>>>
>>>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>>>
>>>>>>
>>>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>>>> The list_lock can be held in the critical section of
>>>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>>>> like below:
>>>>>>>>
>>>>>>>>     =============================
>>>>>>>>     [ BUG: Invalid wait context ]
>>>>>>>>     6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>>>     -----------------------------
>>>>>>>>     swapper/0/1 is trying to lock:
>>>>>>>>     ffff888100055418 (&n->list_lock){....}-{3:3}, at: 
>>>>>>>> ___slab_alloc+0x73d/0x1330
>>>>>>>>     other info that might help us debug this:
>>>>>>>>     context-{5:5}
>>>>>>>>     2 locks held by swapper/0/1:
>>>>>>>>      #0: ffffffff824e8160 (rcu_tasks.cbs_gbl_lock){....}-{2:2}, 
>>>>>>>> at: cblist_init_generic+0x22/0x2d0
>>>>>>>>      #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, 
>>>>>>>> lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>>>     stack backtrace:
>>>>>>>>     CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
>>>>>>>> 6.3.0-rc6-next-20230411 #7
>>>>>>>>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
>>>>>>>> 1.14.0-2 04/01/2014
>>>>>>>>     Call Trace:
>>>>>>>>      <TASK>
>>>>>>>>      dump_stack_lvl+0x77/0xc0
>>>>>>>>      __lock_acquire+0xa65/0x2950
>>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>>      ? unwind_next_frame+0x602/0x8d0
>>>>>>>>      lock_acquire+0xe0/0x300
>>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>>      ? find_usage_forwards+0x39/0x50
>>>>>>>>      ? check_irq_usage+0x162/0xa70
>>>>>>>>      ? __bfs+0x10c/0x2c0
>>>>>>>>      _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>>      ___slab_alloc+0x73d/0x1330
>>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>>      ? look_up_lock_class+0x5d/0x160
>>>>>>>>      ? register_lock_class+0x48/0x500
>>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>>      kmem_cache_alloc+0x358/0x3b0
>>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>>      fill_pool+0x16b/0x2a0
>>>>>>>>      ? __debug_object_init+0x292/0x560
>>>>>>>>      ? lock_acquire+0xe0/0x300
>>>>>>>>      ? cblist_init_generic+0x232/0x2d0
>>>>>>>>      __debug_object_init+0x2c/0x560
>>>>
>>>> This "__debug_object_init" is because INIT_WORK() is called in
>>>> cblist_init_generic(), so..
>>>>
>>>>>>>>      cblist_init_generic+0x147/0x2d0
>>>>>>>>      rcu_init_tasks_generic+0x15/0x190
>>>>>>>>      kernel_init_freeable+0x6e/0x3e0
>>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>>      kernel_init+0x1b/0x1d0
>>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>>      ret_from_fork+0x1f/0x30
>>>>>>>>      </TASK>
>>>>>>>>
>>>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>>>> the above warning is not a real issue, but it's better to
>>>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>>>> rid of such issue.
>>>>>>>
>>>>>>> + CC some RT and RCU people
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>>>
>>>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it 
>>>>>>> changes the
>>>>>>> implementation from preemptible mutex to actual spin lock, so it 
>>>>>>> would be
>>>>>>
>>>>>> Yeah.
>>>>>>
>>>>>>> rather unfortunate to do that for a spurious warning. Can it be 
>>>>>>> somehow
>>>>>>> fixed in a better way?
>>>>
>>>> ... probably a better fix is to drop locks and call INIT_WORK(), or 
>>>> make
>>>> the cblist_init_generic() lockless (or part lockless), given it's just
>>>> initializing the cblist, it's probably doable. But I haven't taken a
>>>> careful look yet.
>>>>
>>>
>>>
>>> This is just one of the paths that triggers an invalid wait,  the 
>>> following paths can also trigger:
>>>
>>> [  129.914547] [ BUG: Invalid wait context ]
>>> [  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
>>> [  129.915044] -----------------------------
>>> [  129.915272] kworker/2:0/28 is trying to lock:
>>> [  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: 
>>> ___slab_alloc+0x68/0x12e0
>>> [  129.915967] other info that might help us debug this:
>>> [  129.916241] context-{5:5}
>>> [  129.916392] 3 locks held by kworker/2:0/28:
>>> [  129.916642]  #0: ffff888100084d48 
>>> ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
>>> [  129.917145]  #1: ffff888100c17dd0 
>>> ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: 
>>> process_on0
>>> [  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: 
>>> kfree_rcu_monitor+0x29f/0x810
>>> [  129.918207] stack backtrace:
>>> [  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 
>>> 6.3.0-rc1-yocto-standard+ #2
>>> [  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), 
>>> BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
>>> [  129.919397] Workqueue: events kfree_rcu_monitor
>>> [  129.919662] Call Trace:
>>> [  129.919812]  <TASK>
>>> [  129.919941]  dump_stack_lvl+0x64/0xb0
>>> [  129.920171]  dump_stack+0x10/0x20
>>> [  129.920372]  __lock_acquire+0xeb8/0x3a80
>>> [  129.920603]  ? ret_from_fork+0x2c/0x50
>>> [  129.920824]  ? __pfx___lock_acquire+0x10/0x10
>>> [  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
>>> [  129.921343]  ? ret_from_fork+0x2c/0x50
>>> [  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
>>> [  129.921847]  lock_acquire+0x194/0x480
>>> [  129.922060]  ? ___slab_alloc+0x68/0x12e0
>>> [  129.922293]  ? __pfx_lock_acquire+0x10/0x10
>>> [  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
>>> [  129.922778]  ? __kasan_check_read+0x11/0x20
>>> [  129.922998]  ___slab_alloc+0x9a/0x12e0
>>> [  129.923222]  ? ___slab_alloc+0x68/0x12e0
>>> [  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
>>> [  129.923706]  ? __kasan_check_read+0x11/0x20
>>> [  129.923937]  ? fill_pool+0x22a/0x370
>>> [  129.924161]  ? __lock_acquire+0xf5b/0x3a80
>>> [  129.924387]  ? fill_pool+0x22a/0x370
>>> [  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
>>> [  129.924832]  kmem_cache_alloc+0x296/0x3d0
>>> [  129.925073]  ? fill_pool+0x22a/0x370
>>> [  129.925291]  fill_pool+0x22a/0x370
>>> [  129.925495]  ? __pfx_fill_pool+0x10/0x10
>>> [  129.925718]  ? __pfx___lock_acquire+0x10/0x10
>>> [  129.926034]  ? __kasan_check_read+0x11/0x20
>>> [  129.926269]  ? check_chain_key+0x200/0x2b0
>>> [  129.926503]  __debug_object_init+0x82/0x8c0
>>> [  129.926734]  ? __pfx_lock_release+0x10/0x10
>>> [  129.926984]  ? __pfx___debug_object_init+0x10/0x10
>>> [  129.927249]  ? __kasan_check_read+0x11/0x20
>>> [  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
>>> [  129.927758]  debug_object_activate+0x2d1/0x2f0
>>> [  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
>>> [  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
>>> [  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
>>> [  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
>>> [  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
>>> [  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
>>> [  129.929803]  ? __pfx_lock_acquired+0x10/0x10
>>> [  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
>>> [  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
>>> [  129.930627]  call_rcu+0xe/0x20
>>> [  129.930821]  queue_rcu_work+0x4f/0x60
>>> [  129.931050]  kfree_rcu_monitor+0x5d3/0x810
>>> [  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
>>> [  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
>>> [  129.931878]  process_one_work+0x607/0xba0
>>> [  129.932129]  ? __pfx_process_one_work+0x10/0x10
>>> [  129.932408]  ? worker_thread+0xd6/0x710
>>> [  129.932653]  worker_thread+0x2d4/0x710
>>> [  129.932888]  ? __pfx_worker_thread+0x10/0x10
>>> [  129.933154]  kthread+0x18b/0x1c0
>>> [  129.933363]  ? __pfx_kthread+0x10/0x10
>>> [  129.933598]  ret_from_fork+0x2c/0x50
>>> [  129.933825]  </TASK>
>>>
>>> Maybe no need to convert ->list_lock to raw_spinlock.
>>>
>>> --- a/lib/debugobjects.c
>>> +++ b/lib/debugobjects.c
>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct 
>>> debug_obj_descr *descr, int onstack
>>>          unsigned long flags;
>>>
>>>          /*
>>> -        * On RT enabled kernels the pool refill must happen in 
>>> preemptible
>>> +        * The pool refill must happen in preemptible
>>>           * context:
>>>           */
>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>> +       if (preemptible())
>>>                  fill_pool();
>>
>> +CC Peterz
>>
>> Aha so this is in fact another case where the code is written with
>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
> 
> Maybe we should make CONFIG_PROVE_RAW_LOCK_NESTING depend on
> CONFIG_PREEMPT_RT:

I found a discussion [1] of why CONFIG_PROVE_RAW_LOCK_NESTING didn't
depend on CONFIG_PREEMPT_RT before in the commit history:

```
 >>> We now always get a "Invalid wait context" warning with
 >>> CONFIG_PROVE_RAW_LOCK_NESTING=y, see the full warning below:
 >>>
 >>>        [    0.705900] =============================
 >>>        [    0.706002] [ BUG: Invalid wait context ]
 >>>        [    0.706180] 5.13.0+ #4 Not tainted
 >>>        [    0.706349] -----------------------------
 >> I believe the purpose of CONFIG_PROVE_RAW_LOCK_NESTING is experimental
 >> and it is turned off by default. Turning it on can cause problem as
 >> shown in your lockdep splat. Limiting it to just PREEMPT_RT will defeat
 >> its purpose to find potential spinlock nesting problem in non-PREEMPT_RT
 >> kernel.
 > As far as I know, a spinlock can nest another spinlock. In
 > non-PREEMPT_RT kernel
 > spin_lock and raw_spin_lock are same , so here acquiring a spin_lock 
in hardirq
 > context is acceptable, the warning is not needed. My knowledge on this
 > is not enough,
 > Will dig into this.
 >
 >> The point is to fix the issue found,
 > Agree. I thought there was a spinlock usage issue, but by checking
 > deactivate_slab context,
 > looks like the spinlock usage is well. Maybe I'm missing something?

Yes, spinlock and raw spinlock are the same in non-RT kernel. They are
only different in RT kernel. However, non-RT kernel is also more heavily
tested than the RT kernel counterpart. The purpose of this config option
is to expose spinlock nesting problem in more areas of the code. If you
look at the config help text of PROVE_RAW_LOCK_NESTING:

          help
           Enable the raw_spinlock vs. spinlock nesting checks which ensure
           that the lock nesting rules for PREEMPT_RT enabled kernels are
           not violated.

           NOTE: There are known nesting problems. So if you enable this
           option expect lockdep splats until these problems have been fully
           addressed which is work in progress. This config switch allows to
           identify and analyze these problems. It will be removed and the
           check permanentely enabled once the main issues have been fixed.

           If unsure, select N.

So lockdep splat is expected. It will take time to address all the
issues found.
```

Also +Waiman Long.


[1]. 
https://lore.kernel.org/lkml/1c4c058b-3745-5586-4961-79d83fb5b049@redhat.com/

> 
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index f0d5b82e478d..257b170aacb6 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1262,6 +1262,7 @@ config PROVE_LOCKING
>   config PROVE_RAW_LOCK_NESTING
>          bool "Enable raw_spinlock - spinlock nesting checks"
>          depends on PROVE_LOCKING
> +       depends on PREEMPT_RT
>          default n
>          help
>           Enable the raw_spinlock vs. spinlock nesting checks which ensure
> 
>>
>>>          db = get_bucket((unsigned long) addr);
>>>
>>>
>>>
>>> Thanks
>>> Zqiang
>>>
>>>>
>>>>
>>>> Regards,
>>>> Boqun
>>>>
>>>>>>
>>>>>> It's indeed unfortunate for the warning in the commit message. But
>>>>>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
>>>>>> in the critical section of raw_spinlock or in the hardirq context, 
>>>>>> which
>>>>>
>>>>> Hmm, I thought they may not, actually.
>>>>>
>>>>>> will cause problem in the PREEMPT_RT kernel. So I still think it is
>>>>>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock 
>>>>>> type.
>>>>>
>>>>> It wouldn't be the complete solution anyway. Once we allow even a 
>>>>> GFP_ATOMIC
>>>>> slab allocation for such context, it means also page allocation can 
>>>>> happen
>>>>> to refill the slabs, so lockdep will eventually complain about 
>>>>> zone->lock,
>>>>> and who knows what else.
>>>>
>>>> Oh, indeed. :(
>>>>
>>>>>
>>>>>> In addition, there are many fix patches for this kind of warning 
>>>>>> in the
>>>>>> git log, so I also think there should be a general and better 
>>>>>> solution. :)
>>>>>
>>>>> Maybe, but given above, I doubt it's this one.
>>>>>
>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>> -- 
>>>> Thanks,
>>>> Qi
> 

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  6:50             ` Vlastimil Babka
  2023-04-12  7:30               ` Qi Zheng
@ 2023-04-12 12:47               ` Peter Zijlstra
  2023-04-12 16:44                 ` Qi Zheng
  1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2023-04-12 12:47 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Zhang, Qiang1, Boqun Feng, Qi Zheng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Wed, Apr 12, 2023 at 08:50:29AM +0200, Vlastimil Babka wrote:

> > --- a/lib/debugobjects.c
> > +++ b/lib/debugobjects.c
> > @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
> >         unsigned long flags;
> > 
> >         /*
> > -        * On RT enabled kernels the pool refill must happen in preemptible
> > +        * The pool refill must happen in preemptible
> >          * context:
> >          */
> > -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> > +       if (preemptible())
> >                 fill_pool();
> 
> +CC Peterz
> 
> Aha so this is in fact another case where the code is written with
> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?

Ooh, tricky, yes. PROVE_RAW_LOCK_NESTING always follows the PREEMP_RT
rules and does not expect trickery like the above.

Something like the completely untested below might be of help..

---
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index d22430840b53..f3120d6a7d9e 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
 enum lockdep_lock_type {
 	LD_LOCK_NORMAL = 0,	/* normal, catch all */
 	LD_LOCK_PERCPU,		/* percpu */
+	LD_LOCK_WAIT,		/* annotation */
 	LD_LOCK_MAX,
 };
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 50d4863974e7..a4077f5bb75b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2279,8 +2279,9 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
 	 * As a result, we will skip local_lock(), when we search for irq
 	 * inversion bugs.
 	 */
-	if (entry->class->lock_type == LD_LOCK_PERCPU) {
-		if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+	if (entry->class->lock_type != LD_LOCK_NORMAL) {
+		if (entry->class->lock_type == LD_LOCK_PERCPU &&
+		    DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
 			return false;
 
 		return true;
@@ -4752,7 +4753,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 
 	for (; depth < curr->lockdep_depth; depth++) {
 		struct held_lock *prev = curr->held_locks + depth;
-		u8 prev_inner = hlock_class(prev)->wait_type_inner;
+		struct lock_class *class = hlock_class(prev);
+		u8 prev_inner = class->wait_type_inner;
 
 		if (prev_inner) {
 			/*
@@ -4762,6 +4764,12 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 			 * Also due to trylocks.
 			 */
 			curr_inner = min(curr_inner, prev_inner);
+
+			/*
+			 * Allow override for annotations.
+			 */
+			if (unlikely(class->lock_type == LD_LOCK_WAIT))
+				curr_inner = prev_inner;
 		}
 	}
 
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index df86e649d8be..fae71ef72a16 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
 	 * On RT enabled kernels the pool refill must happen in preemptible
 	 * context:
 	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+		static lockdep_map dep_map = {
+			.name = "wait-type-override",
+			.wait_type_inner = LD_WAIT_SLEEP,
+			.lock_type = LD_LOCK_WAIT,
+		};
+		lock_map_acquire(&dep_map);
 		fill_pool();
+		lock_map_release(&dep_map);
+	}
 
 	db = get_bucket((unsigned long) addr);
 

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  7:30               ` Qi Zheng
  2023-04-12  8:32                 ` Qi Zheng
@ 2023-04-12 12:48                 ` Peter Zijlstra
  1 sibling, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2023-04-12 12:48 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Wed, Apr 12, 2023 at 03:30:20PM +0800, Qi Zheng wrote:
> Maybe we should make CONFIG_PROVE_RAW_LOCK_NESTING depend on
> CONFIG_PREEMPT_RT:
> 
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index f0d5b82e478d..257b170aacb6 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1262,6 +1262,7 @@ config PROVE_LOCKING
>  config PROVE_RAW_LOCK_NESTING
>         bool "Enable raw_spinlock - spinlock nesting checks"
>         depends on PROVE_LOCKING
> +       depends on PREEMPT_RT
>         default n
>         help
>          Enable the raw_spinlock vs. spinlock nesting checks which ensure

No, very much not. The idea is that we want to work towards having this
always enabled (when PROVE_LOCKING) to ensure code gets and stays
PREEMPT_RT clean.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  8:32                 ` Qi Zheng
@ 2023-04-12 13:09                   ` Waiman Long
  2023-04-12 16:47                     ` Qi Zheng
  0 siblings, 1 reply; 23+ messages in thread
From: Waiman Long @ 2023-04-12 13:09 UTC (permalink / raw)
  To: Qi Zheng, Vlastimil Babka, Zhang, Qiang1, Boqun Feng
  Cc: 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim, rientjes,
	penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney, Peter Zijlstra

On 4/12/23 04:32, Qi Zheng wrote:
>
>
> On 2023/4/12 15:30, Qi Zheng wrote:
>>
>>
>> On 2023/4/12 14:50, Vlastimil Babka wrote:
>>>
>>>
>>> On 4/12/23 08:44, Zhang, Qiang1 wrote:
>>>>>
>>>>>
>>>>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>>>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>>>>> The list_lock can be held in the critical section of
>>>>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>>>>> like below:
>>>>>>>>>
>>>>>>>>>     =============================
>>>>>>>>>     [ BUG: Invalid wait context ]
>>>>>>>>>     6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>>>>     -----------------------------
>>>>>>>>>     swapper/0/1 is trying to lock:
>>>>>>>>>     ffff888100055418 (&n->list_lock){....}-{3:3}, at: 
>>>>>>>>> ___slab_alloc+0x73d/0x1330
>>>>>>>>>     other info that might help us debug this:
>>>>>>>>>     context-{5:5}
>>>>>>>>>     2 locks held by swapper/0/1:
>>>>>>>>>      #0: ffffffff824e8160 
>>>>>>>>> (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: 
>>>>>>>>> cblist_init_generic+0x22/0x2d0
>>>>>>>>>      #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, 
>>>>>>>>> lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>>>>     stack backtrace:
>>>>>>>>>     CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
>>>>>>>>> 6.3.0-rc6-next-20230411 #7
>>>>>>>>>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
>>>>>>>>> BIOS 1.14.0-2 04/01/2014
>>>>>>>>>     Call Trace:
>>>>>>>>>      <TASK>
>>>>>>>>>      dump_stack_lvl+0x77/0xc0
>>>>>>>>>      __lock_acquire+0xa65/0x2950
>>>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>>>      ? unwind_next_frame+0x602/0x8d0
>>>>>>>>>      lock_acquire+0xe0/0x300
>>>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>>>      ? find_usage_forwards+0x39/0x50
>>>>>>>>>      ? check_irq_usage+0x162/0xa70
>>>>>>>>>      ? __bfs+0x10c/0x2c0
>>>>>>>>>      _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>>>      ___slab_alloc+0x73d/0x1330
>>>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>>>      ? look_up_lock_class+0x5d/0x160
>>>>>>>>>      ? register_lock_class+0x48/0x500
>>>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>>>      kmem_cache_alloc+0x358/0x3b0
>>>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>>>      fill_pool+0x16b/0x2a0
>>>>>>>>>      ? __debug_object_init+0x292/0x560
>>>>>>>>>      ? lock_acquire+0xe0/0x300
>>>>>>>>>      ? cblist_init_generic+0x232/0x2d0
>>>>>>>>>      __debug_object_init+0x2c/0x560
>>>>>
>>>>> This "__debug_object_init" is because INIT_WORK() is called in
>>>>> cblist_init_generic(), so..
>>>>>
>>>>>>>>> cblist_init_generic+0x147/0x2d0
>>>>>>>>>      rcu_init_tasks_generic+0x15/0x190
>>>>>>>>>      kernel_init_freeable+0x6e/0x3e0
>>>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>>>      kernel_init+0x1b/0x1d0
>>>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>>>      ret_from_fork+0x1f/0x30
>>>>>>>>>      </TASK>
>>>>>>>>>
>>>>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>>>>> the above warning is not a real issue, but it's better to
>>>>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>>>>> rid of such issue.
>>>>>>>>
>>>>>>>> + CC some RT and RCU people
>>>>>>>
>>>>>>> Thanks.
>>>>>>>
>>>>>>>>
>>>>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it 
>>>>>>>> changes the
>>>>>>>> implementation from preemptible mutex to actual spin lock, so 
>>>>>>>> it would be
>>>>>>>
>>>>>>> Yeah.
>>>>>>>
>>>>>>>> rather unfortunate to do that for a spurious warning. Can it be 
>>>>>>>> somehow
>>>>>>>> fixed in a better way?
>>>>>
>>>>> ... probably a better fix is to drop locks and call INIT_WORK(), 
>>>>> or make
>>>>> the cblist_init_generic() lockless (or part lockless), given it's 
>>>>> just
>>>>> initializing the cblist, it's probably doable. But I haven't taken a
>>>>> careful look yet.
>>>>>
>>>>
>>>>
>>>> This is just one of the paths that triggers an invalid wait,  the 
>>>> following paths can also trigger:
>>>>
>>>> [  129.914547] [ BUG: Invalid wait context ]
>>>> [  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
>>>> [  129.915044] -----------------------------
>>>> [  129.915272] kworker/2:0/28 is trying to lock:
>>>> [  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: 
>>>> ___slab_alloc+0x68/0x12e0
>>>> [  129.915967] other info that might help us debug this:
>>>> [  129.916241] context-{5:5}
>>>> [  129.916392] 3 locks held by kworker/2:0/28:
>>>> [  129.916642]  #0: ffff888100084d48 
>>>> ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
>>>> [  129.917145]  #1: ffff888100c17dd0 
>>>> ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: 
>>>> process_on0
>>>> [  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: 
>>>> kfree_rcu_monitor+0x29f/0x810
>>>> [  129.918207] stack backtrace:
>>>> [  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 
>>>> 6.3.0-rc1-yocto-standard+ #2
>>>> [  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), 
>>>> BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
>>>> [  129.919397] Workqueue: events kfree_rcu_monitor
>>>> [  129.919662] Call Trace:
>>>> [  129.919812]  <TASK>
>>>> [  129.919941]  dump_stack_lvl+0x64/0xb0
>>>> [  129.920171]  dump_stack+0x10/0x20
>>>> [  129.920372]  __lock_acquire+0xeb8/0x3a80
>>>> [  129.920603]  ? ret_from_fork+0x2c/0x50
>>>> [  129.920824]  ? __pfx___lock_acquire+0x10/0x10
>>>> [  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
>>>> [  129.921343]  ? ret_from_fork+0x2c/0x50
>>>> [  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
>>>> [  129.921847]  lock_acquire+0x194/0x480
>>>> [  129.922060]  ? ___slab_alloc+0x68/0x12e0
>>>> [  129.922293]  ? __pfx_lock_acquire+0x10/0x10
>>>> [  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
>>>> [  129.922778]  ? __kasan_check_read+0x11/0x20
>>>> [  129.922998]  ___slab_alloc+0x9a/0x12e0
>>>> [  129.923222]  ? ___slab_alloc+0x68/0x12e0
>>>> [  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
>>>> [  129.923706]  ? __kasan_check_read+0x11/0x20
>>>> [  129.923937]  ? fill_pool+0x22a/0x370
>>>> [  129.924161]  ? __lock_acquire+0xf5b/0x3a80
>>>> [  129.924387]  ? fill_pool+0x22a/0x370
>>>> [  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
>>>> [  129.924832]  kmem_cache_alloc+0x296/0x3d0
>>>> [  129.925073]  ? fill_pool+0x22a/0x370
>>>> [  129.925291]  fill_pool+0x22a/0x370
>>>> [  129.925495]  ? __pfx_fill_pool+0x10/0x10
>>>> [  129.925718]  ? __pfx___lock_acquire+0x10/0x10
>>>> [  129.926034]  ? __kasan_check_read+0x11/0x20
>>>> [  129.926269]  ? check_chain_key+0x200/0x2b0
>>>> [  129.926503]  __debug_object_init+0x82/0x8c0
>>>> [  129.926734]  ? __pfx_lock_release+0x10/0x10
>>>> [  129.926984]  ? __pfx___debug_object_init+0x10/0x10
>>>> [  129.927249]  ? __kasan_check_read+0x11/0x20
>>>> [  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
>>>> [  129.927758]  debug_object_activate+0x2d1/0x2f0
>>>> [  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
>>>> [  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
>>>> [  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
>>>> [  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
>>>> [  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
>>>> [  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
>>>> [  129.929803]  ? __pfx_lock_acquired+0x10/0x10
>>>> [  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
>>>> [  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
>>>> [  129.930627]  call_rcu+0xe/0x20
>>>> [  129.930821]  queue_rcu_work+0x4f/0x60
>>>> [  129.931050]  kfree_rcu_monitor+0x5d3/0x810
>>>> [  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
>>>> [  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
>>>> [  129.931878]  process_one_work+0x607/0xba0
>>>> [  129.932129]  ? __pfx_process_one_work+0x10/0x10
>>>> [  129.932408]  ? worker_thread+0xd6/0x710
>>>> [  129.932653]  worker_thread+0x2d4/0x710
>>>> [  129.932888]  ? __pfx_worker_thread+0x10/0x10
>>>> [  129.933154]  kthread+0x18b/0x1c0
>>>> [  129.933363]  ? __pfx_kthread+0x10/0x10
>>>> [  129.933598]  ret_from_fork+0x2c/0x50
>>>> [  129.933825]  </TASK>
>>>>
>>>> Maybe no need to convert ->list_lock to raw_spinlock.
>>>>
>>>> --- a/lib/debugobjects.c
>>>> +++ b/lib/debugobjects.c
>>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct 
>>>> debug_obj_descr *descr, int onstack
>>>>          unsigned long flags;
>>>>
>>>>          /*
>>>> -        * On RT enabled kernels the pool refill must happen in 
>>>> preemptible
>>>> +        * The pool refill must happen in preemptible
>>>>           * context:
>>>>           */
>>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>>> +       if (preemptible())
>>>>                  fill_pool();
>>>
>>> +CC Peterz
>>>
>>> Aha so this is in fact another case where the code is written with
>>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
>>
>> Maybe we should make CONFIG_PROVE_RAW_LOCK_NESTING depend on
>> CONFIG_PREEMPT_RT:
>
> I found a discussion [1] of why CONFIG_PROVE_RAW_LOCK_NESTING didn't
> depend on CONFIG_PREEMPT_RT before in the commit history:
>
> ```
> >>> We now always get a "Invalid wait context" warning with
> >>> CONFIG_PROVE_RAW_LOCK_NESTING=y, see the full warning below:
> >>>
> >>>        [    0.705900] =============================
> >>>        [    0.706002] [ BUG: Invalid wait context ]
> >>>        [    0.706180] 5.13.0+ #4 Not tainted
> >>>        [    0.706349] -----------------------------
> >> I believe the purpose of CONFIG_PROVE_RAW_LOCK_NESTING is experimental
> >> and it is turned off by default. Turning it on can cause problem as
> >> shown in your lockdep splat. Limiting it to just PREEMPT_RT will 
> defeat
> >> its purpose to find potential spinlock nesting problem in 
> non-PREEMPT_RT
> >> kernel.
> > As far as I know, a spinlock can nest another spinlock. In
> > non-PREEMPT_RT kernel
> > spin_lock and raw_spin_lock are same , so here acquiring a spin_lock 
> in hardirq
> > context is acceptable, the warning is not needed. My knowledge on this
> > is not enough,
> > Will dig into this.
> >
> >> The point is to fix the issue found,
> > Agree. I thought there was a spinlock usage issue, but by checking
> > deactivate_slab context,
> > looks like the spinlock usage is well. Maybe I'm missing something?
>
> Yes, spinlock and raw spinlock are the same in non-RT kernel. They are
> only different in RT kernel. However, non-RT kernel is also more heavily
> tested than the RT kernel counterpart. The purpose of this config option
> is to expose spinlock nesting problem in more areas of the code. If you
> look at the config help text of PROVE_RAW_LOCK_NESTING:
>
>          help
>           Enable the raw_spinlock vs. spinlock nesting checks which 
> ensure
>           that the lock nesting rules for PREEMPT_RT enabled kernels are
>           not violated.
>
>           NOTE: There are known nesting problems. So if you enable this
>           option expect lockdep splats until these problems have been 
> fully
>           addressed which is work in progress. This config switch 
> allows to
>           identify and analyze these problems. It will be removed and the
>           check permanentely enabled once the main issues have been 
> fixed.
>
>           If unsure, select N.
>
> So lockdep splat is expected. It will take time to address all the
> issues found.
> ```
>
> Also +Waiman Long.

I believe the purpose of not making PROVE_RAW_LOCK_NESTING depending on 
PREEMPT_RT is to allow people to discover this kind of nest locking 
problem without enabling PREEMPT_RT.

Anyway, I don't think you can change list_lock to a raw spinlock. 
According to mm/slub.c:

  * Lock order:
  *   1. slab_mutex (Global Mutex)
  *   2. node->list_lock (Spinlock)
  *   3. kmem_cache->cpu_slab->lock (Local lock)
  *   4. slab_lock(slab) (Only on some arches)
  *   5. object_map_lock (Only for debugging)

For PREEMPT_RT, local lock is a per-cpu spinlock (rt_mutex). So 
list_lock has to be spinlock also.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12 12:47               ` Peter Zijlstra
@ 2023-04-12 16:44                 ` Qi Zheng
  2023-04-13  7:40                   ` Peter Zijlstra
  2023-04-13 14:49                   ` [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock Qi Zheng
  0 siblings, 2 replies; 23+ messages in thread
From: Qi Zheng @ 2023-04-12 16:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, Qi Zheng, 42.hyeyoo,
	akpm, roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl,
	linux-mm, linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney



On 2023/4/12 20:47, Peter Zijlstra wrote:
> On Wed, Apr 12, 2023 at 08:50:29AM +0200, Vlastimil Babka wrote:
> 
>>> --- a/lib/debugobjects.c
>>> +++ b/lib/debugobjects.c
>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>>>          unsigned long flags;
>>>
>>>          /*
>>> -        * On RT enabled kernels the pool refill must happen in preemptible
>>> +        * The pool refill must happen in preemptible
>>>           * context:
>>>           */
>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>> +       if (preemptible())
>>>                  fill_pool();
>>
>> +CC Peterz
>>
>> Aha so this is in fact another case where the code is written with
>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
> 
> Ooh, tricky, yes. PROVE_RAW_LOCK_NESTING always follows the PREEMP_RT
> rules and does not expect trickery like the above.
> 
> Something like the completely untested below might be of help..
> 
> ---
> diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
> index d22430840b53..f3120d6a7d9e 100644
> --- a/include/linux/lockdep_types.h
> +++ b/include/linux/lockdep_types.h
> @@ -33,6 +33,7 @@ enum lockdep_wait_type {
>   enum lockdep_lock_type {
>   	LD_LOCK_NORMAL = 0,	/* normal, catch all */
>   	LD_LOCK_PERCPU,		/* percpu */
> +	LD_LOCK_WAIT,		/* annotation */
>   	LD_LOCK_MAX,
>   };
>   
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 50d4863974e7..a4077f5bb75b 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -2279,8 +2279,9 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
>   	 * As a result, we will skip local_lock(), when we search for irq
>   	 * inversion bugs.
>   	 */
> -	if (entry->class->lock_type == LD_LOCK_PERCPU) {
> -		if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
> +	if (entry->class->lock_type != LD_LOCK_NORMAL) {
> +		if (entry->class->lock_type == LD_LOCK_PERCPU &&
> +		    DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
>   			return false;
>   
>   		return true;
> @@ -4752,7 +4753,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
>   
>   	for (; depth < curr->lockdep_depth; depth++) {
>   		struct held_lock *prev = curr->held_locks + depth;
> -		u8 prev_inner = hlock_class(prev)->wait_type_inner;
> +		struct lock_class *class = hlock_class(prev);
> +		u8 prev_inner = class->wait_type_inner;
>   
>   		if (prev_inner) {
>   			/*
> @@ -4762,6 +4764,12 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
>   			 * Also due to trylocks.
>   			 */
>   			curr_inner = min(curr_inner, prev_inner);
> +
> +			/*
> +			 * Allow override for annotations.
> +			 */
> +			if (unlikely(class->lock_type == LD_LOCK_WAIT))
> +				curr_inner = prev_inner;
>   		}
>   	}
>   
> diff --git a/lib/debugobjects.c b/lib/debugobjects.c
> index df86e649d8be..fae71ef72a16 100644
> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>   	 * On RT enabled kernels the pool refill must happen in preemptible
>   	 * context:
>   	 */
> -	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
> +		static lockdep_map dep_map = {

                 static struct lockdep_map dep_map = {

> +			.name = "wait-type-override",
> +			.wait_type_inner = LD_WAIT_SLEEP,
> +			.lock_type = LD_LOCK_WAIT,
> +		};
> +		lock_map_acquire(&dep_map);
>   		fill_pool();
> +		lock_map_release(&dep_map);
> +	}
>   
>   	db = get_bucket((unsigned long) addr);
>   

I just tested the above code, and then got the following
warning:

[    0.001000][    T0] =============================
[    0.001000][    T0] [ BUG: Invalid wait context ]
[    0.001000][    T0] 6.3.0-rc6-next-20230412+ #21 Not tainted
[    0.001000][    T0] -----------------------------
[    0.001000][    T0] swapper/0/0 is trying to lock:
[    0.001000][    T0] ffffffff825bcb80 
(wait-type-override){....}-{4:4}, at: __debug_object_init+0x0/0x590
[    0.001000][    T0] other info that might help us debug this:
[    0.001000][    T0] context-{5:5}
[    0.001000][    T0] 2 locks held by swapper/0/0:
[    0.001000][    T0]  #0: ffffffff824f5178 
(timekeeper_lock){....}-{2:2}, at: timekeeping_init+0xf1/0x270
[    0.001000][    T0]  #1: ffffffff824f5008 
(tk_core.seq.seqcount){....}-{0:0}, at: start_kernel+0x31a/0x800
[    0.001000][    T0] stack backtrace:
[    0.001000][    T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
6.3.0-rc6-next-20230412+ #21
[    0.001000][    T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 
1996), BIOS 1.14.0-2 04/01/2014
[    0.001000][    T0] Call Trace:
[    0.001000][    T0]  <TASK>
[    0.001000][    T0]  dump_stack_lvl+0x77/0xc0
[    0.001000][    T0]  __lock_acquire+0xa74/0x2960
[    0.001000][    T0]  ? save_trace+0x3f/0x320
[    0.001000][    T0]  ? add_lock_to_list+0x97/0x130
[    0.001000][    T0]  lock_acquire+0xe0/0x300
[    0.001000][    T0]  ? debug_object_active_state+0x180/0x180
[    0.001000][    T0]  __debug_object_init+0x47/0x590
[    0.001000][    T0]  ? debug_object_active_state+0x180/0x180
[    0.001000][    T0]  ? lock_acquire+0x100/0x300
[    0.001000][    T0]  hrtimer_init+0x23/0xc0
[    0.001000][    T0]  ntp_init+0x70/0x80
[    0.001000][    T0]  timekeeping_init+0x12c/0x270
[    0.001000][    T0]  ? start_kernel+0x31a/0x800
[    0.001000][    T0]  ? _printk+0x5c/0x80
[    0.001000][    T0]  start_kernel+0x31a/0x800
[    0.001000][    T0]  secondary_startup_64_no_verify+0xf4/0xfb
[    0.001000][    T0]  </TASK>

It seems that the LD_WAIT_SLEEP we set is already greater than the
LD_WAIT_SPIN of the current context.

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12 13:09                   ` Waiman Long
@ 2023-04-12 16:47                     ` Qi Zheng
  0 siblings, 0 replies; 23+ messages in thread
From: Qi Zheng @ 2023-04-12 16:47 UTC (permalink / raw)
  To: Waiman Long
  Cc: 42.hyeyoo, akpm, roman.gushchin, iamjoonsoo.kim, rientjes,
	penberg, cl, linux-mm, linux-kernel, Zhao Gongyi,
	Sebastian Andrzej Siewior, Thomas Gleixner, RCU,
	Paul E . McKenney, Peter Zijlstra, Vlastimil Babka, Boqun Feng,
	Zhang, Qiang1



On 2023/4/12 21:09, Waiman Long wrote:
> On 4/12/23 04:32, Qi Zheng wrote:
>>
>>
>> On 2023/4/12 15:30, Qi Zheng wrote:
>>>
>>>
>>> On 2023/4/12 14:50, Vlastimil Babka wrote:
>>>>
>>>>
>>>> On 4/12/23 08:44, Zhang, Qiang1 wrote:
>>>>>>
>>>>>>
>>>>>> On 2023/4/11 22:19, Vlastimil Babka wrote:
>>>>>>> On 4/11/23 16:08, Qi Zheng wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 2023/4/11 21:40, Vlastimil Babka wrote:
>>>>>>>>> On 4/11/23 15:08, Qi Zheng wrote:
>>>>>>>>>> The list_lock can be held in the critical section of
>>>>>>>>>> raw_spinlock, and then lockdep will complain about it
>>>>>>>>>> like below:
>>>>>>>>>>
>>>>>>>>>>     =============================
>>>>>>>>>>     [ BUG: Invalid wait context ]
>>>>>>>>>>     6.3.0-rc6-next-20230411 #7 Not tainted
>>>>>>>>>>     -----------------------------
>>>>>>>>>>     swapper/0/1 is trying to lock:
>>>>>>>>>>     ffff888100055418 (&n->list_lock){....}-{3:3}, at: 
>>>>>>>>>> ___slab_alloc+0x73d/0x1330
>>>>>>>>>>     other info that might help us debug this:
>>>>>>>>>>     context-{5:5}
>>>>>>>>>>     2 locks held by swapper/0/1:
>>>>>>>>>>      #0: ffffffff824e8160 
>>>>>>>>>> (rcu_tasks.cbs_gbl_lock){....}-{2:2}, at: 
>>>>>>>>>> cblist_init_generic+0x22/0x2d0
>>>>>>>>>>      #1: ffff888136bede50 (&ACCESS_PRIVATE(rtpcp, 
>>>>>>>>>> lock)){....}-{2:2}, at: cblist_init_generic+0x232/0x2d0
>>>>>>>>>>     stack backtrace:
>>>>>>>>>>     CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
>>>>>>>>>> 6.3.0-rc6-next-20230411 #7
>>>>>>>>>>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
>>>>>>>>>> BIOS 1.14.0-2 04/01/2014
>>>>>>>>>>     Call Trace:
>>>>>>>>>>      <TASK>
>>>>>>>>>>      dump_stack_lvl+0x77/0xc0
>>>>>>>>>>      __lock_acquire+0xa65/0x2950
>>>>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>>>>      ? arch_stack_walk+0x65/0xf0
>>>>>>>>>>      ? unwind_next_frame+0x602/0x8d0
>>>>>>>>>>      lock_acquire+0xe0/0x300
>>>>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>>>>      ? find_usage_forwards+0x39/0x50
>>>>>>>>>>      ? check_irq_usage+0x162/0xa70
>>>>>>>>>>      ? __bfs+0x10c/0x2c0
>>>>>>>>>>      _raw_spin_lock_irqsave+0x4f/0x90
>>>>>>>>>>      ? ___slab_alloc+0x73d/0x1330
>>>>>>>>>>      ___slab_alloc+0x73d/0x1330
>>>>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>>>>      ? look_up_lock_class+0x5d/0x160
>>>>>>>>>>      ? register_lock_class+0x48/0x500
>>>>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>>>>      ? fill_pool+0x16b/0x2a0
>>>>>>>>>>      kmem_cache_alloc+0x358/0x3b0
>>>>>>>>>>      ? __lock_acquire+0xabc/0x2950
>>>>>>>>>>      fill_pool+0x16b/0x2a0
>>>>>>>>>>      ? __debug_object_init+0x292/0x560
>>>>>>>>>>      ? lock_acquire+0xe0/0x300
>>>>>>>>>>      ? cblist_init_generic+0x232/0x2d0
>>>>>>>>>>      __debug_object_init+0x2c/0x560
>>>>>>
>>>>>> This "__debug_object_init" is because INIT_WORK() is called in
>>>>>> cblist_init_generic(), so..
>>>>>>
>>>>>>>>>> cblist_init_generic+0x147/0x2d0
>>>>>>>>>>      rcu_init_tasks_generic+0x15/0x190
>>>>>>>>>>      kernel_init_freeable+0x6e/0x3e0
>>>>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>>>>      kernel_init+0x1b/0x1d0
>>>>>>>>>>      ? rest_init+0x1e0/0x1e0
>>>>>>>>>>      ret_from_fork+0x1f/0x30
>>>>>>>>>>      </TASK>
>>>>>>>>>>
>>>>>>>>>> The fill_pool() can only be called in the !PREEMPT_RT kernel
>>>>>>>>>> or in the preemptible context of the PREEMPT_RT kernel, so
>>>>>>>>>> the above warning is not a real issue, but it's better to
>>>>>>>>>> annotate kmem_cache_node->list_lock as raw_spinlock to get
>>>>>>>>>> rid of such issue.
>>>>>>>>>
>>>>>>>>> + CC some RT and RCU people
>>>>>>>>
>>>>>>>> Thanks.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> AFAIK raw_spinlock is not just an annotation, but on RT it 
>>>>>>>>> changes the
>>>>>>>>> implementation from preemptible mutex to actual spin lock, so 
>>>>>>>>> it would be
>>>>>>>>
>>>>>>>> Yeah.
>>>>>>>>
>>>>>>>>> rather unfortunate to do that for a spurious warning. Can it be 
>>>>>>>>> somehow
>>>>>>>>> fixed in a better way?
>>>>>>
>>>>>> ... probably a better fix is to drop locks and call INIT_WORK(), 
>>>>>> or make
>>>>>> the cblist_init_generic() lockless (or part lockless), given it's 
>>>>>> just
>>>>>> initializing the cblist, it's probably doable. But I haven't taken a
>>>>>> careful look yet.
>>>>>>
>>>>>
>>>>>
>>>>> This is just one of the paths that triggers an invalid wait,  the 
>>>>> following paths can also trigger:
>>>>>
>>>>> [  129.914547] [ BUG: Invalid wait context ]
>>>>> [  129.914775] 6.3.0-rc1-yocto-standard+ #2 Not tainted
>>>>> [  129.915044] -----------------------------
>>>>> [  129.915272] kworker/2:0/28 is trying to lock:
>>>>> [  129.915516] ffff88815660f570 (&c->lock){-.-.}-{3:3}, at: 
>>>>> ___slab_alloc+0x68/0x12e0
>>>>> [  129.915967] other info that might help us debug this:
>>>>> [  129.916241] context-{5:5}
>>>>> [  129.916392] 3 locks held by kworker/2:0/28:
>>>>> [  129.916642]  #0: ffff888100084d48 
>>>>> ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x515/0xba0
>>>>> [  129.917145]  #1: ffff888100c17dd0 
>>>>> ((work_completion)(&(&krcp->monitor_work)->work)){+.+.}-{0:0}, at: 
>>>>> process_on0
>>>>> [  129.917758]  #2: ffff8881565f8508 (krc.lock){....}-{2:2}, at: 
>>>>> kfree_rcu_monitor+0x29f/0x810
>>>>> [  129.918207] stack backtrace:
>>>>> [  129.918374] CPU: 2 PID: 28 Comm: kworker/2:0 Not tainted 
>>>>> 6.3.0-rc1-yocto-standard+ #2
>>>>> [  129.918784] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), 
>>>>> BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.o4
>>>>> [  129.919397] Workqueue: events kfree_rcu_monitor
>>>>> [  129.919662] Call Trace:
>>>>> [  129.919812]  <TASK>
>>>>> [  129.919941]  dump_stack_lvl+0x64/0xb0
>>>>> [  129.920171]  dump_stack+0x10/0x20
>>>>> [  129.920372]  __lock_acquire+0xeb8/0x3a80
>>>>> [  129.920603]  ? ret_from_fork+0x2c/0x50
>>>>> [  129.920824]  ? __pfx___lock_acquire+0x10/0x10
>>>>> [  129.921068]  ? unwind_next_frame.part.0+0x1ba/0x3c0
>>>>> [  129.921343]  ? ret_from_fork+0x2c/0x50
>>>>> [  129.921573]  ? __this_cpu_preempt_check+0x13/0x20
>>>>> [  129.921847]  lock_acquire+0x194/0x480
>>>>> [  129.922060]  ? ___slab_alloc+0x68/0x12e0
>>>>> [  129.922293]  ? __pfx_lock_acquire+0x10/0x10
>>>>> [  129.922529]  ? __pfx_mark_lock.part.0+0x10/0x10
>>>>> [  129.922778]  ? __kasan_check_read+0x11/0x20
>>>>> [  129.922998]  ___slab_alloc+0x9a/0x12e0
>>>>> [  129.923222]  ? ___slab_alloc+0x68/0x12e0
>>>>> [  129.923452]  ? __pfx_mark_lock.part.0+0x10/0x10
>>>>> [  129.923706]  ? __kasan_check_read+0x11/0x20
>>>>> [  129.923937]  ? fill_pool+0x22a/0x370
>>>>> [  129.924161]  ? __lock_acquire+0xf5b/0x3a80
>>>>> [  129.924387]  ? fill_pool+0x22a/0x370
>>>>> [  129.924590]  __slab_alloc.constprop.0+0x5b/0x90
>>>>> [  129.924832]  kmem_cache_alloc+0x296/0x3d0
>>>>> [  129.925073]  ? fill_pool+0x22a/0x370
>>>>> [  129.925291]  fill_pool+0x22a/0x370
>>>>> [  129.925495]  ? __pfx_fill_pool+0x10/0x10
>>>>> [  129.925718]  ? __pfx___lock_acquire+0x10/0x10
>>>>> [  129.926034]  ? __kasan_check_read+0x11/0x20
>>>>> [  129.926269]  ? check_chain_key+0x200/0x2b0
>>>>> [  129.926503]  __debug_object_init+0x82/0x8c0
>>>>> [  129.926734]  ? __pfx_lock_release+0x10/0x10
>>>>> [  129.926984]  ? __pfx___debug_object_init+0x10/0x10
>>>>> [  129.927249]  ? __kasan_check_read+0x11/0x20
>>>>> [  129.927498]  ? do_raw_spin_unlock+0x9c/0x100
>>>>> [  129.927758]  debug_object_activate+0x2d1/0x2f0
>>>>> [  129.928022]  ? __pfx_debug_object_activate+0x10/0x10
>>>>> [  129.928300]  ? __this_cpu_preempt_check+0x13/0x20
>>>>> [  129.928583]  __call_rcu_common.constprop.0+0x94/0xeb0
>>>>> [  129.928897]  ? __this_cpu_preempt_check+0x13/0x20
>>>>> [  129.929186]  ? __pfx_rcu_work_rcufn+0x10/0x10
>>>>> [  129.929459]  ? __pfx___call_rcu_common.constprop.0+0x10/0x10
>>>>> [  129.929803]  ? __pfx_lock_acquired+0x10/0x10
>>>>> [  129.930067]  ? __pfx_do_raw_spin_trylock+0x10/0x10
>>>>> [  129.930363]  ? kfree_rcu_monitor+0x29f/0x810
>>>>> [  129.930627]  call_rcu+0xe/0x20
>>>>> [  129.930821]  queue_rcu_work+0x4f/0x60
>>>>> [  129.931050]  kfree_rcu_monitor+0x5d3/0x810
>>>>> [  129.931302]  ? __pfx_kfree_rcu_monitor+0x10/0x10
>>>>> [  129.931587]  ? __this_cpu_preempt_check+0x13/0x20
>>>>> [  129.931878]  process_one_work+0x607/0xba0
>>>>> [  129.932129]  ? __pfx_process_one_work+0x10/0x10
>>>>> [  129.932408]  ? worker_thread+0xd6/0x710
>>>>> [  129.932653]  worker_thread+0x2d4/0x710
>>>>> [  129.932888]  ? __pfx_worker_thread+0x10/0x10
>>>>> [  129.933154]  kthread+0x18b/0x1c0
>>>>> [  129.933363]  ? __pfx_kthread+0x10/0x10
>>>>> [  129.933598]  ret_from_fork+0x2c/0x50
>>>>> [  129.933825]  </TASK>
>>>>>
>>>>> Maybe no need to convert ->list_lock to raw_spinlock.
>>>>>
>>>>> --- a/lib/debugobjects.c
>>>>> +++ b/lib/debugobjects.c
>>>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct 
>>>>> debug_obj_descr *descr, int onstack
>>>>>          unsigned long flags;
>>>>>
>>>>>          /*
>>>>> -        * On RT enabled kernels the pool refill must happen in 
>>>>> preemptible
>>>>> +        * The pool refill must happen in preemptible
>>>>>           * context:
>>>>>           */
>>>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>>>> +       if (preemptible())
>>>>>                  fill_pool();
>>>>
>>>> +CC Peterz
>>>>
>>>> Aha so this is in fact another case where the code is written with
>>>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>>>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
>>>
>>> Maybe we should make CONFIG_PROVE_RAW_LOCK_NESTING depend on
>>> CONFIG_PREEMPT_RT:
>>
>> I found a discussion [1] of why CONFIG_PROVE_RAW_LOCK_NESTING didn't
>> depend on CONFIG_PREEMPT_RT before in the commit history:
>>
>> ```
>> >>> We now always get a "Invalid wait context" warning with
>> >>> CONFIG_PROVE_RAW_LOCK_NESTING=y, see the full warning below:
>> >>>
>> >>>        [    0.705900] =============================
>> >>>        [    0.706002] [ BUG: Invalid wait context ]
>> >>>        [    0.706180] 5.13.0+ #4 Not tainted
>> >>>        [    0.706349] -----------------------------
>> >> I believe the purpose of CONFIG_PROVE_RAW_LOCK_NESTING is experimental
>> >> and it is turned off by default. Turning it on can cause problem as
>> >> shown in your lockdep splat. Limiting it to just PREEMPT_RT will 
>> defeat
>> >> its purpose to find potential spinlock nesting problem in 
>> non-PREEMPT_RT
>> >> kernel.
>> > As far as I know, a spinlock can nest another spinlock. In
>> > non-PREEMPT_RT kernel
>> > spin_lock and raw_spin_lock are same , so here acquiring a spin_lock 
>> in hardirq
>> > context is acceptable, the warning is not needed. My knowledge on this
>> > is not enough,
>> > Will dig into this.
>> >
>> >> The point is to fix the issue found,
>> > Agree. I thought there was a spinlock usage issue, but by checking
>> > deactivate_slab context,
>> > looks like the spinlock usage is well. Maybe I'm missing something?
>>
>> Yes, spinlock and raw spinlock are the same in non-RT kernel. They are
>> only different in RT kernel. However, non-RT kernel is also more heavily
>> tested than the RT kernel counterpart. The purpose of this config option
>> is to expose spinlock nesting problem in more areas of the code. If you
>> look at the config help text of PROVE_RAW_LOCK_NESTING:
>>
>>          help
>>           Enable the raw_spinlock vs. spinlock nesting checks which 
>> ensure
>>           that the lock nesting rules for PREEMPT_RT enabled kernels are
>>           not violated.
>>
>>           NOTE: There are known nesting problems. So if you enable this
>>           option expect lockdep splats until these problems have been 
>> fully
>>           addressed which is work in progress. This config switch 
>> allows to
>>           identify and analyze these problems. It will be removed and the
>>           check permanentely enabled once the main issues have been 
>> fixed.
>>
>>           If unsure, select N.
>>
>> So lockdep splat is expected. It will take time to address all the
>> issues found.
>> ```
>>
>> Also +Waiman Long.
> 
> I believe the purpose of not making PROVE_RAW_LOCK_NESTING depending on 
> PREEMPT_RT is to allow people to discover this kind of nest locking 
> problem without enabling PREEMPT_RT.
> 
> Anyway, I don't think you can change list_lock to a raw spinlock. 
> According to mm/slub.c:
> 
>   * Lock order:
>   *   1. slab_mutex (Global Mutex)
>   *   2. node->list_lock (Spinlock)
>   *   3. kmem_cache->cpu_slab->lock (Local lock)
>   *   4. slab_lock(slab) (Only on some arches)
>   *   5. object_map_lock (Only for debugging)
> 
> For PREEMPT_RT, local lock is a per-cpu spinlock (rt_mutex). So 
> list_lock has to be spinlock also.

Got it. Thanks for such a detailed explanation!

> 
> Cheers,
> Longman
> 
> 

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12  6:57             ` Qi Zheng
@ 2023-04-13  0:24               ` Joel Fernandes
  2023-04-13  1:55                 ` Steven Rostedt
  0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2023-04-13  0:24 UTC (permalink / raw)
  To: Qi Zheng, Steven Rostedt
  Cc: Zhang, Qiang1, Boqun Feng, Vlastimil Babka, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Wed, Apr 12, 2023 at 2:57 AM Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>
>
>
> On 2023/4/12 14:44, Zhang, Qiang1 wrote:
[..]
> > Maybe no need to convert ->list_lock to raw_spinlock.
> >
> > --- a/lib/debugobjects.c
> > +++ b/lib/debugobjects.c
> > @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
> >          unsigned long flags;
> >
> >          /*
> > -        * On RT enabled kernels the pool refill must happen in preemptible
> > +        * The pool refill must happen in preemptible
> >           * context:
> >           */
> > -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> > +       if (preemptible())
> >                  fill_pool();
> >
> >          db = get_bucket((unsigned long) addr);
>
> Ah, this does fix the warning I was encountered!

Actually fill_pool() should be safe to call on !CONFIG_PREEMPT_RT
kernels as it is GFP_ATOMIC, however with the above change, that goes
away just to satisfy a false-positive report. Because now all
!preemptible() sections on !CONFIG_PREEMPT_RT kernels cannot call
fill_pool(), right? So you will not end up filling the pool when it is
safe to do so?

I think it would be better to fix PROVE_LOCKING / CONFIG_PREEMPT_RT
instead of degrading !CONFIG_PREEMPT_RT just to satisfy a
false-positive report.

+Steven Rostedt as well.

thanks,

 - Joel


>
> >
> >
> >
> > Thanks
> > Zqiang
> >
> >>
> >>
> >> Regards,
> >> Boqun
> >>
> >>>>
> >>>> It's indeed unfortunate for the warning in the commit message. But
> >>>> functions like kmem_cache_alloc(GFP_ATOMIC) may indeed be called
> >>>> in the critical section of raw_spinlock or in the hardirq context, which
> >>>
> >>> Hmm, I thought they may not, actually.
> >>>
> >>>> will cause problem in the PREEMPT_RT kernel. So I still think it is
> >>>> reasonable to convert kmem_cache_node->list_lock to raw_spinlock type.
> >>>
> >>> It wouldn't be the complete solution anyway. Once we allow even a GFP_ATOMIC
> >>> slab allocation for such context, it means also page allocation can happen
> >>> to refill the slabs, so lockdep will eventually complain about zone->lock,
> >>> and who knows what else.
> >>
> >> Oh, indeed. :(
> >>
> >>>
> >>>> In addition, there are many fix patches for this kind of warning in the
> >>>> git log, so I also think there should be a general and better solution. :)
> >>>
> >>> Maybe, but given above, I doubt it's this one.
> >>>
> >>>>
> >>>>>
> >>>>
> >>>
> >>
> >> --
> >> Thanks,
> >> Qi
>
> --
> Thanks,
> Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-13  0:24               ` Joel Fernandes
@ 2023-04-13  1:55                 ` Steven Rostedt
  0 siblings, 0 replies; 23+ messages in thread
From: Steven Rostedt @ 2023-04-13  1:55 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Qi Zheng, Zhang, Qiang1, Boqun Feng, Vlastimil Babka, 42.hyeyoo,
	akpm, roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl,
	linux-mm, linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Wed, 12 Apr 2023 20:24:19 -0400
Joel Fernandes <joel@joelfernandes.org> wrote:

> I think it would be better to fix PROVE_LOCKING / CONFIG_PREEMPT_RT
> instead of degrading !CONFIG_PREEMPT_RT just to satisfy a
> false-positive report.

I think getting Peter's patch working:

  https://lore.kernel.org/all/20230412124735.GE628377@hirez.programming.kicks-ass.net/

is the real solution.

-- Steve

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12 16:44                 ` Qi Zheng
@ 2023-04-13  7:40                   ` Peter Zijlstra
  2023-04-25 15:03                     ` Peter Zijlstra
  2023-04-13 14:49                   ` [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock Qi Zheng
  1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2023-04-13  7:40 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Thu, Apr 13, 2023 at 12:44:42AM +0800, Qi Zheng wrote:
> > Something like the completely untested below might be of help..

> I just tested the above code, and then got the following
> warning:
> 

> It seems that the LD_WAIT_SLEEP we set is already greater than the
> LD_WAIT_SPIN of the current context.

Yeah, I'm an idiot and got it wrong.. I'll try again later if I manage
to wake up today :-)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-12 16:44                 ` Qi Zheng
  2023-04-13  7:40                   ` Peter Zijlstra
@ 2023-04-13 14:49                   ` Qi Zheng
  1 sibling, 0 replies; 23+ messages in thread
From: Qi Zheng @ 2023-04-13 14:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney



On 2023/4/13 00:44, Qi Zheng wrote:
> 
> 
> On 2023/4/12 20:47, Peter Zijlstra wrote:
>> On Wed, Apr 12, 2023 at 08:50:29AM +0200, Vlastimil Babka wrote:
>>
>>>> --- a/lib/debugobjects.c
>>>> +++ b/lib/debugobjects.c
>>>> @@ -562,10 +562,10 @@ __debug_object_init(void *addr, const struct 
>>>> debug_obj_descr *descr, int onstack
>>>>          unsigned long flags;
>>>>
>>>>          /*
>>>> -        * On RT enabled kernels the pool refill must happen in 
>>>> preemptible
>>>> +        * The pool refill must happen in preemptible
>>>>           * context:
>>>>           */
>>>> -       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>>>> +       if (preemptible())
>>>>                  fill_pool();
>>>
>>> +CC Peterz
>>>
>>> Aha so this is in fact another case where the code is written with
>>> actual differences between PREEMPT_RT and !PREEMPT_RT in mind, but
>>> CONFIG_PROVE_RAW_LOCK_NESTING always assumes PREEMPT_RT?
>>
>> Ooh, tricky, yes. PROVE_RAW_LOCK_NESTING always follows the PREEMP_RT
>> rules and does not expect trickery like the above.
>>
>> Something like the completely untested below might be of help..
>>
>> ---
>> diff --git a/include/linux/lockdep_types.h 
>> b/include/linux/lockdep_types.h
>> index d22430840b53..f3120d6a7d9e 100644
>> --- a/include/linux/lockdep_types.h
>> +++ b/include/linux/lockdep_types.h
>> @@ -33,6 +33,7 @@ enum lockdep_wait_type {
>>   enum lockdep_lock_type {
>>       LD_LOCK_NORMAL = 0,    /* normal, catch all */
>>       LD_LOCK_PERCPU,        /* percpu */
>> +    LD_LOCK_WAIT,        /* annotation */
>>       LD_LOCK_MAX,
>>   };
>> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
>> index 50d4863974e7..a4077f5bb75b 100644
>> --- a/kernel/locking/lockdep.c
>> +++ b/kernel/locking/lockdep.c
>> @@ -2279,8 +2279,9 @@ static inline bool usage_skip(struct lock_list 
>> *entry, void *mask)
>>        * As a result, we will skip local_lock(), when we search for irq
>>        * inversion bugs.
>>        */
>> -    if (entry->class->lock_type == LD_LOCK_PERCPU) {
>> -        if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
>> LD_WAIT_CONFIG))
>> +    if (entry->class->lock_type != LD_LOCK_NORMAL) {
>> +        if (entry->class->lock_type == LD_LOCK_PERCPU &&
>> +            DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
>> LD_WAIT_CONFIG))
>>               return false;
>>           return true;
>> @@ -4752,7 +4753,8 @@ static int check_wait_context(struct task_struct 
>> *curr, struct held_lock *next)
>>       for (; depth < curr->lockdep_depth; depth++) {
>>           struct held_lock *prev = curr->held_locks + depth;
>> -        u8 prev_inner = hlock_class(prev)->wait_type_inner;
>> +        struct lock_class *class = hlock_class(prev);
>> +        u8 prev_inner = class->wait_type_inner;
>>           if (prev_inner) {
>>               /*
>> @@ -4762,6 +4764,12 @@ static int check_wait_context(struct 
>> task_struct *curr, struct held_lock *next)
>>                * Also due to trylocks.
>>                */
>>               curr_inner = min(curr_inner, prev_inner);
>> +
>> +            /*
>> +             * Allow override for annotations.
>> +             */
>> +            if (unlikely(class->lock_type == LD_LOCK_WAIT))
>> +                curr_inner = prev_inner;
>>           }
>>       }
>> diff --git a/lib/debugobjects.c b/lib/debugobjects.c
>> index df86e649d8be..fae71ef72a16 100644
>> --- a/lib/debugobjects.c
>> +++ b/lib/debugobjects.c
>> @@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct 
>> debug_obj_descr *descr, int onstack
>>        * On RT enabled kernels the pool refill must happen in preemptible
>>        * context:
>>        */
>> -    if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
>> +    if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
>> +        static lockdep_map dep_map = {
> 
>                  static struct lockdep_map dep_map = {
> 
>> +            .name = "wait-type-override",
>> +            .wait_type_inner = LD_WAIT_SLEEP,
>> +            .lock_type = LD_LOCK_WAIT,
>> +        };
>> +        lock_map_acquire(&dep_map);
>>           fill_pool();
>> +        lock_map_release(&dep_map);
>> +    }
>>       db = get_bucket((unsigned long) addr);
> 
> I just tested the above code, and then got the following
> warning:
> 

> 
> It seems that the LD_WAIT_SLEEP we set is already greater than the
> LD_WAIT_SPIN of the current context.
> 

Can we do something like below? This solves the warning I encountered.

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index d22430840b53..f3120d6a7d9e 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
  enum lockdep_lock_type {
         LD_LOCK_NORMAL = 0,     /* normal, catch all */
         LD_LOCK_PERCPU,         /* percpu */
+       LD_LOCK_WAIT,           /* annotation */
         LD_LOCK_MAX,
  };

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dcd1d5bfc1e0..6859dba8a3aa 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2289,8 +2289,9 @@ static inline bool usage_skip(struct lock_list 
*entry, void *mask)
          * As a result, we will skip local_lock(), when we search for irq
          * inversion bugs.
          */
-       if (entry->class->lock_type == LD_LOCK_PERCPU) {
-               if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
LD_WAIT_CONFIG))
+       if (entry->class->lock_type != LD_LOCK_NORMAL) {
+               if (entry->class->lock_type == LD_LOCK_PERCPU &&
+                   DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < 
LD_WAIT_CONFIG))
                         return false;

                 return true;
@@ -3981,6 +3982,9 @@ static inline int
  valid_state(struct task_struct *curr, struct held_lock *this,
             enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
  {
+       if (unlikely(hlock_class(this)->lock_type == LD_LOCK_WAIT))
+               return 1;
+
         if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
                 graph_unlock();
                 print_usage_bug(curr, this, bad_bit, new_bit);
@@ -4768,7 +4772,8 @@ static int check_wait_context(struct task_struct 
*curr, struct held_lock *next)

         for (; depth < curr->lockdep_depth; depth++) {
                 struct held_lock *prev = curr->held_locks + depth;
-               u8 prev_inner = hlock_class(prev)->wait_type_inner;
+               struct lock_class *class = hlock_class(prev);
+               u8 prev_inner = class->wait_type_inner;

                 if (prev_inner) {
                         /*
@@ -4778,9 +4783,19 @@ static int check_wait_context(struct task_struct 
*curr, struct held_lock *next)
                          * Also due to trylocks.
                          */
                         curr_inner = min(curr_inner, prev_inner);
+
+                       /*
+                        * Allow override for annotations.
+                        */
+                       if (unlikely(class->lock_type == LD_LOCK_WAIT))
+                               curr_inner = prev_inner;
                 }
         }

+       if (unlikely(hlock_class(next)->lock_type == LD_LOCK_WAIT &&
+                    curr_inner == LD_WAIT_SPIN))
+               curr_inner = LD_WAIT_CONFIG;
+
         if (next_outer > curr_inner)
                 return print_lock_invalid_wait_context(curr, next);

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index df86e649d8be..a8a69991b0d0 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct 
debug_obj_descr *descr, int onstack
          * On RT enabled kernels the pool refill must happen in preemptible
          * context:
          */
-       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+       if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+               static struct lockdep_map dep_map = {
+                       .name = "wait-type-override",
+                       .wait_type_inner = LD_WAIT_CONFIG,
+                       .lock_type = LD_LOCK_WAIT,
+               };
+               lock_map_acquire(&dep_map);
                 fill_pool();
+               lock_map_release(&dep_map);
+       }

         db = get_bucket((unsigned long) addr);

-- 
Thanks,
Qi

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-13  7:40                   ` Peter Zijlstra
@ 2023-04-25 15:03                     ` Peter Zijlstra
  2023-04-25 15:51                       ` Qi Zheng
  0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2023-04-25 15:03 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Thu, Apr 13, 2023 at 09:40:13AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 13, 2023 at 12:44:42AM +0800, Qi Zheng wrote:
> > > Something like the completely untested below might be of help..
> 
> > I just tested the above code, and then got the following
> > warning:
> > 
> 
> > It seems that the LD_WAIT_SLEEP we set is already greater than the
> > LD_WAIT_SPIN of the current context.
> 
> Yeah, I'm an idiot and got it wrong.. I'll try again later if I manage
> to wake up today :-)

And then I forgot ofcourse :/ Can you give the below (still mostly
untested) a spin? The crucial difference is the new
lock_map_acquire_try(). By making the annotation a 'trylock' it will
skip the acquire of the annotation itself (since trylocks don't block).

---
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1023f349af71..435a3b0f8ea6 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -551,6 +551,7 @@ do {									\
 #define rwsem_release(l, i)			lock_release(l, i)
 
 #define lock_map_acquire(l)			lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
+#define lock_map_acquire_try(l)			lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
 #define lock_map_acquire_read(l)		lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
 #define lock_map_acquire_tryread(l)		lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
 #define lock_map_release(l)			lock_release(l, _THIS_IP_)
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index d22430840b53..f3120d6a7d9e 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
 enum lockdep_lock_type {
 	LD_LOCK_NORMAL = 0,	/* normal, catch all */
 	LD_LOCK_PERCPU,		/* percpu */
+	LD_LOCK_WAIT,		/* annotation */
 	LD_LOCK_MAX,
 };
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 50d4863974e7..d254f9e53c0e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2253,6 +2253,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
 
 static inline bool usage_skip(struct lock_list *entry, void *mask)
 {
+	if (entry->class->lock_type == LD_LOCK_NORMAL)
+		return false;
+
 	/*
 	 * Skip local_lock() for irq inversion detection.
 	 *
@@ -2279,14 +2282,11 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
 	 * As a result, we will skip local_lock(), when we search for irq
 	 * inversion bugs.
 	 */
-	if (entry->class->lock_type == LD_LOCK_PERCPU) {
-		if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
-			return false;
-
-		return true;
-	}
+	if (entry->class->lock_type == LD_LOCK_PERCPU &&
+	    DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+		return false;
 
-	return false;
+	return true;
 }
 
 /*
@@ -4752,7 +4752,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 
 	for (; depth < curr->lockdep_depth; depth++) {
 		struct held_lock *prev = curr->held_locks + depth;
-		u8 prev_inner = hlock_class(prev)->wait_type_inner;
+		struct lock_class *class = hlock_class(prev);
+		u8 prev_inner = class->wait_type_inner;
 
 		if (prev_inner) {
 			/*
@@ -4762,6 +4763,12 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 			 * Also due to trylocks.
 			 */
 			curr_inner = min(curr_inner, prev_inner);
+
+			/*
+			 * Allow override for annotations.
+			 */
+			if (unlikely(class->lock_type == LD_LOCK_WAIT))
+				curr_inner = prev_inner;
 		}
 	}
 
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index df86e649d8be..0e089882146b 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
 	 * On RT enabled kernels the pool refill must happen in preemptible
 	 * context:
 	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+		static struct lockdep_map dep_map = {
+			.name = "wait-type-override",
+			.wait_type_inner = LD_WAIT_SLEEP,
+			.lock_type = LD_LOCK_WAIT,
+		};
+		lock_map_acquire_try(&dep_map);
 		fill_pool();
+		lock_map_release(&dep_map);
+	}
 
 	db = get_bucket((unsigned long) addr);
 

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock
  2023-04-25 15:03                     ` Peter Zijlstra
@ 2023-04-25 15:51                       ` Qi Zheng
  2023-04-29 10:06                         ` [PATCH] debugobjects,locking: Annotate __debug_object_init() wait type violation Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Qi Zheng @ 2023-04-25 15:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney



On 2023/4/25 23:03, Peter Zijlstra wrote:
> On Thu, Apr 13, 2023 at 09:40:13AM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 13, 2023 at 12:44:42AM +0800, Qi Zheng wrote:
>>>> Something like the completely untested below might be of help..
>>
>>> I just tested the above code, and then got the following
>>> warning:
>>>
>>
>>> It seems that the LD_WAIT_SLEEP we set is already greater than the
>>> LD_WAIT_SPIN of the current context.
>>
>> Yeah, I'm an idiot and got it wrong.. I'll try again later if I manage
>> to wake up today :-)
> 
> And then I forgot ofcourse :/ Can you give the below (still mostly
> untested) a spin? The crucial difference is the new
> lock_map_acquire_try(). By making the annotation a 'trylock' it will
> skip the acquire of the annotation itself (since trylocks don't block).

Oh, 'trylock' is a good idea. I just tested the following code and
it can resolve the warning I encountered. :)

Thanks,
Qi

> 
> ---
> diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
> index 1023f349af71..435a3b0f8ea6 100644
> --- a/include/linux/lockdep.h
> +++ b/include/linux/lockdep.h
> @@ -551,6 +551,7 @@ do {									\
>   #define rwsem_release(l, i)			lock_release(l, i)
>   
>   #define lock_map_acquire(l)			lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
> +#define lock_map_acquire_try(l)			lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
>   #define lock_map_acquire_read(l)		lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
>   #define lock_map_acquire_tryread(l)		lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
>   #define lock_map_release(l)			lock_release(l, _THIS_IP_)
> diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
> index d22430840b53..f3120d6a7d9e 100644
> --- a/include/linux/lockdep_types.h
> +++ b/include/linux/lockdep_types.h
> @@ -33,6 +33,7 @@ enum lockdep_wait_type {
>   enum lockdep_lock_type {
>   	LD_LOCK_NORMAL = 0,	/* normal, catch all */
>   	LD_LOCK_PERCPU,		/* percpu */
> +	LD_LOCK_WAIT,		/* annotation */
>   	LD_LOCK_MAX,
>   };
>   
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 50d4863974e7..d254f9e53c0e 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -2253,6 +2253,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask)
>   
>   static inline bool usage_skip(struct lock_list *entry, void *mask)
>   {
> +	if (entry->class->lock_type == LD_LOCK_NORMAL)
> +		return false;
> +
>   	/*
>   	 * Skip local_lock() for irq inversion detection.
>   	 *
> @@ -2279,14 +2282,11 @@ static inline bool usage_skip(struct lock_list *entry, void *mask)
>   	 * As a result, we will skip local_lock(), when we search for irq
>   	 * inversion bugs.
>   	 */
> -	if (entry->class->lock_type == LD_LOCK_PERCPU) {
> -		if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
> -			return false;
> -
> -		return true;
> -	}
> +	if (entry->class->lock_type == LD_LOCK_PERCPU &&
> +	    DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
> +		return false;
>   
> -	return false;
> +	return true;
>   }
>   
>   /*
> @@ -4752,7 +4752,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
>   
>   	for (; depth < curr->lockdep_depth; depth++) {
>   		struct held_lock *prev = curr->held_locks + depth;
> -		u8 prev_inner = hlock_class(prev)->wait_type_inner;
> +		struct lock_class *class = hlock_class(prev);
> +		u8 prev_inner = class->wait_type_inner;
>   
>   		if (prev_inner) {
>   			/*
> @@ -4762,6 +4763,12 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
>   			 * Also due to trylocks.
>   			 */
>   			curr_inner = min(curr_inner, prev_inner);
> +
> +			/*
> +			 * Allow override for annotations.
> +			 */
> +			if (unlikely(class->lock_type == LD_LOCK_WAIT))
> +				curr_inner = prev_inner;
>   		}
>   	}
>   
> diff --git a/lib/debugobjects.c b/lib/debugobjects.c
> index df86e649d8be..0e089882146b 100644
> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -565,8 +565,16 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack
>   	 * On RT enabled kernels the pool refill must happen in preemptible
>   	 * context:
>   	 */
> -	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
> +		static struct lockdep_map dep_map = {
> +			.name = "wait-type-override",
> +			.wait_type_inner = LD_WAIT_SLEEP,
> +			.lock_type = LD_LOCK_WAIT,
> +		};
> +		lock_map_acquire_try(&dep_map);
>   		fill_pool();
> +		lock_map_release(&dep_map);
> +	}
>   
>   	db = get_bucket((unsigned long) addr);
>   

-- 
Thanks,
Qi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] debugobjects,locking: Annotate __debug_object_init() wait type violation
  2023-04-25 15:51                       ` Qi Zheng
@ 2023-04-29 10:06                         ` Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2023-04-29 10:06 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Vlastimil Babka, Zhang, Qiang1, Boqun Feng, 42.hyeyoo, akpm,
	roman.gushchin, iamjoonsoo.kim, rientjes, penberg, cl, linux-mm,
	linux-kernel, Zhao Gongyi, Sebastian Andrzej Siewior,
	Thomas Gleixner, RCU, Paul E . McKenney

On Tue, Apr 25, 2023 at 11:51:05PM +0800, Qi Zheng wrote:
> I just tested the following code and
> it can resolve the warning I encountered. :)

Excellent; I've spruiced up the code with a few comments and helpers
(as suggested by Vlastimil on IRC) and will queue the below commit.

---
Subject: debugobjects,locking: Annotate __debug_object_init() wait type violation
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Apr 2023 17:03:13 +0200

There is an explicit wait-type violation in __debug_object_init() for
PREEMPT_RT=n kernels which allows them to more easily fill the object
pool and reduce the chance of allocation failures.

Lockdep's wait-type checks are designed to check the PREEMPT_RT
locking rules even for PREEMPT_RT=n kernels and object to this, so
create a lockdep annotation to allow this to stand.

Specifically, create a 'lock' type that overrides the inner wait-type
while it is held -- allowing one to temporarily raise it, such that
the violation is hidden.

Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/lockdep.h       |   14 ++++++++++++++
 include/linux/lockdep_types.h |    1 +
 kernel/locking/lockdep.c      |   28 +++++++++++++++++++++-------
 lib/debugobjects.c            |   15 +++++++++++++--
 4 files changed, 49 insertions(+), 9 deletions(-)

--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -339,6 +339,16 @@ extern void lock_unpin_lock(struct lockd
 #define lockdep_repin_lock(l,c)	lock_repin_lock(&(l)->dep_map, (c))
 #define lockdep_unpin_lock(l,c)	lock_unpin_lock(&(l)->dep_map, (c))
 
+/*
+ * Must use lock_map_aquire_try() with override maps to avoid
+ * lockdep thinking they participate in the block chain.
+ */
+#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)	\
+	struct lockdep_map _name = {			\
+		.name = #_name "-wait-type-override",	\
+		.wait_type_inner = _wait_type,		\
+		.lock_type = LD_LOCK_WAIT_OVERRIDE, }
+
 #else /* !CONFIG_LOCKDEP */
 
 static inline void lockdep_init_task(struct task_struct *task)
@@ -427,6 +437,9 @@ extern int lockdep_is_held(const void *)
 #define lockdep_repin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
 #define lockdep_unpin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
 
+#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)	\
+	struct lockdep_map __maybe_unused _name = {}
+
 #endif /* !LOCKDEP */
 
 enum xhlock_context_t {
@@ -551,6 +564,7 @@ do {									\
 #define rwsem_release(l, i)			lock_release(l, i)
 
 #define lock_map_acquire(l)			lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
+#define lock_map_acquire_try(l)			lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
 #define lock_map_acquire_read(l)		lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
 #define lock_map_acquire_tryread(l)		lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
 #define lock_map_release(l)			lock_release(l, _THIS_IP_)
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -33,6 +33,7 @@ enum lockdep_wait_type {
 enum lockdep_lock_type {
 	LD_LOCK_NORMAL = 0,	/* normal, catch all */
 	LD_LOCK_PERCPU,		/* percpu */
+	LD_LOCK_WAIT_OVERRIDE,	/* annotation */
 	LD_LOCK_MAX,
 };
 
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2253,6 +2253,9 @@ static inline bool usage_match(struct lo
 
 static inline bool usage_skip(struct lock_list *entry, void *mask)
 {
+	if (entry->class->lock_type == LD_LOCK_NORMAL)
+		return false;
+
 	/*
 	 * Skip local_lock() for irq inversion detection.
 	 *
@@ -2279,14 +2282,16 @@ static inline bool usage_skip(struct loc
 	 * As a result, we will skip local_lock(), when we search for irq
 	 * inversion bugs.
 	 */
-	if (entry->class->lock_type == LD_LOCK_PERCPU) {
-		if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
-			return false;
+	if (entry->class->lock_type == LD_LOCK_PERCPU &&
+	    DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+		return false;
 
-		return true;
-	}
+	/*
+	 * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+	 * a lock and only used to override the wait_type.
+	 */
 
-	return false;
+	return true;
 }
 
 /*
@@ -4752,7 +4757,8 @@ static int check_wait_context(struct tas
 
 	for (; depth < curr->lockdep_depth; depth++) {
 		struct held_lock *prev = curr->held_locks + depth;
-		u8 prev_inner = hlock_class(prev)->wait_type_inner;
+		struct lock_class *class = hlock_class(prev);
+		u8 prev_inner = class->wait_type_inner;
 
 		if (prev_inner) {
 			/*
@@ -4762,6 +4768,14 @@ static int check_wait_context(struct tas
 			 * Also due to trylocks.
 			 */
 			curr_inner = min(curr_inner, prev_inner);
+
+			/*
+			 * Allow override for annotations -- this is typically
+			 * only valid/needed for code that only exists when
+			 * CONFIG_PREEMPT_RT=n.
+			 */
+			if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+				curr_inner = prev_inner;
 		}
 	}
 
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -563,10 +563,21 @@ __debug_object_init(void *addr, const st
 
 	/*
 	 * On RT enabled kernels the pool refill must happen in preemptible
-	 * context:
+	 * context -- for !RT kernels we rely on the fact that spinlock_t and
+	 * raw_spinlock_t are basically the same type and this lock-type
+	 * inversion works just fine.
 	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible())
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+		/*
+		 * Annotate away the spinlock_t inside raw_spinlock_t warning
+		 * by temporarily raising the wait-type to WAIT_SLEEP, matching
+		 * the preemptible() condition above.
+		 */
+		static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP);
+		lock_map_acquire_try(&fill_pool_map);
 		fill_pool();
+		lock_map_release(&fill_pool_map);
+	}
 
 	db = get_bucket((unsigned long) addr);
 

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2023-04-29 10:07 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20230411130854.46795-1-zhengqi.arch@bytedance.com>
2023-04-11 13:40 ` [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock Vlastimil Babka
2023-04-11 14:08   ` Qi Zheng
2023-04-11 14:19     ` Vlastimil Babka
2023-04-11 14:25       ` Qi Zheng
2023-04-12  5:51         ` Boqun Feng
2023-04-12  6:44           ` Zhang, Qiang1
2023-04-12  6:50             ` Vlastimil Babka
2023-04-12  7:30               ` Qi Zheng
2023-04-12  8:32                 ` Qi Zheng
2023-04-12 13:09                   ` Waiman Long
2023-04-12 16:47                     ` Qi Zheng
2023-04-12 12:48                 ` Peter Zijlstra
2023-04-12 12:47               ` Peter Zijlstra
2023-04-12 16:44                 ` Qi Zheng
2023-04-13  7:40                   ` Peter Zijlstra
2023-04-25 15:03                     ` Peter Zijlstra
2023-04-25 15:51                       ` Qi Zheng
2023-04-29 10:06                         ` [PATCH] debugobjects,locking: Annotate __debug_object_init() wait type violation Peter Zijlstra
2023-04-13 14:49                   ` [PATCH] mm: slub: annotate kmem_cache_node->list_lock as raw_spinlock Qi Zheng
2023-04-12  6:57             ` Qi Zheng
2023-04-13  0:24               ` Joel Fernandes
2023-04-13  1:55                 ` Steven Rostedt
2023-04-12  6:45           ` Qi Zheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).