linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] debugobjects: Move printk out of db lock critical sections
@ 2018-12-12 22:28 Waiman Long
  2018-12-12 23:39 ` Andrew Morton
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Waiman Long @ 2018-12-12 22:28 UTC (permalink / raw)
  To: Thomas Gleixner, Andrew Morton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Yang Shi,
	Arnd Bergmann, Sergey Senozhatsky, Dmitry Safonov, Waiman Long

The db->lock is a raw spinlock and so the lock hold time is supposed
to be short. This will not be the case when printk() is being involved
in some of the critical sections. In order to avoid the long hold time,
in case some messages need to be printed, the debug_object_is_on_stack()
and debug_print_object() calls are now moved out of those critical
sections.

Holding the db->lock while calling printk() may lead to deadlock if
printk() somehow requires the allocation/freeing of debug object that
happens to be in the same hash bucket or a circular lock dependency
warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.

[   87.209665] WARNING: possible circular locking dependency detected
[   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
[   87.211449] ------------------------------------------------------
[   87.212405] getty/519 is trying to acquire lock:
[   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
[   87.214343]
[   87.214343] but task is already holding lock:
[   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
[   87.216260]
[   87.216260] which lock already depends on the new lock.

This patch was also found to be able to fix a boot hanging problem
when the initramfs image was switched on after a debugobjects splat
from the EFI code.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 lib/debugobjects.c | 61 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 14afeeb..c30e786 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -375,6 +375,8 @@ static void debug_object_is_on_stack(void *addr, int onstack)
 	struct debug_bucket *db;
 	struct debug_obj *obj;
 	unsigned long flags;
+	bool debug_printobj = false;
+	bool debug_chkstack = false;
 
 	fill_pool();
 
@@ -391,7 +393,7 @@ static void debug_object_is_on_stack(void *addr, int onstack)
 			debug_objects_oom();
 			return;
 		}
-		debug_object_is_on_stack(addr, onstack);
+		debug_chkstack = true;
 	}
 
 	switch (obj->state) {
@@ -402,20 +404,25 @@ static void debug_object_is_on_stack(void *addr, int onstack)
 		break;
 
 	case ODEBUG_STATE_ACTIVE:
-		debug_print_object(obj, "init");
 		state = obj->state;
 		raw_spin_unlock_irqrestore(&db->lock, flags);
+		debug_print_object(obj, "init");
 		debug_object_fixup(descr->fixup_init, addr, state);
 		return;
 
 	case ODEBUG_STATE_DESTROYED:
-		debug_print_object(obj, "init");
+		debug_printobj = true;
 		break;
 	default:
 		break;
 	}
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
+	if (debug_chkstack)
+		debug_object_is_on_stack(addr, onstack);
+	if (debug_printobj)
+		debug_print_object(obj, "init");
+
 }
 
 /**
@@ -473,6 +480,8 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 
 	obj = lookup_object(addr, db);
 	if (obj) {
+		bool debug_printobj = false;
+
 		switch (obj->state) {
 		case ODEBUG_STATE_INIT:
 		case ODEBUG_STATE_INACTIVE:
@@ -481,14 +490,14 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 			break;
 
 		case ODEBUG_STATE_ACTIVE:
-			debug_print_object(obj, "activate");
 			state = obj->state;
 			raw_spin_unlock_irqrestore(&db->lock, flags);
+			debug_print_object(obj, "activate");
 			ret = debug_object_fixup(descr->fixup_activate, addr, state);
 			return ret ? 0 : -EINVAL;
 
 		case ODEBUG_STATE_DESTROYED:
-			debug_print_object(obj, "activate");
+			debug_printobj = true;
 			ret = -EINVAL;
 			break;
 		default:
@@ -496,10 +505,13 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 			break;
 		}
 		raw_spin_unlock_irqrestore(&db->lock, flags);
+		if (debug_printobj)
+			debug_print_object(obj, "activate");
 		return ret;
 	}
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
+
 	/*
 	 * We are here when a static object is activated. We
 	 * let the type specific code confirm whether this is
@@ -531,6 +543,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
 	struct debug_bucket *db;
 	struct debug_obj *obj;
 	unsigned long flags;
+	bool debug_printobj = false;
 
 	if (!debug_objects_enabled)
 		return;
@@ -548,24 +561,27 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
 			if (!obj->astate)
 				obj->state = ODEBUG_STATE_INACTIVE;
 			else
-				debug_print_object(obj, "deactivate");
+				debug_printobj = true;
 			break;
 
 		case ODEBUG_STATE_DESTROYED:
-			debug_print_object(obj, "deactivate");
+			debug_printobj = true;
 			break;
 		default:
 			break;
 		}
-	} else {
+	}
+
+	raw_spin_unlock_irqrestore(&db->lock, flags);
+	if (!obj) {
 		struct debug_obj o = { .object = addr,
 				       .state = ODEBUG_STATE_NOTAVAILABLE,
 				       .descr = descr };
 
 		debug_print_object(&o, "deactivate");
+	} else if (debug_printobj) {
+		debug_print_object(obj, "deactivate");
 	}
-
-	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
 EXPORT_SYMBOL_GPL(debug_object_deactivate);
 
@@ -580,6 +596,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
 	struct debug_bucket *db;
 	struct debug_obj *obj;
 	unsigned long flags;
+	bool debug_printobj = false;
 
 	if (!debug_objects_enabled)
 		return;
@@ -599,20 +616,22 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
 		obj->state = ODEBUG_STATE_DESTROYED;
 		break;
 	case ODEBUG_STATE_ACTIVE:
-		debug_print_object(obj, "destroy");
 		state = obj->state;
 		raw_spin_unlock_irqrestore(&db->lock, flags);
+		debug_print_object(obj, "destroy");
 		debug_object_fixup(descr->fixup_destroy, addr, state);
 		return;
 
 	case ODEBUG_STATE_DESTROYED:
-		debug_print_object(obj, "destroy");
+		debug_printobj = true;
 		break;
 	default:
 		break;
 	}
 out_unlock:
 	raw_spin_unlock_irqrestore(&db->lock, flags);
+	if (debug_printobj)
+		debug_print_object(obj, "destroy");
 }
 EXPORT_SYMBOL_GPL(debug_object_destroy);
 
@@ -641,9 +660,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
 
 	switch (obj->state) {
 	case ODEBUG_STATE_ACTIVE:
-		debug_print_object(obj, "free");
 		state = obj->state;
 		raw_spin_unlock_irqrestore(&db->lock, flags);
+		debug_print_object(obj, "free");
 		debug_object_fixup(descr->fixup_free, addr, state);
 		return;
 	default:
@@ -716,6 +735,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 	struct debug_bucket *db;
 	struct debug_obj *obj;
 	unsigned long flags;
+	bool debug_printobj = false;
 
 	if (!debug_objects_enabled)
 		return;
@@ -731,22 +751,25 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 			if (obj->astate == expect)
 				obj->astate = next;
 			else
-				debug_print_object(obj, "active_state");
+				debug_printobj = true;
 			break;
 
 		default:
-			debug_print_object(obj, "active_state");
+			debug_printobj = true;
 			break;
 		}
-	} else {
+	}
+
+	raw_spin_unlock_irqrestore(&db->lock, flags);
+	if (!obj) {
 		struct debug_obj o = { .object = addr,
 				       .state = ODEBUG_STATE_NOTAVAILABLE,
 				       .descr = descr };
 
 		debug_print_object(&o, "active_state");
+	} else if (debug_printobj) {
+		debug_print_object(obj, "active_state");
 	}
-
-	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
 EXPORT_SYMBOL_GPL(debug_object_active_state);
 
@@ -782,10 +805,10 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size)
 
 			switch (obj->state) {
 			case ODEBUG_STATE_ACTIVE:
-				debug_print_object(obj, "free");
 				descr = obj->descr;
 				state = obj->state;
 				raw_spin_unlock_irqrestore(&db->lock, flags);
+				debug_print_object(obj, "free");
 				debug_object_fixup(descr->fixup_free,
 						   (void *) oaddr, state);
 				goto repeat;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-12 22:28 [PATCH] debugobjects: Move printk out of db lock critical sections Waiman Long
@ 2018-12-12 23:39 ` Andrew Morton
  2018-12-13 22:01   ` Waiman Long
  2018-12-13  2:03 ` Sergey Senozhatsky
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2018-12-12 23:39 UTC (permalink / raw)
  To: Waiman Long
  Cc: Thomas Gleixner, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Yang Shi, Arnd Bergmann, Sergey Senozhatsky, Dmitry Safonov

On Wed, 12 Dec 2018 17:28:14 -0500 Waiman Long <longman@redhat.com> wrote:

> The db->lock is a raw spinlock and so the lock hold time is supposed
> to be short. This will not be the case when printk() is being involved
> in some of the critical sections. In order to avoid the long hold time,
> in case some messages need to be printed, the debug_object_is_on_stack()
> and debug_print_object() calls are now moved out of those critical
> sections.
> 
> Holding the db->lock while calling printk() may lead to deadlock if
> printk() somehow requires the allocation/freeing of debug object that
> happens to be in the same hash bucket or a circular lock dependency
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
> 
> [   87.209665] WARNING: possible circular locking dependency detected
> [   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
> [   87.211449] ------------------------------------------------------
> [   87.212405] getty/519 is trying to acquire lock:
> [   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
> [   87.214343]
> [   87.214343] but task is already holding lock:
> [   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
> [   87.216260]
> [   87.216260] which lock already depends on the new lock.
> 
> This patch was also found to be able to fix a boot hanging problem
> when the initramfs image was switched on after a debugobjects splat
> from the EFI code.

Patch looks sensible, but I have a nit about the variable names.

> --- a/lib/debugobjects.c
> +++ b/lib/debugobjects.c
> @@ -375,6 +375,8 @@ static void debug_object_is_on_stack(void *addr, int onstack)
>  	struct debug_bucket *db;
>  	struct debug_obj *obj;
>  	unsigned long flags;
> +	bool debug_printobj = false;

"debug_printobject" would be better, but this code already intermingles
"obj" and "object".

> +	bool debug_chkstack = false;

Not so good.  Is it debug_chkstack or debug_checkstk or ...

This file uses "check" consistently so let's not depart from that? 
Linux style is to avoid these tricky little abbreviations and to use
full words.  

ie, debug_checkstack, please.  Better would be debug_check_stack.  Or
simply check_stack: the "debug" doesn't add anything useful.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-12 22:28 [PATCH] debugobjects: Move printk out of db lock critical sections Waiman Long
  2018-12-12 23:39 ` Andrew Morton
@ 2018-12-13  2:03 ` Sergey Senozhatsky
  2018-12-13 19:59   ` Waiman Long
  2018-12-13  2:59 ` Sergey Senozhatsky
  2018-12-13  4:35 ` Dmitry Safonov
  3 siblings, 1 reply; 14+ messages in thread
From: Sergey Senozhatsky @ 2018-12-13  2:03 UTC (permalink / raw)
  To: Waiman Long
  Cc: Thomas Gleixner, Andrew Morton, linux-kernel, Ingo Molnar,
	Peter Zijlstra, Yang Shi, Arnd Bergmann, Sergey Senozhatsky,
	Dmitry Safonov

On (12/12/18 17:28), Waiman Long wrote:
> The db->lock is a raw spinlock and so the lock hold time is supposed
> to be short. This will not be the case when printk() is being involved
> in some of the critical sections. In order to avoid the long hold time,
> in case some messages need to be printed, the debug_object_is_on_stack()
> and debug_print_object() calls are now moved out of those critical
> sections.
> 
> Holding the db->lock while calling printk() may lead to deadlock if
> printk() somehow requires the allocation/freeing of debug object that
> happens to be in the same hash bucket or a circular lock dependency
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
> 
> [   87.209665] WARNING: possible circular locking dependency detected
> [   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
> [   87.211449] ------------------------------------------------------
> [   87.212405] getty/519 is trying to acquire lock:
> [   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
> [   87.214343]
> [   87.214343] but task is already holding lock:
> [   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
> [   87.216260]
> [   87.216260] which lock already depends on the new lock.
> 
> This patch was also found to be able to fix a boot hanging problem
> when the initramfs image was switched on after a debugobjects splat
> from the EFI code.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>

The patch looks good to me. A bit curious if we need to also patch
the self-test part debugobjects - check_results(). That guy still
printk()-s under bucket ->lock.

	-ss

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-12 22:28 [PATCH] debugobjects: Move printk out of db lock critical sections Waiman Long
  2018-12-12 23:39 ` Andrew Morton
  2018-12-13  2:03 ` Sergey Senozhatsky
@ 2018-12-13  2:59 ` Sergey Senozhatsky
  2018-12-13  9:21   ` Peter Zijlstra
  2018-12-13  4:35 ` Dmitry Safonov
  3 siblings, 1 reply; 14+ messages in thread
From: Sergey Senozhatsky @ 2018-12-13  2:59 UTC (permalink / raw)
  To: Waiman Long
  Cc: Thomas Gleixner, Andrew Morton, linux-kernel, Ingo Molnar,
	Peter Zijlstra, Yang Shi, Arnd Bergmann, Sergey Senozhatsky,
	Dmitry Safonov

On (12/12/18 17:28), Waiman Long wrote:
> 
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
> 

That link shows an empty page, lkml.org is quite unstable. Let's
use this one instead:

https://lore.kernel.org/lkml/20181211091154.GL23332@shao2-debian/T/#u

	-ss

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-12 22:28 [PATCH] debugobjects: Move printk out of db lock critical sections Waiman Long
                   ` (2 preceding siblings ...)
  2018-12-13  2:59 ` Sergey Senozhatsky
@ 2018-12-13  4:35 ` Dmitry Safonov
  2018-12-13  5:18   ` Sergey Senozhatsky
  2018-12-13 22:10   ` Waiman Long
  3 siblings, 2 replies; 14+ messages in thread
From: Dmitry Safonov @ 2018-12-13  4:35 UTC (permalink / raw)
  To: Waiman Long, Thomas Gleixner, Andrew Morton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Yang Shi,
	Arnd Bergmann, Sergey Senozhatsky

Hi Waiman,

On 12/12/18 10:28 PM, Waiman Long wrote:
> The db->lock is a raw spinlock and so the lock hold time is supposed
> to be short. This will not be the case when printk() is being involved
> in some of the critical sections. In order to avoid the long hold time,
> in case some messages need to be printed, the debug_object_is_on_stack()
> and debug_print_object() calls are now moved out of those critical
> sections.
> 
> Holding the db->lock while calling printk() may lead to deadlock if
> printk() somehow requires the allocation/freeing of debug object that
> happens to be in the same hash bucket or a circular lock dependency
> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
> 
> [   87.209665] WARNING: possible circular locking dependency detected
> [   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
> [   87.211449] ------------------------------------------------------
> [   87.212405] getty/519 is trying to acquire lock:
> [   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
> [   87.214343]
> [   87.214343] but task is already holding lock:
> [   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
> [   87.216260]
> [   87.216260] which lock already depends on the new lock.
> 
> This patch was also found to be able to fix a boot hanging problem
> when the initramfs image was switched on after a debugobjects splat
> from the EFI code.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---

I've tried to review it and found minor issues like missed
debug_object_is_on_stack() for initializing already active object.

But than I come to opinion that it's just generally unsafe:
debug_obj life-time is protected by bucket's spin_lock.
Check the conditions when free_object() is being called.

So, I resulted by finishing my stack-copy version which I've started
yesterday on test robot report when I didn't know about your patch.

I've sent it now concurrently.

Thanks,
          Dima

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13  4:35 ` Dmitry Safonov
@ 2018-12-13  5:18   ` Sergey Senozhatsky
  2018-12-13  5:54     ` Dmitry Safonov
  2018-12-13 22:10   ` Waiman Long
  1 sibling, 1 reply; 14+ messages in thread
From: Sergey Senozhatsky @ 2018-12-13  5:18 UTC (permalink / raw)
  To: Dmitry Safonov
  Cc: Waiman Long, Thomas Gleixner, Andrew Morton, linux-kernel,
	Ingo Molnar, Peter Zijlstra, Yang Shi, Arnd Bergmann,
	Sergey Senozhatsky

On (12/13/18 04:35), Dmitry Safonov wrote:
> I've tried to review it and found minor issues like missed
> debug_object_is_on_stack() for initializing already active object.
> 
> But than I come to opinion that it's just generally unsafe:
> debug_obj life-time is protected by bucket's spin_lock.
> Check the conditions when free_object() is being called.

I thought about it for a second, but couldn't figure out if this race
was real.

E.g. uart case - if there are two paths which concurrently free and
access debug object, then the same race condition should exist for
the xmit.buf page. debug object's lifespan should be the same as the
lifespan of, ummm, whatever the thing that debug object is attached
to? I'm surely missing something here.

	-ss

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13  5:18   ` Sergey Senozhatsky
@ 2018-12-13  5:54     ` Dmitry Safonov
  2018-12-13  9:00       ` Sergey Senozhatsky
  0 siblings, 1 reply; 14+ messages in thread
From: Dmitry Safonov @ 2018-12-13  5:54 UTC (permalink / raw)
  To: sergey.senozhatsky.work
  Cc: Dmitry Safonov, longman, Thomas Gleixner, Andrew Morton,
	open list, Ingo Molnar, Peter Zijlstra, yang.shi, Arnd Bergmann

On Thu, 13 Dec 2018 at 05:19, Sergey Senozhatsky
<sergey.senozhatsky.work@gmail.com> wrote:
>
> On (12/13/18 04:35), Dmitry Safonov wrote:
> > I've tried to review it and found minor issues like missed
> > debug_object_is_on_stack() for initializing already active object.
> >
> > But than I come to opinion that it's just generally unsafe:
> > debug_obj life-time is protected by bucket's spin_lock.
> > Check the conditions when free_object() is being called.
>
> I thought about it for a second, but couldn't figure out if this race
> was real.
>
> E.g. uart case - if there are two paths which concurrently free and
> access debug object, then the same race condition should exist for
> the xmit.buf page. debug object's lifespan should be the same as the
> lifespan of, ummm, whatever the thing that debug object is attached
> to? I'm surely missing something here.

I think the point of debugobjects is to reduce the damage caused
by an improper life-time of objects.

IIUC:
1. alloc_page()
2. dynamically allocate a workqueue on a new page
3. schedule some work
4. until the work has run free_page()

So, wroqueue fixup upon freeing should cancel the work TBD.

But maybe I've misunderstood something..
And it's 6 am in my timezone, I'll catch some sleep and
will glance this with some morning coffee.

Thanks,
             Dmitry

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13  5:54     ` Dmitry Safonov
@ 2018-12-13  9:00       ` Sergey Senozhatsky
  0 siblings, 0 replies; 14+ messages in thread
From: Sergey Senozhatsky @ 2018-12-13  9:00 UTC (permalink / raw)
  To: Dmitry Safonov
  Cc: sergey.senozhatsky.work, Dmitry Safonov, longman,
	Thomas Gleixner, Andrew Morton, open list, Ingo Molnar,
	Peter Zijlstra, yang.shi, Arnd Bergmann

On (12/13/18 05:54), Dmitry Safonov wrote:
> > I thought about it for a second, but couldn't figure out if this race
> > was real.
> >
> > E.g. uart case - if there are two paths which concurrently free and
> > access debug object, then the same race condition should exist for
> > the xmit.buf page. debug object's lifespan should be the same as the
> > lifespan of, ummm, whatever the thing that debug object is attached
> > to? I'm surely missing something here.
> 
> I think the point of debugobjects is to reduce the damage caused
> by an improper life-time of objects.
> 
> IIUC:
> 1. alloc_page()
> 2. dynamically allocate a workqueue on a new page
> 3. schedule some work
> 4. until the work has run free_page()

Hmm, if we free() that page then __debug_check_no_obj_freed() should ->fixup
all ODEBUG_STATE_ACTIVE objects first. Maybe we are talking about different
scenarios.

	-ss

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13  2:59 ` Sergey Senozhatsky
@ 2018-12-13  9:21   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2018-12-13  9:21 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Waiman Long, Thomas Gleixner, Andrew Morton, linux-kernel,
	Ingo Molnar, Yang Shi, Arnd Bergmann, Dmitry Safonov

On Thu, Dec 13, 2018 at 11:59:40AM +0900, Sergey Senozhatsky wrote:
> On (12/12/18 17:28), Waiman Long wrote:
> > 
> > warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
> > 
> 
> That link shows an empty page, lkml.org is quite unstable. Let's
> use this one instead:
> 
> https://lore.kernel.org/lkml/20181211091154.GL23332@shao2-debian/T/#u

Better yet:

  https://lkml.kernel.org/r/20181211091154.GL23332@shao2-debian

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13  2:03 ` Sergey Senozhatsky
@ 2018-12-13 19:59   ` Waiman Long
  0 siblings, 0 replies; 14+ messages in thread
From: Waiman Long @ 2018-12-13 19:59 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Thomas Gleixner, Andrew Morton, linux-kernel, Ingo Molnar,
	Peter Zijlstra, Yang Shi, Arnd Bergmann, Dmitry Safonov

On 12/12/2018 09:03 PM, Sergey Senozhatsky wrote:
> On (12/12/18 17:28), Waiman Long wrote:
>> The db->lock is a raw spinlock and so the lock hold time is supposed
>> to be short. This will not be the case when printk() is being involved
>> in some of the critical sections. In order to avoid the long hold time,
>> in case some messages need to be printed, the debug_object_is_on_stack()
>> and debug_print_object() calls are now moved out of those critical
>> sections.
>>
>> Holding the db->lock while calling printk() may lead to deadlock if
>> printk() somehow requires the allocation/freeing of debug object that
>> happens to be in the same hash bucket or a circular lock dependency
>> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>>
>> [   87.209665] WARNING: possible circular locking dependency detected
>> [   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
>> [   87.211449] ------------------------------------------------------
>> [   87.212405] getty/519 is trying to acquire lock:
>> [   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
>> [   87.214343]
>> [   87.214343] but task is already holding lock:
>> [   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
>> [   87.216260]
>> [   87.216260] which lock already depends on the new lock.
>>
>> This patch was also found to be able to fix a boot hanging problem
>> when the initramfs image was switched on after a debugobjects splat
>> from the EFI code.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
> The patch looks good to me. A bit curious if we need to also patch
> the self-test part debugobjects - check_results(). That guy still
> printk()-s under bucket ->lock.
>
> 	-ss

Yes, I should have changed those in the check_results() as well.

-Longman


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-12 23:39 ` Andrew Morton
@ 2018-12-13 22:01   ` Waiman Long
  0 siblings, 0 replies; 14+ messages in thread
From: Waiman Long @ 2018-12-13 22:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Thomas Gleixner, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Yang Shi, Arnd Bergmann, Sergey Senozhatsky, Dmitry Safonov

On 12/12/2018 06:39 PM, Andrew Morton wrote:
> On Wed, 12 Dec 2018 17:28:14 -0500 Waiman Long <longman@redhat.com> wrote:
>
>> The db->lock is a raw spinlock and so the lock hold time is supposed
>> to be short. This will not be the case when printk() is being involved
>> in some of the critical sections. In order to avoid the long hold time,
>> in case some messages need to be printed, the debug_object_is_on_stack()
>> and debug_print_object() calls are now moved out of those critical
>> sections.
>>
>> Holding the db->lock while calling printk() may lead to deadlock if
>> printk() somehow requires the allocation/freeing of debug object that
>> happens to be in the same hash bucket or a circular lock dependency
>> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>>
>> [   87.209665] WARNING: possible circular locking dependency detected
>> [   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
>> [   87.211449] ------------------------------------------------------
>> [   87.212405] getty/519 is trying to acquire lock:
>> [   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
>> [   87.214343]
>> [   87.214343] but task is already holding lock:
>> [   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
>> [   87.216260]
>> [   87.216260] which lock already depends on the new lock.
>>
>> This patch was also found to be able to fix a boot hanging problem
>> when the initramfs image was switched on after a debugobjects splat
>> from the EFI code.
> Patch looks sensible, but I have a nit about the variable names.
>
>> --- a/lib/debugobjects.c
>> +++ b/lib/debugobjects.c
>> @@ -375,6 +375,8 @@ static void debug_object_is_on_stack(void *addr, int onstack)
>>  	struct debug_bucket *db;
>>  	struct debug_obj *obj;
>>  	unsigned long flags;
>> +	bool debug_printobj = false;
> "debug_printobject" would be better, but this code already intermingles
> "obj" and "object".
>
>> +	bool debug_chkstack = false;
> Not so good.  Is it debug_chkstack or debug_checkstk or ...
>
> This file uses "check" consistently so let's not depart from that? 
> Linux style is to avoid these tricky little abbreviations and to use
> full words.  
>
> ie, debug_checkstack, please.  Better would be debug_check_stack.  Or
> simply check_stack: the "debug" doesn't add anything useful.
>
>
Thanks for the review. I have eliminated debug_printobj in the new v2 patch.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13  4:35 ` Dmitry Safonov
  2018-12-13  5:18   ` Sergey Senozhatsky
@ 2018-12-13 22:10   ` Waiman Long
  2018-12-14 18:06     ` Dmitry Safonov
  1 sibling, 1 reply; 14+ messages in thread
From: Waiman Long @ 2018-12-13 22:10 UTC (permalink / raw)
  To: Dmitry Safonov, Thomas Gleixner, Andrew Morton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Yang Shi,
	Arnd Bergmann, Sergey Senozhatsky

On 12/12/2018 11:35 PM, Dmitry Safonov wrote:
> Hi Waiman,
>
> On 12/12/18 10:28 PM, Waiman Long wrote:
>> The db->lock is a raw spinlock and so the lock hold time is supposed
>> to be short. This will not be the case when printk() is being involved
>> in some of the critical sections. In order to avoid the long hold time,
>> in case some messages need to be printed, the debug_object_is_on_stack()
>> and debug_print_object() calls are now moved out of those critical
>> sections.
>>
>> Holding the db->lock while calling printk() may lead to deadlock if
>> printk() somehow requires the allocation/freeing of debug object that
>> happens to be in the same hash bucket or a circular lock dependency
>> warning from lockdep as reported in https://lkml.org/lkml/2018/12/11/143.
>>
>> [   87.209665] WARNING: possible circular locking dependency detected
>> [   87.210547] 4.20.0-rc4-00057-gc96cf92 #1 Tainted: G        W
>> [   87.211449] ------------------------------------------------------
>> [   87.212405] getty/519 is trying to acquire lock:
>> [   87.213074] (____ptrval____) (&obj_hash[i].lock){-.-.}, at: debug_check_no_obj_freed+0xb4/0x302
>> [   87.214343]
>> [   87.214343] but task is already holding lock:
>> [   87.215174] (____ptrval____) (&port_lock_key){-.-.}, at: uart_shutdown+0x3a3/0x4e2
>> [   87.216260]
>> [   87.216260] which lock already depends on the new lock.
>>
>> This patch was also found to be able to fix a boot hanging problem
>> when the initramfs image was switched on after a debugobjects splat
>> from the EFI code.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
> I've tried to review it and found minor issues like missed
> debug_object_is_on_stack() for initializing already active object.
>
> But than I come to opinion that it's just generally unsafe:
> debug_obj life-time is protected by bucket's spin_lock.
> Check the conditions when free_object() is being called.

The bucket lock is for protecting the insertion and deletion of
debug_obj to/from the bucket list as well as searching within the bucket
list. It has nothing to do with the life time of the debug_obj itself.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-13 22:10   ` Waiman Long
@ 2018-12-14 18:06     ` Dmitry Safonov
  2018-12-14 18:21       ` Waiman Long
  0 siblings, 1 reply; 14+ messages in thread
From: Dmitry Safonov @ 2018-12-14 18:06 UTC (permalink / raw)
  To: Waiman Long, Thomas Gleixner, Andrew Morton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Yang Shi,
	Arnd Bergmann, Sergey Senozhatsky

On 12/13/18 10:10 PM, Waiman Long wrote:
> The bucket lock is for protecting the insertion and deletion of
> debug_obj to/from the bucket list as well as searching within the bucket
> list. It has nothing to do with the life time of the debug_obj itself.

The bucket lock also protects lookups on a bucket.

1. Imagine, you have object in ODEBUG_STATE_DESTROYED.
If you will try debug_object_activate() with this patch, it will debug
print object outside of bucket lock, which means that
debug_check_no_obj_freed() may concurrently fixup/free object and
meta-data on another CPU.

I don't see this state is being used in many places around the kernel,
except selftest and i915 driver.

2. If you try to deactivate already non-active object - you will have
debug print outside of bucket lock. The other CPU can fixup/free page
with this object concurrently.

(before your patch the lookup would fail and no printing)

So, I might be mistaken, I'm mostly worried about dereferencing the
descriptor inside of the object during printing.
And in my opinion, i.e., there was a reason to save `descr` before
releasing the bucket lock here:
https://elixir.bootlin.com/linux/latest/source/lib/debugobjects.c#L789

Anyway, I see that your patch is already in -mm tree and Thomas seems to
be fine with the change so whatever %)

Thanks,
          Dima

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] debugobjects: Move printk out of db lock critical sections
  2018-12-14 18:06     ` Dmitry Safonov
@ 2018-12-14 18:21       ` Waiman Long
  0 siblings, 0 replies; 14+ messages in thread
From: Waiman Long @ 2018-12-14 18:21 UTC (permalink / raw)
  To: Dmitry Safonov, Thomas Gleixner, Andrew Morton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Yang Shi,
	Arnd Bergmann, Sergey Senozhatsky

On 12/14/2018 01:06 PM, Dmitry Safonov wrote:
> On 12/13/18 10:10 PM, Waiman Long wrote:
>> The bucket lock is for protecting the insertion and deletion of
>> debug_obj to/from the bucket list as well as searching within the bucket
>> list. It has nothing to do with the life time of the debug_obj itself.
> The bucket lock also protects lookups on a bucket.

Yes, that is what I meant by "searching".

>
> 1. Imagine, you have object in ODEBUG_STATE_DESTROYED.
> If you will try debug_object_activate() with this patch, it will debug
> print object outside of bucket lock, which means that
> debug_check_no_obj_freed() may concurrently fixup/free object and
> meta-data on another CPU.
>
> I don't see this state is being used in many places around the kernel,
> except selftest and i915 driver.
>
> 2. If you try to deactivate already non-active object - you will have
> debug print outside of bucket lock. The other CPU can fixup/free page
> with this object concurrently.
>
> (before your patch the lookup would fail and no printing)
>
> So, I might be mistaken, I'm mostly worried about dereferencing the
> descriptor inside of the object during printing.
> And in my opinion, i.e., there was a reason to save `descr` before
> releasing the bucket lock here:
> https://elixir.bootlin.com/linux/latest/source/lib/debugobjects.c#L789
>
> Anyway, I see that your patch is already in -mm tree and Thomas seems to
> be fine with the change so whatever %)

I also thought about whether we should save descr before unlock. Anyway,
we can always send a follow-up patch to make further change if it is the
right thing to do.

The debugobjects code is enabled in a debug kernel and never in a kernel
for production use. The printk should only be triggered if there is a
bug in the code which the debug object is tracking.

Thanks,
Longman



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-12-14 18:21 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-12 22:28 [PATCH] debugobjects: Move printk out of db lock critical sections Waiman Long
2018-12-12 23:39 ` Andrew Morton
2018-12-13 22:01   ` Waiman Long
2018-12-13  2:03 ` Sergey Senozhatsky
2018-12-13 19:59   ` Waiman Long
2018-12-13  2:59 ` Sergey Senozhatsky
2018-12-13  9:21   ` Peter Zijlstra
2018-12-13  4:35 ` Dmitry Safonov
2018-12-13  5:18   ` Sergey Senozhatsky
2018-12-13  5:54     ` Dmitry Safonov
2018-12-13  9:00       ` Sergey Senozhatsky
2018-12-13 22:10   ` Waiman Long
2018-12-14 18:06     ` Dmitry Safonov
2018-12-14 18:21       ` Waiman Long

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).