linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] x86, espfix: use spin_lock rather than mutex
@ 2015-05-14 11:37 Gu Zheng
  2015-05-14 12:26 ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Gu Zheng @ 2015-05-14 11:37 UTC (permalink / raw)
  To: linux-kernel; +Cc: x86, mingo, hpa, guz.fnst, Stable

The following lockdep warning occurrs when running with latest kernel:
[    3.178000] ------------[ cut here ]------------
[    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[    3.199000] Modules linked in:

[    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[    3.246000] Call Trace:
[    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
init_espfix_ap() which is called before enabled local irq, and the lockdep
sub-system considers this behaviour as allocating memory with GFP_FS with
local irq disabled, then trigger the warning as mentioned about.
Though here we use GFP_NOFS rather GFP_KERNEL to avoid the warning, but
you know, init_espfix_ap is called with preempt and local irq disabled,
it is not a good idea to use mutex (might sleep) here.
So we convert the initialization lock to spin_lock here to avoid the noise.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Cc: Stable <stable@vger.kernel.org>
---
 arch/x86/kernel/espfix_64.c |   13 +++++++------
 1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730..ceb35a3 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,14 +57,14 @@
 # error "Need more than one PGD for the ESPFIX hack"
 #endif
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO)
 
 /* This contains the *bottom* address of the espfix stack */
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
-/* Initialization mutex - should this be a spinlock? */
-static DEFINE_MUTEX(espfix_init_mutex);
+/* Initialization lock */
+static DEFINE_SPINLOCK(espfix_init_lock);
 
 /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
 #define ESPFIX_MAX_PAGES  DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
@@ -144,6 +144,7 @@ void init_espfix_ap(void)
 	int n;
 	void *stack_page;
 	pteval_t ptemask;
+	unsigned long flags;
 
 	/* We only have to do this once... */
 	if (likely(this_cpu_read(espfix_stack)))
@@ -158,7 +159,7 @@ void init_espfix_ap(void)
 	if (likely(stack_page))
 		goto done;
 
-	mutex_lock(&espfix_init_mutex);
+	spin_lock_irqsave(&espfix_init_lock, flags);
 
 	/* Did we race on the lock? */
 	stack_page = ACCESS_ONCE(espfix_pages[page]);
@@ -188,7 +189,7 @@ void init_espfix_ap(void)
 	}
 
 	pte_p = pte_offset_kernel(&pmd, addr);
-	stack_page = (void *)__get_free_page(GFP_KERNEL);
+	stack_page = (void *)__get_free_page(PGALLOC_GFP);
 	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -197,7 +198,7 @@ void init_espfix_ap(void)
 	ACCESS_ONCE(espfix_pages[page]) = stack_page;
 
 unlock_done:
-	mutex_unlock(&espfix_init_mutex);
+	spin_unlock_irqrestore(&espfix_init_lock, flags);
 done:
 	this_cpu_write(espfix_stack, addr);
 	this_cpu_write(espfix_waddr, (unsigned long)stack_page
-- 
1.7.7


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-14 11:37 [RFC PATCH] x86, espfix: use spin_lock rather than mutex Gu Zheng
@ 2015-05-14 12:26 ` Borislav Petkov
  2015-05-14 18:29   ` Ingo Molnar
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2015-05-14 12:26 UTC (permalink / raw)
  To: Gu Zheng, H. Peter Anvin, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, x86, Stable

On Thu, May 14, 2015 at 07:37:45PM +0800, Gu Zheng wrote:
> The following lockdep warning occurrs when running with latest kernel:
> [    3.178000] ------------[ cut here ]------------
> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> [    3.199000] Modules linked in:
> 
> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> [    3.246000] Call Trace:
> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>
> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
> init_espfix_ap() which is called before enabled local irq, and the lockdep
> sub-system considers this behaviour as allocating memory with GFP_FS with
> local irq disabled, then trigger the warning as mentioned about.
> Though here we use GFP_NOFS rather GFP_KERNEL to avoid the warning, but
> you know, init_espfix_ap is called with preempt and local irq disabled,
> it is not a good idea to use mutex (might sleep) here.
> So we convert the initialization lock to spin_lock here to avoid the noise.
> 
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> Cc: Stable <stable@vger.kernel.org>
> ---
>  arch/x86/kernel/espfix_64.c |   13 +++++++------
>  1 files changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
> index f5d0730..ceb35a3 100644
> --- a/arch/x86/kernel/espfix_64.c
> +++ b/arch/x86/kernel/espfix_64.c
> @@ -57,14 +57,14 @@
>  # error "Need more than one PGD for the ESPFIX hack"
>  #endif
>  
> -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
> +#define PGALLOC_GFP (GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO)

IINM, that's ESPFIX_MAX_PAGES with GFP_ATOMIC which for 8K CPUs are 128
pages.

That's a lotta waste in my book for espfix stack pages.

Enabling interrupts earlier in start_secondary() is probably out of the
question, maybe we should prealloc all those pages...

hpa?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-14 12:26 ` Borislav Petkov
@ 2015-05-14 18:29   ` Ingo Molnar
  2015-05-14 21:27     ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: Ingo Molnar @ 2015-05-14 18:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Gu Zheng, H. Peter Anvin, Thomas Gleixner, linux-kernel, x86, Stable


* Borislav Petkov <bp@alien8.de> wrote:

> On Thu, May 14, 2015 at 07:37:45PM +0800, Gu Zheng wrote:
> > The following lockdep warning occurrs when running with latest kernel:
> > [    3.178000] ------------[ cut here ]------------
> > [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> > [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> > [    3.199000] Modules linked in:
> > 
> > [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> > [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> > [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> > [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> > [    3.246000] Call Trace:
> > [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> > [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> > [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> > [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> > [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> > [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> > [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> > [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> > [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> > [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> > [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> > [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> > [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
> >
> > This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
> > init_espfix_ap() which is called before enabled local irq, and the lockdep
> > sub-system considers this behaviour as allocating memory with GFP_FS with
> > local irq disabled, then trigger the warning as mentioned about.
> > Though here we use GFP_NOFS rather GFP_KERNEL to avoid the warning, but
> > you know, init_espfix_ap is called with preempt and local irq disabled,
> > it is not a good idea to use mutex (might sleep) here.
> > So we convert the initialization lock to spin_lock here to avoid the noise.
> > 
> > Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > Cc: Stable <stable@vger.kernel.org>
> > ---
> >  arch/x86/kernel/espfix_64.c |   13 +++++++------
> >  1 files changed, 7 insertions(+), 6 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
> > index f5d0730..ceb35a3 100644
> > --- a/arch/x86/kernel/espfix_64.c
> > +++ b/arch/x86/kernel/espfix_64.c
> > @@ -57,14 +57,14 @@
> >  # error "Need more than one PGD for the ESPFIX hack"
> >  #endif
> >  
> > -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
> > +#define PGALLOC_GFP (GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO)
> 
> IINM, that's ESPFIX_MAX_PAGES with GFP_ATOMIC which for 8K CPUs are 128
> pages.
> 
> That's a lotta waste in my book for espfix stack pages.
> 
> Enabling interrupts earlier in start_secondary() is probably out of 
> the question, maybe we should prealloc all those pages...

We could allocate them on the boot CPU side and hand them over to the 
secondary CPU.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-14 18:29   ` Ingo Molnar
@ 2015-05-14 21:27     ` Borislav Petkov
  2015-05-14 22:13       ` H. Peter Anvin
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2015-05-14 21:27 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Gu Zheng, H. Peter Anvin, Thomas Gleixner, linux-kernel, x86

Remove stable@ from CC.

On Thu, May 14, 2015 at 08:29:55PM +0200, Ingo Molnar wrote:
> We could allocate them on the boot CPU side and hand them over to the 
> secondary CPU.

Yeah, something along those lines. I mean, they're allocated and in-use
during the complete system lifetime, we might just as well allocate them
all in one go. Btw, what's our allocator that early, memblock?

Still, what I find strange is why are we seeing this only now? Is it
because it had to be a big box (cpu >= 128) or something else changed...?

Hohumm.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-14 21:27     ` Borislav Petkov
@ 2015-05-14 22:13       ` H. Peter Anvin
  2015-05-15  6:54         ` Ingo Molnar
  0 siblings, 1 reply; 23+ messages in thread
From: H. Peter Anvin @ 2015-05-14 22:13 UTC (permalink / raw)
  To: Borislav Petkov, Ingo Molnar; +Cc: Gu Zheng, Thomas Gleixner, linux-kernel, x86

On 05/14/2015 02:27 PM, Borislav Petkov wrote:
> Remove stable@ from CC.
> 
> On Thu, May 14, 2015 at 08:29:55PM +0200, Ingo Molnar wrote:
>> We could allocate them on the boot CPU side and hand them over to the 
>> secondary CPU.
> 
> Yeah, something along those lines. I mean, they're allocated and in-use
> during the complete system lifetime, we might just as well allocate them
> all in one go. Btw, what's our allocator that early, memblock?
> 
> Still, what I find strange is why are we seeing this only now? Is it
> because it had to be a big box (cpu >= 128) or something else changed...?
> 

Quite probable.  You don't really want to allocate them until you know
if a CPU at least exists, though.

I like Ingo's suggestion of allocating them before CPU bringup on the
initiating CPU.

	-hpa



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-14 22:13       ` H. Peter Anvin
@ 2015-05-15  6:54         ` Ingo Molnar
  2015-05-15  7:27           ` H. Peter Anvin
  0 siblings, 1 reply; 23+ messages in thread
From: Ingo Molnar @ 2015-05-15  6:54 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Borislav Petkov, Gu Zheng, Thomas Gleixner, linux-kernel, x86


* H. Peter Anvin <hpa@zytor.com> wrote:

> On 05/14/2015 02:27 PM, Borislav Petkov wrote:
> > Remove stable@ from CC.
> > 
> > On Thu, May 14, 2015 at 08:29:55PM +0200, Ingo Molnar wrote:
> >> We could allocate them on the boot CPU side and hand them over to 
> >> the secondary CPU.
> > 
> > Yeah, something along those lines. I mean, they're allocated and 
> > in-use during the complete system lifetime, we might just as well 
> > allocate them all in one go. Btw, what's our allocator that early, 
> > memblock?
> > 
> > Still, what I find strange is why are we seeing this only now? Is 
> > it because it had to be a big box (cpu >= 128) or something else 
> > changed...?
> > 
> 
> Quite probable.  You don't really want to allocate them until you 
> know if a CPU at least exists, though.
> 
> I like Ingo's suggestion of allocating them before CPU bringup on 
> the initiating CPU.

The only slightly subtle detail with that is to use alloc_pages_node() 
with the secondary CPU's node, to make sure the espfix stack is 
NUMA-local to the CPU that is going to use it.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-15  6:54         ` Ingo Molnar
@ 2015-05-15  7:27           ` H. Peter Anvin
  2015-05-18 19:43             ` Andy Lutomirski
  0 siblings, 1 reply; 23+ messages in thread
From: H. Peter Anvin @ 2015-05-15  7:27 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Borislav Petkov, Gu Zheng, Thomas Gleixner, linux-kernel, x86

On 05/14/2015 11:54 PM, Ingo Molnar wrote:
>
> The only slightly subtle detail with that is to use alloc_pages_node()
> with the secondary CPU's node, to make sure the espfix stack is
> NUMA-local to the CPU that is going to use it.
>

It doesn't hurt, although it isn't super critical as each page will be 
shared among 64 CPUs.  The whole espfix stack is only a single cacheline 
long.

	-hpa


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-15  7:27           ` H. Peter Anvin
@ 2015-05-18 19:43             ` Andy Lutomirski
  2015-05-19 15:04               ` H. Peter Anvin
  0 siblings, 1 reply; 23+ messages in thread
From: Andy Lutomirski @ 2015-05-18 19:43 UTC (permalink / raw)
  To: H. Peter Anvin, Ingo Molnar
  Cc: Borislav Petkov, Gu Zheng, Thomas Gleixner, linux-kernel, x86

On 05/15/2015 12:27 AM, H. Peter Anvin wrote:
> On 05/14/2015 11:54 PM, Ingo Molnar wrote:
>>
>> The only slightly subtle detail with that is to use alloc_pages_node()
>> with the secondary CPU's node, to make sure the espfix stack is
>> NUMA-local to the CPU that is going to use it.
>>
>
> It doesn't hurt, although it isn't super critical as each page will be
> shared among 64 CPUs.  The whole espfix stack is only a single cacheline
> long.
>

I don't think we actually need these pages allocated until we try to run 
user code.  Can we move this very late in initialization instead?

--Andy


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex
  2015-05-18 19:43             ` Andy Lutomirski
@ 2015-05-19 15:04               ` H. Peter Anvin
  2015-05-22 10:13                 ` [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP Gu Zheng
  0 siblings, 1 reply; 23+ messages in thread
From: H. Peter Anvin @ 2015-05-19 15:04 UTC (permalink / raw)
  To: Andy Lutomirski, Ingo Molnar
  Cc: Borislav Petkov, Gu Zheng, Thomas Gleixner, linux-kernel, x86

On 05/18/2015 12:43 PM, Andy Lutomirski wrote:
> On 05/15/2015 12:27 AM, H. Peter Anvin wrote:
>> On 05/14/2015 11:54 PM, Ingo Molnar wrote:
>>>
>>> The only slightly subtle detail with that is to use alloc_pages_node()
>>> with the secondary CPU's node, to make sure the espfix stack is
>>> NUMA-local to the CPU that is going to use it.
>>>
>>
>> It doesn't hurt, although it isn't super critical as each page will be
>> shared among 64 CPUs.  The whole espfix stack is only a single cacheline
>> long.
>>
> 
> I don't think we actually need these pages allocated until we try to run
> user code.  Can we move this very late in initialization instead?
> 

Yes, we could, as long as it is run on each CPU before that CPU tries to
run user code.

	-hpa



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP
  2015-05-19 15:04               ` H. Peter Anvin
@ 2015-05-22 10:13                 ` Gu Zheng
  2015-05-28  1:20                   ` Gu Zheng
  0 siblings, 1 reply; 23+ messages in thread
From: Gu Zheng @ 2015-05-22 10:13 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, Ingo Molnar, Borislav Petkov, Thomas Gleixner,
	linux-kernel, x86

The following lockdep warning occurs when running with 4.1.0-rc3:
[    3.178000] ------------[ cut here ]------------
[    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[    3.199000] Modules linked in:

[    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[    3.246000] Call Trace:
[    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
init_espfix_ap() which is called before enabled local irq, and the lockdep
sub-system considers this behaviour as allocating memory with GFP_FS with
local irq disabled, then trigger the warning as mentioned about.

Though we could allocate them on the boot CPU side and hand them over to
the secondary CPU, but it seems a waste if some of cpus are still offline.
As there is no need to these pages(espfix stack) until we try to run user
code, so we can postpone the initialization of espfix stack after cpu
booted to avoid the noise.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
 arch/x86/kernel/smpboot.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547e..3ce05de 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
 	check_tsc_sync_target();
 
 	/*
-	 * Enable the espfix hack for this CPU
-	 */
-#ifdef CONFIG_X86_ESPFIX64
-	init_espfix_ap();
-#endif
-
-	/*
 	 * We need to hold vector_lock so there the set of online cpus
 	 * does not change while we are assigning vectors to cpus.  Holding
 	 * this lock ensures we don't half assign or remove an irq from a cpu.
@@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		}
 	}
 
+	/*
+	 * Enable the espfix hack for this CPU
+	 */
+#ifdef CONFIG_X86_ESPFIX64
+	init_espfix_ap();
+#endif
+
 	/* mark "stuck" area as not stuck */
 	*trampoline_status = 0;
 
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP
  2015-05-22 10:13                 ` [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP Gu Zheng
@ 2015-05-28  1:20                   ` Gu Zheng
  2015-05-29  1:07                     ` Andy Lutomirski
  0 siblings, 1 reply; 23+ messages in thread
From: Gu Zheng @ 2015-05-28  1:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andy Lutomirski, Ingo Molnar, Borislav Petkov, Thomas Gleixner,
	linux-kernel, x86

ping...

On 05/22/2015 06:13 PM, Gu Zheng wrote:

> The following lockdep warning occurs when running with 4.1.0-rc3:
> [    3.178000] ------------[ cut here ]------------
> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> [    3.199000] Modules linked in:
> 
> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> [    3.246000] Call Trace:
> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
> 
> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
> init_espfix_ap() which is called before enabled local irq, and the lockdep
> sub-system considers this behaviour as allocating memory with GFP_FS with
> local irq disabled, then trigger the warning as mentioned about.
> 
> Though we could allocate them on the boot CPU side and hand them over to
> the secondary CPU, but it seems a waste if some of cpus are still offline.
> As there is no need to these pages(espfix stack) until we try to run user
> code, so we can postpone the initialization of espfix stack after cpu
> booted to avoid the noise.
> 
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> ---
>  arch/x86/kernel/smpboot.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 50e547e..3ce05de 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
>  	check_tsc_sync_target();
>  
>  	/*
> -	 * Enable the espfix hack for this CPU
> -	 */
> -#ifdef CONFIG_X86_ESPFIX64
> -	init_espfix_ap();
> -#endif
> -
> -	/*
>  	 * We need to hold vector_lock so there the set of online cpus
>  	 * does not change while we are assigning vectors to cpus.  Holding
>  	 * this lock ensures we don't half assign or remove an irq from a cpu.
> @@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>  		}
>  	}
>  
> +	/*
> +	 * Enable the espfix hack for this CPU
> +	 */
> +#ifdef CONFIG_X86_ESPFIX64
> +	init_espfix_ap();
> +#endif
> +
>  	/* mark "stuck" area as not stuck */
>  	*trampoline_status = 0;
>  



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP
  2015-05-29  1:07                     ` Andy Lutomirski
@ 2015-05-29  0:57                       ` Gu Zheng
  2015-06-02  9:23                       ` Gu Zheng
  2015-06-02  9:25                       ` [RFC PATCH V2] " Gu Zheng
  2 siblings, 0 replies; 23+ messages in thread
From: Gu Zheng @ 2015-05-29  0:57 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. Peter Anvin, Andy Lutomirski, Ingo Molnar, Borislav Petkov,
	Thomas Gleixner, linux-kernel, X86 ML

Hi Andy,

On 05/29/2015 09:07 AM, Andy Lutomirski wrote:

> On Wed, May 27, 2015 at 6:20 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> ping...
>>
>> On 05/22/2015 06:13 PM, Gu Zheng wrote:
>>
>>> The following lockdep warning occurs when running with 4.1.0-rc3:
>>> [    3.178000] ------------[ cut here ]------------
>>> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
>>> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>>> [    3.199000] Modules linked in:
>>>
>>> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
>>> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
>>> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
>>> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
>>> [    3.246000] Call Trace:
>>> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
>>> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
>>> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
>>> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
>>> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
>>> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
>>> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
>>> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
>>> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
>>> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
>>> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
>>> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
>>> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>>>
>>> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
>>> init_espfix_ap() which is called before enabled local irq, and the lockdep
>>> sub-system considers this behaviour as allocating memory with GFP_FS with
>>> local irq disabled, then trigger the warning as mentioned about.
>>>
>>> Though we could allocate them on the boot CPU side and hand them over to
>>> the secondary CPU, but it seems a waste if some of cpus are still offline.
>>> As there is no need to these pages(espfix stack) until we try to run user
>>> code, so we can postpone the initialization of espfix stack after cpu
>>> booted to avoid the noise.
> 
> Does this pass the sigreturn_32 test on both 32-bit and 64-bit kernels
> and sigreturn_64 test on 64-bit kernels?  (The test is in
> tools/testing/selftests/x86.)  If so, looks good to me.

To be honest, I forgot this part, will do it soon.
Thanks for your reminder.

Regards,
Gu

> 
> --Andy
> 
>>>
>>> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
>>> ---
>>>  arch/x86/kernel/smpboot.c | 14 +++++++-------
>>>  1 file changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>>> index 50e547e..3ce05de 100644
>>> --- a/arch/x86/kernel/smpboot.c
>>> +++ b/arch/x86/kernel/smpboot.c
>>> @@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
>>>       check_tsc_sync_target();
>>>
>>>       /*
>>> -      * Enable the espfix hack for this CPU
>>> -      */
>>> -#ifdef CONFIG_X86_ESPFIX64
>>> -     init_espfix_ap();
>>> -#endif
>>> -
>>> -     /*
>>>        * We need to hold vector_lock so there the set of online cpus
>>>        * does not change while we are assigning vectors to cpus.  Holding
>>>        * this lock ensures we don't half assign or remove an irq from a cpu.
>>> @@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>>>               }
>>>       }
>>>
>>> +     /*
>>> +      * Enable the espfix hack for this CPU
>>> +      */
>>> +#ifdef CONFIG_X86_ESPFIX64
>>> +     init_espfix_ap();
>>> +#endif
>>> +
>>>       /* mark "stuck" area as not stuck */
>>>       *trampoline_status = 0;
>>>
>>
>>
> 
> 
> 



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP
  2015-05-28  1:20                   ` Gu Zheng
@ 2015-05-29  1:07                     ` Andy Lutomirski
  2015-05-29  0:57                       ` Gu Zheng
                                         ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Andy Lutomirski @ 2015-05-29  1:07 UTC (permalink / raw)
  To: Gu Zheng
  Cc: H. Peter Anvin, Andy Lutomirski, Ingo Molnar, Borislav Petkov,
	Thomas Gleixner, linux-kernel, X86 ML

On Wed, May 27, 2015 at 6:20 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> ping...
>
> On 05/22/2015 06:13 PM, Gu Zheng wrote:
>
>> The following lockdep warning occurs when running with 4.1.0-rc3:
>> [    3.178000] ------------[ cut here ]------------
>> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
>> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>> [    3.199000] Modules linked in:
>>
>> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
>> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
>> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
>> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
>> [    3.246000] Call Trace:
>> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
>> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
>> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
>> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
>> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
>> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
>> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
>> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
>> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
>> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
>> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
>> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
>> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>>
>> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
>> init_espfix_ap() which is called before enabled local irq, and the lockdep
>> sub-system considers this behaviour as allocating memory with GFP_FS with
>> local irq disabled, then trigger the warning as mentioned about.
>>
>> Though we could allocate them on the boot CPU side and hand them over to
>> the secondary CPU, but it seems a waste if some of cpus are still offline.
>> As there is no need to these pages(espfix stack) until we try to run user
>> code, so we can postpone the initialization of espfix stack after cpu
>> booted to avoid the noise.

Does this pass the sigreturn_32 test on both 32-bit and 64-bit kernels
and sigreturn_64 test on 64-bit kernels?  (The test is in
tools/testing/selftests/x86.)  If so, looks good to me.

--Andy

>>
>> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
>> ---
>>  arch/x86/kernel/smpboot.c | 14 +++++++-------
>>  1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>> index 50e547e..3ce05de 100644
>> --- a/arch/x86/kernel/smpboot.c
>> +++ b/arch/x86/kernel/smpboot.c
>> @@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
>>       check_tsc_sync_target();
>>
>>       /*
>> -      * Enable the espfix hack for this CPU
>> -      */
>> -#ifdef CONFIG_X86_ESPFIX64
>> -     init_espfix_ap();
>> -#endif
>> -
>> -     /*
>>        * We need to hold vector_lock so there the set of online cpus
>>        * does not change while we are assigning vectors to cpus.  Holding
>>        * this lock ensures we don't half assign or remove an irq from a cpu.
>> @@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>>               }
>>       }
>>
>> +     /*
>> +      * Enable the espfix hack for this CPU
>> +      */
>> +#ifdef CONFIG_X86_ESPFIX64
>> +     init_espfix_ap();
>> +#endif
>> +
>>       /* mark "stuck" area as not stuck */
>>       *trampoline_status = 0;
>>
>
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP
  2015-05-29  1:07                     ` Andy Lutomirski
  2015-05-29  0:57                       ` Gu Zheng
@ 2015-06-02  9:23                       ` Gu Zheng
  2015-06-02  9:25                       ` [RFC PATCH V2] " Gu Zheng
  2 siblings, 0 replies; 23+ messages in thread
From: Gu Zheng @ 2015-06-02  9:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. Peter Anvin, Andy Lutomirski, Ingo Molnar, Borislav Petkov,
	Thomas Gleixner, linux-kernel, X86 ML

Hi Andy,

Sorry for late reply. 
On 05/29/2015 09:07 AM, Andy Lutomirski wrote:

> On Wed, May 27, 2015 at 6:20 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> ping...
>>
>> On 05/22/2015 06:13 PM, Gu Zheng wrote:
>>
>>> The following lockdep warning occurs when running with 4.1.0-rc3:
>>> [    3.178000] ------------[ cut here ]------------
>>> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
>>> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>>> [    3.199000] Modules linked in:
>>>
>>> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
>>> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
>>> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
>>> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
>>> [    3.246000] Call Trace:
>>> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
>>> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
>>> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
>>> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
>>> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
>>> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
>>> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
>>> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
>>> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
>>> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
>>> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
>>> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
>>> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>>>
>>> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
>>> init_espfix_ap() which is called before enabled local irq, and the lockdep
>>> sub-system considers this behaviour as allocating memory with GFP_FS with
>>> local irq disabled, then trigger the warning as mentioned about.
>>>
>>> Though we could allocate them on the boot CPU side and hand them over to
>>> the secondary CPU, but it seems a waste if some of cpus are still offline.
>>> As there is no need to these pages(espfix stack) until we try to run user
>>> code, so we can postpone the initialization of espfix stack after cpu
>>> booted to avoid the noise.
> 
> Does this pass the sigreturn_32 test on both 32-bit and 64-bit kernels
> and sigreturn_64 test on 64-bit kernels?  (The test is in
> tools/testing/selftests/x86.)  If so, looks good to me.

It failed the test.
There seems a bug in this patch, it alloc espfix stack in the do_boot_cpu
routine, not in the context of target cpu that we want to boot up, so the simple
change is wrong here.
I will send the v2 version soon, and it passed the tests you mentioned above.

Thanks again for your comments and suggestion.

Regards,
Gu
 

> 
> --Andy
> 
>>>
>>> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
>>> ---
>>>  arch/x86/kernel/smpboot.c | 14 +++++++-------
>>>  1 file changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>>> index 50e547e..3ce05de 100644
>>> --- a/arch/x86/kernel/smpboot.c
>>> +++ b/arch/x86/kernel/smpboot.c
>>> @@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
>>>       check_tsc_sync_target();
>>>
>>>       /*
>>> -      * Enable the espfix hack for this CPU
>>> -      */
>>> -#ifdef CONFIG_X86_ESPFIX64
>>> -     init_espfix_ap();
>>> -#endif
>>> -
>>> -     /*
>>>        * We need to hold vector_lock so there the set of online cpus
>>>        * does not change while we are assigning vectors to cpus.  Holding
>>>        * this lock ensures we don't half assign or remove an irq from a cpu.
>>> @@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>>>               }
>>>       }
>>>
>>> +     /*
>>> +      * Enable the espfix hack for this CPU
>>> +      */
>>> +#ifdef CONFIG_X86_ESPFIX64
>>> +     init_espfix_ap();
>>> +#endif
>>> +
>>>       /* mark "stuck" area as not stuck */
>>>       *trampoline_status = 0;
>>>
>>
>>
> 
> 
> 



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH V2] x86, espfix: postpone the initialization of espfix stack for AP
  2015-05-29  1:07                     ` Andy Lutomirski
  2015-05-29  0:57                       ` Gu Zheng
  2015-06-02  9:23                       ` Gu Zheng
@ 2015-06-02  9:25                       ` Gu Zheng
  2015-06-02 11:59                         ` Ingo Molnar
  2015-06-04  9:45                         ` [PATCH V1] " Gu Zheng
  2 siblings, 2 replies; 23+ messages in thread
From: Gu Zheng @ 2015-06-02  9:25 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: H. Peter Anvin, Andy Lutomirski, Ingo Molnar, Borislav Petkov,
	Thomas Gleixner, linux-kernel, X86 ML

The following lockdep warning occurrs when running with latest kernel:
[    3.178000] ------------[ cut here ]------------
[    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[    3.199000] Modules linked in:

[    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[    3.246000] Call Trace:
[    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
init_espfix_ap() which is called before enabled local irq, and the lockdep
sub-system considers this behaviour as allocating memory with GFP_FS with
local irq disabled, then trigger the warning as mentioned about.

Though we could allocate them on the boot CPU side and hand them over to
the secondary CPU, but it seemes a bit waste if some of cpus are offline.
As thers is no need to these pages(espfix stack) until we try to run user
code, so we postpone the initialization of espfix stack after cpu booted
to avoid the noise.


Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
v2:
  Let the boot up routine init the espfix stack for the target cpu after it
  booted.
---
 arch/x86/include/asm/espfix.h |    2 +-
 arch/x86/kernel/espfix_64.c   |   15 +++++++--------
 arch/x86/kernel/smpboot.c     |   14 +++++++-------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 99efebb..b074c4f 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
 DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
 extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730..37a4404 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -131,12 +131,12 @@ void __init init_espfix_bsp(void)
 	init_espfix_random();
 
 	/* The rest is the same as for any other processor */
-	init_espfix_ap();
+	init_espfix_ap(0);
 }
 
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
 {
-	unsigned int cpu, page;
+	unsigned int page;
 	unsigned long addr;
 	pud_t pud, *pud_p;
 	pmd_t pmd, *pmd_p;
@@ -146,10 +146,9 @@ void init_espfix_ap(void)
 	pteval_t ptemask;
 
 	/* We only have to do this once... */
-	if (likely(this_cpu_read(espfix_stack)))
+	if (likely(per_cpu(espfix_stack, cpu)))
 		return;		/* Already initialized */
 
-	cpu = smp_processor_id();
 	addr = espfix_base_addr(cpu);
 	page = cpu/ESPFIX_STACKS_PER_PAGE;
 
@@ -199,7 +198,7 @@ void init_espfix_ap(void)
 unlock_done:
 	mutex_unlock(&espfix_init_mutex);
 done:
-	this_cpu_write(espfix_stack, addr);
-	this_cpu_write(espfix_waddr, (unsigned long)stack_page
-		       + (addr & ~PAGE_MASK));
+	per_cpu(espfix_stack, cpu) = addr;
+	per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+		       + (addr & ~PAGE_MASK);
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547e..e9fdd0e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
 	check_tsc_sync_target();
 
 	/*
-	 * Enable the espfix hack for this CPU
-	 */
-#ifdef CONFIG_X86_ESPFIX64
-	init_espfix_ap();
-#endif
-
-	/*
 	 * We need to hold vector_lock so there the set of online cpus
 	 * does not change while we are assigning vectors to cpus.  Holding
 	 * this lock ensures we don't half assign or remove an irq from a cpu.
@@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		}
 	}
 
+	/*
+	 * Enable the espfix hack for this CPU
+	 */
+#ifdef CONFIG_X86_ESPFIX64
+	init_espfix_ap(cpu);
+#endif
+
 	/* mark "stuck" area as not stuck */
 	*trampoline_status = 0;
 
-- 
1.7.7



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH V2] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-02  9:25                       ` [RFC PATCH V2] " Gu Zheng
@ 2015-06-02 11:59                         ` Ingo Molnar
  2015-06-03  9:58                           ` Gu Zheng
  2015-06-04  9:45                         ` [PATCH V1] " Gu Zheng
  1 sibling, 1 reply; 23+ messages in thread
From: Ingo Molnar @ 2015-06-02 11:59 UTC (permalink / raw)
  To: Gu Zheng
  Cc: Andy Lutomirski, H. Peter Anvin, Andy Lutomirski,
	Borislav Petkov, Thomas Gleixner, linux-kernel, X86 ML


* Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:

> The following lockdep warning occurrs when running with latest kernel:
> [    3.178000] ------------[ cut here ]------------
> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> [    3.199000] Modules linked in:
> 
> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> [    3.246000] Call Trace:
> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
> 
> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in 
> init_espfix_ap() which is called before enabled local irq, and the lockdep 
> sub-system considers this behaviour as allocating memory with GFP_FS with local 
> irq disabled, then trigger the warning as mentioned about.

Why should this be a 'mis-warning'? If the GFP_KERNEL allocation sleeps then we'll 
sleep with irqs disabled => bad.

This looks like a real (albeit hard to trigger) bug.

> Though we could allocate them on the boot CPU side and hand them over to the 
> secondary CPU, but it seemes a bit waste if some of cpus are offline. As thers 
> is no need to these pages(espfix stack) until we try to run user code, so we 
> postpone the initialization of espfix stack after cpu booted to avoid the noise.

> -void init_espfix_ap(void)
> +void init_espfix_ap(int cpu)
>  {

So how about the concern I raised in a former thread, that the allocation should 
be done for the node the target CPU is on? The 'cpu' parameter should be 
propagated to the allocation as well, and turned into a node allocation or so.

Even though some CPUs will share the espfix stack, some won't.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH V2] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-02 11:59                         ` Ingo Molnar
@ 2015-06-03  9:58                           ` Gu Zheng
  0 siblings, 0 replies; 23+ messages in thread
From: Gu Zheng @ 2015-06-03  9:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andy Lutomirski, H. Peter Anvin, Andy Lutomirski,
	Borislav Petkov, Thomas Gleixner, linux-kernel, X86 ML

Hi Ingo,

On 06/02/2015 07:59 PM, Ingo Molnar wrote:

> 
> * Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> 
>> The following lockdep warning occurrs when running with latest kernel:
>> [    3.178000] ------------[ cut here ]------------
>> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
>> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>> [    3.199000] Modules linked in:
>>
>> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
>> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
>> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
>> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
>> [    3.246000] Call Trace:
>> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
>> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
>> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
>> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
>> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
>> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
>> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
>> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
>> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
>> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
>> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
>> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
>> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>>
>> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in 
>> init_espfix_ap() which is called before enabled local irq, and the lockdep 
>> sub-system considers this behaviour as allocating memory with GFP_FS with local 
>> irq disabled, then trigger the warning as mentioned about.
> 
> Why should this be a 'mis-warning'? If the GFP_KERNEL allocation sleeps then we'll 
> sleep with irqs disabled => bad.
> 
> This looks like a real (albeit hard to trigger) bug.


You are right.
Thanks for correct me, I misread the log.

> 
>> Though we could allocate them on the boot CPU side and hand them over to the 
>> secondary CPU, but it seemes a bit waste if some of cpus are offline. As thers 
>> is no need to these pages(espfix stack) until we try to run user code, so we 
>> postpone the initialization of espfix stack after cpu booted to avoid the noise.
> 
>> -void init_espfix_ap(void)
>> +void init_espfix_ap(int cpu)
>>  {
> 
> So how about the concern I raised in a former thread, that the allocation should 
> be done for the node the target CPU is on? The 'cpu' parameter should be 
> propagated to the allocation as well, and turned into a node allocation or so.
> 
> Even though some CPUs will share the espfix stack, some won't.


Hmm, sounds reasonable.

Regards,
Gu

> 
> Thanks,
> 
> 	Ingo
> .
> 





^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-02  9:25                       ` [RFC PATCH V2] " Gu Zheng
  2015-06-02 11:59                         ` Ingo Molnar
@ 2015-06-04  9:45                         ` Gu Zheng
  2015-06-17  5:53                           ` Zhu Guihua
  2015-06-17  7:27                           ` H. Peter Anvin
  1 sibling, 2 replies; 23+ messages in thread
From: Gu Zheng @ 2015-06-04  9:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andy Lutomirski, H. Peter Anvin, Andy Lutomirski,
	Borislav Petkov, Thomas Gleixner, linux-kernel, X86 ML

The following lockdep warning occurrs when running with latest kernel:
[    3.178000] ------------[ cut here ]------------
[    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[    3.199000] Modules linked in:

[    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[    3.246000] Call Trace:
[    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called
before enabled local irq, and the lockdep sub-system considers this
behaviour as allocating memory with GFP_FS with local irq disabled,
then trigger the warning as mentioned about.

Though we could allocate them on the boot CPU side and hand them over to
the secondary CPU, but it seemes a bit waste if some of cpus are offline.
As thers is no need to these pages(espfix stack) until we try to run user
code, so we postpone the initialization of espfix stack, and let the boot
up routine init the espfix stack for the target cpu after it booted to
avoid the noise.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
v1:
  Alloc the page on the node the target CPU is on.
RFC:
  Let the boot up routine init the espfix stack for the target cpu after it
  booted.
---
---
 arch/x86/include/asm/espfix.h |    2 +-
 arch/x86/kernel/espfix_64.c   |   28 ++++++++++++++++------------
 arch/x86/kernel/smpboot.c     |   14 +++++++-------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 99efebb..ca3ce9a 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
 DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
 extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730..e397583 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
 	init_espfix_random();
 
 	/* The rest is the same as for any other processor */
-	init_espfix_ap();
+	init_espfix_ap(0);
 }
 
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
 {
-	unsigned int cpu, page;
+	unsigned int page;
 	unsigned long addr;
 	pud_t pud, *pud_p;
 	pmd_t pmd, *pmd_p;
 	pte_t pte, *pte_p;
-	int n;
+	int n, node;
 	void *stack_page;
 	pteval_t ptemask;
 
 	/* We only have to do this once... */
-	if (likely(this_cpu_read(espfix_stack)))
+	if (likely(per_cpu(espfix_stack, cpu)))
 		return;		/* Already initialized */
 
-	cpu = smp_processor_id();
 	addr = espfix_base_addr(cpu);
 	page = cpu/ESPFIX_STACKS_PER_PAGE;
 
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
 	if (stack_page)
 		goto unlock_done;
 
+	node = cpu_to_node(cpu);
 	ptemask = __supported_pte_mask;
 
 	pud_p = &espfix_pud_page[pud_index(addr)];
 	pud = *pud_p;
 	if (!pud_present(pud)) {
-		pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+		pmd_p = (pmd_t *)page_address(page);
 		pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
 		for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
 	pmd_p = pmd_offset(&pud, addr);
 	pmd = *pmd_p;
 	if (!pmd_present(pmd)) {
-		pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+		pte_p = (pte_t *)page_address(page);
 		pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
 		paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
 		for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
 	}
 
 	pte_p = pte_offset_kernel(&pmd, addr);
-	stack_page = (void *)__get_free_page(GFP_KERNEL);
+	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
 	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
 unlock_done:
 	mutex_unlock(&espfix_init_mutex);
 done:
-	this_cpu_write(espfix_stack, addr);
-	this_cpu_write(espfix_waddr, (unsigned long)stack_page
-		       + (addr & ~PAGE_MASK));
+	per_cpu(espfix_stack, cpu) = addr;
+	per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+		       + (addr & ~PAGE_MASK);
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547e..e9fdd0e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
 	check_tsc_sync_target();
 
 	/*
-	 * Enable the espfix hack for this CPU
-	 */
-#ifdef CONFIG_X86_ESPFIX64
-	init_espfix_ap();
-#endif
-
-	/*
 	 * We need to hold vector_lock so there the set of online cpus
 	 * does not change while we are assigning vectors to cpus.  Holding
 	 * this lock ensures we don't half assign or remove an irq from a cpu.
@@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		}
 	}
 
+	/*
+	 * Enable the espfix hack for this CPU
+	 */
+#ifdef CONFIG_X86_ESPFIX64
+	init_espfix_ap(cpu);
+#endif
+
 	/* mark "stuck" area as not stuck */
 	*trampoline_status = 0;
 
-- 
1.7.7



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-04  9:45                         ` [PATCH V1] " Gu Zheng
@ 2015-06-17  5:53                           ` Zhu Guihua
  2015-06-17  7:27                           ` H. Peter Anvin
  1 sibling, 0 replies; 23+ messages in thread
From: Zhu Guihua @ 2015-06-17  5:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andy Lutomirski, H. Peter Anvin, Andy Lutomirski,
	Borislav Petkov, Thomas Gleixner, linux-kernel, X86 ML

Any feedback about this?

On 06/04/2015 05:45 PM, Gu Zheng wrote:
> The following lockdep warning occurrs when running with latest kernel:
> [    3.178000] ------------[ cut here ]------------
> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> [    3.199000] Modules linked in:
>
> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> [    3.246000] Call Trace:
> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>
> As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called
> before enabled local irq, and the lockdep sub-system considers this
> behaviour as allocating memory with GFP_FS with local irq disabled,
> then trigger the warning as mentioned about.
>
> Though we could allocate them on the boot CPU side and hand them over to
> the secondary CPU, but it seemes a bit waste if some of cpus are offline.
> As thers is no need to these pages(espfix stack) until we try to run user
> code, so we postpone the initialization of espfix stack, and let the boot
> up routine init the espfix stack for the target cpu after it booted to
> avoid the noise.
>
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> ---
> v1:
>    Alloc the page on the node the target CPU is on.
> RFC:
>    Let the boot up routine init the espfix stack for the target cpu after it
>    booted.
> ---
> ---
>   arch/x86/include/asm/espfix.h |    2 +-
>   arch/x86/kernel/espfix_64.c   |   28 ++++++++++++++++------------
>   arch/x86/kernel/smpboot.c     |   14 +++++++-------
>   3 files changed, 24 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
> index 99efebb..ca3ce9a 100644
> --- a/arch/x86/include/asm/espfix.h
> +++ b/arch/x86/include/asm/espfix.h
> @@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
>   DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
>   
>   extern void init_espfix_bsp(void);
> -extern void init_espfix_ap(void);
> +extern void init_espfix_ap(int cpu);
>   
>   #endif /* CONFIG_X86_64 */
>   
> diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
> index f5d0730..e397583 100644
> --- a/arch/x86/kernel/espfix_64.c
> +++ b/arch/x86/kernel/espfix_64.c
> @@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
>   	init_espfix_random();
>   
>   	/* The rest is the same as for any other processor */
> -	init_espfix_ap();
> +	init_espfix_ap(0);
>   }
>   
> -void init_espfix_ap(void)
> +void init_espfix_ap(int cpu)
>   {
> -	unsigned int cpu, page;
> +	unsigned int page;
>   	unsigned long addr;
>   	pud_t pud, *pud_p;
>   	pmd_t pmd, *pmd_p;
>   	pte_t pte, *pte_p;
> -	int n;
> +	int n, node;
>   	void *stack_page;
>   	pteval_t ptemask;
>   
>   	/* We only have to do this once... */
> -	if (likely(this_cpu_read(espfix_stack)))
> +	if (likely(per_cpu(espfix_stack, cpu)))
>   		return;		/* Already initialized */
>   
> -	cpu = smp_processor_id();
>   	addr = espfix_base_addr(cpu);
>   	page = cpu/ESPFIX_STACKS_PER_PAGE;
>   
> @@ -165,12 +164,15 @@ void init_espfix_ap(void)
>   	if (stack_page)
>   		goto unlock_done;
>   
> +	node = cpu_to_node(cpu);
>   	ptemask = __supported_pte_mask;
>   
>   	pud_p = &espfix_pud_page[pud_index(addr)];
>   	pud = *pud_p;
>   	if (!pud_present(pud)) {
> -		pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
> +		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
> +
> +		pmd_p = (pmd_t *)page_address(page);
>   		pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
>   		paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
>   		for (n = 0; n < ESPFIX_PUD_CLONES; n++)
> @@ -180,7 +182,9 @@ void init_espfix_ap(void)
>   	pmd_p = pmd_offset(&pud, addr);
>   	pmd = *pmd_p;
>   	if (!pmd_present(pmd)) {
> -		pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
> +		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
> +
> +		pte_p = (pte_t *)page_address(page);
>   		pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
>   		paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
>   		for (n = 0; n < ESPFIX_PMD_CLONES; n++)
> @@ -188,7 +192,7 @@ void init_espfix_ap(void)
>   	}
>   
>   	pte_p = pte_offset_kernel(&pmd, addr);
> -	stack_page = (void *)__get_free_page(GFP_KERNEL);
> +	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
>   	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
>   	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
>   		set_pte(&pte_p[n*PTE_STRIDE], pte);
> @@ -199,7 +203,7 @@ void init_espfix_ap(void)
>   unlock_done:
>   	mutex_unlock(&espfix_init_mutex);
>   done:
> -	this_cpu_write(espfix_stack, addr);
> -	this_cpu_write(espfix_waddr, (unsigned long)stack_page
> -		       + (addr & ~PAGE_MASK));
> +	per_cpu(espfix_stack, cpu) = addr;
> +	per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
> +		       + (addr & ~PAGE_MASK);
>   }
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 50e547e..e9fdd0e 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
>   	check_tsc_sync_target();
>   
>   	/*
> -	 * Enable the espfix hack for this CPU
> -	 */
> -#ifdef CONFIG_X86_ESPFIX64
> -	init_espfix_ap();
> -#endif
> -
> -	/*
>   	 * We need to hold vector_lock so there the set of online cpus
>   	 * does not change while we are assigning vectors to cpus.  Holding
>   	 * this lock ensures we don't half assign or remove an irq from a cpu.
> @@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>   		}
>   	}
>   
> +	/*
> +	 * Enable the espfix hack for this CPU
> +	 */
> +#ifdef CONFIG_X86_ESPFIX64
> +	init_espfix_ap(cpu);
> +#endif
> +
>   	/* mark "stuck" area as not stuck */
>   	*trampoline_status = 0;
>   


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-04  9:45                         ` [PATCH V1] " Gu Zheng
  2015-06-17  5:53                           ` Zhu Guihua
@ 2015-06-17  7:27                           ` H. Peter Anvin
  2015-06-17 21:04                             ` Borislav Petkov
  1 sibling, 1 reply; 23+ messages in thread
From: H. Peter Anvin @ 2015-06-17  7:27 UTC (permalink / raw)
  To: Gu Zheng, Ingo Molnar
  Cc: Andy Lutomirski, Andy Lutomirski, Borislav Petkov,
	Thomas Gleixner, linux-kernel, X86 ML

On 06/04/2015 02:45 AM, Gu Zheng wrote:
> The following lockdep warning occurrs when running with latest kernel:
> [    3.178000] ------------[ cut here ]------------
> [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> [    3.199000] Modules linked in:
> 
> [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> [    3.246000] Call Trace:
> [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
> 
> As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called
> before enabled local irq, and the lockdep sub-system considers this
> behaviour as allocating memory with GFP_FS with local irq disabled,
> then trigger the warning as mentioned about.
> 
> Though we could allocate them on the boot CPU side and hand them over to
> the secondary CPU, but it seemes a bit waste if some of cpus are offline.
> As thers is no need to these pages(espfix stack) until we try to run user
> code, so we postpone the initialization of espfix stack, and let the boot
> up routine init the espfix stack for the target cpu after it booted to
> avoid the noise.
> 

It isn't *at all* obvious to me at least that if the GFP_KERNEL
allocation fails we may not get rescheduled on another CPU and/or get stuck.

I'm starting to think that the right thing to do is to allocate these on
the CPU that is bringing up the other CPU, at the same time we allocate
the percpu area.  This won't affect offline CPUs.

	-hpa


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-17  7:27                           ` H. Peter Anvin
@ 2015-06-17 21:04                             ` Borislav Petkov
  2015-06-17 21:11                               ` H. Peter Anvin
  0 siblings, 1 reply; 23+ messages in thread
From: Borislav Petkov @ 2015-06-17 21:04 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Gu Zheng, Ingo Molnar, Andy Lutomirski, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, X86 ML

On Wed, Jun 17, 2015 at 12:27:05AM -0700, H. Peter Anvin wrote:
> On 06/04/2015 02:45 AM, Gu Zheng wrote:
> > The following lockdep warning occurrs when running with latest kernel:
> > [    3.178000] ------------[ cut here ]------------
> > [    3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> > [    3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> > [    3.199000] Modules linked in:
> > 
> > [    3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> > [    3.221000]  0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> > [    3.230000]  0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> > [    3.238000]  ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> > [    3.246000] Call Trace:
> > [    3.249000]  [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> > [    3.255000]  [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> > [    3.261000]  [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> > [    3.268000]  [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> > [    3.274000]  [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> > [    3.281000]  [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> > [    3.288000]  [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> > [    3.295000]  [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> > [    3.301000]  [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> > [    3.308000]  [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> > [    3.314000]  [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> > [    3.320000]  [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> > [    3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
> > 
> > As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called
> > before enabled local irq, and the lockdep sub-system considers this
> > behaviour as allocating memory with GFP_FS with local irq disabled,
> > then trigger the warning as mentioned about.
> > 
> > Though we could allocate them on the boot CPU side and hand them over to
> > the secondary CPU, but it seemes a bit waste if some of cpus are offline.
> > As thers is no need to these pages(espfix stack) until we try to run user
> > code, so we postpone the initialization of espfix stack, and let the boot
> > up routine init the espfix stack for the target cpu after it booted to
> > avoid the noise.
> > 
> 
> It isn't *at all* obvious to me at least that if the GFP_KERNEL
> allocation fails we may not get rescheduled on another CPU and/or get stuck.
> 
> I'm starting to think that the right thing to do is to allocate these on
> the CPU that is bringing up the other CPU, at the same time we allocate
> the percpu area.  This won't affect offline CPUs.

Btw, as part of experimenting for something else, I was able to trigger
this even on a guest here. It is an insane guest though: 16 NUMA nodes,
with 8 cores each:

[    0.032000] ------------[ cut here ]------------
[    0.032000] WARNING: CPU: 64 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0x10c/0x120()
[    0.032000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[    0.032000] Modules linked in:
[    0.032000] CPU: 64 PID: 0 Comm: swapper/64 Not tainted 4.1.0-rc3+ #4
[    0.032000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[    0.032000]  ffffffff818dd1a1 ffff880006a1fca8 ffffffff816a9685 0000000000000000
[    0.032000]  ffff880006a1fcf8 ffff880006a1fce8 ffffffff81058585 00000000001d74c0
[    0.032000]  0000000000000080 0000000000000046 ffff880047ffcd00 ffff88003e804058
[    0.032000] Call Trace:
[    0.032000]  [<ffffffff816a9685>] dump_stack+0x4f/0x7b
[    0.032000]  [<ffffffff81058585>] warn_slowpath_common+0x95/0xe0
[    0.032000]  [<ffffffff81058616>] warn_slowpath_fmt+0x46/0x50
[    0.032000]  [<ffffffff810a662c>] lockdep_trace_alloc+0x10c/0x120
[    0.032000]  [<ffffffff8113d6ed>] __alloc_pages_nodemask+0xad/0xab0
[    0.032000]  [<ffffffff813442d7>] ? debug_smp_processor_id+0x17/0x20
[    0.032000]  [<ffffffff810a370e>] ? put_lock_stats.isra.19+0xe/0x30
[    0.032000]  [<ffffffff816ae288>] ? mutex_lock_nested+0x2e8/0x420
[    0.032000]  [<ffffffff8117e0cc>] alloc_page_interleave+0x3c/0x90
[    0.032000]  [<ffffffff8117e995>] alloc_pages_current+0xc5/0xd0
[    0.032000]  [<ffffffff81138734>] __get_free_pages+0x14/0x50
[    0.032000]  [<ffffffff8100a484>] init_espfix_ap.part.5+0x164/0x270
[    0.032000]  [<ffffffff8100a5b1>] init_espfix_ap+0x21/0x30
[    0.032000]  [<ffffffff8103cd28>] start_secondary+0xe8/0x180
[    0.032000] ---[ end trace 6a7abdb28fbb7667 ]---

Now I can test the future fix too. :)

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-17 21:04                             ` Borislav Petkov
@ 2015-06-17 21:11                               ` H. Peter Anvin
  2015-06-17 21:50                                 ` Borislav Petkov
  0 siblings, 1 reply; 23+ messages in thread
From: H. Peter Anvin @ 2015-06-17 21:11 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Gu Zheng, Ingo Molnar, Andy Lutomirski, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, X86 ML

On 06/17/2015 02:04 PM, Borislav Petkov wrote:
>>
>> It isn't *at all* obvious to me at least that if the GFP_KERNEL
>> allocation fails we may not get rescheduled on another CPU and/or get stuck.
>>
>> I'm starting to think that the right thing to do is to allocate these on
>> the CPU that is bringing up the other CPU, at the same time we allocate
>> the percpu area.  This won't affect offline CPUs.
> 
> Btw, as part of experimenting for something else, I was able to trigger
> this even on a guest here. It is an insane guest though: 16 NUMA nodes,
> with 8 cores each:
> 

Is this reliable?

	-hpa


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V1] x86, espfix: postpone the initialization of espfix stack for AP
  2015-06-17 21:11                               ` H. Peter Anvin
@ 2015-06-17 21:50                                 ` Borislav Petkov
  0 siblings, 0 replies; 23+ messages in thread
From: Borislav Petkov @ 2015-06-17 21:50 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Gu Zheng, Ingo Molnar, Andy Lutomirski, Andy Lutomirski,
	Thomas Gleixner, linux-kernel, X86 ML

On Wed, Jun 17, 2015 at 02:11:27PM -0700, H. Peter Anvin wrote:
> Is this reliable?

10 out of 10 :)

I'm booting with:

...
-smp 128
-numa node,nodeid=0,cpus=0-7
-numa node,nodeid=1,cpus=8-15
-numa node,nodeid=2,cpus=16-23
-numa node,nodeid=3,cpus=24-31
-numa node,nodeid=4,cpus=32-39
-numa node,nodeid=5,cpus=40-47
-numa node,nodeid=6,cpus=48-55
-numa node,nodeid=7,cpus=56-63
-numa node,nodeid=8,cpus=64-71
-numa node,nodeid=9,cpus=72-79
-numa node,nodeid=10,cpus=80-87
-numa node,nodeid=11,cpus=88-95
-numa node,nodeid=12,cpus=96-103
-numa node,nodeid=13,cpus=104-111
-numa node,nodeid=14,cpus=112-119
-numa node,nodeid=15,cpus=120-127
...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2015-06-17 21:50 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-14 11:37 [RFC PATCH] x86, espfix: use spin_lock rather than mutex Gu Zheng
2015-05-14 12:26 ` Borislav Petkov
2015-05-14 18:29   ` Ingo Molnar
2015-05-14 21:27     ` Borislav Petkov
2015-05-14 22:13       ` H. Peter Anvin
2015-05-15  6:54         ` Ingo Molnar
2015-05-15  7:27           ` H. Peter Anvin
2015-05-18 19:43             ` Andy Lutomirski
2015-05-19 15:04               ` H. Peter Anvin
2015-05-22 10:13                 ` [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP Gu Zheng
2015-05-28  1:20                   ` Gu Zheng
2015-05-29  1:07                     ` Andy Lutomirski
2015-05-29  0:57                       ` Gu Zheng
2015-06-02  9:23                       ` Gu Zheng
2015-06-02  9:25                       ` [RFC PATCH V2] " Gu Zheng
2015-06-02 11:59                         ` Ingo Molnar
2015-06-03  9:58                           ` Gu Zheng
2015-06-04  9:45                         ` [PATCH V1] " Gu Zheng
2015-06-17  5:53                           ` Zhu Guihua
2015-06-17  7:27                           ` H. Peter Anvin
2015-06-17 21:04                             ` Borislav Petkov
2015-06-17 21:11                               ` H. Peter Anvin
2015-06-17 21:50                                 ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).