All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-07 16:14 ` Kirill Tkhai
  0 siblings, 0 replies; 18+ messages in thread
From: Kirill Tkhai @ 2018-02-07 16:14 UTC (permalink / raw)
  To: tglx, mingo, hpa, aryabinin, glider, dvyukov, luto, bp, jpoimboe,
	dave.hansen, jgross, kirill.shutemov, keescook, minipli, gregkh,
	kstewart, linux-kernel, kasan-dev, linux-mm

Sometimes it is possible to meet a situation,
when irq stack is corrupted, while innocent
callback function is being executed. This may
happen because of crappy drivers irq handlers,
when they access wrong memory on the irq stack.

This patch aims to catch such the situations
and adds checks of unauthorized stack access.

Every time we enter in interrupt, we check for
irq_count, and allow irq stack usage. After
last nested irq handler is exited, we prohibit
the access back.

I did x86_unpoison_irq_stack() and x86_poison_irq_stack()
calls unconditional, because this requires
to change the order of incl PER_CPU_VAR(irq_count)
and UNWIND_HINT_REGS(), and I'm not sure it's
legitimately to do. So, irq_count is checked in
x86_unpoison_irq_stack().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 arch/x86/entry/entry_64.S        |    6 ++++++
 arch/x86/include/asm/processor.h |    6 ++++++
 arch/x86/kernel/irq_64.c         |   13 +++++++++++++
 include/linux/kasan.h            |    3 +++
 mm/kasan/kasan.c                 |   16 ++++++++++++++++
 5 files changed, 44 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 741d9877b357..1e9d69de2528 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -485,6 +485,9 @@ END(irq_entries_start)
  * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
  */
 .macro ENTER_IRQ_STACK regs=1 old_rsp
+#ifdef CONFIG_KASAN
+	call	x86_unpoison_irq_stack
+#endif
 	DEBUG_ENTRY_ASSERT_IRQS_OFF
 	movq	%rsp, \old_rsp
 
@@ -552,6 +555,9 @@ END(irq_entries_start)
 	 */
 
 	decl	PER_CPU_VAR(irq_count)
+#ifdef CONFIG_KASAN
+	call	x86_poison_irq_stack
+#endif
 .endm
 
 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 793bae7e7ce3..4353e3a85b0b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -404,6 +404,12 @@ union irq_stack_union {
 	};
 };
 
+#define KASAN_IRQ_STACK_SIZE \
+	(sizeof(union irq_stack_union) - \
+		(offsetof(union irq_stack_union, stack_canary) + 8))
+
+#define percpu_irq_stack_addr() this_cpu_ptr(irq_stack_union.irq_stack)
+
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
 DECLARE_INIT_PER_CPU(irq_stack_union);
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d86e344f5b3d..ad78f4b3f0b5 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -77,3 +77,16 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 	generic_handle_irq_desc(desc);
 	return true;
 }
+
+#ifdef CONFIG_KASAN
+void __visible x86_poison_irq_stack(void)
+{
+	if (this_cpu_read(irq_count) == -1)
+		kasan_poison_irq_stack();
+}
+void __visible x86_unpoison_irq_stack(void)
+{
+	if (this_cpu_read(irq_count) == -1)
+		kasan_unpoison_irq_stack();
+}
+#endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index adc13474a53b..cb433f1bf178 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -40,6 +40,9 @@ void kasan_unpoison_shadow(const void *address, size_t size);
 void kasan_unpoison_task_stack(struct task_struct *task);
 void kasan_unpoison_stack_above_sp_to(const void *watermark);
 
+void kasan_poison_irq_stack(void);
+void kasan_unpoison_irq_stack(void);
+
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0d9d9d268f32..9bc150c87205 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -412,6 +412,22 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 			KASAN_KMALLOC_REDZONE);
 }
 
+#ifdef KASAN_IRQ_STACK_SIZE
+void kasan_poison_irq_stack(void)
+{
+	void *stack = percpu_irq_stack_addr();
+
+	kasan_poison_shadow(stack, KASAN_IRQ_STACK_SIZE, KASAN_GLOBAL_REDZONE);
+}
+
+void kasan_unpoison_irq_stack(void)
+{
+	void *stack = percpu_irq_stack_addr();
+
+	kasan_unpoison_shadow(stack, KASAN_IRQ_STACK_SIZE);
+}
+#endif /* KASAN_IRQ_STACK_SIZE */
+
 static inline int in_irqentry_text(unsigned long ptr)
 {
 	return (ptr >= (unsigned long)&__irqentry_text_start &&

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-07 16:14 ` Kirill Tkhai
  0 siblings, 0 replies; 18+ messages in thread
From: Kirill Tkhai @ 2018-02-07 16:14 UTC (permalink / raw)
  To: tglx, mingo, hpa, aryabinin, glider, dvyukov, luto, bp, jpoimboe,
	dave.hansen, jgross, kirill.shutemov, keescook, minipli, gregkh,
	kstewart, linux-kernel, kasan-dev, linux-mm

Sometimes it is possible to meet a situation,
when irq stack is corrupted, while innocent
callback function is being executed. This may
happen because of crappy drivers irq handlers,
when they access wrong memory on the irq stack.

This patch aims to catch such the situations
and adds checks of unauthorized stack access.

Every time we enter in interrupt, we check for
irq_count, and allow irq stack usage. After
last nested irq handler is exited, we prohibit
the access back.

I did x86_unpoison_irq_stack() and x86_poison_irq_stack()
calls unconditional, because this requires
to change the order of incl PER_CPU_VAR(irq_count)
and UNWIND_HINT_REGS(), and I'm not sure it's
legitimately to do. So, irq_count is checked in
x86_unpoison_irq_stack().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 arch/x86/entry/entry_64.S        |    6 ++++++
 arch/x86/include/asm/processor.h |    6 ++++++
 arch/x86/kernel/irq_64.c         |   13 +++++++++++++
 include/linux/kasan.h            |    3 +++
 mm/kasan/kasan.c                 |   16 ++++++++++++++++
 5 files changed, 44 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 741d9877b357..1e9d69de2528 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -485,6 +485,9 @@ END(irq_entries_start)
  * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
  */
 .macro ENTER_IRQ_STACK regs=1 old_rsp
+#ifdef CONFIG_KASAN
+	call	x86_unpoison_irq_stack
+#endif
 	DEBUG_ENTRY_ASSERT_IRQS_OFF
 	movq	%rsp, \old_rsp
 
@@ -552,6 +555,9 @@ END(irq_entries_start)
 	 */
 
 	decl	PER_CPU_VAR(irq_count)
+#ifdef CONFIG_KASAN
+	call	x86_poison_irq_stack
+#endif
 .endm
 
 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 793bae7e7ce3..4353e3a85b0b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -404,6 +404,12 @@ union irq_stack_union {
 	};
 };
 
+#define KASAN_IRQ_STACK_SIZE \
+	(sizeof(union irq_stack_union) - \
+		(offsetof(union irq_stack_union, stack_canary) + 8))
+
+#define percpu_irq_stack_addr() this_cpu_ptr(irq_stack_union.irq_stack)
+
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
 DECLARE_INIT_PER_CPU(irq_stack_union);
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d86e344f5b3d..ad78f4b3f0b5 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -77,3 +77,16 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 	generic_handle_irq_desc(desc);
 	return true;
 }
+
+#ifdef CONFIG_KASAN
+void __visible x86_poison_irq_stack(void)
+{
+	if (this_cpu_read(irq_count) == -1)
+		kasan_poison_irq_stack();
+}
+void __visible x86_unpoison_irq_stack(void)
+{
+	if (this_cpu_read(irq_count) == -1)
+		kasan_unpoison_irq_stack();
+}
+#endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index adc13474a53b..cb433f1bf178 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -40,6 +40,9 @@ void kasan_unpoison_shadow(const void *address, size_t size);
 void kasan_unpoison_task_stack(struct task_struct *task);
 void kasan_unpoison_stack_above_sp_to(const void *watermark);
 
+void kasan_poison_irq_stack(void);
+void kasan_unpoison_irq_stack(void);
+
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0d9d9d268f32..9bc150c87205 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -412,6 +412,22 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 			KASAN_KMALLOC_REDZONE);
 }
 
+#ifdef KASAN_IRQ_STACK_SIZE
+void kasan_poison_irq_stack(void)
+{
+	void *stack = percpu_irq_stack_addr();
+
+	kasan_poison_shadow(stack, KASAN_IRQ_STACK_SIZE, KASAN_GLOBAL_REDZONE);
+}
+
+void kasan_unpoison_irq_stack(void)
+{
+	void *stack = percpu_irq_stack_addr();
+
+	kasan_unpoison_shadow(stack, KASAN_IRQ_STACK_SIZE);
+}
+#endif /* KASAN_IRQ_STACK_SIZE */
+
 static inline int in_irqentry_text(unsigned long ptr)
 {
 	return (ptr >= (unsigned long)&__irqentry_text_start &&

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-07 16:14 ` Kirill Tkhai
@ 2018-02-07 18:38   ` Dave Hansen
  -1 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2018-02-07 18:38 UTC (permalink / raw)
  To: Kirill Tkhai, tglx, mingo, hpa, aryabinin, glider, dvyukov, luto,
	bp, jpoimboe, jgross, kirill.shutemov, keescook, minipli, gregkh,
	kstewart, linux-kernel, kasan-dev, linux-mm

On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
> Sometimes it is possible to meet a situation,
> when irq stack is corrupted, while innocent
> callback function is being executed. This may
> happen because of crappy drivers irq handlers,
> when they access wrong memory on the irq stack.

Can you be more clear about the actual issue?  Which drivers do this?
How do they even find an IRQ stack pointer?

> This patch aims to catch such the situations
> and adds checks of unauthorized stack access.

I think I forgot how KASAN did this.  KASAN has metadata that says which
areas of memory are good or bad to access, right?  So, this just tags
IRQ stacks as bad when we are not _in_ an interrupt?

> +#define KASAN_IRQ_STACK_SIZE \
> +	(sizeof(union irq_stack_union) - \
> +		(offsetof(union irq_stack_union, stack_canary) + 8))

Just curious, but why leave out the canary?  It shouldn't be accessed
either.

> +#ifdef CONFIG_KASAN
> +void __visible x86_poison_irq_stack(void)
> +{
> +	if (this_cpu_read(irq_count) == -1)
> +		kasan_poison_irq_stack();
> +}
> +void __visible x86_unpoison_irq_stack(void)
> +{
> +	if (this_cpu_read(irq_count) == -1)
> +		kasan_unpoison_irq_stack();
> +}
> +#endif

It might be handy to point out here that -1 means "not in an interrupt"
and >=0 means "in an interrupt".

Otherwise, this looks pretty straightforward.  Would it be something to
extend to the other stacks like the NMI or double-fault stacks?  Or are
those just not worth it?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-07 18:38   ` Dave Hansen
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Hansen @ 2018-02-07 18:38 UTC (permalink / raw)
  To: Kirill Tkhai, tglx, mingo, hpa, aryabinin, glider, dvyukov, luto,
	bp, jpoimboe, jgross, kirill.shutemov, keescook, minipli, gregkh,
	kstewart, linux-kernel, kasan-dev, linux-mm

On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
> Sometimes it is possible to meet a situation,
> when irq stack is corrupted, while innocent
> callback function is being executed. This may
> happen because of crappy drivers irq handlers,
> when they access wrong memory on the irq stack.

Can you be more clear about the actual issue?  Which drivers do this?
How do they even find an IRQ stack pointer?

> This patch aims to catch such the situations
> and adds checks of unauthorized stack access.

I think I forgot how KASAN did this.  KASAN has metadata that says which
areas of memory are good or bad to access, right?  So, this just tags
IRQ stacks as bad when we are not _in_ an interrupt?

> +#define KASAN_IRQ_STACK_SIZE \
> +	(sizeof(union irq_stack_union) - \
> +		(offsetof(union irq_stack_union, stack_canary) + 8))

Just curious, but why leave out the canary?  It shouldn't be accessed
either.

> +#ifdef CONFIG_KASAN
> +void __visible x86_poison_irq_stack(void)
> +{
> +	if (this_cpu_read(irq_count) == -1)
> +		kasan_poison_irq_stack();
> +}
> +void __visible x86_unpoison_irq_stack(void)
> +{
> +	if (this_cpu_read(irq_count) == -1)
> +		kasan_unpoison_irq_stack();
> +}
> +#endif

It might be handy to point out here that -1 means "not in an interrupt"
and >=0 means "in an interrupt".

Otherwise, this looks pretty straightforward.  Would it be something to
extend to the other stacks like the NMI or double-fault stacks?  Or are
those just not worth it?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-07 18:38   ` Dave Hansen
@ 2018-02-07 19:31     ` Dmitry Vyukov
  -1 siblings, 0 replies; 18+ messages in thread
From: Dmitry Vyukov @ 2018-02-07 19:31 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Kirill Tkhai, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Andrey Ryabinin, Alexander Potapenko, Andy Lutomirski,
	Borislav Petkov, Josh Poimboeuf, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Wed, Feb 7, 2018 at 7:38 PM, Dave Hansen <dave.hansen@linux.intel.com> wrote:
> On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
>> Sometimes it is possible to meet a situation,
>> when irq stack is corrupted, while innocent
>> callback function is being executed. This may
>> happen because of crappy drivers irq handlers,
>> when they access wrong memory on the irq stack.
>
> Can you be more clear about the actual issue?  Which drivers do this?
> How do they even find an IRQ stack pointer?
>
>> This patch aims to catch such the situations
>> and adds checks of unauthorized stack access.
>
> I think I forgot how KASAN did this.  KASAN has metadata that says which
> areas of memory are good or bad to access, right?  So, this just tags
> IRQ stacks as bad when we are not _in_ an interrupt?

Correct.
kasan_poison/unpoison_shadow effectively memset separate "shadow"
memory range, which is then checked by memory accesses to understand
if it's OK to access corresponding memory.


>> +#define KASAN_IRQ_STACK_SIZE \
>> +     (sizeof(union irq_stack_union) - \
>> +             (offsetof(union irq_stack_union, stack_canary) + 8))
>
> Just curious, but why leave out the canary?  It shouldn't be accessed
> either.
>
>> +#ifdef CONFIG_KASAN
>> +void __visible x86_poison_irq_stack(void)
>> +{
>> +     if (this_cpu_read(irq_count) == -1)
>> +             kasan_poison_irq_stack();
>> +}
>> +void __visible x86_unpoison_irq_stack(void)
>> +{
>> +     if (this_cpu_read(irq_count) == -1)
>> +             kasan_unpoison_irq_stack();
>> +}
>> +#endif
>
> It might be handy to point out here that -1 means "not in an interrupt"
> and >=0 means "in an interrupt".
>
> Otherwise, this looks pretty straightforward.  Would it be something to
> extend to the other stacks like the NMI or double-fault stacks?  Or are
> those just not worth it?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-07 19:31     ` Dmitry Vyukov
  0 siblings, 0 replies; 18+ messages in thread
From: Dmitry Vyukov @ 2018-02-07 19:31 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Kirill Tkhai, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Andrey Ryabinin, Alexander Potapenko, Andy Lutomirski,
	Borislav Petkov, Josh Poimboeuf, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Wed, Feb 7, 2018 at 7:38 PM, Dave Hansen <dave.hansen@linux.intel.com> wrote:
> On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
>> Sometimes it is possible to meet a situation,
>> when irq stack is corrupted, while innocent
>> callback function is being executed. This may
>> happen because of crappy drivers irq handlers,
>> when they access wrong memory on the irq stack.
>
> Can you be more clear about the actual issue?  Which drivers do this?
> How do they even find an IRQ stack pointer?
>
>> This patch aims to catch such the situations
>> and adds checks of unauthorized stack access.
>
> I think I forgot how KASAN did this.  KASAN has metadata that says which
> areas of memory are good or bad to access, right?  So, this just tags
> IRQ stacks as bad when we are not _in_ an interrupt?

Correct.
kasan_poison/unpoison_shadow effectively memset separate "shadow"
memory range, which is then checked by memory accesses to understand
if it's OK to access corresponding memory.


>> +#define KASAN_IRQ_STACK_SIZE \
>> +     (sizeof(union irq_stack_union) - \
>> +             (offsetof(union irq_stack_union, stack_canary) + 8))
>
> Just curious, but why leave out the canary?  It shouldn't be accessed
> either.
>
>> +#ifdef CONFIG_KASAN
>> +void __visible x86_poison_irq_stack(void)
>> +{
>> +     if (this_cpu_read(irq_count) == -1)
>> +             kasan_poison_irq_stack();
>> +}
>> +void __visible x86_unpoison_irq_stack(void)
>> +{
>> +     if (this_cpu_read(irq_count) == -1)
>> +             kasan_unpoison_irq_stack();
>> +}
>> +#endif
>
> It might be handy to point out here that -1 means "not in an interrupt"
> and >=0 means "in an interrupt".
>
> Otherwise, this looks pretty straightforward.  Would it be something to
> extend to the other stacks like the NMI or double-fault stacks?  Or are
> those just not worth it?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-07 18:38   ` Dave Hansen
@ 2018-02-08 10:03     ` Kirill Tkhai
  -1 siblings, 0 replies; 18+ messages in thread
From: Kirill Tkhai @ 2018-02-08 10:03 UTC (permalink / raw)
  To: Dave Hansen, tglx, mingo, hpa, aryabinin, glider, dvyukov, luto,
	bp, jpoimboe, jgross, kirill.shutemov, keescook, minipli, gregkh,
	kstewart, linux-kernel, kasan-dev, linux-mm

On 07.02.2018 21:38, Dave Hansen wrote:
> On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
>> Sometimes it is possible to meet a situation,
>> when irq stack is corrupted, while innocent
>> callback function is being executed. This may
>> happen because of crappy drivers irq handlers,
>> when they access wrong memory on the irq stack.
> 
> Can you be more clear about the actual issue?  Which drivers do this?
> How do they even find an IRQ stack pointer?

I can't say actual driver making this, because I'm still investigating the guilty one.
But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
where stack variable sg becomes corrupted. This time all scheduler-related not-stack
variables are in ideal state. And update_sd_lb_stats() is the function, which can't
corrupt its own stack. So, I thought this functionality may be useful for something else,
especially because of irq stack is one of the last stacks, which are not sanitized.
Task's stacks are already covered, as I know

[1595450.678971] Call Trace:
[1595450.683991]  <IRQ>
[1595450.684038]
[1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
[1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
[1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
[1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
[1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
[1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
[1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
[1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
[1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
[1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
[1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
[1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
[1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
[1595450.758519]  <EOI>
[1595450.758569]
[1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
[1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
[1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
[1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
[1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250

>> This patch aims to catch such the situations
>> and adds checks of unauthorized stack access.
> 
> I think I forgot how KASAN did this.  KASAN has metadata that says which
> areas of memory are good or bad to access, right?  So, this just tags
> IRQ stacks as bad when we are not _in_ an interrupt?
> 
>> +#define KASAN_IRQ_STACK_SIZE \
>> +	(sizeof(union irq_stack_union) - \
>> +		(offsetof(union irq_stack_union, stack_canary) + 8))
> 
> Just curious, but why leave out the canary?  It shouldn't be accessed
> either.

It's touched in several more places (e.g., in __switch_to_asm()), and I'm not
sure KASAN is OK with this. Does it?

Also gs_base is touched from load_percpu_segment(), which could be called from
different cpu, and this seems it would required some synchronization between
the handlers and this primitive.

>> +#ifdef CONFIG_KASAN
>> +void __visible x86_poison_irq_stack(void)
>> +{
>> +	if (this_cpu_read(irq_count) == -1)
>> +		kasan_poison_irq_stack();
>> +}
>> +void __visible x86_unpoison_irq_stack(void)
>> +{
>> +	if (this_cpu_read(irq_count) == -1)
>> +		kasan_unpoison_irq_stack();
>> +}
>> +#endif
> 
> It might be handy to point out here that -1 means "not in an interrupt"
> and >=0 means "in an interrupt".
> 
> Otherwise, this looks pretty straightforward.  Would it be something to
> extend to the other stacks like the NMI or double-fault stacks?  Or are
> those just not worth it
I haven't met NMI stack corrupted, so I don't have ideas about this. If
we need to check them too, one more patch should be introduced on top of
this.

Kirill

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-08 10:03     ` Kirill Tkhai
  0 siblings, 0 replies; 18+ messages in thread
From: Kirill Tkhai @ 2018-02-08 10:03 UTC (permalink / raw)
  To: Dave Hansen, tglx, mingo, hpa, aryabinin, glider, dvyukov, luto,
	bp, jpoimboe, jgross, kirill.shutemov, keescook, minipli, gregkh,
	kstewart, linux-kernel, kasan-dev, linux-mm

On 07.02.2018 21:38, Dave Hansen wrote:
> On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
>> Sometimes it is possible to meet a situation,
>> when irq stack is corrupted, while innocent
>> callback function is being executed. This may
>> happen because of crappy drivers irq handlers,
>> when they access wrong memory on the irq stack.
> 
> Can you be more clear about the actual issue?  Which drivers do this?
> How do they even find an IRQ stack pointer?

I can't say actual driver making this, because I'm still investigating the guilty one.
But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
where stack variable sg becomes corrupted. This time all scheduler-related not-stack
variables are in ideal state. And update_sd_lb_stats() is the function, which can't
corrupt its own stack. So, I thought this functionality may be useful for something else,
especially because of irq stack is one of the last stacks, which are not sanitized.
Task's stacks are already covered, as I know

[1595450.678971] Call Trace:
[1595450.683991]  <IRQ>
[1595450.684038]
[1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
[1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
[1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
[1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
[1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
[1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
[1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
[1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
[1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
[1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
[1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
[1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
[1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
[1595450.758519]  <EOI>
[1595450.758569]
[1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
[1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
[1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
[1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
[1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250

>> This patch aims to catch such the situations
>> and adds checks of unauthorized stack access.
> 
> I think I forgot how KASAN did this.  KASAN has metadata that says which
> areas of memory are good or bad to access, right?  So, this just tags
> IRQ stacks as bad when we are not _in_ an interrupt?
> 
>> +#define KASAN_IRQ_STACK_SIZE \
>> +	(sizeof(union irq_stack_union) - \
>> +		(offsetof(union irq_stack_union, stack_canary) + 8))
> 
> Just curious, but why leave out the canary?  It shouldn't be accessed
> either.

It's touched in several more places (e.g., in __switch_to_asm()), and I'm not
sure KASAN is OK with this. Does it?

Also gs_base is touched from load_percpu_segment(), which could be called from
different cpu, and this seems it would required some synchronization between
the handlers and this primitive.

>> +#ifdef CONFIG_KASAN
>> +void __visible x86_poison_irq_stack(void)
>> +{
>> +	if (this_cpu_read(irq_count) == -1)
>> +		kasan_poison_irq_stack();
>> +}
>> +void __visible x86_unpoison_irq_stack(void)
>> +{
>> +	if (this_cpu_read(irq_count) == -1)
>> +		kasan_unpoison_irq_stack();
>> +}
>> +#endif
> 
> It might be handy to point out here that -1 means "not in an interrupt"
> and >=0 means "in an interrupt".
> 
> Otherwise, this looks pretty straightforward.  Would it be something to
> extend to the other stacks like the NMI or double-fault stacks?  Or are
> those just not worth it
I haven't met NMI stack corrupted, so I don't have ideas about this. If
we need to check them too, one more patch should be introduced on top of
this.

Kirill

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-08 10:03     ` Kirill Tkhai
@ 2018-02-08 16:30       ` Josh Poimboeuf
  -1 siblings, 0 replies; 18+ messages in thread
From: Josh Poimboeuf @ 2018-02-08 16:30 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Dave Hansen, tglx, mingo, hpa, aryabinin, glider, dvyukov, luto,
	bp, jgross, kirill.shutemov, keescook, minipli, gregkh, kstewart,
	linux-kernel, kasan-dev, linux-mm

On Thu, Feb 08, 2018 at 01:03:49PM +0300, Kirill Tkhai wrote:
> On 07.02.2018 21:38, Dave Hansen wrote:
> > On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
> >> Sometimes it is possible to meet a situation,
> >> when irq stack is corrupted, while innocent
> >> callback function is being executed. This may
> >> happen because of crappy drivers irq handlers,
> >> when they access wrong memory on the irq stack.
> > 
> > Can you be more clear about the actual issue?  Which drivers do this?
> > How do they even find an IRQ stack pointer?
> 
> I can't say actual driver making this, because I'm still investigating the guilty one.
> But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
> where stack variable sg becomes corrupted. This time all scheduler-related not-stack
> variables are in ideal state. And update_sd_lb_stats() is the function, which can't
> corrupt its own stack. So, I thought this functionality may be useful for something else,
> especially because of irq stack is one of the last stacks, which are not sanitized.
> Task's stacks are already covered, as I know
> 
> [1595450.678971] Call Trace:
> [1595450.683991]  <IRQ>
> [1595450.684038]
> [1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
> [1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
> [1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
> [1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
> [1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
> [1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
> [1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
> [1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
> [1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
> [1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
> [1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
> [1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
> [1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
> [1595450.758519]  <EOI>
> [1595450.758569]
> [1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
> [1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
> [1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
> [1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
> [1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250

I'm not seeing how this patch would help.  If you're running on the irq
stack, the *entire* irq stack would be unpoisoned.  So there's still no
KASAN protection.  Or am I missing something?

Seems like it would be more useful for KASAN to detect redzone accesses
on the irq stack (if it's not doing that already).

-- 
Josh

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-08 16:30       ` Josh Poimboeuf
  0 siblings, 0 replies; 18+ messages in thread
From: Josh Poimboeuf @ 2018-02-08 16:30 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Dave Hansen, tglx, mingo, hpa, aryabinin, glider, dvyukov, luto,
	bp, jgross, kirill.shutemov, keescook, minipli, gregkh, kstewart,
	linux-kernel, kasan-dev, linux-mm

On Thu, Feb 08, 2018 at 01:03:49PM +0300, Kirill Tkhai wrote:
> On 07.02.2018 21:38, Dave Hansen wrote:
> > On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
> >> Sometimes it is possible to meet a situation,
> >> when irq stack is corrupted, while innocent
> >> callback function is being executed. This may
> >> happen because of crappy drivers irq handlers,
> >> when they access wrong memory on the irq stack.
> > 
> > Can you be more clear about the actual issue?  Which drivers do this?
> > How do they even find an IRQ stack pointer?
> 
> I can't say actual driver making this, because I'm still investigating the guilty one.
> But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
> where stack variable sg becomes corrupted. This time all scheduler-related not-stack
> variables are in ideal state. And update_sd_lb_stats() is the function, which can't
> corrupt its own stack. So, I thought this functionality may be useful for something else,
> especially because of irq stack is one of the last stacks, which are not sanitized.
> Task's stacks are already covered, as I know
> 
> [1595450.678971] Call Trace:
> [1595450.683991]  <IRQ>
> [1595450.684038]
> [1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
> [1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
> [1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
> [1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
> [1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
> [1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
> [1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
> [1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
> [1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
> [1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
> [1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
> [1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
> [1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
> [1595450.758519]  <EOI>
> [1595450.758569]
> [1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
> [1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
> [1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
> [1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
> [1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250

I'm not seeing how this patch would help.  If you're running on the irq
stack, the *entire* irq stack would be unpoisoned.  So there's still no
KASAN protection.  Or am I missing something?

Seems like it would be more useful for KASAN to detect redzone accesses
on the irq stack (if it's not doing that already).

-- 
Josh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-08 16:30       ` Josh Poimboeuf
@ 2018-02-08 16:41         ` Dmitry Vyukov
  -1 siblings, 0 replies; 18+ messages in thread
From: Dmitry Vyukov @ 2018-02-08 16:41 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Kirill Tkhai, Dave Hansen, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrey Ryabinin, Alexander Potapenko,
	Andy Lutomirski, Borislav Petkov, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Thu, Feb 8, 2018 at 5:30 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Thu, Feb 08, 2018 at 01:03:49PM +0300, Kirill Tkhai wrote:
>> On 07.02.2018 21:38, Dave Hansen wrote:
>> > On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
>> >> Sometimes it is possible to meet a situation,
>> >> when irq stack is corrupted, while innocent
>> >> callback function is being executed. This may
>> >> happen because of crappy drivers irq handlers,
>> >> when they access wrong memory on the irq stack.
>> >
>> > Can you be more clear about the actual issue?  Which drivers do this?
>> > How do they even find an IRQ stack pointer?
>>
>> I can't say actual driver making this, because I'm still investigating the guilty one.
>> But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
>> where stack variable sg becomes corrupted. This time all scheduler-related not-stack
>> variables are in ideal state. And update_sd_lb_stats() is the function, which can't
>> corrupt its own stack. So, I thought this functionality may be useful for something else,
>> especially because of irq stack is one of the last stacks, which are not sanitized.
>> Task's stacks are already covered, as I know
>>
>> [1595450.678971] Call Trace:
>> [1595450.683991]  <IRQ>
>> [1595450.684038]
>> [1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
>> [1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
>> [1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
>> [1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
>> [1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
>> [1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
>> [1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
>> [1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
>> [1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
>> [1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
>> [1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
>> [1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
>> [1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
>> [1595450.758519]  <EOI>
>> [1595450.758569]
>> [1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
>> [1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
>> [1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
>> [1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
>> [1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250
>
> I'm not seeing how this patch would help.  If you're running on the irq
> stack, the *entire* irq stack would be unpoisoned.  So there's still no
> KASAN protection.  Or am I missing something?
>
> Seems like it would be more useful for KASAN to detect redzone accesses
> on the irq stack (if it's not doing that already).

KASAN should do this already (unless there is something terribly
broken). Compiler instrumentation adds redzones around all stack
variables and injects code to poision/unpoison these redzones on
function entry/exit.
KASAN can also detect use-after-scope bugs for stack variables, but
this requires a more recent gcc (6 or 7, don't remember exactly now)
and CONFIG_KASAN_EXTRA since recently.
User-space ASAN can also detect so called use-after-return bugs
(dangling references to stack variables), but this requires manual
management of stack frames and quarantine for stack frames. This is
more tricky to do inside of kernel, so this was never implemented in
KASAN. KASAN still can detect some of these, if it will happen so that
the dangling reference happen to point to a redzone in a new frame.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-08 16:41         ` Dmitry Vyukov
  0 siblings, 0 replies; 18+ messages in thread
From: Dmitry Vyukov @ 2018-02-08 16:41 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Kirill Tkhai, Dave Hansen, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrey Ryabinin, Alexander Potapenko,
	Andy Lutomirski, Borislav Petkov, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Thu, Feb 8, 2018 at 5:30 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Thu, Feb 08, 2018 at 01:03:49PM +0300, Kirill Tkhai wrote:
>> On 07.02.2018 21:38, Dave Hansen wrote:
>> > On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
>> >> Sometimes it is possible to meet a situation,
>> >> when irq stack is corrupted, while innocent
>> >> callback function is being executed. This may
>> >> happen because of crappy drivers irq handlers,
>> >> when they access wrong memory on the irq stack.
>> >
>> > Can you be more clear about the actual issue?  Which drivers do this?
>> > How do they even find an IRQ stack pointer?
>>
>> I can't say actual driver making this, because I'm still investigating the guilty one.
>> But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
>> where stack variable sg becomes corrupted. This time all scheduler-related not-stack
>> variables are in ideal state. And update_sd_lb_stats() is the function, which can't
>> corrupt its own stack. So, I thought this functionality may be useful for something else,
>> especially because of irq stack is one of the last stacks, which are not sanitized.
>> Task's stacks are already covered, as I know
>>
>> [1595450.678971] Call Trace:
>> [1595450.683991]  <IRQ>
>> [1595450.684038]
>> [1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
>> [1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
>> [1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
>> [1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
>> [1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
>> [1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
>> [1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
>> [1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
>> [1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
>> [1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
>> [1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
>> [1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
>> [1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
>> [1595450.758519]  <EOI>
>> [1595450.758569]
>> [1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
>> [1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
>> [1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
>> [1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
>> [1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250
>
> I'm not seeing how this patch would help.  If you're running on the irq
> stack, the *entire* irq stack would be unpoisoned.  So there's still no
> KASAN protection.  Or am I missing something?
>
> Seems like it would be more useful for KASAN to detect redzone accesses
> on the irq stack (if it's not doing that already).

KASAN should do this already (unless there is something terribly
broken). Compiler instrumentation adds redzones around all stack
variables and injects code to poision/unpoison these redzones on
function entry/exit.
KASAN can also detect use-after-scope bugs for stack variables, but
this requires a more recent gcc (6 or 7, don't remember exactly now)
and CONFIG_KASAN_EXTRA since recently.
User-space ASAN can also detect so called use-after-return bugs
(dangling references to stack variables), but this requires manual
management of stack frames and quarantine for stack frames. This is
more tricky to do inside of kernel, so this was never implemented in
KASAN. KASAN still can detect some of these, if it will happen so that
the dangling reference happen to point to a redzone in a new frame.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-08 16:41         ` Dmitry Vyukov
@ 2018-02-08 17:20           ` Josh Poimboeuf
  -1 siblings, 0 replies; 18+ messages in thread
From: Josh Poimboeuf @ 2018-02-08 17:20 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Kirill Tkhai, Dave Hansen, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrey Ryabinin, Alexander Potapenko,
	Andy Lutomirski, Borislav Petkov, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Thu, Feb 08, 2018 at 05:41:19PM +0100, Dmitry Vyukov wrote:
> On Thu, Feb 8, 2018 at 5:30 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > On Thu, Feb 08, 2018 at 01:03:49PM +0300, Kirill Tkhai wrote:
> >> On 07.02.2018 21:38, Dave Hansen wrote:
> >> > On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
> >> >> Sometimes it is possible to meet a situation,
> >> >> when irq stack is corrupted, while innocent
> >> >> callback function is being executed. This may
> >> >> happen because of crappy drivers irq handlers,
> >> >> when they access wrong memory on the irq stack.
> >> >
> >> > Can you be more clear about the actual issue?  Which drivers do this?
> >> > How do they even find an IRQ stack pointer?
> >>
> >> I can't say actual driver making this, because I'm still investigating the guilty one.
> >> But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
> >> where stack variable sg becomes corrupted. This time all scheduler-related not-stack
> >> variables are in ideal state. And update_sd_lb_stats() is the function, which can't
> >> corrupt its own stack. So, I thought this functionality may be useful for something else,
> >> especially because of irq stack is one of the last stacks, which are not sanitized.
> >> Task's stacks are already covered, as I know
> >>
> >> [1595450.678971] Call Trace:
> >> [1595450.683991]  <IRQ>
> >> [1595450.684038]
> >> [1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
> >> [1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
> >> [1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
> >> [1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
> >> [1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
> >> [1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
> >> [1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
> >> [1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
> >> [1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
> >> [1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
> >> [1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
> >> [1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
> >> [1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
> >> [1595450.758519]  <EOI>
> >> [1595450.758569]
> >> [1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
> >> [1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
> >> [1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
> >> [1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
> >> [1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250
> >
> > I'm not seeing how this patch would help.  If you're running on the irq
> > stack, the *entire* irq stack would be unpoisoned.  So there's still no
> > KASAN protection.  Or am I missing something?
> >
> > Seems like it would be more useful for KASAN to detect redzone accesses
> > on the irq stack (if it's not doing that already).
> 
> KASAN should do this already (unless there is something terribly
> broken). Compiler instrumentation adds redzones around all stack
> variables and injects code to poision/unpoison these redzones on
> function entry/exit.
> KASAN can also detect use-after-scope bugs for stack variables, but
> this requires a more recent gcc (6 or 7, don't remember exactly now)
> and CONFIG_KASAN_EXTRA since recently.
> User-space ASAN can also detect so called use-after-return bugs
> (dangling references to stack variables), but this requires manual
> management of stack frames and quarantine for stack frames. This is
> more tricky to do inside of kernel, so this was never implemented in
> KASAN. KASAN still can detect some of these, if it will happen so that
> the dangling reference happen to point to a redzone in a new frame.

Ok, that's good.  And it seems this patch doesn't change that.

So it looks like the purpose of the patch is to protect the irq stack
from code which is *not* running on the irq stack.  Which seems a bit
far-fetched and theoretical.  Though I don't see any harm in it.

The patch description is confusing.  It talks about "crappy drivers irq
handlers when they access wrong memory on the stack".  But if I
understand correctly, the patch doesn't actually protect against that
case, because irq handlers run on the irq stack, and this patch only
affects code which *isn't* running on the irq stack.

-- 
Josh

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-08 17:20           ` Josh Poimboeuf
  0 siblings, 0 replies; 18+ messages in thread
From: Josh Poimboeuf @ 2018-02-08 17:20 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Kirill Tkhai, Dave Hansen, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrey Ryabinin, Alexander Potapenko,
	Andy Lutomirski, Borislav Petkov, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Thu, Feb 08, 2018 at 05:41:19PM +0100, Dmitry Vyukov wrote:
> On Thu, Feb 8, 2018 at 5:30 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > On Thu, Feb 08, 2018 at 01:03:49PM +0300, Kirill Tkhai wrote:
> >> On 07.02.2018 21:38, Dave Hansen wrote:
> >> > On 02/07/2018 08:14 AM, Kirill Tkhai wrote:
> >> >> Sometimes it is possible to meet a situation,
> >> >> when irq stack is corrupted, while innocent
> >> >> callback function is being executed. This may
> >> >> happen because of crappy drivers irq handlers,
> >> >> when they access wrong memory on the irq stack.
> >> >
> >> > Can you be more clear about the actual issue?  Which drivers do this?
> >> > How do they even find an IRQ stack pointer?
> >>
> >> I can't say actual driver making this, because I'm still investigating the guilty one.
> >> But I have couple of crash dumps with the crash inside update_sd_lb_stats() function,
> >> where stack variable sg becomes corrupted. This time all scheduler-related not-stack
> >> variables are in ideal state. And update_sd_lb_stats() is the function, which can't
> >> corrupt its own stack. So, I thought this functionality may be useful for something else,
> >> especially because of irq stack is one of the last stacks, which are not sanitized.
> >> Task's stacks are already covered, as I know
> >>
> >> [1595450.678971] Call Trace:
> >> [1595450.683991]  <IRQ>
> >> [1595450.684038]
> >> [1595450.688926]  [<ffffffff81320005>] cpumask_next_and+0x35/0x50
> >> [1595450.693984]  [<ffffffff810d91d3>] find_busiest_group+0x143/0x950
> >> [1595450.699088]  [<ffffffff810d9b7a>] load_balance+0x19a/0xc20
> >> [1595450.704289]  [<ffffffff810cde55>] ? sched_clock_cpu+0x85/0xc0
> >> [1595450.709457]  [<ffffffff810c29aa>] ? update_rq_clock.part.88+0x1a/0x150
> >> [1595450.714711]  [<ffffffff810da770>] rebalance_domains+0x170/0x2b0
> >> [1595450.719997]  [<ffffffff810da9d2>] run_rebalance_domains+0x122/0x1e0
> >> [1595450.725321]  [<ffffffff816bb10f>] __do_softirq+0x10f/0x2aa
> >> [1595450.730746]  [<ffffffff816b62ac>] call_softirq+0x1c/0x30
> >> [1595450.736169]  [<ffffffff8102d325>] do_softirq+0x65/0xa0
> >> [1595450.741754]  [<ffffffff81093ec5>] irq_exit+0x105/0x110
> >> [1595450.747279]  [<ffffffff816baad2>] smp_apic_timer_interrupt+0x42/0x50
> >> [1595450.752905]  [<ffffffff816b7a62>] apic_timer_interrupt+0x232/0x240
> >> [1595450.758519]  <EOI>
> >> [1595450.758569]
> >> [1595450.764100]  [<ffffffff8152f282>] ? cpuidle_enter_state+0x52/0xc0
> >> [1595450.769652]  [<ffffffff8152f3c8>] cpuidle_idle_call+0xd8/0x210
> >> [1595450.775198]  [<ffffffff8103540e>] arch_cpu_idle+0xe/0x30
> >> [1595450.780813]  [<ffffffff810effba>] cpu_startup_entry+0x14a/0x1c0
> >> [1595450.786286]  [<ffffffff810523e6>] start_secondary+0x1d6/0x250
> >
> > I'm not seeing how this patch would help.  If you're running on the irq
> > stack, the *entire* irq stack would be unpoisoned.  So there's still no
> > KASAN protection.  Or am I missing something?
> >
> > Seems like it would be more useful for KASAN to detect redzone accesses
> > on the irq stack (if it's not doing that already).
> 
> KASAN should do this already (unless there is something terribly
> broken). Compiler instrumentation adds redzones around all stack
> variables and injects code to poision/unpoison these redzones on
> function entry/exit.
> KASAN can also detect use-after-scope bugs for stack variables, but
> this requires a more recent gcc (6 or 7, don't remember exactly now)
> and CONFIG_KASAN_EXTRA since recently.
> User-space ASAN can also detect so called use-after-return bugs
> (dangling references to stack variables), but this requires manual
> management of stack frames and quarantine for stack frames. This is
> more tricky to do inside of kernel, so this was never implemented in
> KASAN. KASAN still can detect some of these, if it will happen so that
> the dangling reference happen to point to a redzone in a new frame.

Ok, that's good.  And it seems this patch doesn't change that.

So it looks like the purpose of the patch is to protect the irq stack
from code which is *not* running on the irq stack.  Which seems a bit
far-fetched and theoretical.  Though I don't see any harm in it.

The patch description is confusing.  It talks about "crappy drivers irq
handlers when they access wrong memory on the stack".  But if I
understand correctly, the patch doesn't actually protect against that
case, because irq handlers run on the irq stack, and this patch only
affects code which *isn't* running on the irq stack.

-- 
Josh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-08 17:20           ` Josh Poimboeuf
@ 2018-02-08 19:00             ` Matthew Wilcox
  -1 siblings, 0 replies; 18+ messages in thread
From: Matthew Wilcox @ 2018-02-08 19:00 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Dmitry Vyukov, Kirill Tkhai, Dave Hansen, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, Andrey Ryabinin,
	Alexander Potapenko, Andy Lutomirski, Borislav Petkov,
	Juergen Gross, Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Thu, Feb 08, 2018 at 11:20:26AM -0600, Josh Poimboeuf wrote:
> The patch description is confusing.  It talks about "crappy drivers irq
> handlers when they access wrong memory on the stack".  But if I
> understand correctly, the patch doesn't actually protect against that
> case, because irq handlers run on the irq stack, and this patch only
> affects code which *isn't* running on the irq stack.

This would catch a crappy driver which allocates some memory on the
irq stack, squirrels the pointer to it away in a data structure, then
returns to process (or softirq) context and dereferences the pointer.

I have no idea if that's the case that Kirill is tracking down, but it's
something I can imagine someone doing.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-08 19:00             ` Matthew Wilcox
  0 siblings, 0 replies; 18+ messages in thread
From: Matthew Wilcox @ 2018-02-08 19:00 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Dmitry Vyukov, Kirill Tkhai, Dave Hansen, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, Andrey Ryabinin,
	Alexander Potapenko, Andy Lutomirski, Borislav Petkov,
	Juergen Gross, Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On Thu, Feb 08, 2018 at 11:20:26AM -0600, Josh Poimboeuf wrote:
> The patch description is confusing.  It talks about "crappy drivers irq
> handlers when they access wrong memory on the stack".  But if I
> understand correctly, the patch doesn't actually protect against that
> case, because irq handlers run on the irq stack, and this patch only
> affects code which *isn't* running on the irq stack.

This would catch a crappy driver which allocates some memory on the
irq stack, squirrels the pointer to it away in a data structure, then
returns to process (or softirq) context and dereferences the pointer.

I have no idea if that's the case that Kirill is tracking down, but it's
something I can imagine someone doing.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
  2018-02-08 19:00             ` Matthew Wilcox
@ 2018-02-09  8:53               ` Kirill Tkhai
  -1 siblings, 0 replies; 18+ messages in thread
From: Kirill Tkhai @ 2018-02-09  8:53 UTC (permalink / raw)
  To: Matthew Wilcox, Josh Poimboeuf
  Cc: Dmitry Vyukov, Dave Hansen, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrey Ryabinin, Alexander Potapenko,
	Andy Lutomirski, Borislav Petkov, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On 08.02.2018 22:00, Matthew Wilcox wrote:
> On Thu, Feb 08, 2018 at 11:20:26AM -0600, Josh Poimboeuf wrote:
>> The patch description is confusing.  It talks about "crappy drivers irq
>> handlers when they access wrong memory on the stack".  But if I
>> understand correctly, the patch doesn't actually protect against that
>> case, because irq handlers run on the irq stack, and this patch only
>> affects code which *isn't* running on the irq stack.
> 
> This would catch a crappy driver which allocates some memory on the
> irq stack, squirrels the pointer to it away in a data structure, then
> returns to process (or softirq) context and dereferences the pointer.

Yes, this is exactly what I mean. The patch allows stack modifications
for interrupt time, and catches wrong accesses from another contexts/cpus
(when there is no interrupt executing in parallel).

It's possible to catch wrong accesses in interrupt time also, but we need
to unmap irq stacks on another cpus to do that, which is not KASAN thing.

But, I hope we may be lucky and catch such situations even if we only check
for accesses, which are going not in interrupt time.

> I have no idea if that's the case that Kirill is tracking down, but it's
> something I can imagine someone doing.

Kirill

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access
@ 2018-02-09  8:53               ` Kirill Tkhai
  0 siblings, 0 replies; 18+ messages in thread
From: Kirill Tkhai @ 2018-02-09  8:53 UTC (permalink / raw)
  To: Matthew Wilcox, Josh Poimboeuf
  Cc: Dmitry Vyukov, Dave Hansen, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Andrey Ryabinin, Alexander Potapenko,
	Andy Lutomirski, Borislav Petkov, Juergen Gross,
	Kirill A. Shutemov, Kees Cook, Mathias Krause,
	Greg Kroah-Hartman, Kate Stewart, LKML, kasan-dev, Linux-MM

On 08.02.2018 22:00, Matthew Wilcox wrote:
> On Thu, Feb 08, 2018 at 11:20:26AM -0600, Josh Poimboeuf wrote:
>> The patch description is confusing.  It talks about "crappy drivers irq
>> handlers when they access wrong memory on the stack".  But if I
>> understand correctly, the patch doesn't actually protect against that
>> case, because irq handlers run on the irq stack, and this patch only
>> affects code which *isn't* running on the irq stack.
> 
> This would catch a crappy driver which allocates some memory on the
> irq stack, squirrels the pointer to it away in a data structure, then
> returns to process (or softirq) context and dereferences the pointer.

Yes, this is exactly what I mean. The patch allows stack modifications
for interrupt time, and catches wrong accesses from another contexts/cpus
(when there is no interrupt executing in parallel).

It's possible to catch wrong accesses in interrupt time also, but we need
to unmap irq stacks on another cpus to do that, which is not KASAN thing.

But, I hope we may be lucky and catch such situations even if we only check
for accesses, which are going not in interrupt time.

> I have no idea if that's the case that Kirill is tracking down, but it's
> something I can imagine someone doing.

Kirill

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2018-02-09  8:53 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-07 16:14 [PATCH RFC] x86: KASAN: Sanitize unauthorized irq stack access Kirill Tkhai
2018-02-07 16:14 ` Kirill Tkhai
2018-02-07 18:38 ` Dave Hansen
2018-02-07 18:38   ` Dave Hansen
2018-02-07 19:31   ` Dmitry Vyukov
2018-02-07 19:31     ` Dmitry Vyukov
2018-02-08 10:03   ` Kirill Tkhai
2018-02-08 10:03     ` Kirill Tkhai
2018-02-08 16:30     ` Josh Poimboeuf
2018-02-08 16:30       ` Josh Poimboeuf
2018-02-08 16:41       ` Dmitry Vyukov
2018-02-08 16:41         ` Dmitry Vyukov
2018-02-08 17:20         ` Josh Poimboeuf
2018-02-08 17:20           ` Josh Poimboeuf
2018-02-08 19:00           ` Matthew Wilcox
2018-02-08 19:00             ` Matthew Wilcox
2018-02-09  8:53             ` Kirill Tkhai
2018-02-09  8:53               ` Kirill Tkhai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.