All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] cobalt/intr: fix interrupt stat iterator
@ 2019-04-24 10:29 Philippe Gerum
  2019-04-25 14:53 ` Jan Kiszka
  0 siblings, 1 reply; 11+ messages in thread
From: Philippe Gerum @ 2019-04-24 10:29 UTC (permalink / raw)
  To: xenomai

The implementation does not iterate fully over all CPU slots for every
handler, leaving some of them uncollected. In addition, if maxcpus= is
used to restrict the number of available CPUs to a subset of the
present ones, with the highest numbered CPU marked offline, the
iterator rescans the first valid IRQ slot indefinitely, eventually
leading to memory corruption due to out-of-bound writes.

Rewrite the iterator in order to implement the following loop as
expected:

for_each_irq(irq)
    for_each_handler_of_irq(handler, irq)
    	for_each_online_cpu(cpu)
	    collect_stats(irq, handler, cpu);
---
 kernel/cobalt/intr.c  | 48 +++++++++++++++++++------------------------
 kernel/cobalt/sched.c |  2 +-
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/kernel/cobalt/intr.c b/kernel/cobalt/intr.c
index ba6409ebb..9c3f55508 100644
--- a/kernel/cobalt/intr.c
+++ b/kernel/cobalt/intr.c
@@ -1015,7 +1015,6 @@ void xnintr_put_query_lock(void)
 
 int xnintr_query_init(struct xnintr_iterator *iterator)
 {
-	iterator->cpu = -1;
 	iterator->prev = NULL;
 
 	/* The order is important here: first xnintr_list_rev then
@@ -1039,43 +1038,38 @@ int xnintr_query_next(int irq, struct xnintr_iterator *iterator,
 	int cpu, nr_cpus = num_present_cpus();
 	struct xnintr *intr;
 
-	for (cpu = iterator->cpu + 1; cpu < nr_cpus; ++cpu) {
-		if (cpu_online(cpu))
-			break;
-	}
-	if (cpu == nr_cpus)
-		cpu = 0;
-	iterator->cpu = cpu;
-
 	if (iterator->list_rev != xnintr_list_rev)
 		return -EAGAIN;
 
-	if (!iterator->prev) {
+	intr = iterator->prev;
+	if (intr == NULL) {
 		if (xnintr_is_timer_irq(irq))
 			intr = &nktimer;
 		else
 			intr = xnintr_vec_first(irq);
-	} else
-		intr = xnintr_vec_next(iterator->prev);
-
-	if (intr == NULL) {
-		cpu = -1;
-		iterator->prev = NULL;
-		return -ENODEV;
+		if (intr == NULL)
+			return -ENODEV;
+		iterator->prev = intr;
+		iterator->cpu = -1;
 	}
 
-	ksformat(name_buf, XNOBJECT_NAME_LEN, "IRQ%d: %s", irq, intr->name);
+	for (;;) {
+		for (cpu = iterator->cpu + 1; cpu < nr_cpus; ++cpu) {
+			if (cpu_online(cpu)) {
+				ksformat(name_buf, XNOBJECT_NAME_LEN, "IRQ%d: %s",
+					irq, intr->name);
+				query_irqstats(intr, cpu, iterator);
+				iterator->cpu = cpu;
+				return 0;
+			}
+		}
 
-	query_irqstats(intr, cpu, iterator);
+		iterator->prev = xnintr_vec_next(intr);
+		if (iterator->prev == NULL)
+			return -ENODEV;
 
-	/*
-	 * Proceed to next entry in shared IRQ chain when all CPUs
-	 * have been visited for this one.
-	 */
-	if (cpu + 1 == nr_cpus)
-		iterator->prev = intr;
-
-	return 0;
+		iterator->cpu = -1;
+	}
 }
 
 #endif /* CONFIG_XENO_OPT_STATS */
diff --git a/kernel/cobalt/sched.c b/kernel/cobalt/sched.c
index f7d16fa37..9c7a729b8 100644
--- a/kernel/cobalt/sched.c
+++ b/kernel/cobalt/sched.c
@@ -1265,7 +1265,7 @@ static int vfile_schedstat_rewind(struct xnvfile_snapshot_iterator *it)
 	 */
 	priv->curr = list_first_entry(&nkthreadq, struct xnthread, glink);
 	priv->irq = 0;
-	irqnr = xnintr_query_init(&priv->intr_it) * NR_CPUS;
+	irqnr = xnintr_query_init(&priv->intr_it) * num_online_cpus();
 
 	return irqnr + cobalt_nrthreads;
 }
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-24 10:29 [PATCH] cobalt/intr: fix interrupt stat iterator Philippe Gerum
@ 2019-04-25 14:53 ` Jan Kiszka
  2019-04-25 15:10   ` Jan Kiszka
  2019-04-25 15:11   ` Philippe Gerum
  0 siblings, 2 replies; 11+ messages in thread
From: Jan Kiszka @ 2019-04-25 14:53 UTC (permalink / raw)
  To: Philippe Gerum, xenomai

On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
> The implementation does not iterate fully over all CPU slots for every
> handler, leaving some of them uncollected. In addition, if maxcpus= is
> used to restrict the number of available CPUs to a subset of the
> present ones, with the highest numbered CPU marked offline, the
> iterator rescans the first valid IRQ slot indefinitely, eventually
> leading to memory corruption due to out-of-bound writes.

The patch looks good and works fine for the common case, so I've applied it to next.

But I cannot reproduce your maxcpus setup as - for to-be-debugged reasons - we 
are already crashing on 4.14 and 4.4 on x86 during boot. Did you test on ARM?

Jan

> 
> Rewrite the iterator in order to implement the following loop as
> expected:
> 
> for_each_irq(irq)
>      for_each_handler_of_irq(handler, irq)
>      	for_each_online_cpu(cpu)
> 	    collect_stats(irq, handler, cpu);
> ---
>   kernel/cobalt/intr.c  | 48 +++++++++++++++++++------------------------
>   kernel/cobalt/sched.c |  2 +-
>   2 files changed, 22 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/cobalt/intr.c b/kernel/cobalt/intr.c
> index ba6409ebb..9c3f55508 100644
> --- a/kernel/cobalt/intr.c
> +++ b/kernel/cobalt/intr.c
> @@ -1015,7 +1015,6 @@ void xnintr_put_query_lock(void)
>   
>   int xnintr_query_init(struct xnintr_iterator *iterator)
>   {
> -	iterator->cpu = -1;
>   	iterator->prev = NULL;
>   
>   	/* The order is important here: first xnintr_list_rev then
> @@ -1039,43 +1038,38 @@ int xnintr_query_next(int irq, struct xnintr_iterator *iterator,
>   	int cpu, nr_cpus = num_present_cpus();
>   	struct xnintr *intr;
>   
> -	for (cpu = iterator->cpu + 1; cpu < nr_cpus; ++cpu) {
> -		if (cpu_online(cpu))
> -			break;
> -	}
> -	if (cpu == nr_cpus)
> -		cpu = 0;
> -	iterator->cpu = cpu;
> -
>   	if (iterator->list_rev != xnintr_list_rev)
>   		return -EAGAIN;
>   
> -	if (!iterator->prev) {
> +	intr = iterator->prev;
> +	if (intr == NULL) {
>   		if (xnintr_is_timer_irq(irq))
>   			intr = &nktimer;
>   		else
>   			intr = xnintr_vec_first(irq);
> -	} else
> -		intr = xnintr_vec_next(iterator->prev);
> -
> -	if (intr == NULL) {
> -		cpu = -1;
> -		iterator->prev = NULL;
> -		return -ENODEV;
> +		if (intr == NULL)
> +			return -ENODEV;
> +		iterator->prev = intr;
> +		iterator->cpu = -1;
>   	}
>   
> -	ksformat(name_buf, XNOBJECT_NAME_LEN, "IRQ%d: %s", irq, intr->name);
> +	for (;;) {
> +		for (cpu = iterator->cpu + 1; cpu < nr_cpus; ++cpu) {
> +			if (cpu_online(cpu)) {
> +				ksformat(name_buf, XNOBJECT_NAME_LEN, "IRQ%d: %s",
> +					irq, intr->name);
> +				query_irqstats(intr, cpu, iterator);
> +				iterator->cpu = cpu;
> +				return 0;
> +			}
> +		}
>   
> -	query_irqstats(intr, cpu, iterator);
> +		iterator->prev = xnintr_vec_next(intr);
> +		if (iterator->prev == NULL)
> +			return -ENODEV;
>   
> -	/*
> -	 * Proceed to next entry in shared IRQ chain when all CPUs
> -	 * have been visited for this one.
> -	 */
> -	if (cpu + 1 == nr_cpus)
> -		iterator->prev = intr;
> -
> -	return 0;
> +		iterator->cpu = -1;
> +	}
>   }
>   
>   #endif /* CONFIG_XENO_OPT_STATS */
> diff --git a/kernel/cobalt/sched.c b/kernel/cobalt/sched.c
> index f7d16fa37..9c7a729b8 100644
> --- a/kernel/cobalt/sched.c
> +++ b/kernel/cobalt/sched.c
> @@ -1265,7 +1265,7 @@ static int vfile_schedstat_rewind(struct xnvfile_snapshot_iterator *it)
>   	 */
>   	priv->curr = list_first_entry(&nkthreadq, struct xnthread, glink);
>   	priv->irq = 0;
> -	irqnr = xnintr_query_init(&priv->intr_it) * NR_CPUS;
> +	irqnr = xnintr_query_init(&priv->intr_it) * num_online_cpus();
>   
>   	return irqnr + cobalt_nrthreads;
>   }
> 


-- 
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 14:53 ` Jan Kiszka
@ 2019-04-25 15:10   ` Jan Kiszka
  2019-04-25 15:12     ` Philippe Gerum
  2019-04-25 15:16     ` Philippe Gerum
  2019-04-25 15:11   ` Philippe Gerum
  1 sibling, 2 replies; 11+ messages in thread
From: Jan Kiszka @ 2019-04-25 15:10 UTC (permalink / raw)
  To: Philippe Gerum, xenomai

On 25.04.19 16:53, Jan Kiszka wrote:
> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>> The implementation does not iterate fully over all CPU slots for every
>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>> used to restrict the number of available CPUs to a subset of the
>> present ones, with the highest numbered CPU marked offline, the
>> iterator rescans the first valid IRQ slot indefinitely, eventually
>> leading to memory corruption due to out-of-bound writes.
> 
> The patch looks good and works fine for the common case, so I've applied it to 
> next.
> 
> But I cannot reproduce your maxcpus setup as - for to-be-debugged reasons - we 
> are already crashing on 4.14 and 4.4 on x86 during boot. Did you test on ARM?

OK, understood the trigger: systemd-udevd decides to boot the remaining CPUs 
later on, and that will crash us on the first timer tick. Maybe x86-only issue, 
maybe you tested without CPU hotplug support.

Jan

-- 
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 14:53 ` Jan Kiszka
  2019-04-25 15:10   ` Jan Kiszka
@ 2019-04-25 15:11   ` Philippe Gerum
  2019-04-25 15:12     ` Jan Kiszka
  1 sibling, 1 reply; 11+ messages in thread
From: Philippe Gerum @ 2019-04-25 15:11 UTC (permalink / raw)
  To: Jan Kiszka, xenomai

On 4/25/19 4:53 PM, Jan Kiszka wrote:
> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>> The implementation does not iterate fully over all CPU slots for every
>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>> used to restrict the number of available CPUs to a subset of the
>> present ones, with the highest numbered CPU marked offline, the
>> iterator rescans the first valid IRQ slot indefinitely, eventually
>> leading to memory corruption due to out-of-bound writes.
> 
> The patch looks good and works fine for the common case, so I've applied
> it to next.
> 
> But I cannot reproduce your maxcpus setup as - for to-be-debugged
> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
> Did you test on ARM?

Yes, i.MX6. Quad-core restricted with maxcpus=2.

-- 
Philippe.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:10   ` Jan Kiszka
@ 2019-04-25 15:12     ` Philippe Gerum
  2019-04-25 15:16     ` Philippe Gerum
  1 sibling, 0 replies; 11+ messages in thread
From: Philippe Gerum @ 2019-04-25 15:12 UTC (permalink / raw)
  To: Jan Kiszka, xenomai

On 4/25/19 5:10 PM, Jan Kiszka wrote:
> On 25.04.19 16:53, Jan Kiszka wrote:
>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>> The implementation does not iterate fully over all CPU slots for every
>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>> used to restrict the number of available CPUs to a subset of the
>>> present ones, with the highest numbered CPU marked offline, the
>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>> leading to memory corruption due to out-of-bound writes.
>>
>> The patch looks good and works fine for the common case, so I've
>> applied it to next.
>>
>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>> Did you test on ARM?
> 
> OK, understood the trigger: systemd-udevd decides to boot the remaining
> CPUs later on, and that will crash us on the first timer tick. Maybe
> x86-only issue, maybe you tested without CPU hotplug support.
> 

I did, but ARM kernel.

-- 
Philippe.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:11   ` Philippe Gerum
@ 2019-04-25 15:12     ` Jan Kiszka
  2019-04-25 15:17       ` Philippe Gerum
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Kiszka @ 2019-04-25 15:12 UTC (permalink / raw)
  To: Philippe Gerum, xenomai

On 25.04.19 17:11, Philippe Gerum wrote:
> On 4/25/19 4:53 PM, Jan Kiszka wrote:
>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>> The implementation does not iterate fully over all CPU slots for every
>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>> used to restrict the number of available CPUs to a subset of the
>>> present ones, with the highest numbered CPU marked offline, the
>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>> leading to memory corruption due to out-of-bound writes.
>>
>> The patch looks good and works fine for the common case, so I've applied
>> it to next.
>>
>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>> Did you test on ARM?
> 
> Yes, i.MX6. Quad-core restricted with maxcpus=2.
> 

CONFIG_HOTPLUG_CPU? Does it survive manual onlining later on?

Jan

-- 
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:10   ` Jan Kiszka
  2019-04-25 15:12     ` Philippe Gerum
@ 2019-04-25 15:16     ` Philippe Gerum
  2019-04-25 15:18       ` Philippe Gerum
  1 sibling, 1 reply; 11+ messages in thread
From: Philippe Gerum @ 2019-04-25 15:16 UTC (permalink / raw)
  To: Jan Kiszka, xenomai

On 4/25/19 5:10 PM, Jan Kiszka wrote:
> On 25.04.19 16:53, Jan Kiszka wrote:
>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>> The implementation does not iterate fully over all CPU slots for every
>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>> used to restrict the number of available CPUs to a subset of the
>>> present ones, with the highest numbered CPU marked offline, the
>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>> leading to memory corruption due to out-of-bound writes.
>>
>> The patch looks good and works fine for the common case, so I've
>> applied it to next.
>>
>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>> Did you test on ARM?
> 
> OK, understood the trigger: systemd-udevd decides to boot the remaining
> CPUs later on, and that will crash us on the first timer tick. Maybe
> x86-only issue, maybe you tested without CPU hotplug support.
> 

The crash is triggered by xnintr_query_next() returning the first
collected bulk of stat data indefinitely, causing vfile_snapshot_open()
to exceed the max. count of items returned by ->rewind(), which ends up
trashing the kernel memory beyond all recognition.

-- 
Philippe.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:12     ` Jan Kiszka
@ 2019-04-25 15:17       ` Philippe Gerum
  2019-04-25 15:21         ` Jan Kiszka
  0 siblings, 1 reply; 11+ messages in thread
From: Philippe Gerum @ 2019-04-25 15:17 UTC (permalink / raw)
  To: Jan Kiszka, xenomai

On 4/25/19 5:12 PM, Jan Kiszka wrote:
> On 25.04.19 17:11, Philippe Gerum wrote:
>> On 4/25/19 4:53 PM, Jan Kiszka wrote:
>>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>>> The implementation does not iterate fully over all CPU slots for every
>>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>>> used to restrict the number of available CPUs to a subset of the
>>>> present ones, with the highest numbered CPU marked offline, the
>>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>>> leading to memory corruption due to out-of-bound writes.
>>>
>>> The patch looks good and works fine for the common case, so I've applied
>>> it to next.
>>>
>>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>>> Did you test on ARM?
>>
>> Yes, i.MX6. Quad-core restricted with maxcpus=2.
>>
> 
> CONFIG_HOTPLUG_CPU? Does it survive manual onlining later on?
> 

HOTPLUG is off in my setup.


-- 
Philippe.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:16     ` Philippe Gerum
@ 2019-04-25 15:18       ` Philippe Gerum
  0 siblings, 0 replies; 11+ messages in thread
From: Philippe Gerum @ 2019-04-25 15:18 UTC (permalink / raw)
  To: Jan Kiszka, xenomai

On 4/25/19 5:16 PM, Philippe Gerum via Xenomai wrote:
> On 4/25/19 5:10 PM, Jan Kiszka wrote:
>> On 25.04.19 16:53, Jan Kiszka wrote:
>>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>>> The implementation does not iterate fully over all CPU slots for every
>>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>>> used to restrict the number of available CPUs to a subset of the
>>>> present ones, with the highest numbered CPU marked offline, the
>>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>>> leading to memory corruption due to out-of-bound writes.
>>>
>>> The patch looks good and works fine for the common case, so I've
>>> applied it to next.
>>>
>>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>>> Did you test on ARM?
>>
>> OK, understood the trigger: systemd-udevd decides to boot the remaining
>> CPUs later on, and that will crash us on the first timer tick. Maybe
>> x86-only issue, maybe you tested without CPU hotplug support.
>>
> 
> The crash is triggered by xnintr_query_next() returning the first
> collected bulk of stat data indefinitely, causing vfile_snapshot_open()
> to exceed the max. count of items returned by ->rewind()

i.e. upon return from the ->next() handler.

-- 
Philippe.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:17       ` Philippe Gerum
@ 2019-04-25 15:21         ` Jan Kiszka
  2019-04-26 17:29           ` Jan Kiszka
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Kiszka @ 2019-04-25 15:21 UTC (permalink / raw)
  To: Philippe Gerum, xenomai

On 25.04.19 17:17, Philippe Gerum wrote:
> On 4/25/19 5:12 PM, Jan Kiszka wrote:
>> On 25.04.19 17:11, Philippe Gerum wrote:
>>> On 4/25/19 4:53 PM, Jan Kiszka wrote:
>>>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>>>> The implementation does not iterate fully over all CPU slots for every
>>>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>>>> used to restrict the number of available CPUs to a subset of the
>>>>> present ones, with the highest numbered CPU marked offline, the
>>>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>>>> leading to memory corruption due to out-of-bound writes.
>>>>
>>>> The patch looks good and works fine for the common case, so I've applied
>>>> it to next.
>>>>
>>>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>>>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>>>> Did you test on ARM?
>>>
>>> Yes, i.MX6. Quad-core restricted with maxcpus=2.
>>>
>>
>> CONFIG_HOTPLUG_CPU? Does it survive manual onlining later on?
>>
> 
> HOTPLUG is off in my setup.
> 

OK, then the pattern may also be found on ARM. It seems we can only do
offline/online when the cpu was online at the time ipipe took over the
timers, or maybe even when ipipe was hijacking interrupts. If it wasn't
we see to lack some initialization when the cpu comes up later, and the
timer tick on that new core hits a NULL pointer.

[    7.629979] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
[    7.631456] IP: __ipipe_ack_hrtimer_irq+0x47/0x60
[    7.632557] PGD 0 P4D 0 
[    7.633504] Oops: 0000 [#1] PREEMPT SMP PTI
[    7.634647] Modules linked in: rt_e1000e rtnet
[    7.635668] CPU: 3 PID: 401 Comm: cpuhp/3 Not tainted 4.14.111+ #89
[    7.636878] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
[    7.638560] I-pipe domain: Linux
[    7.639513] task: ffff888039526340 task.stack: ffffc9000073c000
[    7.640847] RIP: 0010:__ipipe_ack_hrtimer_irq+0x47/0x60
[    7.641965] RSP: 0000:ffff88803e203f80 EFLAGS: 00010046
[    7.643144] RAX: 000000000002e620 RBX: 0000000000001103 RCX: ffffffff83740880
[    7.645724] RDX: ffff88803e200000 RSI: ffffffff836f6d80 RDI: 0000000000000000
[    7.647163] RBP: 0000000000000000 R08: ffff88803cc03909 R09: 0000000000000112
[    7.648479] R10: 0000000000000115 R11: 0000000000000003 R12: 0000000000000000
[    7.650493] R13: 0000000000000000 R14: 000000000002e540 R15: 0000000000000000
[    7.651922] FS:  0000000000000000(0000) GS:ffff88803e200000(0000) knlGS:0000000000000000
[    7.653493] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    7.654826] CR2: 0000000000000018 CR3: 000000000221e001 CR4: 00000000003606e0
[    7.656383] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    7.657859] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    7.659355] Call Trace:
[    7.660379]  <IRQ>
[    7.661261]  __ipipe_dispatch_irq+0x87/0x1d0
[    7.662310]  __ipipe_handle_irq+0x76/0x1e0
[    7.663360]  apic_timer_interrupt+0x7f/0xb0
[    7.664401]  </IRQ>

Jan

-- 
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] cobalt/intr: fix interrupt stat iterator
  2019-04-25 15:21         ` Jan Kiszka
@ 2019-04-26 17:29           ` Jan Kiszka
  0 siblings, 0 replies; 11+ messages in thread
From: Jan Kiszka @ 2019-04-26 17:29 UTC (permalink / raw)
  To: Philippe Gerum, xenomai

On 25.04.19 17:21, Jan Kiszka wrote:
> On 25.04.19 17:17, Philippe Gerum wrote:
>> On 4/25/19 5:12 PM, Jan Kiszka wrote:
>>> On 25.04.19 17:11, Philippe Gerum wrote:
>>>> On 4/25/19 4:53 PM, Jan Kiszka wrote:
>>>>> On 24.04.19 12:29, Philippe Gerum via Xenomai wrote:
>>>>>> The implementation does not iterate fully over all CPU slots for every
>>>>>> handler, leaving some of them uncollected. In addition, if maxcpus= is
>>>>>> used to restrict the number of available CPUs to a subset of the
>>>>>> present ones, with the highest numbered CPU marked offline, the
>>>>>> iterator rescans the first valid IRQ slot indefinitely, eventually
>>>>>> leading to memory corruption due to out-of-bound writes.
>>>>>
>>>>> The patch looks good and works fine for the common case, so I've applied
>>>>> it to next.
>>>>>
>>>>> But I cannot reproduce your maxcpus setup as - for to-be-debugged
>>>>> reasons - we are already crashing on 4.14 and 4.4 on x86 during boot.
>>>>> Did you test on ARM?
>>>>
>>>> Yes, i.MX6. Quad-core restricted with maxcpus=2.
>>>>
>>>
>>> CONFIG_HOTPLUG_CPU? Does it survive manual onlining later on?
>>>
>>
>> HOTPLUG is off in my setup.
>>
> 
> OK, then the pattern may also be found on ARM. It seems we can only do
> offline/online when the cpu was online at the time ipipe took over the
> timers, or maybe even when ipipe was hijacking interrupts. If it wasn't
> we see to lack some initialization when the cpu comes up later, and the
> timer tick on that new core hits a NULL pointer.
> 
> [    7.629979] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
> [    7.631456] IP: __ipipe_ack_hrtimer_irq+0x47/0x60
> [    7.632557] PGD 0 P4D 0
> [    7.633504] Oops: 0000 [#1] PREEMPT SMP PTI
> [    7.634647] Modules linked in: rt_e1000e rtnet
> [    7.635668] CPU: 3 PID: 401 Comm: cpuhp/3 Not tainted 4.14.111+ #89
> [    7.636878] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
> [    7.638560] I-pipe domain: Linux
> [    7.639513] task: ffff888039526340 task.stack: ffffc9000073c000
> [    7.640847] RIP: 0010:__ipipe_ack_hrtimer_irq+0x47/0x60
> [    7.641965] RSP: 0000:ffff88803e203f80 EFLAGS: 00010046
> [    7.643144] RAX: 000000000002e620 RBX: 0000000000001103 RCX: ffffffff83740880
> [    7.645724] RDX: ffff88803e200000 RSI: ffffffff836f6d80 RDI: 0000000000000000
> [    7.647163] RBP: 0000000000000000 R08: ffff88803cc03909 R09: 0000000000000112
> [    7.648479] R10: 0000000000000115 R11: 0000000000000003 R12: 0000000000000000
> [    7.650493] R13: 0000000000000000 R14: 000000000002e540 R15: 0000000000000000
> [    7.651922] FS:  0000000000000000(0000) GS:ffff88803e200000(0000) knlGS:0000000000000000
> [    7.653493] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    7.654826] CR2: 0000000000000018 CR3: 000000000221e001 CR4: 00000000003606e0
> [    7.656383] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [    7.657859] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [    7.659355] Call Trace:
> [    7.660379]  <IRQ>
> [    7.661261]  __ipipe_dispatch_irq+0x87/0x1d0
> [    7.662310]  __ipipe_handle_irq+0x76/0x1e0
> [    7.663360]  apic_timer_interrupt+0x7f/0xb0
> [    7.664401]  </IRQ>

OK, I opened a can of worms this way. Killed one in Xenomai already and 2 or so 
in ipipe (core). Another one in Xenomai, related but also reproducible without 
hotplug, is still pending: rtdm_task_create on unsupported CPUs.

Jan

-- 
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-04-26 17:29 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-24 10:29 [PATCH] cobalt/intr: fix interrupt stat iterator Philippe Gerum
2019-04-25 14:53 ` Jan Kiszka
2019-04-25 15:10   ` Jan Kiszka
2019-04-25 15:12     ` Philippe Gerum
2019-04-25 15:16     ` Philippe Gerum
2019-04-25 15:18       ` Philippe Gerum
2019-04-25 15:11   ` Philippe Gerum
2019-04-25 15:12     ` Jan Kiszka
2019-04-25 15:17       ` Philippe Gerum
2019-04-25 15:21         ` Jan Kiszka
2019-04-26 17:29           ` Jan Kiszka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.