All of lore.kernel.org
 help / color / mirror / Atom feed
* [Xenomai-core] [RFC] Break out of endless user space loops
@ 2010-06-02 17:19 Jan Kiszka
  2010-06-02 17:30 ` Gilles Chanteperdrix
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-02 17:19 UTC (permalink / raw)
  To: Philippe Gerum, Gilles Chanteperdrix; +Cc: xenomai-core, Tschaeche IT-Services

[-- Attachment #1: Type: text/plain, Size: 4052 bytes --]

Hi all,

here is the first apparently working prototype for getting hold of
endless user space loops in RT threads. A simple test case of mine now
receive a SIGDEBUG even if it does "while (1);".

The design follows Gilles' suggestion to force a SEGV on victim thread
but restore the patched PC before migrating the thread after this fault.
The only drawback of this approach: We need to keep track of the
preempted register set at I-pipe level. I basically replicated what
Linux does these days as well and exported it as ipipe_get_irq_regs()
(the second patch).

This is an x86-64-only draft which clearly needs more love. I'm open for
suggestions of different abstractions wherever you see a need.

Jan

---
 include/asm-generic/hal.h        |    2 ++
 include/asm-x86/bits/thread_64.h |    1 +
 include/asm-x86/system_64.h      |   26 ++++++++++++++++++++++++++
 ksrc/nucleus/pod.c               |    5 +++++
 ksrc/nucleus/sched.c             |    1 +
 5 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/include/asm-generic/hal.h b/include/asm-generic/hal.h
index 84c1a4d..be6abf0 100644
--- a/include/asm-generic/hal.h
+++ b/include/asm-generic/hal.h
@@ -96,6 +96,8 @@ typedef spinlock_t rthal_spinlock_t;
 #define rthal_irq_cookie(ipd,irq)	__ipipe_irq_cookie(ipd,irq)
 #define rthal_irq_handler(ipd,irq)	__ipipe_irq_handler(ipd,irq)
 
+#define rthal_get_irq_regs()		ipipe_get_irq_regs()
+
 #define rthal_cpudata_irq_hits(ipd,cpu,irq)	__ipipe_cpudata_irq_hits(ipd,cpu,irq)
 
 #ifndef local_irq_save_hw_smp
diff --git a/include/asm-x86/bits/thread_64.h b/include/asm-x86/bits/thread_64.h
index 91b71ed..d163c9e 100644
--- a/include/asm-x86/bits/thread_64.h
+++ b/include/asm-x86/bits/thread_64.h
@@ -33,6 +33,7 @@ static inline void xnarch_init_tcb(xnarchtcb_t * tcb)
 	tcb->ripp = &tcb->rip;
 	tcb->fpup = &tcb->i387;
 	tcb->is_root = 0;
+	tcb->forced_um_exit = 0;
 	/* Must be followed by xnarch_init_thread(). */
 }
 
diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h
index 4de8693..f023dab 100644
--- a/include/asm-x86/system_64.h
+++ b/include/asm-x86/system_64.h
@@ -60,6 +60,8 @@ typedef struct xnarchtcb {      /* Per-thread arch-dependent block */
 	unsigned long ts_usedfpu: 1;
 	unsigned long cr0_ts: 1;
 
+	unsigned long forced_um_exit: 1;
+
 	unsigned stacksize;         /* Aligned size of stack (bytes) */
 	unsigned long *stackbase;   /* Stack space */
 
@@ -122,6 +124,30 @@ static inline void xnarch_free_stack_mem(void *chunk, u_long bytes)
 	kfree(chunk);
 }
 
+static inline void xnarch_force_userspace_exit(xnarchtcb_t *tcb)
+{
+	struct pt_regs *regs = rthal_get_irq_regs();
+
+	if (user_mode(regs)) {
+		tcb->rip = regs->x86reg_ip;
+		tcb->forced_um_exit = 1;
+		regs->x86reg_ip = 0;
+	}
+}
+
+static inline int
+xnarch_fixup_userspace_exit(xnarchtcb_t *tcb, xnarch_fltinfo_t *fi)
+{
+#ifdef CONFIG_XENO_OPT_PERVASIVE
+	if (tcb->forced_um_exit) {
+		fi->regs->x86reg_ip = tcb->rip;
+		tcb->forced_um_exit = 0;
+		return 1;
+	}
+#endif /* CONFIG_XENO_OPT_PERVASIVE */
+	return 0;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ksrc/nucleus/pod.c b/ksrc/nucleus/pod.c
index 7002a73..bdb5758 100644
--- a/ksrc/nucleus/pod.c
+++ b/ksrc/nucleus/pod.c
@@ -2547,6 +2547,11 @@ int xnpod_trap_fault(xnarch_fltinfo_t *fltinfo)
 
 	thread = xnpod_current_thread();
 
+	if (xnarch_fixup_userspace_exit(xnthread_archtcb(thread), fltinfo)) {
+		xnshadow_relax(0, 0);
+		return 1;
+	}
+
 	trace_mark(xn_nucleus, thread_fault,
 		   "thread %p thread_name %s ip %p type %d",
 		   thread, xnthread_name(thread),
diff --git a/ksrc/nucleus/sched.c b/ksrc/nucleus/sched.c
index 0b737a3..64fa0e0 100644
--- a/ksrc/nucleus/sched.c
+++ b/ksrc/nucleus/sched.c
@@ -100,6 +100,7 @@ static void xnsched_watchdog_handler(struct xntimer *timer)
 			 "'%s'\n", xnthread_name(thread));
 		xnthread_set_info(thread, XNAMOK | XNKICKED);
 		xnshadow_send_sig(thread, SIGDEBUG, SIGDEBUG_WATCHDOG, 1);
+		xnarch_force_userspace_exit(xnthread_archtcb(thread));
 	} else
 #endif /* CONFIG_XENO_OPT_PERVASIVE */
 	{
-- 
1.6.0.2

[-- Attachment #2: ipipe_irq_regs.patch --]
[-- Type: text/x-patch, Size: 2833 bytes --]

---
 arch/x86/kernel/ipipe.c      |    4 ++++
 include/linux/ipipe.h        |    5 +++++
 include/linux/ipipe_percpu.h |    3 +++
 kernel/ipipe/core.c          |    3 +++
 4 files changed, 15 insertions(+)

Index: b/arch/x86/kernel/ipipe.c
===================================================================
--- a/arch/x86/kernel/ipipe.c
+++ b/arch/x86/kernel/ipipe.c
@@ -900,11 +900,14 @@ int __ipipe_syscall_root(struct pt_regs
  */
 int __ipipe_handle_irq(struct pt_regs *regs)
 {
+	struct pt_regs *old_regs = __ipipe_get_cpu_var(ipipe_irq_regs);
 	struct ipipe_domain *this_domain, *next_domain;
 	unsigned int vector = regs->orig_ax, irq;
 	struct list_head *head, *pos;
 	int m_ack;
 
+	__ipipe_get_cpu_var(ipipe_irq_regs) = regs;
+
 	if ((long)regs->orig_ax < 0) {
 		vector = ~vector;
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -976,6 +979,7 @@ int __ipipe_handle_irq(struct pt_regs *r
 	__ipipe_walk_pipeline(head);
 
 finalize_nosync:
+	__ipipe_get_cpu_var(ipipe_irq_regs) = old_regs;
 
 	/*
 	 * Given our deferred dispatching model for regular IRQs, we
Index: b/include/linux/ipipe.h
===================================================================
--- a/include/linux/ipipe.h
+++ b/include/linux/ipipe.h
@@ -245,6 +245,11 @@ static inline void ipipe_irq_unlock(unsi
 	__ipipe_unlock_irq(__ipipe_current_domain, irq);
 }
 
+static inline struct pt_regs *ipipe_get_irq_regs(void)
+{
+	return __ipipe_get_cpu_var(ipipe_irq_regs);
+}
+
 #ifndef __ipipe_sync_pipeline
 #define __ipipe_sync_pipeline(dovirt) __ipipe_sync_stage(dovirt)
 #endif
Index: b/include/linux/ipipe_percpu.h
===================================================================
--- a/include/linux/ipipe_percpu.h
+++ b/include/linux/ipipe_percpu.h
@@ -68,6 +68,9 @@ DECLARE_PER_CPU(struct ipipe_domain *, i
 
 DECLARE_PER_CPU(unsigned long, ipipe_nmi_saved_root);
 
+struct pt_regs;
+DECLARE_PER_CPU(struct pt_regs *, ipipe_irq_regs);
+
 #ifdef CONFIG_IPIPE_DEBUG_CONTEXT
 DECLARE_PER_CPU(int, ipipe_percpu_context_check);
 DECLARE_PER_CPU(int, ipipe_saved_context_check_state);
Index: b/kernel/ipipe/core.c
===================================================================
--- a/kernel/ipipe/core.c
+++ b/kernel/ipipe/core.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct ipipe_domain *, ip
 
 DEFINE_PER_CPU(unsigned long, ipipe_nmi_saved_root); /* Copy of root status during NMI */
 
+DEFINE_PER_CPU(struct pt_regs *, ipipe_irq_regs);
+
 static IPIPE_DEFINE_SPINLOCK(__ipipe_pipelock);
 
 LIST_HEAD(__ipipe_pipeline);
@@ -1940,6 +1942,7 @@ EXPORT_SYMBOL(ipipe_suspend_domain);
 EXPORT_SYMBOL(ipipe_alloc_virq);
 EXPORT_PER_CPU_SYMBOL(ipipe_percpu_domain);
 EXPORT_PER_CPU_SYMBOL(ipipe_percpu_darray);
+EXPORT_PER_CPU_SYMBOL(ipipe_irq_regs);
 EXPORT_SYMBOL(ipipe_root);
 EXPORT_SYMBOL(ipipe_stall_pipeline_from);
 EXPORT_SYMBOL(ipipe_test_and_stall_pipeline_from);

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-02 17:19 [Xenomai-core] [RFC] Break out of endless user space loops Jan Kiszka
@ 2010-06-02 17:30 ` Gilles Chanteperdrix
  2010-06-03  6:55   ` Jan Kiszka
  2010-06-02 20:58 ` Philippe Gerum
  2010-06-09 10:41 ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
  2 siblings, 1 reply; 32+ messages in thread
From: Gilles Chanteperdrix @ 2010-06-02 17:30 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai-core, Tschaeche IT-Services

Jan Kiszka wrote:
> Hi all,
> 
> here is the first apparently working prototype for getting hold of
> endless user space loops in RT threads. A simple test case of mine now
> receive a SIGDEBUG even if it does "while (1);".
> 
> The design follows Gilles' suggestion to force a SEGV on victim thread
> but restore the patched PC before migrating the thread after this fault.
> The only drawback of this approach: We need to keep track of the
> preempted register set at I-pipe level. I basically replicated what
> Linux does these days as well and exported it as ipipe_get_irq_regs()
> (the second patch).

You already have the regs in xnarch_fault_info.

-- 
					    Gilles.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-02 17:19 [Xenomai-core] [RFC] Break out of endless user space loops Jan Kiszka
  2010-06-02 17:30 ` Gilles Chanteperdrix
@ 2010-06-02 20:58 ` Philippe Gerum
  2010-06-03  6:56   ` Jan Kiszka
  2010-06-09 10:41 ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
  2 siblings, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-06-02 20:58 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai-core, Tschaeche IT-Services

On Wed, 2010-06-02 at 19:19 +0200, Jan Kiszka wrote:
> Hi all,
> 
> here is the first apparently working prototype for getting hold of
> endless user space loops in RT threads. A simple test case of mine now
> receive a SIGDEBUG even if it does "while (1);".
> 
> The design follows Gilles' suggestion to force a SEGV on victim thread
> but restore the patched PC before migrating the thread after this fault.
> The only drawback of this approach: We need to keep track of the
> preempted register set at I-pipe level. I basically replicated what
> Linux does these days as well and exported it as ipipe_get_irq_regs()
> (the second patch).
> 
> This is an x86-64-only draft which clearly needs more love. I'm open for
> suggestions of different abstractions wherever you see a need.

What if you have no MMU ?

> 
> Jan
> 
> ---
>  include/asm-generic/hal.h        |    2 ++
>  include/asm-x86/bits/thread_64.h |    1 +
>  include/asm-x86/system_64.h      |   26 ++++++++++++++++++++++++++
>  ksrc/nucleus/pod.c               |    5 +++++
>  ksrc/nucleus/sched.c             |    1 +
>  5 files changed, 35 insertions(+), 0 deletions(-)
> 
> diff --git a/include/asm-generic/hal.h b/include/asm-generic/hal.h
> index 84c1a4d..be6abf0 100644
> --- a/include/asm-generic/hal.h
> +++ b/include/asm-generic/hal.h
> @@ -96,6 +96,8 @@ typedef spinlock_t rthal_spinlock_t;
>  #define rthal_irq_cookie(ipd,irq)	__ipipe_irq_cookie(ipd,irq)
>  #define rthal_irq_handler(ipd,irq)	__ipipe_irq_handler(ipd,irq)
>  
> +#define rthal_get_irq_regs()		ipipe_get_irq_regs()
> +
>  #define rthal_cpudata_irq_hits(ipd,cpu,irq)	__ipipe_cpudata_irq_hits(ipd,cpu,irq)
>  
>  #ifndef local_irq_save_hw_smp
> diff --git a/include/asm-x86/bits/thread_64.h b/include/asm-x86/bits/thread_64.h
> index 91b71ed..d163c9e 100644
> --- a/include/asm-x86/bits/thread_64.h
> +++ b/include/asm-x86/bits/thread_64.h
> @@ -33,6 +33,7 @@ static inline void xnarch_init_tcb(xnarchtcb_t * tcb)
>  	tcb->ripp = &tcb->rip;
>  	tcb->fpup = &tcb->i387;
>  	tcb->is_root = 0;
> +	tcb->forced_um_exit = 0;
>  	/* Must be followed by xnarch_init_thread(). */
>  }
>  
> diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h
> index 4de8693..f023dab 100644
> --- a/include/asm-x86/system_64.h
> +++ b/include/asm-x86/system_64.h
> @@ -60,6 +60,8 @@ typedef struct xnarchtcb {      /* Per-thread arch-dependent block */
>  	unsigned long ts_usedfpu: 1;
>  	unsigned long cr0_ts: 1;
>  
> +	unsigned long forced_um_exit: 1;
> +
>  	unsigned stacksize;         /* Aligned size of stack (bytes) */
>  	unsigned long *stackbase;   /* Stack space */
>  
> @@ -122,6 +124,30 @@ static inline void xnarch_free_stack_mem(void *chunk, u_long bytes)
>  	kfree(chunk);
>  }
>  
> +static inline void xnarch_force_userspace_exit(xnarchtcb_t *tcb)
> +{
> +	struct pt_regs *regs = rthal_get_irq_regs();
> +
> +	if (user_mode(regs)) {
> +		tcb->rip = regs->x86reg_ip;
> +		tcb->forced_um_exit = 1;
> +		regs->x86reg_ip = 0;
> +	}
> +}
> +
> +static inline int
> +xnarch_fixup_userspace_exit(xnarchtcb_t *tcb, xnarch_fltinfo_t *fi)
> +{
> +#ifdef CONFIG_XENO_OPT_PERVASIVE
> +	if (tcb->forced_um_exit) {
> +		fi->regs->x86reg_ip = tcb->rip;
> +		tcb->forced_um_exit = 0;
> +		return 1;
> +	}
> +#endif /* CONFIG_XENO_OPT_PERVASIVE */
> +	return 0;
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/ksrc/nucleus/pod.c b/ksrc/nucleus/pod.c
> index 7002a73..bdb5758 100644
> --- a/ksrc/nucleus/pod.c
> +++ b/ksrc/nucleus/pod.c
> @@ -2547,6 +2547,11 @@ int xnpod_trap_fault(xnarch_fltinfo_t *fltinfo)
>  
>  	thread = xnpod_current_thread();
>  
> +	if (xnarch_fixup_userspace_exit(xnthread_archtcb(thread), fltinfo)) {
> +		xnshadow_relax(0, 0);
> +		return 1;
> +	}
> +
>  	trace_mark(xn_nucleus, thread_fault,
>  		   "thread %p thread_name %s ip %p type %d",
>  		   thread, xnthread_name(thread),
> diff --git a/ksrc/nucleus/sched.c b/ksrc/nucleus/sched.c
> index 0b737a3..64fa0e0 100644
> --- a/ksrc/nucleus/sched.c
> +++ b/ksrc/nucleus/sched.c
> @@ -100,6 +100,7 @@ static void xnsched_watchdog_handler(struct xntimer *timer)
>  			 "'%s'\n", xnthread_name(thread));
>  		xnthread_set_info(thread, XNAMOK | XNKICKED);
>  		xnshadow_send_sig(thread, SIGDEBUG, SIGDEBUG_WATCHDOG, 1);
> +		xnarch_force_userspace_exit(xnthread_archtcb(thread));
>  	} else
>  #endif /* CONFIG_XENO_OPT_PERVASIVE */
>  	{


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-02 17:30 ` Gilles Chanteperdrix
@ 2010-06-03  6:55   ` Jan Kiszka
  2010-06-03  8:27     ` Philippe Gerum
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-06-03  6:55 UTC (permalink / raw)
  To: Gilles Chanteperdrix; +Cc: xenomai-core, Tschaeche IT-Services

[-- Attachment #1: Type: text/plain, Size: 772 bytes --]

Gilles Chanteperdrix wrote:
> Jan Kiszka wrote:
>> Hi all,
>>
>> here is the first apparently working prototype for getting hold of
>> endless user space loops in RT threads. A simple test case of mine now
>> receive a SIGDEBUG even if it does "while (1);".
>>
>> The design follows Gilles' suggestion to force a SEGV on victim thread
>> but restore the patched PC before migrating the thread after this fault.
>> The only drawback of this approach: We need to keep track of the
>> preempted register set at I-pipe level. I basically replicated what
>> Linux does these days as well and exported it as ipipe_get_irq_regs()
>> (the second patch).
> 
> You already have the regs in xnarch_fault_info.
> 

We only pass this around for exceptions.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-02 20:58 ` Philippe Gerum
@ 2010-06-03  6:56   ` Jan Kiszka
  0 siblings, 0 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-03  6:56 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai-core, Tschaeche IT-Services

[-- Attachment #1: Type: text/plain, Size: 1066 bytes --]

Philippe Gerum wrote:
> On Wed, 2010-06-02 at 19:19 +0200, Jan Kiszka wrote:
>> Hi all,
>>
>> here is the first apparently working prototype for getting hold of
>> endless user space loops in RT threads. A simple test case of mine now
>> receive a SIGDEBUG even if it does "while (1);".
>>
>> The design follows Gilles' suggestion to force a SEGV on victim thread
>> but restore the patched PC before migrating the thread after this fault.
>> The only drawback of this approach: We need to keep track of the
>> preempted register set at I-pipe level. I basically replicated what
>> Linux does these days as well and exported it as ipipe_get_irq_regs()
>> (the second patch).
>>
>> This is an x86-64-only draft which clearly needs more love. I'm open for
>> suggestions of different abstractions wherever you see a need.
> 
> What if you have no MMU ?

Then you either have an MPU detecting NULL pointer accesses or you can
actually redirect the PC to some kernel function containing some other
invalid, exception-raising instruction.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03  6:55   ` Jan Kiszka
@ 2010-06-03  8:27     ` Philippe Gerum
  2010-06-03  8:47       ` Jan Kiszka
  0 siblings, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-06-03  8:27 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai-core, Tschaeche IT-Services

On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
> Gilles Chanteperdrix wrote:
> > Jan Kiszka wrote:
> >> Hi all,
> >>
> >> here is the first apparently working prototype for getting hold of
> >> endless user space loops in RT threads. A simple test case of mine now
> >> receive a SIGDEBUG even if it does "while (1);".
> >>
> >> The design follows Gilles' suggestion to force a SEGV on victim thread
> >> but restore the patched PC before migrating the thread after this fault.
> >> The only drawback of this approach: We need to keep track of the
> >> preempted register set at I-pipe level. I basically replicated what
> >> Linux does these days as well and exported it as ipipe_get_irq_regs()
> >> (the second patch).
> > 
> > You already have the regs in xnarch_fault_info.
> > 
> 
> We only pass this around for exceptions.

And for a good reason, exceptions are always delivered synchronously
upon receipt, not IRQs, given the deferred dispatching scheme. Your
ipipe_get_irq_regs interface is inherently broken for anything which is
not a wired-mode timer IRQ, since you could pass the caller a reference
to an unwound stack frame.

You have to resort to __ipipe_tick_regs, and obviously only use this in
the context of a timer-triggered code, like the watchdog handler, which
saves your day.

> 
> Jan
> 
> _______________________________________________
> Xenomai-core mailing list
> Xenomai-core@domain.hid
> https://mail.gna.org/listinfo/xenomai-core


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03  8:27     ` Philippe Gerum
@ 2010-06-03  8:47       ` Jan Kiszka
  2010-06-03  9:56         ` Philippe Gerum
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-06-03  8:47 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai-core, Tschaeche IT-Services

[-- Attachment #1: Type: text/plain, Size: 1835 bytes --]

Philippe Gerum wrote:
> On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
>> Gilles Chanteperdrix wrote:
>>> Jan Kiszka wrote:
>>>> Hi all,
>>>>
>>>> here is the first apparently working prototype for getting hold of
>>>> endless user space loops in RT threads. A simple test case of mine now
>>>> receive a SIGDEBUG even if it does "while (1);".
>>>>
>>>> The design follows Gilles' suggestion to force a SEGV on victim thread
>>>> but restore the patched PC before migrating the thread after this fault.
>>>> The only drawback of this approach: We need to keep track of the
>>>> preempted register set at I-pipe level. I basically replicated what
>>>> Linux does these days as well and exported it as ipipe_get_irq_regs()
>>>> (the second patch).
>>> You already have the regs in xnarch_fault_info.
>>>
>> We only pass this around for exceptions.
> 
> And for a good reason, exceptions are always delivered synchronously
> upon receipt, not IRQs, given the deferred dispatching scheme. Your
> ipipe_get_irq_regs interface is inherently broken for anything which is
> not a wired-mode timer IRQ, since you could pass the caller a reference
> to an unwound stack frame.

It may not work for certain deferred IRQs, true, but then it will return
NULL. The user of ipipe_get_irq_regs has to take this into account. And
most consumers will be wired IRQ handler anyway.

> 
> You have to resort to __ipipe_tick_regs, and obviously only use this in
> the context of a timer-triggered code, like the watchdog handler, which
> saves your day.

Doesn't work if the timer IRQ is not the host tick AND doesn't help us
modifying the return path.

Granted, the former scenario is already broken in I-pipe (try using an
x86 host with an MSI-capable HPET...), but the latter is definitely a no-go.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03  8:47       ` Jan Kiszka
@ 2010-06-03  9:56         ` Philippe Gerum
  2010-06-03 10:18           ` Jan Kiszka
  0 siblings, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-06-03  9:56 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai-core, Tschaeche IT-Services

On Thu, 2010-06-03 at 10:47 +0200, Jan Kiszka wrote:
> Philippe Gerum wrote:
> > On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
> >> Gilles Chanteperdrix wrote:
> >>> Jan Kiszka wrote:
> >>>> Hi all,
> >>>>
> >>>> here is the first apparently working prototype for getting hold of
> >>>> endless user space loops in RT threads. A simple test case of mine now
> >>>> receive a SIGDEBUG even if it does "while (1);".
> >>>>
> >>>> The design follows Gilles' suggestion to force a SEGV on victim thread
> >>>> but restore the patched PC before migrating the thread after this fault.
> >>>> The only drawback of this approach: We need to keep track of the
> >>>> preempted register set at I-pipe level. I basically replicated what
> >>>> Linux does these days as well and exported it as ipipe_get_irq_regs()
> >>>> (the second patch).
> >>> You already have the regs in xnarch_fault_info.
> >>>
> >> We only pass this around for exceptions.
> > 
> > And for a good reason, exceptions are always delivered synchronously
> > upon receipt, not IRQs, given the deferred dispatching scheme. Your
> > ipipe_get_irq_regs interface is inherently broken for anything which is
> > not a wired-mode timer IRQ, since you could pass the caller a reference
> > to an unwound stack frame.
> 
> It may not work for certain deferred IRQs, true, but then it will return
> NULL. The user of ipipe_get_irq_regs has to take this into account. And
> most consumers will be wired IRQ handler anyway.
> 
> > 
> > You have to resort to __ipipe_tick_regs, and obviously only use this in
> > the context of a timer-triggered code, like the watchdog handler, which
> > saves your day.
> 
> Doesn't work if the timer IRQ is not the host tick AND doesn't help us
> modifying the return path.

That is not the basic issue, copying back regs->ip to the actual frame
before yielding to the IRQ trampoline code would be trivial and your
patch does require a deeper change in the ipipe already. The issue is:
do not provide a service which is not 100% trustable in this area.

> Granted, the former scenario is already broken in I-pipe (try using an
> x86 host with an MSI-capable HPET...), but the latter is definitely a no-go.
> 

I'm arguing that your ipipe_get_irq_regs interface is broken by design
pipeline-wise; piling up more crap in the pipeline core that is wrong
already for some x86 timer sources won't help. The point is: you have to
explicitly address that case only considering the timer interrupt, in
wired-mode, because this won't fly in any other cases.

> Jan
> 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03  9:56         ` Philippe Gerum
@ 2010-06-03 10:18           ` Jan Kiszka
  2010-06-03 10:47             ` Philippe Gerum
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-06-03 10:18 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai-core, Tschaeche IT-Services

[-- Attachment #1: Type: text/plain, Size: 2574 bytes --]

Philippe Gerum wrote:
> On Thu, 2010-06-03 at 10:47 +0200, Jan Kiszka wrote:
>> Philippe Gerum wrote:
>>> On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
>>>> Gilles Chanteperdrix wrote:
>>>>> Jan Kiszka wrote:
>>>>>> Hi all,
>>>>>>
>>>>>> here is the first apparently working prototype for getting hold of
>>>>>> endless user space loops in RT threads. A simple test case of mine now
>>>>>> receive a SIGDEBUG even if it does "while (1);".
>>>>>>
>>>>>> The design follows Gilles' suggestion to force a SEGV on victim thread
>>>>>> but restore the patched PC before migrating the thread after this fault.
>>>>>> The only drawback of this approach: We need to keep track of the
>>>>>> preempted register set at I-pipe level. I basically replicated what
>>>>>> Linux does these days as well and exported it as ipipe_get_irq_regs()
>>>>>> (the second patch).
>>>>> You already have the regs in xnarch_fault_info.
>>>>>
>>>> We only pass this around for exceptions.
>>> And for a good reason, exceptions are always delivered synchronously
>>> upon receipt, not IRQs, given the deferred dispatching scheme. Your
>>> ipipe_get_irq_regs interface is inherently broken for anything which is
>>> not a wired-mode timer IRQ, since you could pass the caller a reference
>>> to an unwound stack frame.
>> It may not work for certain deferred IRQs, true, but then it will return
>> NULL. The user of ipipe_get_irq_regs has to take this into account. And
>> most consumers will be wired IRQ handler anyway.
>>
>>> You have to resort to __ipipe_tick_regs, and obviously only use this in
>>> the context of a timer-triggered code, like the watchdog handler, which
>>> saves your day.
>> Doesn't work if the timer IRQ is not the host tick AND doesn't help us
>> modifying the return path.
> 
> That is not the basic issue, copying back regs->ip to the actual frame
> before yielding to the IRQ trampoline code would be trivial and your
> patch does require a deeper change in the ipipe already. The issue is:
> do not provide a service which is not 100% trustable in this area.

There is no use for ipipe_get_irq_regs in our case outside the call
stack of the triggering IRQ. If you have nested IRQs inside this stack,
ipipe_get_irq_regs account for this, if you leave the stack, it returns
NULL. This is 100% reliable.

If you want read-only access to the preempted register set, then we need
some other mechanism, something like the tick regs. But those already
exits, and we have no other users beyond the host tick so far.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03 10:18           ` Jan Kiszka
@ 2010-06-03 10:47             ` Philippe Gerum
  2010-06-03 10:52               ` Philippe Gerum
  2010-06-03 10:59               ` Jan Kiszka
  0 siblings, 2 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-06-03 10:47 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai-core, Tschaeche IT-Services

On Thu, 2010-06-03 at 12:18 +0200, Jan Kiszka wrote:
> Philippe Gerum wrote:
> > On Thu, 2010-06-03 at 10:47 +0200, Jan Kiszka wrote:
> >> Philippe Gerum wrote:
> >>> On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
> >>>> Gilles Chanteperdrix wrote:
> >>>>> Jan Kiszka wrote:
> >>>>>> Hi all,
> >>>>>>
> >>>>>> here is the first apparently working prototype for getting hold of
> >>>>>> endless user space loops in RT threads. A simple test case of mine now
> >>>>>> receive a SIGDEBUG even if it does "while (1);".
> >>>>>>
> >>>>>> The design follows Gilles' suggestion to force a SEGV on victim thread
> >>>>>> but restore the patched PC before migrating the thread after this fault.
> >>>>>> The only drawback of this approach: We need to keep track of the
> >>>>>> preempted register set at I-pipe level. I basically replicated what
> >>>>>> Linux does these days as well and exported it as ipipe_get_irq_regs()
> >>>>>> (the second patch).
> >>>>> You already have the regs in xnarch_fault_info.
> >>>>>
> >>>> We only pass this around for exceptions.
> >>> And for a good reason, exceptions are always delivered synchronously
> >>> upon receipt, not IRQs, given the deferred dispatching scheme. Your
> >>> ipipe_get_irq_regs interface is inherently broken for anything which is
> >>> not a wired-mode timer IRQ, since you could pass the caller a reference
> >>> to an unwound stack frame.
> >> It may not work for certain deferred IRQs, true, but then it will return
> >> NULL. The user of ipipe_get_irq_regs has to take this into account. And
> >> most consumers will be wired IRQ handler anyway.
> >>
> >>> You have to resort to __ipipe_tick_regs, and obviously only use this in
> >>> the context of a timer-triggered code, like the watchdog handler, which
> >>> saves your day.
> >> Doesn't work if the timer IRQ is not the host tick AND doesn't help us
> >> modifying the return path.
> > 
> > That is not the basic issue, copying back regs->ip to the actual frame
> > before yielding to the IRQ trampoline code would be trivial and your
> > patch does require a deeper change in the ipipe already. The issue is:
> > do not provide a service which is not 100% trustable in this area.
> 
> There is no use for ipipe_get_irq_regs in our case outside the call
> stack of the triggering IRQ. If you have nested IRQs inside this stack,
> ipipe_get_irq_regs account for this, if you leave the stack, it returns
> NULL. This is 100% reliable.

Try calling ipipe_get_irq_regs within a root domain IRQ handler, then,
we'll resume this discussion right after - you may have another
perception of the situation. You will get NULL once in a while, albeit
you are running over an IRQ context, from a Linux POV.

100% reliable for a published ipipe interface means that it ought to
work when called from _all_ domains, unless its semantics specifically
dictates a particular context for use. By no mean ipipe_get_irq_regs
tells anyone that it may only be used reliably on behalf of an unlocked,
wired, directly dispatched IRQ.

The only IRQ that fits this description is the pipelined hrtimer irq
(not even the host one, the host one simply inherits this property when
it happens that hrtimer == host timer for the underlying architecture),
and the only domain which may assume this safely is the invariant head,
which certainly restricts quite a bit the valid context for using those
services.

> 
> If you want read-only access to the preempted register set, then we need
> some other mechanism, something like the tick regs. But those already
> exits, and we have no other users beyond the host tick so far.

I agree, we do need something to ALLOWS US fixup the frame for the
return address to be correct. I'm just asking that we do provide a clean
interface for this, since it will be there to stay. 

> 
> Jan
> 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03 10:47             ` Philippe Gerum
@ 2010-06-03 10:52               ` Philippe Gerum
  2010-06-03 10:59               ` Jan Kiszka
  1 sibling, 0 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-06-03 10:52 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai-core, Tschaeche IT-Services

On Thu, 2010-06-03 at 12:47 +0200, Philippe Gerum wrote:
> On Thu, 2010-06-03 at 12:18 +0200, Jan Kiszka wrote:
> > Philippe Gerum wrote:
> > > On Thu, 2010-06-03 at 10:47 +0200, Jan Kiszka wrote:
> > >> Philippe Gerum wrote:
> > >>> On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
> > >>>> Gilles Chanteperdrix wrote:
> > >>>>> Jan Kiszka wrote:
> > >>>>>> Hi all,
> > >>>>>>
> > >>>>>> here is the first apparently working prototype for getting hold of
> > >>>>>> endless user space loops in RT threads. A simple test case of mine now
> > >>>>>> receive a SIGDEBUG even if it does "while (1);".
> > >>>>>>
> > >>>>>> The design follows Gilles' suggestion to force a SEGV on victim thread
> > >>>>>> but restore the patched PC before migrating the thread after this fault.
> > >>>>>> The only drawback of this approach: We need to keep track of the
> > >>>>>> preempted register set at I-pipe level. I basically replicated what
> > >>>>>> Linux does these days as well and exported it as ipipe_get_irq_regs()
> > >>>>>> (the second patch).
> > >>>>> You already have the regs in xnarch_fault_info.
> > >>>>>
> > >>>> We only pass this around for exceptions.
> > >>> And for a good reason, exceptions are always delivered synchronously
> > >>> upon receipt, not IRQs, given the deferred dispatching scheme. Your
> > >>> ipipe_get_irq_regs interface is inherently broken for anything which is
> > >>> not a wired-mode timer IRQ, since you could pass the caller a reference
> > >>> to an unwound stack frame.
> > >> It may not work for certain deferred IRQs, true, but then it will return
> > >> NULL. The user of ipipe_get_irq_regs has to take this into account. And
> > >> most consumers will be wired IRQ handler anyway.
> > >>
> > >>> You have to resort to __ipipe_tick_regs, and obviously only use this in
> > >>> the context of a timer-triggered code, like the watchdog handler, which
> > >>> saves your day.
> > >> Doesn't work if the timer IRQ is not the host tick AND doesn't help us
> > >> modifying the return path.
> > > 
> > > That is not the basic issue, copying back regs->ip to the actual frame
> > > before yielding to the IRQ trampoline code would be trivial and your
> > > patch does require a deeper change in the ipipe already. The issue is:
> > > do not provide a service which is not 100% trustable in this area.
> > 
> > There is no use for ipipe_get_irq_regs in our case outside the call
> > stack of the triggering IRQ. If you have nested IRQs inside this stack,
> > ipipe_get_irq_regs account for this, if you leave the stack, it returns
> > NULL. This is 100% reliable.
> 
> Try calling ipipe_get_irq_regs within a root domain IRQ handler, then,
> we'll resume this discussion right after - you may have another
> perception of the situation. You will get NULL once in a while, albeit
> you are running over an IRQ context, from a Linux POV.
> 
> 100% reliable for a published ipipe interface means that it ought to
> work when called from _all_ domains, unless its semantics specifically
> dictates a particular context for use. By no mean ipipe_get_irq_regs
> tells anyone that it may only be used reliably on behalf of an unlocked,
> wired, directly dispatched IRQ.
> 
> The only IRQ that fits this description is the pipelined hrtimer irq
> (not even the host one, the host one simply inherits this property when
> it happens that hrtimer == host timer for the underlying architecture),
> and the only domain which may assume this safely is the invariant head,
> which certainly restricts quite a bit the valid context for using those
> services.
> 
> > 
> > If you want read-only access to the preempted register set, then we need
> > some other mechanism, something like the tick regs. But those already
> > exits, and we have no other users beyond the host tick so far.
> 
> I agree, we do need something to ALLOWS US

sorry, no shouting intended. I'm still learning how to deal with this
strange key with the "capslock" sticker on it...

>  fixup the frame for the
> return address to be correct. I'm just asking that we do provide a clean
> interface for this, since it will be there to stay. 
> 
> > 
> > Jan
> > 
> 
> 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [RFC] Break out of endless user space loops
  2010-06-03 10:47             ` Philippe Gerum
  2010-06-03 10:52               ` Philippe Gerum
@ 2010-06-03 10:59               ` Jan Kiszka
  1 sibling, 0 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-03 10:59 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai-core, Tschaeche IT-Services

[-- Attachment #1: Type: text/plain, Size: 4272 bytes --]

Philippe Gerum wrote:
> On Thu, 2010-06-03 at 12:18 +0200, Jan Kiszka wrote:
>> Philippe Gerum wrote:
>>> On Thu, 2010-06-03 at 10:47 +0200, Jan Kiszka wrote:
>>>> Philippe Gerum wrote:
>>>>> On Thu, 2010-06-03 at 08:55 +0200, Jan Kiszka wrote:
>>>>>> Gilles Chanteperdrix wrote:
>>>>>>> Jan Kiszka wrote:
>>>>>>>> Hi all,
>>>>>>>>
>>>>>>>> here is the first apparently working prototype for getting hold of
>>>>>>>> endless user space loops in RT threads. A simple test case of mine now
>>>>>>>> receive a SIGDEBUG even if it does "while (1);".
>>>>>>>>
>>>>>>>> The design follows Gilles' suggestion to force a SEGV on victim thread
>>>>>>>> but restore the patched PC before migrating the thread after this fault.
>>>>>>>> The only drawback of this approach: We need to keep track of the
>>>>>>>> preempted register set at I-pipe level. I basically replicated what
>>>>>>>> Linux does these days as well and exported it as ipipe_get_irq_regs()
>>>>>>>> (the second patch).
>>>>>>> You already have the regs in xnarch_fault_info.
>>>>>>>
>>>>>> We only pass this around for exceptions.
>>>>> And for a good reason, exceptions are always delivered synchronously
>>>>> upon receipt, not IRQs, given the deferred dispatching scheme. Your
>>>>> ipipe_get_irq_regs interface is inherently broken for anything which is
>>>>> not a wired-mode timer IRQ, since you could pass the caller a reference
>>>>> to an unwound stack frame.
>>>> It may not work for certain deferred IRQs, true, but then it will return
>>>> NULL. The user of ipipe_get_irq_regs has to take this into account. And
>>>> most consumers will be wired IRQ handler anyway.
>>>>
>>>>> You have to resort to __ipipe_tick_regs, and obviously only use this in
>>>>> the context of a timer-triggered code, like the watchdog handler, which
>>>>> saves your day.
>>>> Doesn't work if the timer IRQ is not the host tick AND doesn't help us
>>>> modifying the return path.
>>> That is not the basic issue, copying back regs->ip to the actual frame
>>> before yielding to the IRQ trampoline code would be trivial and your
>>> patch does require a deeper change in the ipipe already. The issue is:
>>> do not provide a service which is not 100% trustable in this area.
>> There is no use for ipipe_get_irq_regs in our case outside the call
>> stack of the triggering IRQ. If you have nested IRQs inside this stack,
>> ipipe_get_irq_regs account for this, if you leave the stack, it returns
>> NULL. This is 100% reliable.
> 
> Try calling ipipe_get_irq_regs within a root domain IRQ handler, then,
> we'll resume this discussion right after - you may have another
> perception of the situation. You will get NULL once in a while, albeit
> you are running over an IRQ context, from a Linux POV.
> 
> 100% reliable for a published ipipe interface means that it ought to
> work when called from _all_ domains, unless its semantics specifically
> dictates a particular context for use. By no mean ipipe_get_irq_regs
> tells anyone that it may only be used reliably on behalf of an unlocked,
> wired, directly dispatched IRQ.
> 
> The only IRQ that fits this description is the pipelined hrtimer irq
> (not even the host one, the host one simply inherits this property when
> it happens that hrtimer == host timer for the underlying architecture),
> and the only domain which may assume this safely is the invariant head,
> which certainly restricts quite a bit the valid context for using those
> services.
> 
>> If you want read-only access to the preempted register set, then we need
>> some other mechanism, something like the tick regs. But those already
>> exits, and we have no other users beyond the host tick so far.
> 
> I agree, we do need something to ALLOWS US fixup the frame for the
> return address to be correct. I'm just asking that we do provide a clean
> interface for this, since it will be there to stay. 

Ack. I'm already looking into some way to pimp up ipipe_get_irq_regs so
that it can replace the x86 tick_regs and will never be NULL inside an
IRQ handler. The only thing that is missing is filling in the proper
context that unblocked some pipeline and triggers an IRQ replay this
way. Should be fairly simple.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 257 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-02 17:19 [Xenomai-core] [RFC] Break out of endless user space loops Jan Kiszka
  2010-06-02 17:30 ` Gilles Chanteperdrix
  2010-06-02 20:58 ` Philippe Gerum
@ 2010-06-09 10:41 ` Philippe Gerum
  2010-06-09 13:38   ` [Xenomai-help] " Tschaeche IT-Services
                     ` (2 more replies)
  2 siblings, 3 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-06-09 10:41 UTC (permalink / raw)
  To: xenomai; +Cc: Jan Kiszka, Tschaeche IT-Services


I've toyed a bit to find a generic approach for the nucleus to regain
complete control over a userland application running in a syscall-less
loop.

The original issue was about recovering gracefully from a runaway
situation detected by the nucleus watchdog, where a thread would spin in
primary mode without issuing any syscall, but this would also apply for
real-time signals pending for such a thread. Currently, Xenomai rt
signals cannot preempt syscall-less code running in primary mode either.

The major difference between the previous approaches we discussed about
and this one, is the fact that we now force the runaway thread to run a
piece of valid code that calls into the nucleus. We do not force the
thread to run faulty code or at a faulty address anymore. Therefore, we
can reuse this feature to improve the rt signal management, without
having to forge yet-another signal stack frame for this.

The code introduced only fixes the watchdog related issue, but also does
some groundwork for enhancing the rt signal support later. The
implementation details can be found here:
http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c

The current mayday support is only available for powerpc and x86 for
now, more will come in the next days. To have it enabled, you have to
upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
new interface available from those latest patches.

The current implementation does not break the 2.5.x ABI on purpose, so
we could merge it into the stable branch.

We definitely need user feedback on this. Typically, does arming the
nucleus watchdog with that patch support in, properly recovers from your
favorite "get me out of here" situation? TIA,

You can pull this stuff from
git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-help] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-09 10:41 ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
@ 2010-06-09 13:38   ` Tschaeche IT-Services
  2010-06-09 14:01     ` Philippe Gerum
  2010-06-09 18:11   ` Tschaeche IT-Services
  2010-06-24 12:05   ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
  2 siblings, 1 reply; 32+ messages in thread
From: Tschaeche IT-Services @ 2010-06-09 13:38 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai

Not quite sure if i understood correctly. Trying to summarize:

Triggering the syscall code (to get out of the loop) is initiated
by the nucleus watchdog (available with CONFIG_XENO_OPT_WATCHDOG)
and can't be forced manually yet. Right?

Your (Philippe's) commit message says:
> ...
> runaway thread situation detected by the nucleus watchdog. Instead of
> killing the runaway thread bluntly, this feature allows to force a
> relax on it, despite the syscall-less nature of the code it was
> ...

How do i choose between "killing" and "force a relax"?
Or does the watchdog already set a Linux signal,
which is handled when the task gets relaxed?

Then, for testing, i have to do the following:
- apply patches ;-)
- setup the watchdog with an appropriate timeout
	(wd_timeout_arg/CONFIG_XENO_OPT_WATCHDOG_TIMEOUT)
- no changes in our user space application
- just wait until watchdog forces relaxing the task
	which will initiate linux signal handling

correct?

Thanks for your support,

	Olli

On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
> 
> I've toyed a bit to find a generic approach for the nucleus to regain
> complete control over a userland application running in a syscall-less
> loop.
> 
> The original issue was about recovering gracefully from a runaway
> situation detected by the nucleus watchdog, where a thread would spin in
> primary mode without issuing any syscall, but this would also apply for
> real-time signals pending for such a thread. Currently, Xenomai rt
> signals cannot preempt syscall-less code running in primary mode either.
> 
> The major difference between the previous approaches we discussed about
> and this one, is the fact that we now force the runaway thread to run a
> piece of valid code that calls into the nucleus. We do not force the
> thread to run faulty code or at a faulty address anymore. Therefore, we
> can reuse this feature to improve the rt signal management, without
> having to forge yet-another signal stack frame for this.
> 
> The code introduced only fixes the watchdog related issue, but also does
> some groundwork for enhancing the rt signal support later. The
> implementation details can be found here:
> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> 
> The current mayday support is only available for powerpc and x86 for
> now, more will come in the next days. To have it enabled, you have to
> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> new interface available from those latest patches.
> 
> The current implementation does not break the 2.5.x ABI on purpose, so
> we could merge it into the stable branch.
> 
> We definitely need user feedback on this. Typically, does arming the
> nucleus watchdog with that patch support in, properly recovers from your
> favorite "get me out of here" situation? TIA,
> 
> You can pull this stuff from
> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> 
> 
> -- 
> Philippe.
> 
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-help] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-09 13:38   ` [Xenomai-help] " Tschaeche IT-Services
@ 2010-06-09 14:01     ` Philippe Gerum
  0 siblings, 0 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-06-09 14:01 UTC (permalink / raw)
  To: Tschaeche IT-Services; +Cc: xenomai

On Wed, 2010-06-09 at 15:38 +0200, Tschaeche IT-Services wrote:
> Not quite sure if i understood correctly. Trying to summarize:
> 
> Triggering the syscall code (to get out of the loop) is initiated
> by the nucleus watchdog (available with CONFIG_XENO_OPT_WATCHDOG)
> and can't be forced manually yet. Right?

Yes.

> 
> Your (Philippe's) commit message says:
> > ...
> > runaway thread situation detected by the nucleus watchdog. Instead of
> > killing the runaway thread bluntly, this feature allows to force a
> > relax on it, despite the syscall-less nature of the code it was
> > ...
> 
> How do i choose between "killing" and "force a relax"?
> Or does the watchdog already set a Linux signal,
> which is handled when the task gets relaxed?
> 

There is no more killing of the runaway task, it always get a SIGXCPU,
after the nucleus forced it to relax, so that you can decide what's
next, but the system does not lock up anymore.

> Then, for testing, i have to do the following:
> - apply patches ;-)

Including the new I-pipe patch.

> - setup the watchdog with an appropriate timeout
> 	(wd_timeout_arg/CONFIG_XENO_OPT_WATCHDOG_TIMEOUT)
> - no changes in our user space application
> - just wait until watchdog forces relaxing the task
> 	which will initiate linux signal handling
> 
> correct?

Correct.

> 
> Thanks for your support,
> 
> 	Olli
> 
> On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
> > 
> > I've toyed a bit to find a generic approach for the nucleus to regain
> > complete control over a userland application running in a syscall-less
> > loop.
> > 
> > The original issue was about recovering gracefully from a runaway
> > situation detected by the nucleus watchdog, where a thread would spin in
> > primary mode without issuing any syscall, but this would also apply for
> > real-time signals pending for such a thread. Currently, Xenomai rt
> > signals cannot preempt syscall-less code running in primary mode either.
> > 
> > The major difference between the previous approaches we discussed about
> > and this one, is the fact that we now force the runaway thread to run a
> > piece of valid code that calls into the nucleus. We do not force the
> > thread to run faulty code or at a faulty address anymore. Therefore, we
> > can reuse this feature to improve the rt signal management, without
> > having to forge yet-another signal stack frame for this.
> > 
> > The code introduced only fixes the watchdog related issue, but also does
> > some groundwork for enhancing the rt signal support later. The
> > implementation details can be found here:
> > http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> > 
> > The current mayday support is only available for powerpc and x86 for
> > now, more will come in the next days. To have it enabled, you have to
> > upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> > 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> > new interface available from those latest patches.
> > 
> > The current implementation does not break the 2.5.x ABI on purpose, so
> > we could merge it into the stable branch.
> > 
> > We definitely need user feedback on this. Typically, does arming the
> > nucleus watchdog with that patch support in, properly recovers from your
> > favorite "get me out of here" situation? TIA,
> > 
> > You can pull this stuff from
> > git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> > 
> > 
> > -- 
> > Philippe.
> > 
> > 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-help] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-09 10:41 ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
  2010-06-09 13:38   ` [Xenomai-help] " Tschaeche IT-Services
@ 2010-06-09 18:11   ` Tschaeche IT-Services
  2010-06-18 23:11     ` [Xenomai-core] " Philippe Gerum
  2010-06-24 12:05   ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
  2 siblings, 1 reply; 32+ messages in thread
From: Tschaeche IT-Services @ 2010-06-09 18:11 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: Jan Kiszka, xenomai, xenomai

On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
> We definitely need user feedback on this. Typically, does arming the
> nucleus watchdog with that patch support in, properly recovers from your
> favorite "get me out of here" situation? TIA,
> 
> You can pull this stuff from
> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.

manually build a kernel (timeout 1s) with your patches.
user space linked to 2.5.3 libraries without any patches.
Looks fine: the amok task is switched to secondary domain
(we catched the SIGXCPU) running the loop in secondary domain.
then, on a SIGTRAP the task leaves the loop.

also, if SIGTRAP arives before SIGXCPU it looks good,
apart from the latency of 1s.

did not check the ucontext within the exception handler, yet.
would like to setup a reproducible kernel build first...
we will go into deeper testing in 2 weeks.

maybe we need a finer granularity than 1s for the watchdog timeout.
is there a chance?

will your patches be merged in an official 2.5.x version?

thanks for your great support,

	Olli


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-09 18:11   ` Tschaeche IT-Services
@ 2010-06-18 23:11     ` Philippe Gerum
  2010-06-24  9:22       ` [Xenomai-help] " Tschaeche IT-Services
  0 siblings, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-06-18 23:11 UTC (permalink / raw)
  To: Tschaeche IT-Services; +Cc: Jan Kiszka, xenomai, xenomai

On Wed, 2010-06-09 at 20:11 +0200, Tschaeche IT-Services wrote:
> On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
> > We definitely need user feedback on this. Typically, does arming the
> > nucleus watchdog with that patch support in, properly recovers from your
> > favorite "get me out of here" situation? TIA,
> > 
> > You can pull this stuff from
> > git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> 
> manually build a kernel (timeout 1s) with your patches.
> user space linked to 2.5.3 libraries without any patches.
> Looks fine: the amok task is switched to secondary domain
> (we catched the SIGXCPU) running the loop in secondary domain.
> then, on a SIGTRAP the task leaves the loop.
> 
> also, if SIGTRAP arives before SIGXCPU it looks good,
> apart from the latency of 1s.
> 
> did not check the ucontext within the exception handler, yet.
> would like to setup a reproducible kernel build first...
> we will go into deeper testing in 2 weeks.
> 
> maybe we need a finer granularity than 1s for the watchdog timeout.
> is there a chance?

The watchdog is not meant to be used for implementing application-level
health monitors, which is what you seem to be looking after. The
watchdog is really about pulling the break while debugging, as a mean
not to brick your board when things start to hit the crapper, without
knowing anything from the error source. For that purpose, the current 1s
granularity is just fine. It makes the nucleus watchdog as tactful as a
lumberjack, which is what we want in those circumstances: we want it to
point the finger at the problem we did not know about yet and keep the
board afloat; it is neither meant to monitor a specific code we know in
advance that might misbehave, nor provide any kind of smart contingency
plan.

I would rather think that you may need something like a RTDM driver
actually implementing smarter health monitoring features that you could
use along with your app. That driver would expose a normalized socket
interface for observing how things go app-wise, by collecting data about
the current health status. It would have to tap into the mayday routines
for recovering from runaway situations it may detect via its own,
fine-grained watchdog service for instance.

ATM, you can still hack the nucleus watchdog threshold by changing the
periodic setup for its timer in xnpod_enable_timesource(). This said,
increasing the frequency too much would also induce much more overhead,
so YMMV.

> 
> will your patches be merged in an official 2.5.x version?
> 

2.5.4.

> thanks for your great support,
> 
> 	Olli


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-help] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-18 23:11     ` [Xenomai-core] " Philippe Gerum
@ 2010-06-24  9:22       ` Tschaeche IT-Services
  2010-06-24  9:34         ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
  2010-06-24 10:28         ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
  0 siblings, 2 replies; 32+ messages in thread
From: Tschaeche IT-Services @ 2010-06-24  9:22 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: Jan Kiszka, xenomai, xenomai

On Sat, Jun 19, 2010 at 01:11:17AM +0200, Philippe Gerum wrote:
> On Wed, 2010-06-09 at 20:11 +0200, Tschaeche IT-Services wrote:
> > On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
> > > We definitely need user feedback on this. Typically, does arming the
> > > nucleus watchdog with that patch support in, properly recovers from your
> > > favorite "get me out of here" situation? TIA,
> > > 
> > > You can pull this stuff from
> > > git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> > 
> > manually build a kernel (timeout 1s) with your patches.
> > user space linked to 2.5.3 libraries without any patches.
> > Looks fine: the amok task is switched to secondary domain
> > (we catched the SIGXCPU) running the loop in secondary domain.
> > then, on a SIGTRAP the task leaves the loop.
> > 
> > also, if SIGTRAP arives before SIGXCPU it looks good,
> > apart from the latency of 1s.
> > 
> > did not check the ucontext within the exception handler, yet.
> > would like to setup a reproducible kernel build first...
> > we will go into deeper testing in 2 weeks.
> > 
> > maybe we need a finer granularity than 1s for the watchdog timeout.
> > is there a chance?
> 
> The watchdog is not meant to be used for implementing application-level
> health monitors, which is what you seem to be looking after. The
> watchdog is really about pulling the break while debugging, as a mean
> not to brick your board when things start to hit the crapper, without
> knowing anything from the error source. For that purpose, the current 1s
> granularity is just fine. It makes the nucleus watchdog as tactful as a
> lumberjack, which is what we want in those circumstances: we want it to
> point the finger at the problem we did not know about yet and keep the
> board afloat; it is neither meant to monitor a specific code we know in
> advance that might misbehave, nor provide any kind of smart contingency
> plan.
> 
> I would rather think that you may need something like a RTDM driver
> actually implementing smarter health monitoring features that you could
> use along with your app. That driver would expose a normalized socket
> interface for observing how things go app-wise, by collecting data about
> the current health status. It would have to tap into the mayday routines
> for recovering from runaway situations it may detect via its own,
> fine-grained watchdog service for instance.

Perfect, that's exactly what we want (and already have implemented).
How can i tap into the MayDay routines from my driver?
Is there a rt_mayday(RT_TASK)?

Cheers,

	Olli

-- 
Tschaeche IT-Services       Tel.:  +49/9134/9089850
Dr.-Ing. Oliver Tschäche    Mobil: +49/176/20435601
Welluckenweg 4              Email: services@domain.hid
91077 Neunkirchen


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-24  9:22       ` [Xenomai-help] " Tschaeche IT-Services
@ 2010-06-24  9:34         ` Jan Kiszka
  2010-06-24 10:28         ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
  1 sibling, 0 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-24  9:34 UTC (permalink / raw)
  To: Tschaeche IT-Services; +Cc: xenomai, xenomai

Tschaeche IT-Services wrote:
> On Sat, Jun 19, 2010 at 01:11:17AM +0200, Philippe Gerum wrote:
>> On Wed, 2010-06-09 at 20:11 +0200, Tschaeche IT-Services wrote:
>>> On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
>>>> We definitely need user feedback on this. Typically, does arming the
>>>> nucleus watchdog with that patch support in, properly recovers from your
>>>> favorite "get me out of here" situation? TIA,
>>>>
>>>> You can pull this stuff from
>>>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
>>> manually build a kernel (timeout 1s) with your patches.
>>> user space linked to 2.5.3 libraries without any patches.
>>> Looks fine: the amok task is switched to secondary domain
>>> (we catched the SIGXCPU) running the loop in secondary domain.
>>> then, on a SIGTRAP the task leaves the loop.
>>>
>>> also, if SIGTRAP arives before SIGXCPU it looks good,
>>> apart from the latency of 1s.
>>>
>>> did not check the ucontext within the exception handler, yet.
>>> would like to setup a reproducible kernel build first...
>>> we will go into deeper testing in 2 weeks.
>>>
>>> maybe we need a finer granularity than 1s for the watchdog timeout.
>>> is there a chance?
>> The watchdog is not meant to be used for implementing application-level
>> health monitors, which is what you seem to be looking after. The
>> watchdog is really about pulling the break while debugging, as a mean
>> not to brick your board when things start to hit the crapper, without
>> knowing anything from the error source. For that purpose, the current 1s
>> granularity is just fine. It makes the nucleus watchdog as tactful as a
>> lumberjack, which is what we want in those circumstances: we want it to
>> point the finger at the problem we did not know about yet and keep the
>> board afloat; it is neither meant to monitor a specific code we know in
>> advance that might misbehave, nor provide any kind of smart contingency
>> plan.
>>
>> I would rather think that you may need something like a RTDM driver
>> actually implementing smarter health monitoring features that you could
>> use along with your app. That driver would expose a normalized socket
>> interface for observing how things go app-wise, by collecting data about
>> the current health status. It would have to tap into the mayday routines
>> for recovering from runaway situations it may detect via its own,
>> fine-grained watchdog service for instance.
> 
> Perfect, that's exactly what we want (and already have implemented).
> How can i tap into the MayDay routines from my driver?
> Is there a rt_mayday(RT_TASK)?

I think you will simply have to call the nucleus services directly,
which indicates that there is something wrong with it conceptually.

An RTDM driver is just another workaround. A better solution will once
come with RT-signals: A user space(!) high-prio watchdog thread will be
able to send a signal to the spinning thread, and the signal handler can
then report the error and/or kick the thread out of primary mode.

Alternatively, the nucleus could export a user space interface to send
SIGDEBUG from an RT thread to some other thread. That would allow to
push the watchdog policy into user space, freeing the kernel (or some
workaround driver) from any customization burdens.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops)
  2010-06-24  9:22       ` [Xenomai-help] " Tschaeche IT-Services
  2010-06-24  9:34         ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
@ 2010-06-24 10:28         ` Philippe Gerum
  1 sibling, 0 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-06-24 10:28 UTC (permalink / raw)
  To: Tschaeche IT-Services; +Cc: Jan Kiszka, xenomai, xenomai

On Thu, 2010-06-24 at 11:22 +0200, Tschaeche IT-Services wrote:
> On Sat, Jun 19, 2010 at 01:11:17AM +0200, Philippe Gerum wrote:
> > On Wed, 2010-06-09 at 20:11 +0200, Tschaeche IT-Services wrote:
> > > On Wed, Jun 09, 2010 at 12:41:23PM +0200, Philippe Gerum wrote:
> > > > We definitely need user feedback on this. Typically, does arming the
> > > > nucleus watchdog with that patch support in, properly recovers from your
> > > > favorite "get me out of here" situation? TIA,
> > > > 
> > > > You can pull this stuff from
> > > > git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> > > 
> > > manually build a kernel (timeout 1s) with your patches.
> > > user space linked to 2.5.3 libraries without any patches.
> > > Looks fine: the amok task is switched to secondary domain
> > > (we catched the SIGXCPU) running the loop in secondary domain.
> > > then, on a SIGTRAP the task leaves the loop.
> > > 
> > > also, if SIGTRAP arives before SIGXCPU it looks good,
> > > apart from the latency of 1s.
> > > 
> > > did not check the ucontext within the exception handler, yet.
> > > would like to setup a reproducible kernel build first...
> > > we will go into deeper testing in 2 weeks.
> > > 
> > > maybe we need a finer granularity than 1s for the watchdog timeout.
> > > is there a chance?
> > 
> > The watchdog is not meant to be used for implementing application-level
> > health monitors, which is what you seem to be looking after. The
> > watchdog is really about pulling the break while debugging, as a mean
> > not to brick your board when things start to hit the crapper, without
> > knowing anything from the error source. For that purpose, the current 1s
> > granularity is just fine. It makes the nucleus watchdog as tactful as a
> > lumberjack, which is what we want in those circumstances: we want it to
> > point the finger at the problem we did not know about yet and keep the
> > board afloat; it is neither meant to monitor a specific code we know in
> > advance that might misbehave, nor provide any kind of smart contingency
> > plan.
> > 
> > I would rather think that you may need something like a RTDM driver
> > actually implementing smarter health monitoring features that you could
> > use along with your app. That driver would expose a normalized socket
> > interface for observing how things go app-wise, by collecting data about
> > the current health status. It would have to tap into the mayday routines
> > for recovering from runaway situations it may detect via its own,
> > fine-grained watchdog service for instance.
> 
> Perfect, that's exactly what we want (and already have implemented).
> How can i tap into the MayDay routines from my driver?
> Is there a rt_mayday(RT_TASK)?

You will need this patch (totally untested, but it has a good chance to
work given the implementation of the mayday support underneath):
http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=2205a8f2a7aa8fdc7b7d7f5a96f8064a771382ec

Should be used like this:

void foo(RT_TASK *task)
{
	xnshadow_call_mayday(&task->thread_base, SIGDEBUG_WATCHDOG);
}

We are obviously bypassing all the layers happily, this should be used
only in contexts where 'thread' is guaranteed ok, but this should work
until 2.6 provides a better support that won't expose the innards this
way.

NOTE: that method above is of course absolutely discouraged. Make sure
it is not disclosed out of the Internet.

HTH,

> 
> Cheers,
> 
> 	Olli
> 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-09 10:41 ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
  2010-06-09 13:38   ` [Xenomai-help] " Tschaeche IT-Services
  2010-06-09 18:11   ` Tschaeche IT-Services
@ 2010-06-24 12:05   ` Jan Kiszka
  2010-06-27 16:01     ` Philippe Gerum
  2010-08-20 12:32     ` Jan Kiszka
  2 siblings, 2 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-24 12:05 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai, Tschaeche IT-Services

Philippe Gerum wrote:
> I've toyed a bit to find a generic approach for the nucleus to regain
> complete control over a userland application running in a syscall-less
> loop.
> 
> The original issue was about recovering gracefully from a runaway
> situation detected by the nucleus watchdog, where a thread would spin in
> primary mode without issuing any syscall, but this would also apply for
> real-time signals pending for such a thread. Currently, Xenomai rt
> signals cannot preempt syscall-less code running in primary mode either.
> 
> The major difference between the previous approaches we discussed about
> and this one, is the fact that we now force the runaway thread to run a
> piece of valid code that calls into the nucleus. We do not force the
> thread to run faulty code or at a faulty address anymore. Therefore, we
> can reuse this feature to improve the rt signal management, without
> having to forge yet-another signal stack frame for this.
> 
> The code introduced only fixes the watchdog related issue, but also does
> some groundwork for enhancing the rt signal support later. The
> implementation details can be found here:
> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> 
> The current mayday support is only available for powerpc and x86 for
> now, more will come in the next days. To have it enabled, you have to
> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> new interface available from those latest patches.
> 
> The current implementation does not break the 2.5.x ABI on purpose, so
> we could merge it into the stable branch.
> 
> We definitely need user feedback on this. Typically, does arming the
> nucleus watchdog with that patch support in, properly recovers from your
> favorite "get me out of here" situation? TIA,
> 
> You can pull this stuff from
> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> 

I've retested the feature as it's now in master, and it has one
remaining problem: If you run the cpu hog under gdb control and try to
break out of the while(1) loop, this doesn't work before the watchdog
expired - of course. But if you send the break before the expiry (or hit
a breakpoint), something goes wrong. The Xenomai task continues to spin,
and there is no chance to kill its process (only gdb).

# cat /proc/xenomai/sched
CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
  0  0      idle    -1      -         master     RR         ROOT/0
  1  0      idle    -1      -         master     R          ROOT/1
  0  6120   rt      99      -         master     Tt         cpu-hog
# cat /proc/xenomai/stat
CPU  PID    MSW        CSW        PF    STAT       %CPU  NAME
  0  0      0          0          0     00500088    0.0  ROOT/0
  1  0      0          0          0     00500080   99.7  ROOT/1
  0  6120   0          1          0     00342180  100.0  cpu-hog
  0  0      0          21005      0     00000000    0.0  IRQ3340: [timer]
  1  0      0          35887      0     00000000    0.3  IRQ3340: [timer]

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-24 12:05   ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
@ 2010-06-27 16:01     ` Philippe Gerum
  2010-06-28 14:06       ` Jan Kiszka
  2010-08-20 12:32     ` Jan Kiszka
  1 sibling, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-06-27 16:01 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai, Tschaeche IT-Services

On Thu, 2010-06-24 at 14:05 +0200, Jan Kiszka wrote:
> Philippe Gerum wrote:
> > I've toyed a bit to find a generic approach for the nucleus to regain
> > complete control over a userland application running in a syscall-less
> > loop.
> > 
> > The original issue was about recovering gracefully from a runaway
> > situation detected by the nucleus watchdog, where a thread would spin in
> > primary mode without issuing any syscall, but this would also apply for
> > real-time signals pending for such a thread. Currently, Xenomai rt
> > signals cannot preempt syscall-less code running in primary mode either.
> > 
> > The major difference between the previous approaches we discussed about
> > and this one, is the fact that we now force the runaway thread to run a
> > piece of valid code that calls into the nucleus. We do not force the
> > thread to run faulty code or at a faulty address anymore. Therefore, we
> > can reuse this feature to improve the rt signal management, without
> > having to forge yet-another signal stack frame for this.
> > 
> > The code introduced only fixes the watchdog related issue, but also does
> > some groundwork for enhancing the rt signal support later. The
> > implementation details can be found here:
> > http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> > 
> > The current mayday support is only available for powerpc and x86 for
> > now, more will come in the next days. To have it enabled, you have to
> > upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> > 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> > new interface available from those latest patches.
> > 
> > The current implementation does not break the 2.5.x ABI on purpose, so
> > we could merge it into the stable branch.
> > 
> > We definitely need user feedback on this. Typically, does arming the
> > nucleus watchdog with that patch support in, properly recovers from your
> > favorite "get me out of here" situation? TIA,
> > 
> > You can pull this stuff from
> > git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> > 
> 
> I've retested the feature as it's now in master, and it has one
> remaining problem: If you run the cpu hog under gdb control and try to
> break out of the while(1) loop, this doesn't work before the watchdog
> expired - of course. But if you send the break before the expiry (or hit
> a breakpoint), something goes wrong. The Xenomai task continues to spin,
> and there is no chance to kill its process (only gdb).

I can't reproduce this easily here; it happened only once on a lite52xx,
and then disappeared; no way to reproduce this once on a dual core atom
in 64bit mode, or on a x86_32 single core platform either. But I still
saw it once on a powerpc target, so this looks like a generic
time-dependent issue.

Do you have the same behavior on a single core config, and/or without
WARNSW enabled?

Also, could you post your hog test code? maybe there is a difference
with the way I'm testing.

> 
> # cat /proc/xenomai/sched
> CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
>   0  0      idle    -1      -         master     RR         ROOT/0

Eeek. This symbolic stat mode label looks weird.

>   1  0      idle    -1      -         master     R          ROOT/1
>   0  6120   rt      99      -         master     Tt         cpu-hog
> # cat /proc/xenomai/stat
> CPU  PID    MSW        CSW        PF    STAT       %CPU  NAME
>   0  0      0          0          0     00500088    0.0  ROOT/0
>   1  0      0          0          0     00500080   99.7  ROOT/1
>   0  6120   0          1          0     00342180  100.0  cpu-hog
>   0  0      0          21005      0     00000000    0.0  IRQ3340: [timer]
>   1  0      0          35887      0     00000000    0.3  IRQ3340: [timer]
> 
> Jan
> 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-27 16:01     ` Philippe Gerum
@ 2010-06-28 14:06       ` Jan Kiszka
  2010-06-28 14:12         ` Philippe Gerum
  2010-07-06 15:44         ` Philippe Gerum
  0 siblings, 2 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-06-28 14:06 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai, Tschaeche IT-Services

Philippe Gerum wrote:
> On Thu, 2010-06-24 at 14:05 +0200, Jan Kiszka wrote:
>> Philippe Gerum wrote:
>>> I've toyed a bit to find a generic approach for the nucleus to regain
>>> complete control over a userland application running in a syscall-less
>>> loop.
>>>
>>> The original issue was about recovering gracefully from a runaway
>>> situation detected by the nucleus watchdog, where a thread would spin in
>>> primary mode without issuing any syscall, but this would also apply for
>>> real-time signals pending for such a thread. Currently, Xenomai rt
>>> signals cannot preempt syscall-less code running in primary mode either.
>>>
>>> The major difference between the previous approaches we discussed about
>>> and this one, is the fact that we now force the runaway thread to run a
>>> piece of valid code that calls into the nucleus. We do not force the
>>> thread to run faulty code or at a faulty address anymore. Therefore, we
>>> can reuse this feature to improve the rt signal management, without
>>> having to forge yet-another signal stack frame for this.
>>>
>>> The code introduced only fixes the watchdog related issue, but also does
>>> some groundwork for enhancing the rt signal support later. The
>>> implementation details can be found here:
>>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
>>>
>>> The current mayday support is only available for powerpc and x86 for
>>> now, more will come in the next days. To have it enabled, you have to
>>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
>>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
>>> new interface available from those latest patches.
>>>
>>> The current implementation does not break the 2.5.x ABI on purpose, so
>>> we could merge it into the stable branch.
>>>
>>> We definitely need user feedback on this. Typically, does arming the
>>> nucleus watchdog with that patch support in, properly recovers from your
>>> favorite "get me out of here" situation? TIA,
>>>
>>> You can pull this stuff from
>>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
>>>
>> I've retested the feature as it's now in master, and it has one
>> remaining problem: If you run the cpu hog under gdb control and try to
>> break out of the while(1) loop, this doesn't work before the watchdog
>> expired - of course. But if you send the break before the expiry (or hit
>> a breakpoint), something goes wrong. The Xenomai task continues to spin,
>> and there is no chance to kill its process (only gdb).
> 
> I can't reproduce this easily here; it happened only once on a lite52xx,
> and then disappeared; no way to reproduce this once on a dual core atom
> in 64bit mode, or on a x86_32 single core platform either. But I still
> saw it once on a powerpc target, so this looks like a generic
> time-dependent issue.
> 
> Do you have the same behavior on a single core config,

You cannot reproduce it on a single core as the CPU hog will occupy that
core and gdb cannot be operated.

> and/or without
> WARNSW enabled?

Just tried and disabled WARNSW in the test below: no difference.

> 
> Also, could you post your hog test code? maybe there is a difference
> with the way I'm testing.

#include <signal.h>
#include <native/task.h>
#include <sys/mman.h>
#include <stdlib.h>

void sighandler(int sig, siginfo_t *si, void *context)
{
	printf("SIGDEBUG: reason=%d\n", si->si_value.sival_int);
	exit(1);
}

void loop(void *arg)
{
	RT_TASK_INFO info;

	while (1)
		if (!arg)
			rt_task_inquire(NULL, &info);
}

int main(int argc, const char *argv[])
{
	struct sigaction sa;
	RT_TASK task;

	sigemptyset(&sa.sa_mask);
	sa.sa_sigaction = sighandler;
	sa.sa_flags = SA_SIGINFO;
	sigaction(SIGDEBUG, &sa, NULL);

	mlockall(MCL_CURRENT|MCL_FUTURE);
	rt_task_spawn(&task, "cpu-hog", 0, 99, T_JOINABLE|T_WARNSW, loop,
		(void *)(long)((argc > 1) && strcmp(argv[1], "--lethal") == 0));
	rt_task_join(&task);

	return 0;
}

> 
>> # cat /proc/xenomai/sched
>> CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
>>   0  0      idle    -1      -         master     RR         ROOT/0
> 
> Eeek. This symbolic stat mode label looks weird.

Hmm, haven't noticed this yet. I'm running a kind of all-yes config,
namely:

...
CONFIG_XENOMAI=y
CONFIG_XENO_GENERIC_STACKPOOL=y
CONFIG_XENO_FASTSYNCH=y
CONFIG_XENO_OPT_NUCLEUS=y
CONFIG_XENO_OPT_PERVASIVE=y
CONFIG_XENO_OPT_PRIOCPL=y
CONFIG_XENO_OPT_PIPELINE_HEAD=y
CONFIG_XENO_OPT_SCHED_CLASSES=y
CONFIG_XENO_OPT_SCHED_TP=y
CONFIG_XENO_OPT_SCHED_TP_NRPART=4
CONFIG_XENO_OPT_SCHED_SPORADIC=y
CONFIG_XENO_OPT_SCHED_SPORADIC_MAXREPL=8
CONFIG_XENO_OPT_PIPE=y
CONFIG_XENO_OPT_MAP=y
CONFIG_XENO_OPT_PIPE_NRDEV=32
CONFIG_XENO_OPT_REGISTRY_NRSLOTS=512
CONFIG_XENO_OPT_SYS_HEAPSZ=256
CONFIG_XENO_OPT_SYS_STACKPOOLSZ=128
CONFIG_XENO_OPT_SEM_HEAPSZ=12
CONFIG_XENO_OPT_GLOBAL_SEM_HEAPSZ=12
CONFIG_XENO_OPT_STATS=y
CONFIG_XENO_OPT_DEBUG=y
# CONFIG_XENO_OPT_DEBUG_NUCLEUS is not set
# CONFIG_XENO_OPT_DEBUG_XNLOCK is not set
# CONFIG_XENO_OPT_DEBUG_QUEUES is not set
# CONFIG_XENO_OPT_DEBUG_REGISTRY is not set
# CONFIG_XENO_OPT_DEBUG_TIMERS is not set
CONFIG_XENO_OPT_DEBUG_SYNCH_RELAX=y
CONFIG_XENO_OPT_WATCHDOG=y
CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=60
CONFIG_XENO_OPT_SHIRQ=y
CONFIG_XENO_OPT_SELECT=y

#
# Timing
#
CONFIG_XENO_OPT_TIMING_PERIODIC=y
CONFIG_XENO_OPT_TIMING_VIRTICK=1000
CONFIG_XENO_OPT_TIMING_SCHEDLAT=0

#
# Scalability
#
CONFIG_XENO_OPT_SCALABLE_SCHED=y
# CONFIG_XENO_OPT_TIMER_LIST is not set
CONFIG_XENO_OPT_TIMER_HEAP=y
# CONFIG_XENO_OPT_TIMER_WHEEL is not set
CONFIG_XENO_OPT_TIMER_HEAP_CAPACITY=256
...

Maybe this has some influence as well. The 'RR' correlates with starting
the hog, with or without gdb.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-28 14:06       ` Jan Kiszka
@ 2010-06-28 14:12         ` Philippe Gerum
  2010-07-06 15:44         ` Philippe Gerum
  1 sibling, 0 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-06-28 14:12 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai, Tschaeche IT-Services

On Mon, 2010-06-28 at 16:06 +0200, Jan Kiszka wrote:
> Philippe Gerum wrote:
> > On Thu, 2010-06-24 at 14:05 +0200, Jan Kiszka wrote:
> >> Philippe Gerum wrote:
> >>> I've toyed a bit to find a generic approach for the nucleus to regain
> >>> complete control over a userland application running in a syscall-less
> >>> loop.
> >>>
> >>> The original issue was about recovering gracefully from a runaway
> >>> situation detected by the nucleus watchdog, where a thread would spin in
> >>> primary mode without issuing any syscall, but this would also apply for
> >>> real-time signals pending for such a thread. Currently, Xenomai rt
> >>> signals cannot preempt syscall-less code running in primary mode either.
> >>>
> >>> The major difference between the previous approaches we discussed about
> >>> and this one, is the fact that we now force the runaway thread to run a
> >>> piece of valid code that calls into the nucleus. We do not force the
> >>> thread to run faulty code or at a faulty address anymore. Therefore, we
> >>> can reuse this feature to improve the rt signal management, without
> >>> having to forge yet-another signal stack frame for this.
> >>>
> >>> The code introduced only fixes the watchdog related issue, but also does
> >>> some groundwork for enhancing the rt signal support later. The
> >>> implementation details can be found here:
> >>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> >>>
> >>> The current mayday support is only available for powerpc and x86 for
> >>> now, more will come in the next days. To have it enabled, you have to
> >>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> >>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> >>> new interface available from those latest patches.
> >>>
> >>> The current implementation does not break the 2.5.x ABI on purpose, so
> >>> we could merge it into the stable branch.
> >>>
> >>> We definitely need user feedback on this. Typically, does arming the
> >>> nucleus watchdog with that patch support in, properly recovers from your
> >>> favorite "get me out of here" situation? TIA,
> >>>
> >>> You can pull this stuff from
> >>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> >>>
> >> I've retested the feature as it's now in master, and it has one
> >> remaining problem: If you run the cpu hog under gdb control and try to
> >> break out of the while(1) loop, this doesn't work before the watchdog
> >> expired - of course. But if you send the break before the expiry (or hit
> >> a breakpoint), something goes wrong. The Xenomai task continues to spin,
> >> and there is no chance to kill its process (only gdb).
> > 
> > I can't reproduce this easily here; it happened only once on a lite52xx,
> > and then disappeared; no way to reproduce this once on a dual core atom
> > in 64bit mode, or on a x86_32 single core platform either. But I still
> > saw it once on a powerpc target, so this looks like a generic
> > time-dependent issue.
> > 
> > Do you have the same behavior on a single core config,
> 
> You cannot reproduce it on a single core as the CPU hog will occupy that
> core and gdb cannot be operated.

What I want is the lockup to happen; I'll start working from this point
using other means.

> 
> > and/or without
> > WARNSW enabled?
> 
> Just tried and disabled WARNSW in the test below: no difference.
> 

Ok.

> > 
> > Also, could you post your hog test code? maybe there is a difference
> > with the way I'm testing.
> 
> #include <signal.h>
> #include <native/task.h>
> #include <sys/mman.h>
> #include <stdlib.h>
> 
> void sighandler(int sig, siginfo_t *si, void *context)
> {
> 	printf("SIGDEBUG: reason=%d\n", si->si_value.sival_int);
> 	exit(1);
> }
> 
> void loop(void *arg)
> {
> 	RT_TASK_INFO info;
> 
> 	while (1)
> 		if (!arg)
> 			rt_task_inquire(NULL, &info);
> }
> 
> int main(int argc, const char *argv[])
> {
> 	struct sigaction sa;
> 	RT_TASK task;
> 
> 	sigemptyset(&sa.sa_mask);
> 	sa.sa_sigaction = sighandler;
> 	sa.sa_flags = SA_SIGINFO;
> 	sigaction(SIGDEBUG, &sa, NULL);
> 
> 	mlockall(MCL_CURRENT|MCL_FUTURE);
> 	rt_task_spawn(&task, "cpu-hog", 0, 99, T_JOINABLE|T_WARNSW, loop,
> 		(void *)(long)((argc > 1) && strcmp(argv[1], "--lethal") == 0));
> 	rt_task_join(&task);
> 
> 	return 0;
> }

Ok, will rebase on this code. Thanks.

> 
> > 
> >> # cat /proc/xenomai/sched
> >> CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
> >>   0  0      idle    -1      -         master     RR         ROOT/0
> > 
> > Eeek. This symbolic stat mode label looks weird.
> 
> Hmm, haven't noticed this yet. I'm running a kind of all-yes config,
> namely:
> 
> ...
> CONFIG_XENOMAI=y
> CONFIG_XENO_GENERIC_STACKPOOL=y
> CONFIG_XENO_FASTSYNCH=y
> CONFIG_XENO_OPT_NUCLEUS=y
> CONFIG_XENO_OPT_PERVASIVE=y
> CONFIG_XENO_OPT_PRIOCPL=y
> CONFIG_XENO_OPT_PIPELINE_HEAD=y
> CONFIG_XENO_OPT_SCHED_CLASSES=y
> CONFIG_XENO_OPT_SCHED_TP=y
> CONFIG_XENO_OPT_SCHED_TP_NRPART=4
> CONFIG_XENO_OPT_SCHED_SPORADIC=y
> CONFIG_XENO_OPT_SCHED_SPORADIC_MAXREPL=8
> CONFIG_XENO_OPT_PIPE=y
> CONFIG_XENO_OPT_MAP=y
> CONFIG_XENO_OPT_PIPE_NRDEV=32
> CONFIG_XENO_OPT_REGISTRY_NRSLOTS=512
> CONFIG_XENO_OPT_SYS_HEAPSZ=256
> CONFIG_XENO_OPT_SYS_STACKPOOLSZ=128
> CONFIG_XENO_OPT_SEM_HEAPSZ=12
> CONFIG_XENO_OPT_GLOBAL_SEM_HEAPSZ=12
> CONFIG_XENO_OPT_STATS=y
> CONFIG_XENO_OPT_DEBUG=y
> # CONFIG_XENO_OPT_DEBUG_NUCLEUS is not set
> # CONFIG_XENO_OPT_DEBUG_XNLOCK is not set
> # CONFIG_XENO_OPT_DEBUG_QUEUES is not set
> # CONFIG_XENO_OPT_DEBUG_REGISTRY is not set
> # CONFIG_XENO_OPT_DEBUG_TIMERS is not set
> CONFIG_XENO_OPT_DEBUG_SYNCH_RELAX=y
> CONFIG_XENO_OPT_WATCHDOG=y
> CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=60
> CONFIG_XENO_OPT_SHIRQ=y
> CONFIG_XENO_OPT_SELECT=y
> 
> #
> # Timing
> #
> CONFIG_XENO_OPT_TIMING_PERIODIC=y
> CONFIG_XENO_OPT_TIMING_VIRTICK=1000
> CONFIG_XENO_OPT_TIMING_SCHEDLAT=0
> 
> #
> # Scalability
> #
> CONFIG_XENO_OPT_SCALABLE_SCHED=y
> # CONFIG_XENO_OPT_TIMER_LIST is not set
> CONFIG_XENO_OPT_TIMER_HEAP=y
> # CONFIG_XENO_OPT_TIMER_WHEEL is not set
> CONFIG_XENO_OPT_TIMER_HEAP_CAPACITY=256
> ...
> 
> Maybe this has some influence as well. The 'RR' correlates with starting
> the hog, with or without gdb.
> 

It looks like the status mask is misinterpreted; it could be some
harmless position-to-label mismatch (it happened already when the state
labels were not properly reordered after a change in the status bits),
or something worse. I'll work from your config as well. Thanks. Again.

> Jan
> 


-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-28 14:06       ` Jan Kiszka
  2010-06-28 14:12         ` Philippe Gerum
@ 2010-07-06 15:44         ` Philippe Gerum
  2010-07-06 15:54           ` Jan Kiszka
  1 sibling, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-07-06 15:44 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai, Tschaeche IT-Services

On Mon, 2010-06-28 at 16:06 +0200, Jan Kiszka wrote:
> Philippe Gerum wrote:
> > On Thu, 2010-06-24 at 14:05 +0200, Jan Kiszka wrote:
> >> Philippe Gerum wrote:
> >>> I've toyed a bit to find a generic approach for the nucleus to regain
> >>> complete control over a userland application running in a syscall-less
> >>> loop.
> >>>
> >>> The original issue was about recovering gracefully from a runaway
> >>> situation detected by the nucleus watchdog, where a thread would spin in
> >>> primary mode without issuing any syscall, but this would also apply for
> >>> real-time signals pending for such a thread. Currently, Xenomai rt
> >>> signals cannot preempt syscall-less code running in primary mode either.
> >>>
> >>> The major difference between the previous approaches we discussed about
> >>> and this one, is the fact that we now force the runaway thread to run a
> >>> piece of valid code that calls into the nucleus. We do not force the
> >>> thread to run faulty code or at a faulty address anymore. Therefore, we
> >>> can reuse this feature to improve the rt signal management, without
> >>> having to forge yet-another signal stack frame for this.
> >>>
> >>> The code introduced only fixes the watchdog related issue, but also does
> >>> some groundwork for enhancing the rt signal support later. The
> >>> implementation details can be found here:
> >>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> >>>
> >>> The current mayday support is only available for powerpc and x86 for
> >>> now, more will come in the next days. To have it enabled, you have to
> >>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> >>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> >>> new interface available from those latest patches.
> >>>
> >>> The current implementation does not break the 2.5.x ABI on purpose, so
> >>> we could merge it into the stable branch.
> >>>
> >>> We definitely need user feedback on this. Typically, does arming the
> >>> nucleus watchdog with that patch support in, properly recovers from your
> >>> favorite "get me out of here" situation? TIA,
> >>>
> >>> You can pull this stuff from
> >>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> >>>
> >> I've retested the feature as it's now in master, and it has one
> >> remaining problem: If you run the cpu hog under gdb control and try to
> >> break out of the while(1) loop, this doesn't work before the watchdog
> >> expired - of course. But if you send the break before the expiry (or hit
> >> a breakpoint), something goes wrong. The Xenomai task continues to spin,
> >> and there is no chance to kill its process (only gdb).
> > 
> > I can't reproduce this easily here; it happened only once on a lite52xx,
> > and then disappeared; no way to reproduce this once on a dual core atom
> > in 64bit mode, or on a x86_32 single core platform either. But I still
> > saw it once on a powerpc target, so this looks like a generic
> > time-dependent issue.
> > 
> > Do you have the same behavior on a single core config,
> 
> You cannot reproduce it on a single core as the CPU hog will occupy that
> core and gdb cannot be operated.
> 
> > and/or without
> > WARNSW enabled?
> 
> Just tried and disabled WARNSW in the test below: no difference.
> 
> > 
> > Also, could you post your hog test code? maybe there is a difference
> > with the way I'm testing.
> 
> #include <signal.h>
> #include <native/task.h>
> #include <sys/mman.h>
> #include <stdlib.h>
> 
> void sighandler(int sig, siginfo_t *si, void *context)
> {
> 	printf("SIGDEBUG: reason=%d\n", si->si_value.sival_int);
> 	exit(1);
> }
> 
> void loop(void *arg)
> {
> 	RT_TASK_INFO info;
> 
> 	while (1)
> 		if (!arg)
> 			rt_task_inquire(NULL, &info);
> }
> 
> int main(int argc, const char *argv[])
> {
> 	struct sigaction sa;
> 	RT_TASK task;
> 
> 	sigemptyset(&sa.sa_mask);
> 	sa.sa_sigaction = sighandler;
> 	sa.sa_flags = SA_SIGINFO;
> 	sigaction(SIGDEBUG, &sa, NULL);
> 
> 	mlockall(MCL_CURRENT|MCL_FUTURE);
> 	rt_task_spawn(&task, "cpu-hog", 0, 99, T_JOINABLE|T_WARNSW, loop,
> 		(void *)(long)((argc > 1) && strcmp(argv[1], "--lethal") == 0));
> 	rt_task_join(&task);
> 
> 	return 0;
> }

I can't reproduce this issue, leaving the watchdog threshold to the
default value (4s).

> CONFIG_XENO_OPT_WATCHDOG=y
> CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=60

60s seems way too long to have a chance of recovering from a runaway
loop to a reasonably sane state. Do you still see the issue with shorter
timeouts?


> CONFIG_XENO_OPT_SHIRQ=y
> CONFIG_XENO_OPT_SELECT=y
> 
> #
> # Timing
> #
> CONFIG_XENO_OPT_TIMING_PERIODIC=y
> CONFIG_XENO_OPT_TIMING_VIRTICK=1000
> CONFIG_XENO_OPT_TIMING_SCHEDLAT=0
> 
> #
> # Scalability
> #
> CONFIG_XENO_OPT_SCALABLE_SCHED=y
> # CONFIG_XENO_OPT_TIMER_LIST is not set
> CONFIG_XENO_OPT_TIMER_HEAP=y
> # CONFIG_XENO_OPT_TIMER_WHEEL is not set
> CONFIG_XENO_OPT_TIMER_HEAP_CAPACITY=256
> ...
> 
> Maybe this has some influence as well. The 'RR' correlates with starting
> the hog, with or without gdb.
> 
> Jan
> 

-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-07-06 15:44         ` Philippe Gerum
@ 2010-07-06 15:54           ` Jan Kiszka
  2010-07-06 16:41             ` Philippe Gerum
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-07-06 15:54 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai, Tschaeche IT-Services

Philippe Gerum wrote:
> On Mon, 2010-06-28 at 16:06 +0200, Jan Kiszka wrote:
>> Philippe Gerum wrote:
>>> On Thu, 2010-06-24 at 14:05 +0200, Jan Kiszka wrote:
>>>> Philippe Gerum wrote:
>>>>> I've toyed a bit to find a generic approach for the nucleus to regain
>>>>> complete control over a userland application running in a syscall-less
>>>>> loop.
>>>>>
>>>>> The original issue was about recovering gracefully from a runaway
>>>>> situation detected by the nucleus watchdog, where a thread would spin in
>>>>> primary mode without issuing any syscall, but this would also apply for
>>>>> real-time signals pending for such a thread. Currently, Xenomai rt
>>>>> signals cannot preempt syscall-less code running in primary mode either.
>>>>>
>>>>> The major difference between the previous approaches we discussed about
>>>>> and this one, is the fact that we now force the runaway thread to run a
>>>>> piece of valid code that calls into the nucleus. We do not force the
>>>>> thread to run faulty code or at a faulty address anymore. Therefore, we
>>>>> can reuse this feature to improve the rt signal management, without
>>>>> having to forge yet-another signal stack frame for this.
>>>>>
>>>>> The code introduced only fixes the watchdog related issue, but also does
>>>>> some groundwork for enhancing the rt signal support later. The
>>>>> implementation details can be found here:
>>>>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
>>>>>
>>>>> The current mayday support is only available for powerpc and x86 for
>>>>> now, more will come in the next days. To have it enabled, you have to
>>>>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
>>>>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
>>>>> new interface available from those latest patches.
>>>>>
>>>>> The current implementation does not break the 2.5.x ABI on purpose, so
>>>>> we could merge it into the stable branch.
>>>>>
>>>>> We definitely need user feedback on this. Typically, does arming the
>>>>> nucleus watchdog with that patch support in, properly recovers from your
>>>>> favorite "get me out of here" situation? TIA,
>>>>>
>>>>> You can pull this stuff from
>>>>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
>>>>>
>>>> I've retested the feature as it's now in master, and it has one
>>>> remaining problem: If you run the cpu hog under gdb control and try to
>>>> break out of the while(1) loop, this doesn't work before the watchdog
>>>> expired - of course. But if you send the break before the expiry (or hit
>>>> a breakpoint), something goes wrong. The Xenomai task continues to spin,
>>>> and there is no chance to kill its process (only gdb).
>>> I can't reproduce this easily here; it happened only once on a lite52xx,
>>> and then disappeared; no way to reproduce this once on a dual core atom
>>> in 64bit mode, or on a x86_32 single core platform either. But I still
>>> saw it once on a powerpc target, so this looks like a generic
>>> time-dependent issue.
>>>
>>> Do you have the same behavior on a single core config,
>> You cannot reproduce it on a single core as the CPU hog will occupy that
>> core and gdb cannot be operated.
>>
>>> and/or without
>>> WARNSW enabled?
>> Just tried and disabled WARNSW in the test below: no difference.
>>
>>> Also, could you post your hog test code? maybe there is a difference
>>> with the way I'm testing.
>> #include <signal.h>
>> #include <native/task.h>
>> #include <sys/mman.h>
>> #include <stdlib.h>
>>
>> void sighandler(int sig, siginfo_t *si, void *context)
>> {
>> 	printf("SIGDEBUG: reason=%d\n", si->si_value.sival_int);
>> 	exit(1);
>> }
>>
>> void loop(void *arg)
>> {
>> 	RT_TASK_INFO info;
>>
>> 	while (1)
>> 		if (!arg)
>> 			rt_task_inquire(NULL, &info);
>> }
>>
>> int main(int argc, const char *argv[])
>> {
>> 	struct sigaction sa;
>> 	RT_TASK task;
>>
>> 	sigemptyset(&sa.sa_mask);
>> 	sa.sa_sigaction = sighandler;
>> 	sa.sa_flags = SA_SIGINFO;
>> 	sigaction(SIGDEBUG, &sa, NULL);
>>
>> 	mlockall(MCL_CURRENT|MCL_FUTURE);
>> 	rt_task_spawn(&task, "cpu-hog", 0, 99, T_JOINABLE|T_WARNSW, loop,
>> 		(void *)(long)((argc > 1) && strcmp(argv[1], "--lethal") == 0));
>> 	rt_task_join(&task);
>>
>> 	return 0;
>> }
> 
> I can't reproduce this issue, leaving the watchdog threshold to the
> default value (4s).
> 
>> CONFIG_XENO_OPT_WATCHDOG=y
>> CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=60
> 
> 60s seems way too long to have a chance of recovering from a runaway
> loop to a reasonably sane state.

That's required for debugging the kernel.

> Do you still see the issue with shorter
> timeouts?

Yes, I usually lower the timeout before triggering the issue.

OK, I will try to find some time to look closer at this.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-07-06 15:54           ` Jan Kiszka
@ 2010-07-06 16:41             ` Philippe Gerum
  2010-07-06 17:10               ` Jan Kiszka
  0 siblings, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-07-06 16:41 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai, Tschaeche IT-Services

On Tue, 2010-07-06 at 17:54 +0200, Jan Kiszka wrote:
> >> CONFIG_XENO_OPT_WATCHDOG=y
> >> CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=60
> > 
> > 60s seems way too long to have a chance of recovering from a runaway
> > loop to a reasonably sane state.
> 
> That's required for debugging the kernel.
> 

I don't understand this requirement. Any insight?

> > Do you still see the issue with shorter
> > timeouts?
> 
> Yes, I usually lower the timeout before triggering the issue.
> 
> OK, I will try to find some time to look closer at this.
> 
> Jan
> 

-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-07-06 16:41             ` Philippe Gerum
@ 2010-07-06 17:10               ` Jan Kiszka
  0 siblings, 0 replies; 32+ messages in thread
From: Jan Kiszka @ 2010-07-06 17:10 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai, Tschaeche IT-Services

Philippe Gerum wrote:
> On Tue, 2010-07-06 at 17:54 +0200, Jan Kiszka wrote:
>>>> CONFIG_XENO_OPT_WATCHDOG=y
>>>> CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=60
>>> 60s seems way too long to have a chance of recovering from a runaway
>>> loop to a reasonably sane state.
>> That's required for debugging the kernel.
>>
> 
> I don't understand this requirement. Any insight?

While you step though a Xenomai task context, timers continue to tick.
So the period spent in that context gets huge, and soon the task will be
shot by the watchdog. Likely a limitation of kvm (interrupts should be
blockable in singlestep mode). Haven't looked at all details yet, just
picked the lazy workaround.

Of course, we don't use this value on real HW.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-06-24 12:05   ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
  2010-06-27 16:01     ` Philippe Gerum
@ 2010-08-20 12:32     ` Jan Kiszka
  2010-08-20 14:00       ` Philippe Gerum
  1 sibling, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-08-20 12:32 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai, Tschaeche IT-Services

Jan Kiszka wrote:
> Philippe Gerum wrote:
>> I've toyed a bit to find a generic approach for the nucleus to regain
>> complete control over a userland application running in a syscall-less
>> loop.
>>
>> The original issue was about recovering gracefully from a runaway
>> situation detected by the nucleus watchdog, where a thread would spin in
>> primary mode without issuing any syscall, but this would also apply for
>> real-time signals pending for such a thread. Currently, Xenomai rt
>> signals cannot preempt syscall-less code running in primary mode either.
>>
>> The major difference between the previous approaches we discussed about
>> and this one, is the fact that we now force the runaway thread to run a
>> piece of valid code that calls into the nucleus. We do not force the
>> thread to run faulty code or at a faulty address anymore. Therefore, we
>> can reuse this feature to improve the rt signal management, without
>> having to forge yet-another signal stack frame for this.
>>
>> The code introduced only fixes the watchdog related issue, but also does
>> some groundwork for enhancing the rt signal support later. The
>> implementation details can be found here:
>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
>>
>> The current mayday support is only available for powerpc and x86 for
>> now, more will come in the next days. To have it enabled, you have to
>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
>> new interface available from those latest patches.
>>
>> The current implementation does not break the 2.5.x ABI on purpose, so
>> we could merge it into the stable branch.
>>
>> We definitely need user feedback on this. Typically, does arming the
>> nucleus watchdog with that patch support in, properly recovers from your
>> favorite "get me out of here" situation? TIA,
>>
>> You can pull this stuff from
>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
>>
> 
> I've retested the feature as it's now in master, and it has one
> remaining problem: If you run the cpu hog under gdb control and try to
> break out of the while(1) loop, this doesn't work before the watchdog
> expired - of course. But if you send the break before the expiry (or hit
> a breakpoint), something goes wrong. The Xenomai task continues to spin,
> and there is no chance to kill its process (only gdb).
> 
> # cat /proc/xenomai/sched
> CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
>   0  0      idle    -1      -         master     RR         ROOT/0
>   1  0      idle    -1      -         master     R          ROOT/1
>   0  6120   rt      99      -         master     Tt         cpu-hog
> # cat /proc/xenomai/stat
> CPU  PID    MSW        CSW        PF    STAT       %CPU  NAME
>   0  0      0          0          0     00500088    0.0  ROOT/0
>   1  0      0          0          0     00500080   99.7  ROOT/1
>   0  6120   0          1          0     00342180  100.0  cpu-hog
>   0  0      0          21005      0     00000000    0.0  IRQ3340: [timer]
>   1  0      0          35887      0     00000000    0.3  IRQ3340: [timer]
> 

Fixable by this tiny change:

diff --git a/ksrc/nucleus/sched.c b/ksrc/nucleus/sched.c
index 5242d9f..04a344e 100644
--- a/ksrc/nucleus/sched.c
+++ b/ksrc/nucleus/sched.c
@@ -175,7 +175,8 @@ void xnsched_init(struct xnsched *sched, int cpu)
 			     xnthread_name(&sched->rootcb));
 
 #ifdef CONFIG_XENO_OPT_WATCHDOG
-	xntimer_init(&sched->wdtimer, &nktbase, xnsched_watchdog_handler);
+	xntimer_init_noblock(&sched->wdtimer, &nktbase,
+			     xnsched_watchdog_handler);
 	xntimer_set_name(&sched->wdtimer, "[watchdog]");
 	xntimer_set_priority(&sched->wdtimer, XNTIMER_LOPRIO);
 	xntimer_set_sched(&sched->wdtimer, sched);


I.e. the watchdog timer should not be stopped by any ongoing debug
session of a Xenomai app. Will queue this for upstream.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-08-20 12:32     ` Jan Kiszka
@ 2010-08-20 14:00       ` Philippe Gerum
  2010-08-20 14:06         ` Jan Kiszka
  0 siblings, 1 reply; 32+ messages in thread
From: Philippe Gerum @ 2010-08-20 14:00 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai, Tschaeche IT-Services

On Fri, 2010-08-20 at 14:32 +0200, Jan Kiszka wrote:
> Jan Kiszka wrote:
> > Philippe Gerum wrote:
> >> I've toyed a bit to find a generic approach for the nucleus to regain
> >> complete control over a userland application running in a syscall-less
> >> loop.
> >>
> >> The original issue was about recovering gracefully from a runaway
> >> situation detected by the nucleus watchdog, where a thread would spin in
> >> primary mode without issuing any syscall, but this would also apply for
> >> real-time signals pending for such a thread. Currently, Xenomai rt
> >> signals cannot preempt syscall-less code running in primary mode either.
> >>
> >> The major difference between the previous approaches we discussed about
> >> and this one, is the fact that we now force the runaway thread to run a
> >> piece of valid code that calls into the nucleus. We do not force the
> >> thread to run faulty code or at a faulty address anymore. Therefore, we
> >> can reuse this feature to improve the rt signal management, without
> >> having to forge yet-another signal stack frame for this.
> >>
> >> The code introduced only fixes the watchdog related issue, but also does
> >> some groundwork for enhancing the rt signal support later. The
> >> implementation details can be found here:
> >> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> >>
> >> The current mayday support is only available for powerpc and x86 for
> >> now, more will come in the next days. To have it enabled, you have to
> >> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> >> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> >> new interface available from those latest patches.
> >>
> >> The current implementation does not break the 2.5.x ABI on purpose, so
> >> we could merge it into the stable branch.
> >>
> >> We definitely need user feedback on this. Typically, does arming the
> >> nucleus watchdog with that patch support in, properly recovers from your
> >> favorite "get me out of here" situation? TIA,
> >>
> >> You can pull this stuff from
> >> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> >>
> > 
> > I've retested the feature as it's now in master, and it has one
> > remaining problem: If you run the cpu hog under gdb control and try to
> > break out of the while(1) loop, this doesn't work before the watchdog
> > expired - of course. But if you send the break before the expiry (or hit
> > a breakpoint), something goes wrong. The Xenomai task continues to spin,
> > and there is no chance to kill its process (only gdb).
> > 
> > # cat /proc/xenomai/sched
> > CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
> >   0  0      idle    -1      -         master     RR         ROOT/0

Eeek, we really need to have a look at this funky STAT output.

> >   1  0      idle    -1      -         master     R          ROOT/1
> >   0  6120   rt      99      -         master     Tt         cpu-hog
> > # cat /proc/xenomai/stat
> > CPU  PID    MSW        CSW        PF    STAT       %CPU  NAME
> >   0  0      0          0          0     00500088    0.0  ROOT/0
> >   1  0      0          0          0     00500080   99.7  ROOT/1
> >   0  6120   0          1          0     00342180  100.0  cpu-hog
> >   0  0      0          21005      0     00000000    0.0  IRQ3340: [timer]
> >   1  0      0          35887      0     00000000    0.3  IRQ3340: [timer]
> > 
> 
> Fixable by this tiny change:
> 
> diff --git a/ksrc/nucleus/sched.c b/ksrc/nucleus/sched.c
> index 5242d9f..04a344e 100644
> --- a/ksrc/nucleus/sched.c
> +++ b/ksrc/nucleus/sched.c
> @@ -175,7 +175,8 @@ void xnsched_init(struct xnsched *sched, int cpu)
>  			     xnthread_name(&sched->rootcb));
>  
>  #ifdef CONFIG_XENO_OPT_WATCHDOG
> -	xntimer_init(&sched->wdtimer, &nktbase, xnsched_watchdog_handler);
> +	xntimer_init_noblock(&sched->wdtimer, &nktbase,
> +			     xnsched_watchdog_handler);
>  	xntimer_set_name(&sched->wdtimer, "[watchdog]");
>  	xntimer_set_priority(&sched->wdtimer, XNTIMER_LOPRIO);
>  	xntimer_set_sched(&sched->wdtimer, sched);
> 
> 
> I.e. the watchdog timer should not be stopped by any ongoing debug
> session of a Xenomai app. Will queue this for upstream.

Yes, that makes a lot of sense now. The watchdog would not fire if the
task was single-stepped anyway, since the latter would have been moved
to secondary mode first.

Did you see this bug happening in a uniprocessor context as well?

> 
> Jan
> 

-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-08-20 14:00       ` Philippe Gerum
@ 2010-08-20 14:06         ` Jan Kiszka
  2010-08-20 14:20           ` Philippe Gerum
  0 siblings, 1 reply; 32+ messages in thread
From: Jan Kiszka @ 2010-08-20 14:06 UTC (permalink / raw)
  To: Philippe Gerum; +Cc: xenomai, Tschaeche IT-Services

Philippe Gerum wrote:
> On Fri, 2010-08-20 at 14:32 +0200, Jan Kiszka wrote:
>> Jan Kiszka wrote:
>>> Philippe Gerum wrote:
>>>> I've toyed a bit to find a generic approach for the nucleus to regain
>>>> complete control over a userland application running in a syscall-less
>>>> loop.
>>>>
>>>> The original issue was about recovering gracefully from a runaway
>>>> situation detected by the nucleus watchdog, where a thread would spin in
>>>> primary mode without issuing any syscall, but this would also apply for
>>>> real-time signals pending for such a thread. Currently, Xenomai rt
>>>> signals cannot preempt syscall-less code running in primary mode either.
>>>>
>>>> The major difference between the previous approaches we discussed about
>>>> and this one, is the fact that we now force the runaway thread to run a
>>>> piece of valid code that calls into the nucleus. We do not force the
>>>> thread to run faulty code or at a faulty address anymore. Therefore, we
>>>> can reuse this feature to improve the rt signal management, without
>>>> having to forge yet-another signal stack frame for this.
>>>>
>>>> The code introduced only fixes the watchdog related issue, but also does
>>>> some groundwork for enhancing the rt signal support later. The
>>>> implementation details can be found here:
>>>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
>>>>
>>>> The current mayday support is only available for powerpc and x86 for
>>>> now, more will come in the next days. To have it enabled, you have to
>>>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
>>>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
>>>> new interface available from those latest patches.
>>>>
>>>> The current implementation does not break the 2.5.x ABI on purpose, so
>>>> we could merge it into the stable branch.
>>>>
>>>> We definitely need user feedback on this. Typically, does arming the
>>>> nucleus watchdog with that patch support in, properly recovers from your
>>>> favorite "get me out of here" situation? TIA,
>>>>
>>>> You can pull this stuff from
>>>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
>>>>
>>> I've retested the feature as it's now in master, and it has one
>>> remaining problem: If you run the cpu hog under gdb control and try to
>>> break out of the while(1) loop, this doesn't work before the watchdog
>>> expired - of course. But if you send the break before the expiry (or hit
>>> a breakpoint), something goes wrong. The Xenomai task continues to spin,
>>> and there is no chance to kill its process (only gdb).
>>>
>>> # cat /proc/xenomai/sched
>>> CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
>>>   0  0      idle    -1      -         master     RR         ROOT/0
> 
> Eeek, we really need to have a look at this funky STAT output.

I've a patch for this queued as well. Was only a cosmetic thing.

> 
>>>   1  0      idle    -1      -         master     R          ROOT/1
>>>   0  6120   rt      99      -         master     Tt         cpu-hog
>>> # cat /proc/xenomai/stat
>>> CPU  PID    MSW        CSW        PF    STAT       %CPU  NAME
>>>   0  0      0          0          0     00500088    0.0  ROOT/0
>>>   1  0      0          0          0     00500080   99.7  ROOT/1
>>>   0  6120   0          1          0     00342180  100.0  cpu-hog
>>>   0  0      0          21005      0     00000000    0.0  IRQ3340: [timer]
>>>   1  0      0          35887      0     00000000    0.3  IRQ3340: [timer]
>>>
>> Fixable by this tiny change:
>>
>> diff --git a/ksrc/nucleus/sched.c b/ksrc/nucleus/sched.c
>> index 5242d9f..04a344e 100644
>> --- a/ksrc/nucleus/sched.c
>> +++ b/ksrc/nucleus/sched.c
>> @@ -175,7 +175,8 @@ void xnsched_init(struct xnsched *sched, int cpu)
>>  			     xnthread_name(&sched->rootcb));
>>  
>>  #ifdef CONFIG_XENO_OPT_WATCHDOG
>> -	xntimer_init(&sched->wdtimer, &nktbase, xnsched_watchdog_handler);
>> +	xntimer_init_noblock(&sched->wdtimer, &nktbase,
>> +			     xnsched_watchdog_handler);
>>  	xntimer_set_name(&sched->wdtimer, "[watchdog]");
>>  	xntimer_set_priority(&sched->wdtimer, XNTIMER_LOPRIO);
>>  	xntimer_set_sched(&sched->wdtimer, sched);
>>
>>
>> I.e. the watchdog timer should not be stopped by any ongoing debug
>> session of a Xenomai app. Will queue this for upstream.
> 
> Yes, that makes a lot of sense now. The watchdog would not fire if the
> task was single-stepped anyway, since the latter would have been moved
> to secondary mode first.

Yep.

> 
> Did you see this bug happening in a uniprocessor context as well?

No, as it is impossible on a uniprocessor to interact with gdb if a cpu
hog - the only existing CPU is simply not available. :)

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [Xenomai-core] [PATCH] Mayday support
  2010-08-20 14:06         ` Jan Kiszka
@ 2010-08-20 14:20           ` Philippe Gerum
  0 siblings, 0 replies; 32+ messages in thread
From: Philippe Gerum @ 2010-08-20 14:20 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: xenomai, Tschaeche IT-Services

On Fri, 2010-08-20 at 16:06 +0200, Jan Kiszka wrote:
> Philippe Gerum wrote:
> > On Fri, 2010-08-20 at 14:32 +0200, Jan Kiszka wrote:
> >> Jan Kiszka wrote:
> >>> Philippe Gerum wrote:
> >>>> I've toyed a bit to find a generic approach for the nucleus to regain
> >>>> complete control over a userland application running in a syscall-less
> >>>> loop.
> >>>>
> >>>> The original issue was about recovering gracefully from a runaway
> >>>> situation detected by the nucleus watchdog, where a thread would spin in
> >>>> primary mode without issuing any syscall, but this would also apply for
> >>>> real-time signals pending for such a thread. Currently, Xenomai rt
> >>>> signals cannot preempt syscall-less code running in primary mode either.
> >>>>
> >>>> The major difference between the previous approaches we discussed about
> >>>> and this one, is the fact that we now force the runaway thread to run a
> >>>> piece of valid code that calls into the nucleus. We do not force the
> >>>> thread to run faulty code or at a faulty address anymore. Therefore, we
> >>>> can reuse this feature to improve the rt signal management, without
> >>>> having to forge yet-another signal stack frame for this.
> >>>>
> >>>> The code introduced only fixes the watchdog related issue, but also does
> >>>> some groundwork for enhancing the rt signal support later. The
> >>>> implementation details can be found here:
> >>>> http://git.xenomai.org/?p=xenomai-rpm.git;a=commit;h=4cf21a2ae58354819da6475ae869b96c2defda0c
> >>>>
> >>>> The current mayday support is only available for powerpc and x86 for
> >>>> now, more will come in the next days. To have it enabled, you have to
> >>>> upgrade your I-pipe patch to 2.6.32.15-2.7-00 or 2.6.34-2.7-00 for x86,
> >>>> 2.6.33.5-2.10-01 or 2.6.34-2.10-00 for powerpc. That feature relies on a
> >>>> new interface available from those latest patches.
> >>>>
> >>>> The current implementation does not break the 2.5.x ABI on purpose, so
> >>>> we could merge it into the stable branch.
> >>>>
> >>>> We definitely need user feedback on this. Typically, does arming the
> >>>> nucleus watchdog with that patch support in, properly recovers from your
> >>>> favorite "get me out of here" situation? TIA,
> >>>>
> >>>> You can pull this stuff from
> >>>> git://git.xenomai.org/xenomai-rpm.git, queue/mayday branch.
> >>>>
> >>> I've retested the feature as it's now in master, and it has one
> >>> remaining problem: If you run the cpu hog under gdb control and try to
> >>> break out of the while(1) loop, this doesn't work before the watchdog
> >>> expired - of course. But if you send the break before the expiry (or hit
> >>> a breakpoint), something goes wrong. The Xenomai task continues to spin,
> >>> and there is no chance to kill its process (only gdb).
> >>>
> >>> # cat /proc/xenomai/sched
> >>> CPU  PID    CLASS  PRI      TIMEOUT   TIMEBASE   STAT       NAME
> >>>   0  0      idle    -1      -         master     RR         ROOT/0
> > 
> > Eeek, we really need to have a look at this funky STAT output.
> 
> I've a patch for this queued as well. Was only a cosmetic thing.
> 
> > 
> >>>   1  0      idle    -1      -         master     R          ROOT/1
> >>>   0  6120   rt      99      -         master     Tt         cpu-hog
> >>> # cat /proc/xenomai/stat
> >>> CPU  PID    MSW        CSW        PF    STAT       %CPU  NAME
> >>>   0  0      0          0          0     00500088    0.0  ROOT/0
> >>>   1  0      0          0          0     00500080   99.7  ROOT/1
> >>>   0  6120   0          1          0     00342180  100.0  cpu-hog
> >>>   0  0      0          21005      0     00000000    0.0  IRQ3340: [timer]
> >>>   1  0      0          35887      0     00000000    0.3  IRQ3340: [timer]
> >>>
> >> Fixable by this tiny change:
> >>
> >> diff --git a/ksrc/nucleus/sched.c b/ksrc/nucleus/sched.c
> >> index 5242d9f..04a344e 100644
> >> --- a/ksrc/nucleus/sched.c
> >> +++ b/ksrc/nucleus/sched.c
> >> @@ -175,7 +175,8 @@ void xnsched_init(struct xnsched *sched, int cpu)
> >>  			     xnthread_name(&sched->rootcb));
> >>  
> >>  #ifdef CONFIG_XENO_OPT_WATCHDOG
> >> -	xntimer_init(&sched->wdtimer, &nktbase, xnsched_watchdog_handler);
> >> +	xntimer_init_noblock(&sched->wdtimer, &nktbase,
> >> +			     xnsched_watchdog_handler);
> >>  	xntimer_set_name(&sched->wdtimer, "[watchdog]");
> >>  	xntimer_set_priority(&sched->wdtimer, XNTIMER_LOPRIO);
> >>  	xntimer_set_sched(&sched->wdtimer, sched);
> >>
> >>
> >> I.e. the watchdog timer should not be stopped by any ongoing debug
> >> session of a Xenomai app. Will queue this for upstream.
> > 
> > Yes, that makes a lot of sense now. The watchdog would not fire if the
> > task was single-stepped anyway, since the latter would have been moved
> > to secondary mode first.
> 
> Yep.
> 
> > 
> > Did you see this bug happening in a uniprocessor context as well?
> 
> No, as it is impossible on a uniprocessor to interact with gdb if a cpu
> hog - the only existing CPU is simply not available. :)

I was rather thinking of your hit-a-breakpoint-or-^C-early scenario... I
thought you did see this on UP as well, and scratched my head to
understand how this would have been possible. Ok, so let's merge this.

> 
> Jan
> 

-- 
Philippe.




^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2010-08-20 14:20 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-02 17:19 [Xenomai-core] [RFC] Break out of endless user space loops Jan Kiszka
2010-06-02 17:30 ` Gilles Chanteperdrix
2010-06-03  6:55   ` Jan Kiszka
2010-06-03  8:27     ` Philippe Gerum
2010-06-03  8:47       ` Jan Kiszka
2010-06-03  9:56         ` Philippe Gerum
2010-06-03 10:18           ` Jan Kiszka
2010-06-03 10:47             ` Philippe Gerum
2010-06-03 10:52               ` Philippe Gerum
2010-06-03 10:59               ` Jan Kiszka
2010-06-02 20:58 ` Philippe Gerum
2010-06-03  6:56   ` Jan Kiszka
2010-06-09 10:41 ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
2010-06-09 13:38   ` [Xenomai-help] " Tschaeche IT-Services
2010-06-09 14:01     ` Philippe Gerum
2010-06-09 18:11   ` Tschaeche IT-Services
2010-06-18 23:11     ` [Xenomai-core] " Philippe Gerum
2010-06-24  9:22       ` [Xenomai-help] " Tschaeche IT-Services
2010-06-24  9:34         ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
2010-06-24 10:28         ` [Xenomai-core] [PATCH] Mayday support (was: Re: [RFC] Break out of endless user space loops) Philippe Gerum
2010-06-24 12:05   ` [Xenomai-core] [PATCH] Mayday support Jan Kiszka
2010-06-27 16:01     ` Philippe Gerum
2010-06-28 14:06       ` Jan Kiszka
2010-06-28 14:12         ` Philippe Gerum
2010-07-06 15:44         ` Philippe Gerum
2010-07-06 15:54           ` Jan Kiszka
2010-07-06 16:41             ` Philippe Gerum
2010-07-06 17:10               ` Jan Kiszka
2010-08-20 12:32     ` Jan Kiszka
2010-08-20 14:00       ` Philippe Gerum
2010-08-20 14:06         ` Jan Kiszka
2010-08-20 14:20           ` Philippe Gerum

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.