All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arm64: Expose original FAR_EL1 value in sigcontext
@ 2020-03-12 17:17 Peter Collingbourne
  2020-03-25 13:10 ` Catalin Marinas
  2020-03-25 17:40 ` [PATCH v2] " Peter Collingbourne
  0 siblings, 2 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-03-12 17:17 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany, Vincenzo Frascino
  Cc: Peter Collingbourne, Andrey Konovalov, Kevin Brodsky,
	Will Deacon, Linux ARM, Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address
fields, because there may be existing userspace applications that are
expecting the tag bits to be cleared. Instead, create a far_context in
sigcontext (similar to the existing esr_context), and store the original
value of FAR_EL1 (including the tag bits) there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h |  9 +++++
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/hw_breakpoint.c        |  3 +-
 arch/arm64/kernel/signal.c               | 20 ++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 7 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8e..90e772d9b2cd8 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5ba63204d078a..77d916c075319 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -142,7 +142,7 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427b..f532a2505d5e8 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,6 +44,7 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		far_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
@@ -94,6 +95,14 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* FAR_EL1 context */
+#define FAR_MAGIC	0x46415201
+
+struct far_context {
+	struct _aarch64_ctx head;
+	__u64 far;
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index fde59981445ca..290ea59c68b85 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 0b727edf41046..985cd44decf62 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -730,7 +730,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
 		return 0;
 }
 
-static int watchpoint_handler(unsigned long addr, unsigned int esr,
+static int watchpoint_handler(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
 	int i, step = 0, *kernel_step, access, closest_match = 0;
@@ -741,6 +741,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 	struct debug_info *debug_info;
 	struct arch_hw_breakpoint *info;
 	struct arch_hw_breakpoint_ctrl ctrl;
+	unsigned long addr = untagged_addr(far);
 
 	slots = this_cpu_ptr(wp_on_reg);
 	debug_info = &current->thread.debug;
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a915..48e8b6c7b5369 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long far_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAR_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,11 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(user, &user->far_offset,
+				     sizeof(struct far_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +628,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +649,16 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->far_offset) {
+		struct far_context __user *far_ctx =
+			apply_user_offset(user, user->far_offset);
+
+		__put_user_error(FAR_MAGIC, &far_ctx->head.magic, err);
+		__put_user_error(sizeof(*far_ctx), &far_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address, &far_ctx->far,
+				 err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 85566d32958f5..2ca2de1ff43be 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -580,7 +586,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -615,30 +621,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -654,7 +662,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -727,11 +735,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
-- 
2.25.1.481.gfbce0eb801-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-12 17:17 [PATCH] arm64: Expose original FAR_EL1 value in sigcontext Peter Collingbourne
@ 2020-03-25 13:10 ` Catalin Marinas
  2020-03-25 17:41   ` Peter Collingbourne
  2020-03-25 17:40 ` [PATCH v2] " Peter Collingbourne
  1 sibling, 1 reply; 64+ messages in thread
From: Catalin Marinas @ 2020-03-25 13:10 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Will Deacon, Linux ARM,
	Richard Henderson

Hi Peter,

On Thu, Mar 12, 2020 at 10:17:55AM -0700, Peter Collingbourne wrote:
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index fde59981445ca..290ea59c68b85 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
>  	unsigned long far = read_sysreg(far_el1);
>  
>  	local_daif_inherit(regs);
> -	far = untagged_addr(far);
>  	do_mem_abort(far, esr, regs);
>  }
>  NOKPROBE_SYMBOL(el1_abort);

Would we get a signal on faults triggered by the kernel? Anyway, I'm
fine with this change for consistency and may help with the fault
information printed by the kernel with khwasan or (later) MTE.

> @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
>  
>  	user_exit_irqoff();
>  	local_daif_restore(DAIF_PROCCTX);
> -	far = untagged_addr(far);
>  	do_mem_abort(far, esr, regs);
>  }
>  NOKPROBE_SYMBOL(el0_da);
> diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
> index 0b727edf41046..985cd44decf62 100644
> --- a/arch/arm64/kernel/hw_breakpoint.c
> +++ b/arch/arm64/kernel/hw_breakpoint.c
> @@ -730,7 +730,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
>  		return 0;
>  }
>  
> -static int watchpoint_handler(unsigned long addr, unsigned int esr,
> +static int watchpoint_handler(unsigned long far, unsigned int esr,
>  			      struct pt_regs *regs)
>  {
>  	int i, step = 0, *kernel_step, access, closest_match = 0;
> @@ -741,6 +741,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
>  	struct debug_info *debug_info;
>  	struct arch_hw_breakpoint *info;
>  	struct arch_hw_breakpoint_ctrl ctrl;
> +	unsigned long addr = untagged_addr(far);
>  
>  	slots = this_cpu_ptr(wp_on_reg);
>  	debug_info = &current->thread.debug;

Why do we need to untag this here? Have you hit any bug? This function
gets the original FAR_EL1 value, untagged (via elX_dbg()), and we clear
the tag further down in get_distance_from_watchpoint().

> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index 339882db5a915..48e8b6c7b5369 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
>  
>  	unsigned long fpsimd_offset;
>  	unsigned long esr_offset;
> +	unsigned long far_offset;
>  	unsigned long sve_offset;
>  	unsigned long extra_offset;
>  	unsigned long end_offset;
> @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
>  			break;
>  
>  		case ESR_MAGIC:
> +		case FAR_MAGIC:
>  			/* ignore */
>  			break;
>  
> @@ -581,6 +583,11 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>  				     sizeof(struct esr_context));
>  		if (err)
>  			return err;
> +
> +		err = sigframe_alloc(user, &user->far_offset,
> +				     sizeof(struct far_context));
> +		if (err)
> +			return err;

It looks fine, I think it makes sense to only expose the raw FAR_EL1
when we also expose the ESR_EL1 (via set_thread_esr()).

> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 85566d32958f5..2ca2de1ff43be 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -41,7 +41,7 @@
>  #include <asm/traps.h>
>  
>  struct fault_info {
> -	int	(*fn)(unsigned long addr, unsigned int esr,
> +	int	(*fn)(unsigned long far, unsigned int esr,
>  		      struct pt_regs *regs);
>  	int	sig;
>  	int	code;
> @@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
>  	die_kernel_fault(msg, addr, esr, regs);
>  }
>  
> -static void set_thread_esr(unsigned long address, unsigned int esr)
> +static void set_thread_esr(unsigned long far, unsigned int esr)

We might as well rename this to set_thread_far_esr().

Thanks.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v2] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-12 17:17 [PATCH] arm64: Expose original FAR_EL1 value in sigcontext Peter Collingbourne
  2020-03-25 13:10 ` Catalin Marinas
@ 2020-03-25 17:40 ` Peter Collingbourne
  2020-03-26 16:45   ` Catalin Marinas
  2020-03-27 19:19   ` [PATCH v3] " Peter Collingbourne
  1 sibling, 2 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-03-25 17:40 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany, Vincenzo Frascino
  Cc: Peter Collingbourne, Andrey Konovalov, Kevin Brodsky,
	Will Deacon, Linux ARM, Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address
fields, because there may be existing userspace applications that are
expecting the tag bits to be cleared. Instead, create a far_context in
sigcontext (similar to the existing esr_context), and store the original
value of FAR_EL1 (including the tag bits) there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h |  9 +++++
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/signal.c               | 20 ++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 6 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5ba63204d078..77d916c07531 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -142,7 +142,7 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..f532a2505d5e 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,6 +44,7 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		far_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
@@ -94,6 +95,14 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* FAR_EL1 context */
+#define FAR_MAGIC	0x46415201
+
+struct far_context {
+	struct _aarch64_ctx head;
+	__u64 far;
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index fde59981445c..290ea59c68b8 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..48e8b6c7b536 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long far_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAR_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,11 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(user, &user->far_offset,
+				     sizeof(struct far_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +628,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +649,16 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->far_offset) {
+		struct far_context __user *far_ctx =
+			apply_user_offset(user, user->far_offset);
+
+		__put_user_error(FAR_MAGIC, &far_ctx->head.magic, err);
+		__put_user_error(sizeof(*far_ctx), &far_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address, &far_ctx->far,
+				 err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 85566d32958f..738adc950012 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_far_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_far_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -580,7 +586,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_far_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -615,30 +621,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -654,7 +662,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -727,11 +735,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
-- 
2.25.1.696.g5e7596f4ac-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-25 13:10 ` Catalin Marinas
@ 2020-03-25 17:41   ` Peter Collingbourne
  0 siblings, 0 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-03-25 17:41 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Will Deacon, Linux ARM,
	Richard Henderson

On Wed, Mar 25, 2020 at 6:10 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> Hi Peter,
>
> On Thu, Mar 12, 2020 at 10:17:55AM -0700, Peter Collingbourne wrote:
> > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > index fde59981445ca..290ea59c68b85 100644
> > --- a/arch/arm64/kernel/entry-common.c
> > +++ b/arch/arm64/kernel/entry-common.c
> > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> >       unsigned long far = read_sysreg(far_el1);
> >
> >       local_daif_inherit(regs);
> > -     far = untagged_addr(far);
> >       do_mem_abort(far, esr, regs);
> >  }
> >  NOKPROBE_SYMBOL(el1_abort);
>
> Would we get a signal on faults triggered by the kernel? Anyway, I'm
> fine with this change for consistency and may help with the fault
> information printed by the kernel with khwasan or (later) MTE.

It doesn't look like we would. As far as I can tell all of the signal
injection paths are guarded with if (user_mode(regs)) and such. Agreed
with the consistency argument.

> > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> >
> >       user_exit_irqoff();
> >       local_daif_restore(DAIF_PROCCTX);
> > -     far = untagged_addr(far);
> >       do_mem_abort(far, esr, regs);
> >  }
> >  NOKPROBE_SYMBOL(el0_da);
> > diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
> > index 0b727edf41046..985cd44decf62 100644
> > --- a/arch/arm64/kernel/hw_breakpoint.c
> > +++ b/arch/arm64/kernel/hw_breakpoint.c
> > @@ -730,7 +730,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
> >               return 0;
> >  }
> >
> > -static int watchpoint_handler(unsigned long addr, unsigned int esr,
> > +static int watchpoint_handler(unsigned long far, unsigned int esr,
> >                             struct pt_regs *regs)
> >  {
> >       int i, step = 0, *kernel_step, access, closest_match = 0;
> > @@ -741,6 +741,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
> >       struct debug_info *debug_info;
> >       struct arch_hw_breakpoint *info;
> >       struct arch_hw_breakpoint_ctrl ctrl;
> > +     unsigned long addr = untagged_addr(far);
> >
> >       slots = this_cpu_ptr(wp_on_reg);
> >       debug_info = &current->thread.debug;
>
> Why do we need to untag this here? Have you hit any bug? This function
> gets the original FAR_EL1 value, untagged (via elX_dbg()), and we clear
> the tag further down in get_distance_from_watchpoint().

You're right, I missed that this was going via elX_dbg() rather than
an abort handler. In fact, this would seem to be a potential userspace
break because the now-untagged address is also stored in
counter_arch_bp(wp)->trigger, which is exposed to userspace via
ptrace_hbptriggered in arch/arm64/kernel/ptrace.c. I've reverted this
part in v2.

> > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > index 339882db5a915..48e8b6c7b5369 100644
> > --- a/arch/arm64/kernel/signal.c
> > +++ b/arch/arm64/kernel/signal.c
> > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> >
> >       unsigned long fpsimd_offset;
> >       unsigned long esr_offset;
> > +     unsigned long far_offset;
> >       unsigned long sve_offset;
> >       unsigned long extra_offset;
> >       unsigned long end_offset;
> > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> >                       break;
> >
> >               case ESR_MAGIC:
> > +             case FAR_MAGIC:
> >                       /* ignore */
> >                       break;
> >
> > @@ -581,6 +583,11 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> >                                    sizeof(struct esr_context));
> >               if (err)
> >                       return err;
> > +
> > +             err = sigframe_alloc(user, &user->far_offset,
> > +                                  sizeof(struct far_context));
> > +             if (err)
> > +                     return err;
>
> It looks fine, I think it makes sense to only expose the raw FAR_EL1
> when we also expose the ESR_EL1 (via set_thread_esr()).
>
> > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> > index 85566d32958f5..2ca2de1ff43be 100644
> > --- a/arch/arm64/mm/fault.c
> > +++ b/arch/arm64/mm/fault.c
> > @@ -41,7 +41,7 @@
> >  #include <asm/traps.h>
> >
> >  struct fault_info {
> > -     int     (*fn)(unsigned long addr, unsigned int esr,
> > +     int     (*fn)(unsigned long far, unsigned int esr,
> >                     struct pt_regs *regs);
> >       int     sig;
> >       int     code;
> > @@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
> >       die_kernel_fault(msg, addr, esr, regs);
> >  }
> >
> > -static void set_thread_esr(unsigned long address, unsigned int esr)
> > +static void set_thread_esr(unsigned long far, unsigned int esr)
>
> We might as well rename this to set_thread_far_esr().

Done in v2.


Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v2] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-25 17:40 ` [PATCH v2] " Peter Collingbourne
@ 2020-03-26 16:45   ` Catalin Marinas
  2020-03-27  7:56     ` Will Deacon
  2020-03-27 19:19   ` [PATCH v3] " Peter Collingbourne
  1 sibling, 1 reply; 64+ messages in thread
From: Catalin Marinas @ 2020-03-26 16:45 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Will Deacon, Linux ARM,
	Richard Henderson

On Wed, Mar 25, 2020 at 10:40:01AM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address
> fields, because there may be existing userspace applications that are
> expecting the tag bits to be cleared. Instead, create a far_context in
> sigcontext (similar to the existing esr_context), and store the original
> value of FAR_EL1 (including the tag bits) there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>

The patch looks fine. However, I wouldn't queue it for 5.7, it's too
close to the merging window and I'd like it to sit in linux-next for a
bit. Unless there are other comments, it looks fine to me for -rc8.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v2] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-26 16:45   ` Catalin Marinas
@ 2020-03-27  7:56     ` Will Deacon
  2020-03-27 11:39       ` Catalin Marinas
  0 siblings, 1 reply; 64+ messages in thread
From: Will Deacon @ 2020-03-27  7:56 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Peter Collingbourne,
	Linux ARM, Richard Henderson

On Thu, Mar 26, 2020 at 04:45:39PM +0000, Catalin Marinas wrote:
> On Wed, Mar 25, 2020 at 10:40:01AM -0700, Peter Collingbourne wrote:
> > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > the tag bits may be needed by tools in order to accurately diagnose
> > memory errors, such as HWASan [1] or future tools based on the Memory
> > Tagging Extension (MTE).
> > 
> > We should not stop clearing these bits in the existing fault address
> > fields, because there may be existing userspace applications that are
> > expecting the tag bits to be cleared. Instead, create a far_context in
> > sigcontext (similar to the existing esr_context), and store the original
> > value of FAR_EL1 (including the tag bits) there.
> > 
> > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > 
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> 
> The patch looks fine. However, I wouldn't queue it for 5.7, it's too
> close to the merging window and I'd like it to sit in linux-next for a
> bit. Unless there are other comments, it looks fine to me for -rc8.

You mean 5.8? I'm also a bit surprised not to see a docs update, given that
we talk about the general lack of tags in siginfo_t towards the end of
Documentation/arm64/tagged-pointers.rst

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v2] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-27  7:56     ` Will Deacon
@ 2020-03-27 11:39       ` Catalin Marinas
  2020-03-27 19:26         ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Catalin Marinas @ 2020-03-27 11:39 UTC (permalink / raw)
  To: Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Peter Collingbourne,
	Linux ARM, Richard Henderson

On Fri, Mar 27, 2020 at 07:56:56AM +0000, Will Deacon wrote:
> On Thu, Mar 26, 2020 at 04:45:39PM +0000, Catalin Marinas wrote:
> > On Wed, Mar 25, 2020 at 10:40:01AM -0700, Peter Collingbourne wrote:
> > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > the tag bits may be needed by tools in order to accurately diagnose
> > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > Tagging Extension (MTE).
> > > 
> > > We should not stop clearing these bits in the existing fault address
> > > fields, because there may be existing userspace applications that are
> > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > sigcontext (similar to the existing esr_context), and store the original
> > > value of FAR_EL1 (including the tag bits) there.
> > > 
> > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > 
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > 
> > The patch looks fine. However, I wouldn't queue it for 5.7, it's too
> > close to the merging window and I'd like it to sit in linux-next for a
> > bit. Unless there are other comments, it looks fine to me for -rc8.
> 
> You mean 5.8?

Yes.

> I'm also a bit surprised not to see a docs update, given that
> we talk about the general lack of tags in siginfo_t towards the end of
> Documentation/arm64/tagged-pointers.rst

Good point. It's worth adding this to the tagged-pointers.rst document
since the only use of the raw FAR_EL1 is for tagged pointers.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-25 17:40 ` [PATCH v2] " Peter Collingbourne
  2020-03-26 16:45   ` Catalin Marinas
@ 2020-03-27 19:19   ` Peter Collingbourne
  2020-04-22 14:25     ` Catalin Marinas
                       ` (2 more replies)
  1 sibling, 3 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-03-27 19:19 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany, Vincenzo Frascino
  Cc: Peter Collingbourne, Andrey Konovalov, Kevin Brodsky,
	Will Deacon, Linux ARM, Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address
fields, because there may be existing userspace applications that are
expecting the tag bits to be cleared. Instead, create a far_context in
sigcontext (similar to the existing esr_context), and store the original
value of FAR_EL1 (including the tag bits) there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 Documentation/arm64/tagged-pointers.rst  | 17 +++++----
 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h | 21 +++++++----
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/signal.c               | 20 ++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 7 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..9da7f6262fad 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@ visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the far field of the signal
+frame record far_context, which is present for signals raised in
+response to data aborts and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5ba63204d078..77d916c07531 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -142,7 +142,7 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..6782394633cb 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,11 +44,12 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		far_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
  *
- *	0x510		(reserved for future allocation)
+ *	0x500		(reserved for future allocation)
  *
  * New records that can exceed this space need to be opt-in for userspace, so
  * that an expanded signal frame is not generated unexpectedly.  The mechanism
@@ -94,17 +95,25 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* FAR_EL1 context */
+#define FAR_MAGIC	0x46415201
+
+struct far_context {
+	struct _aarch64_ctx head;
+	__u64 far;
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
  *
  * Note:
  *
- * 1) fpsimd_context, esr_context and extra_context must be placed in
- * sigcontext.__reserved[] if present.  They cannot be placed in the
- * extra space.  Any other record can be placed either in the extra
- * space or in sigcontext.__reserved[], unless otherwise specified in
- * this file.
+ * 1) fpsimd_context, esr_context, far_context and extra_context must be
+ * placed in sigcontext.__reserved[] if present.  They cannot be placed
+ * in the extra space.  Any other record can be placed either in the
+ * extra space or in sigcontext.__reserved[], unless otherwise specified
+ * in this file.
  *
  * 2) There must not be more than one extra_context.
  *
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index fde59981445c..290ea59c68b8 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..48e8b6c7b536 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long far_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAR_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,11 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(user, &user->far_offset,
+				     sizeof(struct far_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +628,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +649,16 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->far_offset) {
+		struct far_context __user *far_ctx =
+			apply_user_offset(user, user->far_offset);
+
+		__put_user_error(FAR_MAGIC, &far_ctx->head.magic, err);
+		__put_user_error(sizeof(*far_ctx), &far_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address, &far_ctx->far,
+				 err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 85566d32958f..738adc950012 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_far_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_far_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -580,7 +586,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_far_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -615,30 +621,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -654,7 +662,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -727,11 +735,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
-- 
2.26.0.rc2.310.g2932bb562d-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v2] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-27 11:39       ` Catalin Marinas
@ 2020-03-27 19:26         ` Peter Collingbourne
  0 siblings, 0 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-03-27 19:26 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Will Deacon, Linux ARM,
	Richard Henderson

On Fri, Mar 27, 2020 at 4:39 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Fri, Mar 27, 2020 at 07:56:56AM +0000, Will Deacon wrote:
> > On Thu, Mar 26, 2020 at 04:45:39PM +0000, Catalin Marinas wrote:
> > > On Wed, Mar 25, 2020 at 10:40:01AM -0700, Peter Collingbourne wrote:
> > > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > > the tag bits may be needed by tools in order to accurately diagnose
> > > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > > Tagging Extension (MTE).
> > > >
> > > > We should not stop clearing these bits in the existing fault address
> > > > fields, because there may be existing userspace applications that are
> > > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > > sigcontext (similar to the existing esr_context), and store the original
> > > > value of FAR_EL1 (including the tag bits) there.
> > > >
> > > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > >
> > > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > >
> > > The patch looks fine. However, I wouldn't queue it for 5.7, it's too
> > > close to the merging window and I'd like it to sit in linux-next for a
> > > bit. Unless there are other comments, it looks fine to me for -rc8.
> >
> > You mean 5.8?
>
> Yes.
>
> > I'm also a bit surprised not to see a docs update, given that
> > we talk about the general lack of tags in siginfo_t towards the end of
> > Documentation/arm64/tagged-pointers.rst
>
> Good point. It's worth adding this to the tagged-pointers.rst document
> since the only use of the raw FAR_EL1 is for tagged pointers.

In v3 I've added a paragraph about far_context after the paragraph
that talks about siginfo.

Unless I'm mistaken it looks like that paragraph is only really
talking about the fault address (there aren't any other user address
fields in siginfo on arm64 as far as I can tell), so I reworded it so
that my new paragraph follows on from it (and clarified that
sigcontext.fault_address doesn't have the tag either).

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-27 19:19   ` [PATCH v3] " Peter Collingbourne
@ 2020-04-22 14:25     ` Catalin Marinas
  2020-04-29 21:08     ` Will Deacon
  2020-05-04 10:19     ` Dave Martin
  2 siblings, 0 replies; 64+ messages in thread
From: Catalin Marinas @ 2020-04-22 14:25 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Will Deacon, Linux ARM,
	Richard Henderson

On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address
> fields, because there may be existing userspace applications that are
> expecting the tag bits to be cleared. Instead, create a far_context in
> sigcontext (similar to the existing esr_context), and store the original
> value of FAR_EL1 (including the tag bits) there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-27 19:19   ` [PATCH v3] " Peter Collingbourne
  2020-04-22 14:25     ` Catalin Marinas
@ 2020-04-29 21:08     ` Will Deacon
  2020-04-29 21:42       ` Peter Collingbourne
  2020-04-30  9:50       ` [PATCH v3] arm64: Expose original FAR_EL1 value " Catalin Marinas
  2020-05-04 10:19     ` Dave Martin
  2 siblings, 2 replies; 64+ messages in thread
From: Will Deacon @ 2020-04-29 21:08 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Catalin Marinas, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Andrey Konovalov, Vincenzo Frascino, Linux ARM,
	Richard Henderson

On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address
> fields, because there may be existing userspace applications that are
> expecting the tag bits to be cleared. Instead, create a far_context in
> sigcontext (similar to the existing esr_context), and store the original
> value of FAR_EL1 (including the tag bits) there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>
> ---
> v3:
> - add documentation to tagged-pointers.rst
> - update comments in sigcontext.h

Hmm, although the code looks fine, why don't we just expose the tag in the
new field, rather than duplicate the address information? I'm nervous about
exposing privileged registers directly to userspace.

Also, Catalin, could you elaborate on the MTE use-case please? The
architecture says that FAR_EL1[63:60] are UNKNOWN on a synchronous tag
check fault, so we'd have to *avoid* exposing them in that case!

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-04-29 21:08     ` Will Deacon
@ 2020-04-29 21:42       ` Peter Collingbourne
  2020-05-04 17:03         ` Will Deacon
  2020-04-30  9:50       ` [PATCH v3] arm64: Expose original FAR_EL1 value " Catalin Marinas
  1 sibling, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-04-29 21:42 UTC (permalink / raw)
  To: Will Deacon
  Cc: Catalin Marinas, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Andrey Konovalov, Vincenzo Frascino, Linux ARM,
	Richard Henderson

On Wed, Apr 29, 2020 at 2:08 PM Will Deacon <will@kernel.org> wrote:
>
> On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > the tag bits may be needed by tools in order to accurately diagnose
> > memory errors, such as HWASan [1] or future tools based on the Memory
> > Tagging Extension (MTE).
> >
> > We should not stop clearing these bits in the existing fault address
> > fields, because there may be existing userspace applications that are
> > expecting the tag bits to be cleared. Instead, create a far_context in
> > sigcontext (similar to the existing esr_context), and store the original
> > value of FAR_EL1 (including the tag bits) there.
> >
> > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > v3:
> > - add documentation to tagged-pointers.rst
> > - update comments in sigcontext.h
>
> Hmm, although the code looks fine, why don't we just expose the tag in the
> new field, rather than duplicate the address information? I'm nervous about
> exposing privileged registers directly to userspace.

I have no strong opinion on whether this should just contain the tag or not.

> Also, Catalin, could you elaborate on the MTE use-case please? The
> architecture says that FAR_EL1[63:60] are UNKNOWN on a synchronous tag
> check fault, so we'd have to *avoid* exposing them in that case!

The basic use case is to allow a signal handler to identify which
allocation was accessed improperly in order to provide better
diagnostics. For example, if you have granules tagged 1,2,3
consecutively and see an access with pointer tag 1 on the granule
tagged 2, you can tell that it was probably a buffer overflow from the
1 granule and you can report that to the user.

It seems unfortunate that bits 63:60 are now UNKNOWN on synchronous
tag check faults. It seems to be a recent change to the specification.
I can think of a number of use cases for bits 63:60 after a
synchronous tag check fault -- for example, an allocator could store
an additional set of random bits there, and later use them to
determine with more accuracy which allocation was used after free or
out of bounds. That being said, we aren't planning to do this in the
initial version of the MTE-aware scudo allocator.

If there is no chance of changing the architecture at this point, I
will send a v4 with the bits masked out when handling a tag check
fault.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-04-29 21:08     ` Will Deacon
  2020-04-29 21:42       ` Peter Collingbourne
@ 2020-04-30  9:50       ` Catalin Marinas
  2020-04-30  9:59         ` Will Deacon
  1 sibling, 1 reply; 64+ messages in thread
From: Catalin Marinas @ 2020-04-30  9:50 UTC (permalink / raw)
  To: Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Peter Collingbourne,
	Linux ARM, Richard Henderson

On Wed, Apr 29, 2020 at 10:08:26PM +0100, Will Deacon wrote:
> On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > the tag bits may be needed by tools in order to accurately diagnose
> > memory errors, such as HWASan [1] or future tools based on the Memory
> > Tagging Extension (MTE).
> > 
> > We should not stop clearing these bits in the existing fault address
> > fields, because there may be existing userspace applications that are
> > expecting the tag bits to be cleared. Instead, create a far_context in
> > sigcontext (similar to the existing esr_context), and store the original
> > value of FAR_EL1 (including the tag bits) there.
> > 
> > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > 
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > v3:
> > - add documentation to tagged-pointers.rst
> > - update comments in sigcontext.h
> 
> Hmm, although the code looks fine, why don't we just expose the tag in the
> new field, rather than duplicate the address information? I'm nervous about
> exposing privileged registers directly to userspace.

That's for consistency with ESR_EL1 which we expose in a similar way,
though with bits of it not relevant to user masked out. For FAR_EL1, all
the bits are relevant, even if some of them are duplicated in the
si_addr field.

> Also, Catalin, could you elaborate on the MTE use-case please? The
> architecture says that FAR_EL1[63:60] are UNKNOWN on a synchronous tag
> check fault, so we'd have to *avoid* exposing them in that case!

With MTE, FAR_EL1[63:60] will be cleared on sync tag check faults (not
currently done as I don't have this patch in my MTE series).

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-04-30  9:50       ` [PATCH v3] arm64: Expose original FAR_EL1 value " Catalin Marinas
@ 2020-04-30  9:59         ` Will Deacon
  2020-04-30 13:34           ` Catalin Marinas
  0 siblings, 1 reply; 64+ messages in thread
From: Will Deacon @ 2020-04-30  9:59 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Peter Collingbourne,
	Linux ARM, Richard Henderson

On Thu, Apr 30, 2020 at 10:50:01AM +0100, Catalin Marinas wrote:
> On Wed, Apr 29, 2020 at 10:08:26PM +0100, Will Deacon wrote:
> > On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > the tag bits may be needed by tools in order to accurately diagnose
> > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > Tagging Extension (MTE).
> > > 
> > > We should not stop clearing these bits in the existing fault address
> > > fields, because there may be existing userspace applications that are
> > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > sigcontext (similar to the existing esr_context), and store the original
> > > value of FAR_EL1 (including the tag bits) there.
> > > 
> > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > 
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > ---
> > > v3:
> > > - add documentation to tagged-pointers.rst
> > > - update comments in sigcontext.h
> > 
> > Hmm, although the code looks fine, why don't we just expose the tag in the
> > new field, rather than duplicate the address information? I'm nervous about
> > exposing privileged registers directly to userspace.
> 
> That's for consistency with ESR_EL1 which we expose in a similar way,
> though with bits of it not relevant to user masked out. For FAR_EL1, all
> the bits are relevant, even if some of them are duplicated in the
> si_addr field.

It may be consistent, but I would argue that exposing ESR_EL1 was a mistake,
as illustrated by cc19846079a7 ("arm64: fault: Don't leak data in ESR
context for user fault on kernel VA"). We have to live with that, but we
should try to do better for new fields in the sigcontext.

> > Also, Catalin, could you elaborate on the MTE use-case please? The
> > architecture says that FAR_EL1[63:60] are UNKNOWN on a synchronous tag
> > check fault, so we'd have to *avoid* exposing them in that case!
> 
> With MTE, FAR_EL1[63:60] will be cleared on sync tag check faults (not
> currently done as I don't have this patch in my MTE series).

Ok, but in [1] you said "I'm fine with this change for consistency and
may help with the fault information printed by the kernel with khwasan
or (later) MTE."

But I don't think consistency is necessarily a good thing here and I don't
see how it helps with MTE if we zap the bits to 0! We'd be better off not
exposing the information at all in this situation.

Will

[1] https://lore.kernel.org/r/20200325131023.GN3901@mbp

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-04-30  9:59         ` Will Deacon
@ 2020-04-30 13:34           ` Catalin Marinas
  0 siblings, 0 replies; 64+ messages in thread
From: Catalin Marinas @ 2020-04-30 13:34 UTC (permalink / raw)
  To: Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Peter Collingbourne,
	Linux ARM, Richard Henderson

On Thu, Apr 30, 2020 at 10:59:19AM +0100, Will Deacon wrote:
> On Thu, Apr 30, 2020 at 10:50:01AM +0100, Catalin Marinas wrote:
> > On Wed, Apr 29, 2020 at 10:08:26PM +0100, Will Deacon wrote:
> > > On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > > the tag bits may be needed by tools in order to accurately diagnose
> > > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > > Tagging Extension (MTE).
> > > > 
> > > > We should not stop clearing these bits in the existing fault address
> > > > fields, because there may be existing userspace applications that are
> > > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > > sigcontext (similar to the existing esr_context), and store the original
> > > > value of FAR_EL1 (including the tag bits) there.
> > > > 
> > > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > > 
> > > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > > ---
> > > > v3:
> > > > - add documentation to tagged-pointers.rst
> > > > - update comments in sigcontext.h
> > > 
> > > Hmm, although the code looks fine, why don't we just expose the tag in the
> > > new field, rather than duplicate the address information? I'm nervous about
> > > exposing privileged registers directly to userspace.
> > 
> > That's for consistency with ESR_EL1 which we expose in a similar way,
> > though with bits of it not relevant to user masked out. For FAR_EL1, all
> > the bits are relevant, even if some of them are duplicated in the
> > si_addr field.
> 
> It may be consistent, but I would argue that exposing ESR_EL1 was a mistake,
> as illustrated by cc19846079a7 ("arm64: fault: Don't leak data in ESR
> context for user fault on kernel VA"). We have to live with that, but we
> should try to do better for new fields in the sigcontext.

The alternative would be to only expose the tag of the faulting address
if you are worried of leaking some kernel address from FAR_EL1 (and I
agree, there is a risk).

> > > Also, Catalin, could you elaborate on the MTE use-case please? The
> > > architecture says that FAR_EL1[63:60] are UNKNOWN on a synchronous tag
> > > check fault, so we'd have to *avoid* exposing them in that case!
> > 
> > With MTE, FAR_EL1[63:60] will be cleared on sync tag check faults (not
> > currently done as I don't have this patch in my MTE series).
> 
> Ok, but in [1] you said "I'm fine with this change for consistency and
> may help with the fault information printed by the kernel with khwasan
> or (later) MTE."
> 
> But I don't think consistency is necessarily a good thing here and I don't
> see how it helps with MTE if we zap the bits to 0! We'd be better off not
> exposing the information at all in this situation.

The plan for MTE is to only zap bits 63:60 which are unknown on tag
check fault. The actual MTE tag in 59:56 would be preserved. There is an
inconsistency with the TBI feature but that only happens for tag check
faults which is a new thing. The behaviour on any other fault will be
consistent with the non-MTE case (page faults etc.).

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-03-27 19:19   ` [PATCH v3] " Peter Collingbourne
  2020-04-22 14:25     ` Catalin Marinas
  2020-04-29 21:08     ` Will Deacon
@ 2020-05-04 10:19     ` Dave Martin
  2020-05-07 17:55       ` Peter Collingbourne
  2 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-05-04 10:19 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany, Linux ARM,
	Catalin Marinas, Vincenzo Frascino, Will Deacon,
	Evgenii Stepanov, Richard Henderson

On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address
> fields, because there may be existing userspace applications that are
> expecting the tag bits to be cleared. Instead, create a far_context in
> sigcontext (similar to the existing esr_context), and store the original
> value of FAR_EL1 (including the tag bits) there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>
> ---
> v3:
> - add documentation to tagged-pointers.rst
> - update comments in sigcontext.h
> 
> v2:
> - revert changes to hw_breakpoint.c
> - rename set_thread_esr to set_thread_far_esr
> 
>  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
>  arch/arm64/include/asm/exception.h       |  2 +-
>  arch/arm64/include/asm/processor.h       |  2 +-
>  arch/arm64/include/uapi/asm/sigcontext.h | 21 +++++++----
>  arch/arm64/kernel/entry-common.c         |  2 --
>  arch/arm64/kernel/signal.c               | 20 ++++++++++-
>  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
>  7 files changed, 74 insertions(+), 35 deletions(-)

[...]

> diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> index 8b0ebce92427..6782394633cb 100644
> --- a/arch/arm64/include/uapi/asm/sigcontext.h
> +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> @@ -44,11 +44,12 @@ struct sigcontext {
>   *
>   *	0x210		fpsimd_context
>   *	 0x10		esr_context
> + *	 0x10		far_context
>   *	0x8a0		sve_context (vl <= 64) (optional)
>   *	 0x20		extra_context (optional)
>   *	 0x10		terminator (null _aarch64_ctx)
>   *
> - *	0x510		(reserved for future allocation)
> + *	0x500		(reserved for future allocation)
>   *
>   * New records that can exceed this space need to be opt-in for userspace, so
>   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> @@ -94,17 +95,25 @@ struct esr_context {
>  	__u64 esr;
>  };
>  
> +/* FAR_EL1 context */
> +#define FAR_MAGIC	0x46415201
> +
> +struct far_context {
> +	struct _aarch64_ctx head;
> +	__u64 far;
> +};
> +
>  /*
>   * extra_context: describes extra space in the signal frame for
>   * additional structures that don't fit in sigcontext.__reserved[].
>   *
>   * Note:
>   *
> - * 1) fpsimd_context, esr_context and extra_context must be placed in
> - * sigcontext.__reserved[] if present.  They cannot be placed in the
> - * extra space.  Any other record can be placed either in the extra
> - * space or in sigcontext.__reserved[], unless otherwise specified in
> - * this file.
> + * 1) fpsimd_context, esr_context, far_context and extra_context must be
> + * placed in sigcontext.__reserved[] if present.  They cannot be placed
> + * in the extra space.  Any other record can be placed either in the
> + * extra space or in sigcontext.__reserved[], unless otherwise specified
> + * in this file.

This is for backwards compatibility only.  We don't need this constraint
for any new field, so you can probably leave the paragraph as-is.

Removing this would mean constraint would mean that userspace must be
prepared to traverse extra_context when looking for far_context.  But
really we want modern userspace to do this anyway, since it reduces
backwards compatibilty worries when adding more new records in the
future.


The nasty loop in parse_user_sigframe() allows some flexibility
regarding the order of records, but prior to this patch there is no
record that can be actually be moved, due to other backwards
compatibility constraints -- so the flexibility isn't used today.  I'd
like to avoid reorderability creeping in, so that we can get rid of the
loop.

So, mandating that records must be in a consistent order to sigcontext.h
could be helpful.  inserting new records in the middle should be fine,
so long as there is no shuffling.

I'm not sure this patch needs to do anything extra for that: perhaps we
can leave this no-shuffling rule implicit for now (?)

People already get shouted at for needslessly noisy diffs, so there is a
strong disincentive to shuffle existing headers in any case...

[...]

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-04-29 21:42       ` Peter Collingbourne
@ 2020-05-04 17:03         ` Will Deacon
  2020-05-07 17:57           ` [PATCH v4] arm64: Expose FAR_EL1 tag bits " Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Will Deacon @ 2020-05-04 17:03 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Catalin Marinas, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Andrey Konovalov, Vincenzo Frascino, Linux ARM,
	Richard Henderson

Hi Peter,

On Wed, Apr 29, 2020 at 02:42:01PM -0700, Peter Collingbourne wrote:
> On Wed, Apr 29, 2020 at 2:08 PM Will Deacon <will@kernel.org> wrote:
> > On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > the tag bits may be needed by tools in order to accurately diagnose
> > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > Tagging Extension (MTE).
> > >
> > > We should not stop clearing these bits in the existing fault address
> > > fields, because there may be existing userspace applications that are
> > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > sigcontext (similar to the existing esr_context), and store the original
> > > value of FAR_EL1 (including the tag bits) there.
> > >
> > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > >
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > ---
> > > v3:
> > > - add documentation to tagged-pointers.rst
> > > - update comments in sigcontext.h
> >
> > Hmm, although the code looks fine, why don't we just expose the tag in the
> > new field, rather than duplicate the address information? I'm nervous about
> > exposing privileged registers directly to userspace.
> 
> I have no strong opinion on whether this should just contain the tag or not.

A few of us chatted about this today. Please could you spin a v4 where only
the top byte is exposed in the new sigcontext record as a __u8? You'll need
to think of a better name than "FAR"; perhaps something like 'si_addr_top_byte',
'si_addr_63_56' or whatever you fancy. Naming is hard.

For MTE we can add a separate record later on, so as not to overload this
(e.g. si_addr_mte_tag).

Ta,

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-05-04 10:19     ` Dave Martin
@ 2020-05-07 17:55       ` Peter Collingbourne
  2020-05-13 17:27         ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-07 17:55 UTC (permalink / raw)
  To: Dave Martin
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany, Linux ARM,
	Catalin Marinas, Vincenzo Frascino, Will Deacon,
	Evgenii Stepanov, Richard Henderson

On Mon, May 4, 2020 at 3:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > the tag bits may be needed by tools in order to accurately diagnose
> > memory errors, such as HWASan [1] or future tools based on the Memory
> > Tagging Extension (MTE).
> >
> > We should not stop clearing these bits in the existing fault address
> > fields, because there may be existing userspace applications that are
> > expecting the tag bits to be cleared. Instead, create a far_context in
> > sigcontext (similar to the existing esr_context), and store the original
> > value of FAR_EL1 (including the tag bits) there.
> >
> > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > v3:
> > - add documentation to tagged-pointers.rst
> > - update comments in sigcontext.h
> >
> > v2:
> > - revert changes to hw_breakpoint.c
> > - rename set_thread_esr to set_thread_far_esr
> >
> >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> >  arch/arm64/include/asm/exception.h       |  2 +-
> >  arch/arm64/include/asm/processor.h       |  2 +-
> >  arch/arm64/include/uapi/asm/sigcontext.h | 21 +++++++----
> >  arch/arm64/kernel/entry-common.c         |  2 --
> >  arch/arm64/kernel/signal.c               | 20 ++++++++++-
> >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> >  7 files changed, 74 insertions(+), 35 deletions(-)
>
> [...]
>
> > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > index 8b0ebce92427..6782394633cb 100644
> > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > @@ -44,11 +44,12 @@ struct sigcontext {
> >   *
> >   *   0x210           fpsimd_context
> >   *    0x10           esr_context
> > + *    0x10           far_context
> >   *   0x8a0           sve_context (vl <= 64) (optional)
> >   *    0x20           extra_context (optional)
> >   *    0x10           terminator (null _aarch64_ctx)
> >   *
> > - *   0x510           (reserved for future allocation)
> > + *   0x500           (reserved for future allocation)
> >   *
> >   * New records that can exceed this space need to be opt-in for userspace, so
> >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > @@ -94,17 +95,25 @@ struct esr_context {
> >       __u64 esr;
> >  };
> >
> > +/* FAR_EL1 context */
> > +#define FAR_MAGIC    0x46415201
> > +
> > +struct far_context {
> > +     struct _aarch64_ctx head;
> > +     __u64 far;
> > +};
> > +
> >  /*
> >   * extra_context: describes extra space in the signal frame for
> >   * additional structures that don't fit in sigcontext.__reserved[].
> >   *
> >   * Note:
> >   *
> > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > - * extra space.  Any other record can be placed either in the extra
> > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > - * this file.
> > + * 1) fpsimd_context, esr_context, far_context and extra_context must be
> > + * placed in sigcontext.__reserved[] if present.  They cannot be placed
> > + * in the extra space.  Any other record can be placed either in the
> > + * extra space or in sigcontext.__reserved[], unless otherwise specified
> > + * in this file.
>
> This is for backwards compatibility only.  We don't need this constraint
> for any new field, so you can probably leave the paragraph as-is.
>
> Removing this would mean constraint would mean that userspace must be
> prepared to traverse extra_context when looking for far_context.  But
> really we want modern userspace to do this anyway, since it reduces
> backwards compatibilty worries when adding more new records in the
> future.

My original reason for updating this comment was that I figured that
this record was small enough that we could just always include it in
__reserved.

But thinking about this a bit more, it doesn't seem that just wanting
userspace to read extra_context will guarantee that it will do so. In
practice, it would be easy to write userspace code that works right
now but doesn't read extra_context correctly (either because
extra_context wasn't considered at all, or because the code purporting
to read the record from extra_context contains a latent bug because it
wasn't exercised). Since we may be practically constrained from moving
the record anyway, we might as well document it and allow the
userspace code to be a little simpler.

I guess one alternative is that we always place this record in
extra_context, which would force userspace to read it correctly. That
has something of the opposite problem (userspace code could be written
to only expect the record in extra_context), but at least we're less
constrained there, and it's more likely that the code would be parsing
__reserved correctly since it would need to do so in order to find
extra_context.

Anyway, I've reverted the comment change for now in v4, but let me
know what you think.


Peter

>
>
> The nasty loop in parse_user_sigframe() allows some flexibility
> regarding the order of records, but prior to this patch there is no
> record that can be actually be moved, due to other backwards
> compatibility constraints -- so the flexibility isn't used today.  I'd
> like to avoid reorderability creeping in, so that we can get rid of the
> loop.
>
> So, mandating that records must be in a consistent order to sigcontext.h
> could be helpful.  inserting new records in the middle should be fine,
> so long as there is no shuffling.
>
> I'm not sure this patch needs to do anything extra for that: perhaps we
> can leave this no-shuffling rule implicit for now (?)
>
> People already get shouted at for needslessly noisy diffs, so there is a
> strong disincentive to shuffle existing headers in any case...
>
> [...]
>
> Cheers
> ---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v4] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-04 17:03         ` Will Deacon
@ 2020-05-07 17:57           ` Peter Collingbourne
  2020-05-08  2:01             ` [PATCH v5] " Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-07 17:57 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address fields,
because there may be existing userspace applications that are expecting the tag
bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
(similar to the existing esr_context), and store the tag bits of FAR_EL1 there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v4:
- expose only the tag bits in the context instead of the entire FAR_EL1
- remove mention of the new context from the sigcontext.__reserved[] note

v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 Documentation/arm64/tagged-pointers.rst  | 17 +++++----
 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h | 11 +++++-
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/signal.c               | 22 +++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 7 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..c6e9592a9dea 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@ visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the fault_addr_top_byte field of
+the signal frame record fault_addr_top_byte_context, which is present
+for signals raised in response to data aborts and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..63185be29ff9 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -144,7 +144,7 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..736d6e845b66 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,11 +44,12 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		fault_addr_top_byte_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
  *
- *	0x510		(reserved for future allocation)
+ *	0x500		(reserved for future allocation)
  *
  * New records that can exceed this space need to be opt-in for userspace, so
  * that an expanded signal frame is not generated unexpectedly.  The mechanism
@@ -94,6 +95,14 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* Top byte of fault address (normally not exposed via si_addr) */
+#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
+
+struct fault_addr_top_byte_context {
+	struct _aarch64_ctx head;
+	__u8 fault_addr_top_byte;
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..045b4f518836 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..baa88dc02e5c 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long ftb_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAULT_ADDR_TOP_BYTE_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(
+			user, &user->ftb_offset,
+			sizeof(struct fault_addr_top_byte_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->ftb_offset) {
+		struct fault_addr_top_byte_context __user *ftb_ctx =
+			apply_user_offset(user, user->ftb_offset);
+
+		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
+				 &ftb_ctx->head.magic, err);
+		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address >> 56,
+				 &ftb_ctx->fault_addr_top_byte, err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..39bbaa05f162 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_far_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_far_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -570,7 +576,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_far_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -605,30 +611,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -644,7 +652,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -717,11 +725,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
-- 
2.26.2.526.g744177e7f7-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH v5] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-07 17:57           ` [PATCH v4] arm64: Expose FAR_EL1 tag bits " Peter Collingbourne
@ 2020-05-08  2:01             ` Peter Collingbourne
  2020-05-12 16:25               ` Catalin Marinas
  2020-05-13 18:09               ` [PATCH v6] " Peter Collingbourne
  0 siblings, 2 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-08  2:01 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address fields,
because there may be existing userspace applications that are expecting the tag
bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
(similar to the existing esr_context), and store the tag bits of FAR_EL1 there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v5:
- add padding to fault_addr_top_byte_context in order to ensure the correct
  size and preserve sp alignment

v4:
- expose only the tag bits in the context instead of the entire FAR_EL1
- remove mention of the new context from the sigcontext.__reserved[] note

v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 Documentation/arm64/tagged-pointers.rst  | 17 +++++----
 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h | 12 ++++++-
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/signal.c               | 22 +++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 7 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..c6e9592a9dea 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@ visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the fault_addr_top_byte field of
+the signal frame record fault_addr_top_byte_context, which is present
+for signals raised in response to data aborts and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..63185be29ff9 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -144,7 +144,7 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..1a2d23092d8f 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,11 +44,12 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		fault_addr_top_byte_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
  *
- *	0x510		(reserved for future allocation)
+ *	0x500		(reserved for future allocation)
  *
  * New records that can exceed this space need to be opt-in for userspace, so
  * that an expanded signal frame is not generated unexpectedly.  The mechanism
@@ -94,6 +95,15 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* Top byte of fault address (normally not exposed via si_addr) */
+#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
+
+struct fault_addr_top_byte_context {
+	struct _aarch64_ctx head;
+	__u8 fault_addr_top_byte;
+	__u8 __reserved[7];
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..045b4f518836 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..baa88dc02e5c 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long ftb_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAULT_ADDR_TOP_BYTE_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(
+			user, &user->ftb_offset,
+			sizeof(struct fault_addr_top_byte_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->ftb_offset) {
+		struct fault_addr_top_byte_context __user *ftb_ctx =
+			apply_user_offset(user, user->ftb_offset);
+
+		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
+				 &ftb_ctx->head.magic, err);
+		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address >> 56,
+				 &ftb_ctx->fault_addr_top_byte, err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..39bbaa05f162 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_far_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_far_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -570,7 +576,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_far_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -605,30 +611,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -644,7 +652,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -717,11 +725,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
-- 
2.26.2.645.ge9eca65c58-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v5] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-08  2:01             ` [PATCH v5] " Peter Collingbourne
@ 2020-05-12 16:25               ` Catalin Marinas
  2020-05-13 18:09               ` [PATCH v6] " Peter Collingbourne
  1 sibling, 0 replies; 64+ messages in thread
From: Catalin Marinas @ 2020-05-12 16:25 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Vincenzo Frascino, Will Deacon, Dave Martin,
	Linux ARM, Richard Henderson

On Thu, May 07, 2020 at 07:01:06PM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address fields,
> because there may be existing userspace applications that are expecting the tag
> bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>

It looks alright to me.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-05-07 17:55       ` Peter Collingbourne
@ 2020-05-13 17:27         ` Dave Martin
  2020-05-13 18:00           ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-05-13 17:27 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Linux ARM, Richard Henderson

On Thu, May 07, 2020 at 10:55:02AM -0700, Peter Collingbourne wrote:
> On Mon, May 4, 2020 at 3:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > the tag bits may be needed by tools in order to accurately diagnose
> > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > Tagging Extension (MTE).
> > >
> > > We should not stop clearing these bits in the existing fault address
> > > fields, because there may be existing userspace applications that are
> > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > sigcontext (similar to the existing esr_context), and store the original
> > > value of FAR_EL1 (including the tag bits) there.
> > >
> > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > >
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > ---
> > > v3:
> > > - add documentation to tagged-pointers.rst
> > > - update comments in sigcontext.h
> > >
> > > v2:
> > > - revert changes to hw_breakpoint.c
> > > - rename set_thread_esr to set_thread_far_esr
> > >
> > >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> > >  arch/arm64/include/asm/exception.h       |  2 +-
> > >  arch/arm64/include/asm/processor.h       |  2 +-
> > >  arch/arm64/include/uapi/asm/sigcontext.h | 21 +++++++----
> > >  arch/arm64/kernel/entry-common.c         |  2 --
> > >  arch/arm64/kernel/signal.c               | 20 ++++++++++-
> > >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> > >  7 files changed, 74 insertions(+), 35 deletions(-)
> >
> > [...]
> >
> > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > index 8b0ebce92427..6782394633cb 100644
> > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > @@ -44,11 +44,12 @@ struct sigcontext {
> > >   *
> > >   *   0x210           fpsimd_context
> > >   *    0x10           esr_context
> > > + *    0x10           far_context
> > >   *   0x8a0           sve_context (vl <= 64) (optional)
> > >   *    0x20           extra_context (optional)
> > >   *    0x10           terminator (null _aarch64_ctx)
> > >   *
> > > - *   0x510           (reserved for future allocation)
> > > + *   0x500           (reserved for future allocation)
> > >   *
> > >   * New records that can exceed this space need to be opt-in for userspace, so
> > >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > > @@ -94,17 +95,25 @@ struct esr_context {
> > >       __u64 esr;
> > >  };
> > >
> > > +/* FAR_EL1 context */
> > > +#define FAR_MAGIC    0x46415201
> > > +
> > > +struct far_context {
> > > +     struct _aarch64_ctx head;
> > > +     __u64 far;
> > > +};
> > > +
> > >  /*
> > >   * extra_context: describes extra space in the signal frame for
> > >   * additional structures that don't fit in sigcontext.__reserved[].
> > >   *
> > >   * Note:
> > >   *
> > > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > > - * extra space.  Any other record can be placed either in the extra
> > > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > > - * this file.
> > > + * 1) fpsimd_context, esr_context, far_context and extra_context must be
> > > + * placed in sigcontext.__reserved[] if present.  They cannot be placed
> > > + * in the extra space.  Any other record can be placed either in the
> > > + * extra space or in sigcontext.__reserved[], unless otherwise specified
> > > + * in this file.
> >
> > This is for backwards compatibility only.  We don't need this constraint
> > for any new field, so you can probably leave the paragraph as-is.
> >
> > Removing this would mean constraint would mean that userspace must be
> > prepared to traverse extra_context when looking for far_context.  But
> > really we want modern userspace to do this anyway, since it reduces
> > backwards compatibilty worries when adding more new records in the
> > future.
> 
> My original reason for updating this comment was that I figured that
> this record was small enough that we could just always include it in
> __reserved.
> 
> But thinking about this a bit more, it doesn't seem that just wanting
> userspace to read extra_context will guarantee that it will do so. In
> practice, it would be easy to write userspace code that works right
> now but doesn't read extra_context correctly (either because
> extra_context wasn't considered at all, or because the code purporting
> to read the record from extra_context contains a latent bug because it
> wasn't exercised). Since we may be practically constrained from moving
> the record anyway, we might as well document it and allow the
> userspace code to be a little simpler.
> 
> I guess one alternative is that we always place this record in
> extra_context, which would force userspace to read it correctly. That
> has something of the opposite problem (userspace code could be written
> to only expect the record in extra_context), but at least we're less
> constrained there, and it's more likely that the code would be parsing
> __reserved correctly since it would need to do so in order to find
> extra_context.
> 
> Anyway, I've reverted the comment change for now in v4, but let me
> know what you think.

Apologies for the delay in responding -- I think it does make sense to
reserve space in __reserved[] for the new record, the the location you
suggested for it is sensible.

__reserved[] is a scarce resource, and should only be burned on "small"
records, but far_context is small.


here's another reason too, which is that we don't want to needlessly
block new software from using this field without allocating larger
stacks -- not least because they just won't, and the problem won't
bite them until much later.


Hope that helps clarify things.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v3] arm64: Expose original FAR_EL1 value in sigcontext
  2020-05-13 17:27         ` Dave Martin
@ 2020-05-13 18:00           ` Peter Collingbourne
  0 siblings, 0 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-13 18:00 UTC (permalink / raw)
  To: Dave Martin
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Linux ARM, Richard Henderson

On Wed, May 13, 2020 at 10:27 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Thu, May 07, 2020 at 10:55:02AM -0700, Peter Collingbourne wrote:
> > On Mon, May 4, 2020 at 3:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Fri, Mar 27, 2020 at 12:19:15PM -0700, Peter Collingbourne wrote:
> > > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > > the tag bits may be needed by tools in order to accurately diagnose
> > > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > > Tagging Extension (MTE).
> > > >
> > > > We should not stop clearing these bits in the existing fault address
> > > > fields, because there may be existing userspace applications that are
> > > > expecting the tag bits to be cleared. Instead, create a far_context in
> > > > sigcontext (similar to the existing esr_context), and store the original
> > > > value of FAR_EL1 (including the tag bits) there.
> > > >
> > > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > >
> > > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > > ---
> > > > v3:
> > > > - add documentation to tagged-pointers.rst
> > > > - update comments in sigcontext.h
> > > >
> > > > v2:
> > > > - revert changes to hw_breakpoint.c
> > > > - rename set_thread_esr to set_thread_far_esr
> > > >
> > > >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> > > >  arch/arm64/include/asm/exception.h       |  2 +-
> > > >  arch/arm64/include/asm/processor.h       |  2 +-
> > > >  arch/arm64/include/uapi/asm/sigcontext.h | 21 +++++++----
> > > >  arch/arm64/kernel/entry-common.c         |  2 --
> > > >  arch/arm64/kernel/signal.c               | 20 ++++++++++-
> > > >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> > > >  7 files changed, 74 insertions(+), 35 deletions(-)
> > >
> > > [...]
> > >
> > > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > index 8b0ebce92427..6782394633cb 100644
> > > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > @@ -44,11 +44,12 @@ struct sigcontext {
> > > >   *
> > > >   *   0x210           fpsimd_context
> > > >   *    0x10           esr_context
> > > > + *    0x10           far_context
> > > >   *   0x8a0           sve_context (vl <= 64) (optional)
> > > >   *    0x20           extra_context (optional)
> > > >   *    0x10           terminator (null _aarch64_ctx)
> > > >   *
> > > > - *   0x510           (reserved for future allocation)
> > > > + *   0x500           (reserved for future allocation)
> > > >   *
> > > >   * New records that can exceed this space need to be opt-in for userspace, so
> > > >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > > > @@ -94,17 +95,25 @@ struct esr_context {
> > > >       __u64 esr;
> > > >  };
> > > >
> > > > +/* FAR_EL1 context */
> > > > +#define FAR_MAGIC    0x46415201
> > > > +
> > > > +struct far_context {
> > > > +     struct _aarch64_ctx head;
> > > > +     __u64 far;
> > > > +};
> > > > +
> > > >  /*
> > > >   * extra_context: describes extra space in the signal frame for
> > > >   * additional structures that don't fit in sigcontext.__reserved[].
> > > >   *
> > > >   * Note:
> > > >   *
> > > > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > > > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > > > - * extra space.  Any other record can be placed either in the extra
> > > > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > > > - * this file.
> > > > + * 1) fpsimd_context, esr_context, far_context and extra_context must be
> > > > + * placed in sigcontext.__reserved[] if present.  They cannot be placed
> > > > + * in the extra space.  Any other record can be placed either in the
> > > > + * extra space or in sigcontext.__reserved[], unless otherwise specified
> > > > + * in this file.
> > >
> > > This is for backwards compatibility only.  We don't need this constraint
> > > for any new field, so you can probably leave the paragraph as-is.
> > >
> > > Removing this would mean constraint would mean that userspace must be
> > > prepared to traverse extra_context when looking for far_context.  But
> > > really we want modern userspace to do this anyway, since it reduces
> > > backwards compatibilty worries when adding more new records in the
> > > future.
> >
> > My original reason for updating this comment was that I figured that
> > this record was small enough that we could just always include it in
> > __reserved.
> >
> > But thinking about this a bit more, it doesn't seem that just wanting
> > userspace to read extra_context will guarantee that it will do so. In
> > practice, it would be easy to write userspace code that works right
> > now but doesn't read extra_context correctly (either because
> > extra_context wasn't considered at all, or because the code purporting
> > to read the record from extra_context contains a latent bug because it
> > wasn't exercised). Since we may be practically constrained from moving
> > the record anyway, we might as well document it and allow the
> > userspace code to be a little simpler.
> >
> > I guess one alternative is that we always place this record in
> > extra_context, which would force userspace to read it correctly. That
> > has something of the opposite problem (userspace code could be written
> > to only expect the record in extra_context), but at least we're less
> > constrained there, and it's more likely that the code would be parsing
> > __reserved correctly since it would need to do so in order to find
> > extra_context.
> >
> > Anyway, I've reverted the comment change for now in v4, but let me
> > know what you think.
>
> Apologies for the delay in responding -- I think it does make sense to
> reserve space in __reserved[] for the new record, the the location you
> suggested for it is sensible.
>
> __reserved[] is a scarce resource, and should only be burned on "small"
> records, but far_context is small.
>
>
> here's another reason too, which is that we don't want to needlessly
> block new software from using this field without allocating larger
> stacks -- not least because they just won't, and the problem won't
> bite them until much later.
>
>
> Hope that helps clarify things.

Thanks, that makes sense. I will send a v6 with the comment brought back.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-08  2:01             ` [PATCH v5] " Peter Collingbourne
  2020-05-12 16:25               ` Catalin Marinas
@ 2020-05-13 18:09               ` Peter Collingbourne
  2020-05-13 20:28                 ` Dave Martin
  1 sibling, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-13 18:09 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address fields,
because there may be existing userspace applications that are expecting the tag
bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
(similar to the existing esr_context), and store the tag bits of FAR_EL1 there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v6:
- bring back comment about __reserved[]

v5:
- add padding to fault_addr_top_byte_context in order to ensure the correct
  size and preserve sp alignment

v4:
- expose only the tag bits in the context instead of the entire FAR_EL1
- remove mention of the new context from the sigcontext.__reserved[] note

v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 Documentation/arm64/tagged-pointers.rst  | 17 +++++----
 arch/arm64/include/asm/exception.h       |  2 +-
 arch/arm64/include/asm/processor.h       |  2 +-
 arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
 arch/arm64/kernel/entry-common.c         |  2 --
 arch/arm64/kernel/signal.c               | 22 +++++++++++-
 arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
 7 files changed, 77 insertions(+), 35 deletions(-)

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..c6e9592a9dea 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@ visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the fault_addr_top_byte field of
+the signal frame record fault_addr_top_byte_context, which is present
+for signals raised in response to data aborts and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..63185be29ff9 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -144,7 +144,7 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
+	unsigned long		fault_address;	/* FAR_EL1 value */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..2a3fe3de899d 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,11 +44,12 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		fault_addr_top_byte_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
  *
- *	0x510		(reserved for future allocation)
+ *	0x500		(reserved for future allocation)
  *
  * New records that can exceed this space need to be opt-in for userspace, so
  * that an expanded signal frame is not generated unexpectedly.  The mechanism
@@ -94,17 +95,26 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* Top byte of fault address (normally not exposed via si_addr) */
+#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
+
+struct fault_addr_top_byte_context {
+	struct _aarch64_ctx head;
+	__u8 fault_addr_top_byte;
+	__u8 __reserved[7];
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
  *
  * Note:
  *
- * 1) fpsimd_context, esr_context and extra_context must be placed in
- * sigcontext.__reserved[] if present.  They cannot be placed in the
- * extra space.  Any other record can be placed either in the extra
- * space or in sigcontext.__reserved[], unless otherwise specified in
- * this file.
+ * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
+ * extra_context must be placed in sigcontext.__reserved[] if present.
+ * They cannot be placed in the extra space.  Any other record can be
+ * placed either in the extra space or in sigcontext.__reserved[],
+ * unless otherwise specified in this file.
  *
  * 2) There must not be more than one extra_context.
  *
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..045b4f518836 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..baa88dc02e5c 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long ftb_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAULT_ADDR_TOP_BYTE_MAGIC:
 			/* ignore */
 			break;
 
@@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+
+		err = sigframe_alloc(
+			user, &user->ftb_offset,
+			sizeof(struct fault_addr_top_byte_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(current->thread.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->ftb_offset) {
+		struct fault_addr_top_byte_context __user *ftb_ctx =
+			apply_user_offset(user, user->ftb_offset);
+
+		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
+				 &ftb_ctx->head.magic, err);
+		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
+		__put_user_error(current->thread.fault_address >> 56,
+				 &ftb_ctx->fault_addr_top_byte, err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..39bbaa05f162 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void set_thread_far_esr(unsigned long far, unsigned int esr)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
+
+	current->thread.fault_address = far;
 
 	/*
 	 * If the faulting address is in the kernel, we must sanitize the ESR.
@@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	 * type", so we ignore this wrinkle and just return the translation
 	 * fault.)
 	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
+	if (!is_ttbr0_addr(addr)) {
 		switch (ESR_ELx_EC(esr)) {
 		case ESR_ELx_EC_DABT_LOW:
 			/*
@@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 
-		set_thread_esr(addr, esr);
+		set_thread_far_esr(far, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
 				      inf->name);
 	} else {
@@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -570,7 +576,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
+	set_thread_far_esr(far, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
@@ -605,30 +611,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -644,7 +652,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -717,11 +725,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
-- 
2.26.2.645.ge9eca65c58-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-13 18:09               ` [PATCH v6] " Peter Collingbourne
@ 2020-05-13 20:28                 ` Dave Martin
  2020-05-15  0:58                   ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-05-13 20:28 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany, Linux ARM,
	Catalin Marinas, Vincenzo Frascino, Will Deacon,
	Evgenii Stepanov, Richard Henderson

On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).
> 
> We should not stop clearing these bits in the existing fault address fields,
> because there may be existing userspace applications that are expecting the tag
> bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> 
> [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> 
> Signed-off-by: Peter Collingbourne <pcc@google.com>
> ---
> v6:
> - bring back comment about __reserved[]
> 
> v5:
> - add padding to fault_addr_top_byte_context in order to ensure the correct
>   size and preserve sp alignment
> 
> v4:
> - expose only the tag bits in the context instead of the entire FAR_EL1
> - remove mention of the new context from the sigcontext.__reserved[] note
> 
> v3:
> - add documentation to tagged-pointers.rst
> - update comments in sigcontext.h
> 
> v2:
> - revert changes to hw_breakpoint.c
> - rename set_thread_esr to set_thread_far_esr
> 
>  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
>  arch/arm64/include/asm/exception.h       |  2 +-
>  arch/arm64/include/asm/processor.h       |  2 +-
>  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
>  arch/arm64/kernel/entry-common.c         |  2 --
>  arch/arm64/kernel/signal.c               | 22 +++++++++++-
>  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
>  7 files changed, 77 insertions(+), 35 deletions(-)
> 
> diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> index eab4323609b9..c6e9592a9dea 100644
> --- a/Documentation/arm64/tagged-pointers.rst
> +++ b/Documentation/arm64/tagged-pointers.rst
> @@ -53,12 +53,17 @@ visibility.
>  Preserving tags
>  ---------------
>  
> -Non-zero tags are not preserved when delivering signals. This means that
> -signal handlers in applications making use of tags cannot rely on the
> -tag information for user virtual addresses being maintained for fields
> -inside siginfo_t. One exception to this rule is for signals raised in
> -response to watchpoint debug exceptions, where the tag information will
> -be preserved.
> +Non-zero tags are not preserved in the fault address fields
> +siginfo.si_addr or sigcontext.fault_address when delivering
> +signals. This means that signal handlers in applications making use
> +of tags cannot rely on the tag information for user virtual addresses
> +being maintained in these fields. One exception to this rule is for
> +signals raised in response to watchpoint debug exceptions, where the
> +tag information will be preserved.
> +
> +The fault address tag is preserved in the fault_addr_top_byte field of
> +the signal frame record fault_addr_top_byte_context, which is present
> +for signals raised in response to data aborts and instruction aborts.
>  
>  The architecture prevents the use of a tagged PC, so the upper byte will
>  be set to a sign-extension of bit 55 on exception return.
> diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> index 7a6e81ca23a8..90e772d9b2cd 100644
> --- a/arch/arm64/include/asm/exception.h
> +++ b/arch/arm64/include/asm/exception.h
> @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
>  }
>  
>  asmlinkage void enter_from_user_mode(void);
> -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
>  void do_undefinstr(struct pt_regs *regs);
>  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
>  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> index 240fe5e5b720..63185be29ff9 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -144,7 +144,7 @@ struct thread_struct {
>  	void			*sve_state;	/* SVE registers, if any */
>  	unsigned int		sve_vl;		/* SVE vector length */
>  	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
> -	unsigned long		fault_address;	/* fault info */
> +	unsigned long		fault_address;	/* FAR_EL1 value */
>  	unsigned long		fault_code;	/* ESR_EL1 value */
>  	struct debug_info	debug;		/* debugging */
>  #ifdef CONFIG_ARM64_PTR_AUTH
> diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> index 8b0ebce92427..2a3fe3de899d 100644
> --- a/arch/arm64/include/uapi/asm/sigcontext.h
> +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> @@ -44,11 +44,12 @@ struct sigcontext {
>   *
>   *	0x210		fpsimd_context
>   *	 0x10		esr_context
> + *	 0x10		fault_addr_top_byte_context
>   *	0x8a0		sve_context (vl <= 64) (optional)
>   *	 0x20		extra_context (optional)
>   *	 0x10		terminator (null _aarch64_ctx)
>   *
> - *	0x510		(reserved for future allocation)
> + *	0x500		(reserved for future allocation)
>   *
>   * New records that can exceed this space need to be opt-in for userspace, so
>   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> @@ -94,17 +95,26 @@ struct esr_context {
>  	__u64 esr;
>  };
>  
> +/* Top byte of fault address (normally not exposed via si_addr) */
> +#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
> +
> +struct fault_addr_top_byte_context {
> +	struct _aarch64_ctx head;
> +	__u8 fault_addr_top_byte;
> +	__u8 __reserved[7];
> +};
> +

Nit: the name here is a bit cumbersome (obviously bikeshedding...)


For the rest, some of my comments may be bogus -- I haven't dug into
this stuff for a little while!


Anyway:

Do we really get the whole top byte of the address in the FAR?  If so,
fine, but I'm having trouble finding a clear statement in the
architecture one way or the other.  (I didn't attempt to dive into the
pseudocode.)


Also, since we're burning 16 bytes here, I'd prefer if we make this
extensible.  At present the __reserved[7] is unusable because
userspace has no way to know whether it's valid or not.

Options include an additional flag byte (0 for now), or just making
the whole thing a __u64.  In that case we can leave the top byte bits
in their original positions if we want, but it would be a good idea to
include a flag to say that field is valid at all.  (See comments below
on Synchronous external abort.)

So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
(with #defines for the bits/fields as appropriate).


>  /*
>   * extra_context: describes extra space in the signal frame for
>   * additional structures that don't fit in sigcontext.__reserved[].
>   *
>   * Note:
>   *
> - * 1) fpsimd_context, esr_context and extra_context must be placed in
> - * sigcontext.__reserved[] if present.  They cannot be placed in the
> - * extra space.  Any other record can be placed either in the extra
> - * space or in sigcontext.__reserved[], unless otherwise specified in
> - * this file.
> + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> + * extra_context must be placed in sigcontext.__reserved[] if present.
> + * They cannot be placed in the extra space.  Any other record can be
> + * placed either in the extra space or in sigcontext.__reserved[],
> + * unless otherwise specified in this file.
>   *
>   * 2) There must not be more than one extra_context.
>   *
> diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> index c839b5bf1904..045b4f518836 100644
> --- a/arch/arm64/kernel/entry-common.c
> +++ b/arch/arm64/kernel/entry-common.c
> @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
>  	unsigned long far = read_sysreg(far_el1);
>  
>  	local_daif_inherit(regs);
> -	far = untagged_addr(far);
>  	do_mem_abort(far, esr, regs);
>  }
>  NOKPROBE_SYMBOL(el1_abort);
> @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
>  
>  	user_exit_irqoff();
>  	local_daif_restore(DAIF_PROCCTX);
> -	far = untagged_addr(far);
>  	do_mem_abort(far, esr, regs);
>  }
>  NOKPROBE_SYMBOL(el0_da);
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index 339882db5a91..baa88dc02e5c 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
>  
>  	unsigned long fpsimd_offset;
>  	unsigned long esr_offset;
> +	unsigned long ftb_offset;
>  	unsigned long sve_offset;
>  	unsigned long extra_offset;
>  	unsigned long end_offset;
> @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
>  			break;
>  
>  		case ESR_MAGIC:
> +		case FAULT_ADDR_TOP_BYTE_MAGIC:
>  			/* ignore */
>  			break;
>  
> @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
>  				     sizeof(struct esr_context));
>  		if (err)
>  			return err;
> +
> +		err = sigframe_alloc(
> +			user, &user->ftb_offset,
> +			sizeof(struct fault_addr_top_byte_context));

Nit: inconsistent indentation?

(Mostly just because it makes the change look odd against the hunk
context, but not a big deal.)

> +		if (err)
> +			return err;
>  	}
>  
>  	if (system_supports_sve()) {
> @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
>  	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
>  
> -	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> +	__put_user_error(untagged_addr(current->thread.fault_address),
> +			 &sf->uc.uc_mcontext.fault_address, err);
>  
>  	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
>  
> @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
>  		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
>  	}
>  
> +	if (err == 0 && user->ftb_offset) {
> +		struct fault_addr_top_byte_context __user *ftb_ctx =
> +			apply_user_offset(user, user->ftb_offset);
> +
> +		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> +				 &ftb_ctx->head.magic, err);
> +		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> +		__put_user_error(current->thread.fault_address >> 56,
> +				 &ftb_ctx->fault_addr_top_byte, err);
> +	}
> +

How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?

For Synchronous external aborts in particular, those bits are documented
as UNKNOWN, but I don't see any special handling,  There may be other
cases I haven't spotted.

For preference we can omit this record entirely if we don't have any
information we can report, but certainly we shouldn't expose UNKNOWN
bits.


[ Aside:

Also, what if we're not reporting a memory abort at all?  Does
thread.fault_address just contain junk from the last fault?  I see
nothing anywhere that cleans this up.  (This is historical and not
your fault, but it would be good to close this down while we're about
it.)


Hmmm, looking at the code I think we probably leak fault_address etc.
across execve() too, so it may even be stale junk from an old process
:/

Maybe I just confused myself. 

End aside. ]


Apart from these issues, the actual code looks reasonable.

Cheers
---Dave


>  	/* Scalable Vector Extension state, if present */
>  	if (system_supports_sve() && err == 0 && user->sve_offset) {
>  		struct sve_context __user *sve_ctx =
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index c9cedc0432d2..39bbaa05f162 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -41,7 +41,7 @@
>  #include <asm/traps.h>
>  
>  struct fault_info {
> -	int	(*fn)(unsigned long addr, unsigned int esr,
> +	int	(*fn)(unsigned long far, unsigned int esr,
>  		      struct pt_regs *regs);
>  	int	sig;
>  	int	code;
> @@ -320,9 +320,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
>  	die_kernel_fault(msg, addr, esr, regs);
>  }
>  
> -static void set_thread_esr(unsigned long address, unsigned int esr)
> +static void set_thread_far_esr(unsigned long far, unsigned int esr)
>  {
> -	current->thread.fault_address = address;
> +	unsigned long addr = untagged_addr(far);
> +
> +	current->thread.fault_address = far;
>  
>  	/*
>  	 * If the faulting address is in the kernel, we must sanitize the ESR.
> @@ -336,7 +338,7 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
>  	 * type", so we ignore this wrinkle and just return the translation
>  	 * fault.)
>  	 */
> -	if (!is_ttbr0_addr(current->thread.fault_address)) {
> +	if (!is_ttbr0_addr(addr)) {
>  		switch (ESR_ELx_EC(esr)) {
>  		case ESR_ELx_EC_DABT_LOW:
>  			/*
> @@ -377,8 +379,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
>  	current->thread.fault_code = esr;
>  }
>  
> -static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +static void do_bad_area(unsigned long far, unsigned int esr,
> +			struct pt_regs *regs)
>  {
> +	unsigned long addr = untagged_addr(far);
> +
>  	/*
>  	 * If we are in kernel mode at this point, we have no context to
>  	 * handle this fault with.
> @@ -386,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
>  	if (user_mode(regs)) {
>  		const struct fault_info *inf = esr_to_fault_info(esr);
>  
> -		set_thread_esr(addr, esr);
> +		set_thread_far_esr(far, esr);
>  		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
>  				      inf->name);
>  	} else {
> @@ -439,7 +444,7 @@ static bool is_write_abort(unsigned int esr)
>  	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
>  }
>  
> -static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
> +static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
>  				   struct pt_regs *regs)
>  {
>  	const struct fault_info *inf;
> @@ -447,6 +452,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
>  	vm_fault_t fault, major = 0;
>  	unsigned long vm_flags = VM_ACCESS_FLAGS;
>  	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
> +	unsigned long addr = untagged_addr(far);
>  
>  	if (kprobe_page_fault(regs, esr))
>  		return 0;
> @@ -570,7 +576,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
>  	}
>  
>  	inf = esr_to_fault_info(esr);
> -	set_thread_esr(addr, esr);
> +	set_thread_far_esr(far, esr);
>  	if (fault & VM_FAULT_SIGBUS) {
>  		/*
>  		 * We had some memory, but were unable to successfully fix up
> @@ -605,30 +611,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
>  	return 0;
>  }
>  
> -static int __kprobes do_translation_fault(unsigned long addr,
> +static int __kprobes do_translation_fault(unsigned long far,
>  					  unsigned int esr,
>  					  struct pt_regs *regs)
>  {
> +	unsigned long addr = untagged_addr(far);
> +
>  	if (is_ttbr0_addr(addr))
> -		return do_page_fault(addr, esr, regs);
> +		return do_page_fault(far, esr, regs);
>  
> -	do_bad_area(addr, esr, regs);
> +	do_bad_area(far, esr, regs);
>  	return 0;
>  }
>  
> -static int do_alignment_fault(unsigned long addr, unsigned int esr,
> +static int do_alignment_fault(unsigned long far, unsigned int esr,
>  			      struct pt_regs *regs)
>  {
> -	do_bad_area(addr, esr, regs);
> +	do_bad_area(far, esr, regs);
>  	return 0;
>  }
>  
> -static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
>  {
>  	return 1; /* "fault" */
>  }
>  
> -static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf;
>  	void __user *siaddr;
> @@ -644,7 +652,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
>  	if (esr & ESR_ELx_FnV)
>  		siaddr = NULL;
>  	else
> -		siaddr  = (void __user *)addr;
> +		siaddr  = (void __user *)untagged_addr(far);
>  	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
>  
>  	return 0;
> @@ -717,11 +725,12 @@ static const struct fault_info fault_info[] = {
>  	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
>  };
>  
> -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
> +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
>  {
>  	const struct fault_info *inf = esr_to_fault_info(esr);
> +	unsigned long addr = untagged_addr(far);
>  
> -	if (!inf->fn(addr, esr, regs))
> +	if (!inf->fn(far, esr, regs))
>  		return;
>  
>  	if (!user_mode(regs)) {
> -- 
> 2.26.2.645.ge9eca65c58-goog
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-13 20:28                 ` Dave Martin
@ 2020-05-15  0:58                   ` Peter Collingbourne
  2020-05-18  9:53                     ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-15  0:58 UTC (permalink / raw)
  To: Dave Martin
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany, Linux ARM,
	Catalin Marinas, Vincenzo Frascino, Will Deacon,
	Evgenii Stepanov, Richard Henderson

On Wed, May 13, 2020 at 1:28 PM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > the tag bits may be needed by tools in order to accurately diagnose
> > memory errors, such as HWASan [1] or future tools based on the Memory
> > Tagging Extension (MTE).
> >
> > We should not stop clearing these bits in the existing fault address fields,
> > because there may be existing userspace applications that are expecting the tag
> > bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> > (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> >
> > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> >
> > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > ---
> > v6:
> > - bring back comment about __reserved[]
> >
> > v5:
> > - add padding to fault_addr_top_byte_context in order to ensure the correct
> >   size and preserve sp alignment
> >
> > v4:
> > - expose only the tag bits in the context instead of the entire FAR_EL1
> > - remove mention of the new context from the sigcontext.__reserved[] note
> >
> > v3:
> > - add documentation to tagged-pointers.rst
> > - update comments in sigcontext.h
> >
> > v2:
> > - revert changes to hw_breakpoint.c
> > - rename set_thread_esr to set_thread_far_esr
> >
> >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> >  arch/arm64/include/asm/exception.h       |  2 +-
> >  arch/arm64/include/asm/processor.h       |  2 +-
> >  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
> >  arch/arm64/kernel/entry-common.c         |  2 --
> >  arch/arm64/kernel/signal.c               | 22 +++++++++++-
> >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> >  7 files changed, 77 insertions(+), 35 deletions(-)
> >
> > diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> > index eab4323609b9..c6e9592a9dea 100644
> > --- a/Documentation/arm64/tagged-pointers.rst
> > +++ b/Documentation/arm64/tagged-pointers.rst
> > @@ -53,12 +53,17 @@ visibility.
> >  Preserving tags
> >  ---------------
> >
> > -Non-zero tags are not preserved when delivering signals. This means that
> > -signal handlers in applications making use of tags cannot rely on the
> > -tag information for user virtual addresses being maintained for fields
> > -inside siginfo_t. One exception to this rule is for signals raised in
> > -response to watchpoint debug exceptions, where the tag information will
> > -be preserved.
> > +Non-zero tags are not preserved in the fault address fields
> > +siginfo.si_addr or sigcontext.fault_address when delivering
> > +signals. This means that signal handlers in applications making use
> > +of tags cannot rely on the tag information for user virtual addresses
> > +being maintained in these fields. One exception to this rule is for
> > +signals raised in response to watchpoint debug exceptions, where the
> > +tag information will be preserved.
> > +
> > +The fault address tag is preserved in the fault_addr_top_byte field of
> > +the signal frame record fault_addr_top_byte_context, which is present
> > +for signals raised in response to data aborts and instruction aborts.
> >
> >  The architecture prevents the use of a tagged PC, so the upper byte will
> >  be set to a sign-extension of bit 55 on exception return.
> > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> > index 7a6e81ca23a8..90e772d9b2cd 100644
> > --- a/arch/arm64/include/asm/exception.h
> > +++ b/arch/arm64/include/asm/exception.h
> > @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
> >  }
> >
> >  asmlinkage void enter_from_user_mode(void);
> > -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> > +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
> >  void do_undefinstr(struct pt_regs *regs);
> >  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
> >  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > index 240fe5e5b720..63185be29ff9 100644
> > --- a/arch/arm64/include/asm/processor.h
> > +++ b/arch/arm64/include/asm/processor.h
> > @@ -144,7 +144,7 @@ struct thread_struct {
> >       void                    *sve_state;     /* SVE registers, if any */
> >       unsigned int            sve_vl;         /* SVE vector length */
> >       unsigned int            sve_vl_onexec;  /* SVE vl after next exec */
> > -     unsigned long           fault_address;  /* fault info */
> > +     unsigned long           fault_address;  /* FAR_EL1 value */
> >       unsigned long           fault_code;     /* ESR_EL1 value */
> >       struct debug_info       debug;          /* debugging */
> >  #ifdef CONFIG_ARM64_PTR_AUTH
> > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > index 8b0ebce92427..2a3fe3de899d 100644
> > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > @@ -44,11 +44,12 @@ struct sigcontext {
> >   *
> >   *   0x210           fpsimd_context
> >   *    0x10           esr_context
> > + *    0x10           fault_addr_top_byte_context
> >   *   0x8a0           sve_context (vl <= 64) (optional)
> >   *    0x20           extra_context (optional)
> >   *    0x10           terminator (null _aarch64_ctx)
> >   *
> > - *   0x510           (reserved for future allocation)
> > + *   0x500           (reserved for future allocation)
> >   *
> >   * New records that can exceed this space need to be opt-in for userspace, so
> >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > @@ -94,17 +95,26 @@ struct esr_context {
> >       __u64 esr;
> >  };
> >
> > +/* Top byte of fault address (normally not exposed via si_addr) */
> > +#define FAULT_ADDR_TOP_BYTE_MAGIC    0x46544201
> > +
> > +struct fault_addr_top_byte_context {
> > +     struct _aarch64_ctx head;
> > +     __u8 fault_addr_top_byte;
> > +     __u8 __reserved[7];
> > +};
> > +
>
> Nit: the name here is a bit cumbersome (obviously bikeshedding...)
>
>
> For the rest, some of my comments may be bogus -- I haven't dug into
> this stuff for a little while!
>
>
> Anyway:
>
> Do we really get the whole top byte of the address in the FAR?  If so,
> fine, but I'm having trouble finding a clear statement in the
> architecture one way or the other.  (I didn't attempt to dive into the
> pseudocode.)

I rely on this statement in the ARM:

https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/far_el1
"For a Data Abort or Watchpoint exception, if address tagging is
enabled for the address accessed by the data access that caused the
exception, then this field includes the tag."

And note that address tagging here essentially means TBI (which is
always enabled on Linux), and not memory tagging.

> Also, since we're burning 16 bytes here, I'd prefer if we make this
> extensible.  At present the __reserved[7] is unusable because
> userspace has no way to know whether it's valid or not.
>
> Options include an additional flag byte (0 for now), or just making
> the whole thing a __u64.  In that case we can leave the top byte bits
> in their original positions if we want, but it would be a good idea to
> include a flag to say that field is valid at all.  (See comments below
> on Synchronous external abort.)
>
> So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
> (with #defines for the bits/fields as appropriate).

The flag bits seem like a good idea. Thinking ahead to the MTE sync
tag fault (which might not provide us with bits 60-63), we may
consider having separate bits to indicate "bits 56-59 valid" and "bits
60-63 valid", set both bits for regular data aborts and only the
former for sync tag faults, which would avoid the need to define a
separate context for these faults. And if a future architecture
revision provides us with bits 60-63 for tag faults, we could start
setting both flag bits even for tag faults.

> >  /*
> >   * extra_context: describes extra space in the signal frame for
> >   * additional structures that don't fit in sigcontext.__reserved[].
> >   *
> >   * Note:
> >   *
> > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > - * extra space.  Any other record can be placed either in the extra
> > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > - * this file.
> > + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> > + * extra_context must be placed in sigcontext.__reserved[] if present.
> > + * They cannot be placed in the extra space.  Any other record can be
> > + * placed either in the extra space or in sigcontext.__reserved[],
> > + * unless otherwise specified in this file.
> >   *
> >   * 2) There must not be more than one extra_context.
> >   *
> > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > index c839b5bf1904..045b4f518836 100644
> > --- a/arch/arm64/kernel/entry-common.c
> > +++ b/arch/arm64/kernel/entry-common.c
> > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> >       unsigned long far = read_sysreg(far_el1);
> >
> >       local_daif_inherit(regs);
> > -     far = untagged_addr(far);
> >       do_mem_abort(far, esr, regs);
> >  }
> >  NOKPROBE_SYMBOL(el1_abort);
> > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> >
> >       user_exit_irqoff();
> >       local_daif_restore(DAIF_PROCCTX);
> > -     far = untagged_addr(far);
> >       do_mem_abort(far, esr, regs);
> >  }
> >  NOKPROBE_SYMBOL(el0_da);
> > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > index 339882db5a91..baa88dc02e5c 100644
> > --- a/arch/arm64/kernel/signal.c
> > +++ b/arch/arm64/kernel/signal.c
> > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> >
> >       unsigned long fpsimd_offset;
> >       unsigned long esr_offset;
> > +     unsigned long ftb_offset;
> >       unsigned long sve_offset;
> >       unsigned long extra_offset;
> >       unsigned long end_offset;
> > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> >                       break;
> >
> >               case ESR_MAGIC:
> > +             case FAULT_ADDR_TOP_BYTE_MAGIC:
> >                       /* ignore */
> >                       break;
> >
> > @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> >                                    sizeof(struct esr_context));
> >               if (err)
> >                       return err;
> > +
> > +             err = sigframe_alloc(
> > +                     user, &user->ftb_offset,
> > +                     sizeof(struct fault_addr_top_byte_context));
>
> Nit: inconsistent indentation?
>
> (Mostly just because it makes the change look odd against the hunk
> context, but not a big deal.)

With consistent indentation we violate 80 cols due to the extra long
struct name. The indentation is what clang-format is giving me.

> > +             if (err)
> > +                     return err;
> >       }
> >
> >       if (system_supports_sve()) {
> > @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> >       __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
> >       __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
> >
> > -     __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> > +     __put_user_error(untagged_addr(current->thread.fault_address),
> > +                      &sf->uc.uc_mcontext.fault_address, err);
> >
> >       err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
> >
> > @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> >               __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> >       }
> >
> > +     if (err == 0 && user->ftb_offset) {
> > +             struct fault_addr_top_byte_context __user *ftb_ctx =
> > +                     apply_user_offset(user, user->ftb_offset);
> > +
> > +             __put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> > +                              &ftb_ctx->head.magic, err);
> > +             __put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> > +             __put_user_error(current->thread.fault_address >> 56,
> > +                              &ftb_ctx->fault_addr_top_byte, err);
> > +     }
> > +
>
> How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?
>
> For Synchronous external aborts in particular, those bits are documented
> as UNKNOWN, but I don't see any special handling,  There may be other
> cases I haven't spotted.
>
> For preference we can omit this record entirely if we don't have any
> information we can report, but certainly we shouldn't expose UNKNOWN
> bits.

In this case we mask out the top byte in do_sea before passing the
address to arm64_notify_die (which clears fault_address and passes the
address argument on to arm64_force_sig_fault to be exposed via
si_addr). So the record would always contain a 0 byte. It seems
reasonable to omit the record in this case instead.

> [ Aside:
>
> Also, what if we're not reporting a memory abort at all?  Does
> thread.fault_address just contain junk from the last fault?  I see
> nothing anywhere that cleans this up.  (This is historical and not
> your fault, but it would be good to close this down while we're about
> it.)
>
>
> Hmmm, looking at the code I think we probably leak fault_address etc.
> across execve() too, so it may even be stale junk from an old process
> :/
>
> Maybe I just confused myself.
>
> End aside. ]

Yes, it's unclear whether we always manage to not expose a fault
address if we're not reporting a data or instruction abort. The code
would need to arrange for fault_code to be set to 0 in order to avoid
exposing previous fault_address values via future signals. I don't see
anywhere where we're resetting these fields after delivering a signal,
so it seems possible by calling arm64_force_sig_fault without first
setting fault_code (most callers do this, but the calls in
arch/arm64/kernel/debug-monitors.c and arch/arm64/kernel/ptrace.c seem
not to), or simply by calling force_sig_fault (which happens in many
places throughout the kernel).

Maybe something like this would do the trick? (Untested, and forgive
spaces instead of tabs, grumble grumble gmail):

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index baa88dc02e5c..5867f2fdbe64 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -648,6 +648,7 @@ static int setup_sigframe(struct
rt_sigframe_user_layout *user,
                __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
                __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
                __put_user_error(current->thread.fault_code,
&esr_ctx->esr, err);
+               current->thread.fault_code = 0;
        }

        if (err == 0 && user->ftb_offset) {

> Apart from these issues, the actual code looks reasonable.

Thanks for the review.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-15  0:58                   ` Peter Collingbourne
@ 2020-05-18  9:53                     ` Dave Martin
  2020-05-19 22:00                       ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-05-18  9:53 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Linux ARM, Richard Henderson

On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> On Wed, May 13, 2020 at 1:28 PM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > the tag bits may be needed by tools in order to accurately diagnose
> > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > Tagging Extension (MTE).
> > >
> > > We should not stop clearing these bits in the existing fault address fields,
> > > because there may be existing userspace applications that are expecting the tag
> > > bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> > > (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> > >
> > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > >
> > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > ---
> > > v6:
> > > - bring back comment about __reserved[]
> > >
> > > v5:
> > > - add padding to fault_addr_top_byte_context in order to ensure the correct
> > >   size and preserve sp alignment
> > >
> > > v4:
> > > - expose only the tag bits in the context instead of the entire FAR_EL1
> > > - remove mention of the new context from the sigcontext.__reserved[] note
> > >
> > > v3:
> > > - add documentation to tagged-pointers.rst
> > > - update comments in sigcontext.h
> > >
> > > v2:
> > > - revert changes to hw_breakpoint.c
> > > - rename set_thread_esr to set_thread_far_esr
> > >
> > >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> > >  arch/arm64/include/asm/exception.h       |  2 +-
> > >  arch/arm64/include/asm/processor.h       |  2 +-
> > >  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
> > >  arch/arm64/kernel/entry-common.c         |  2 --
> > >  arch/arm64/kernel/signal.c               | 22 +++++++++++-
> > >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> > >  7 files changed, 77 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> > > index eab4323609b9..c6e9592a9dea 100644
> > > --- a/Documentation/arm64/tagged-pointers.rst
> > > +++ b/Documentation/arm64/tagged-pointers.rst
> > > @@ -53,12 +53,17 @@ visibility.
> > >  Preserving tags
> > >  ---------------
> > >
> > > -Non-zero tags are not preserved when delivering signals. This means that
> > > -signal handlers in applications making use of tags cannot rely on the
> > > -tag information for user virtual addresses being maintained for fields
> > > -inside siginfo_t. One exception to this rule is for signals raised in
> > > -response to watchpoint debug exceptions, where the tag information will
> > > -be preserved.
> > > +Non-zero tags are not preserved in the fault address fields
> > > +siginfo.si_addr or sigcontext.fault_address when delivering
> > > +signals. This means that signal handlers in applications making use
> > > +of tags cannot rely on the tag information for user virtual addresses
> > > +being maintained in these fields. One exception to this rule is for
> > > +signals raised in response to watchpoint debug exceptions, where the
> > > +tag information will be preserved.
> > > +
> > > +The fault address tag is preserved in the fault_addr_top_byte field of
> > > +the signal frame record fault_addr_top_byte_context, which is present
> > > +for signals raised in response to data aborts and instruction aborts.
> > >
> > >  The architecture prevents the use of a tagged PC, so the upper byte will
> > >  be set to a sign-extension of bit 55 on exception return.
> > > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> > > index 7a6e81ca23a8..90e772d9b2cd 100644
> > > --- a/arch/arm64/include/asm/exception.h
> > > +++ b/arch/arm64/include/asm/exception.h
> > > @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
> > >  }
> > >
> > >  asmlinkage void enter_from_user_mode(void);
> > > -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> > > +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
> > >  void do_undefinstr(struct pt_regs *regs);
> > >  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
> > >  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> > > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > > index 240fe5e5b720..63185be29ff9 100644
> > > --- a/arch/arm64/include/asm/processor.h
> > > +++ b/arch/arm64/include/asm/processor.h
> > > @@ -144,7 +144,7 @@ struct thread_struct {
> > >       void                    *sve_state;     /* SVE registers, if any */
> > >       unsigned int            sve_vl;         /* SVE vector length */
> > >       unsigned int            sve_vl_onexec;  /* SVE vl after next exec */
> > > -     unsigned long           fault_address;  /* fault info */
> > > +     unsigned long           fault_address;  /* FAR_EL1 value */
> > >       unsigned long           fault_code;     /* ESR_EL1 value */
> > >       struct debug_info       debug;          /* debugging */
> > >  #ifdef CONFIG_ARM64_PTR_AUTH
> > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > index 8b0ebce92427..2a3fe3de899d 100644
> > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > @@ -44,11 +44,12 @@ struct sigcontext {
> > >   *
> > >   *   0x210           fpsimd_context
> > >   *    0x10           esr_context
> > > + *    0x10           fault_addr_top_byte_context
> > >   *   0x8a0           sve_context (vl <= 64) (optional)
> > >   *    0x20           extra_context (optional)
> > >   *    0x10           terminator (null _aarch64_ctx)
> > >   *
> > > - *   0x510           (reserved for future allocation)
> > > + *   0x500           (reserved for future allocation)
> > >   *
> > >   * New records that can exceed this space need to be opt-in for userspace, so
> > >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > > @@ -94,17 +95,26 @@ struct esr_context {
> > >       __u64 esr;
> > >  };
> > >
> > > +/* Top byte of fault address (normally not exposed via si_addr) */
> > > +#define FAULT_ADDR_TOP_BYTE_MAGIC    0x46544201
> > > +
> > > +struct fault_addr_top_byte_context {
> > > +     struct _aarch64_ctx head;
> > > +     __u8 fault_addr_top_byte;
> > > +     __u8 __reserved[7];
> > > +};
> > > +
> >
> > Nit: the name here is a bit cumbersome (obviously bikeshedding...)
> >
> >
> > For the rest, some of my comments may be bogus -- I haven't dug into
> > this stuff for a little while!
> >
> >
> > Anyway:
> >
> > Do we really get the whole top byte of the address in the FAR?  If so,
> > fine, but I'm having trouble finding a clear statement in the
> > architecture one way or the other.  (I didn't attempt to dive into the
> > pseudocode.)
> 
> I rely on this statement in the ARM:
> 
> https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/far_el1
> "For a Data Abort or Watchpoint exception, if address tagging is
> enabled for the address accessed by the data access that caused the
> exception, then this field includes the tag."

Yes, I think that covers it.  I hadn't found a clear definition of
"tag", but I think the TBI mechanism makes it "reasonably obvious" the
non-address (i.e., tag) bits are [63:56].

> And note that address tagging here essentially means TBI (which is
> always enabled on Linux), and not memory tagging.
> 
> > Also, since we're burning 16 bytes here, I'd prefer if we make this
> > extensible.  At present the __reserved[7] is unusable because
> > userspace has no way to know whether it's valid or not.
> >
> > Options include an additional flag byte (0 for now), or just making
> > the whole thing a __u64.  In that case we can leave the top byte bits
> > in their original positions if we want, but it would be a good idea to
> > include a flag to say that field is valid at all.  (See comments below
> > on Synchronous external abort.)
> >
> > So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
> > (with #defines for the bits/fields as appropriate).
> 
> The flag bits seem like a good idea. Thinking ahead to the MTE sync
> tag fault (which might not provide us with bits 60-63), we may
> consider having separate bits to indicate "bits 56-59 valid" and "bits
> 60-63 valid", set both bits for regular data aborts and only the
> former for sync tag faults, which would avoid the need to define a
> separate context for these faults. And if a future architecture
> revision provides us with bits 60-63 for tag faults, we could start
> setting both flag bits even for tag faults.

Seems reasonable, but a "tag mask" field of some sort might be
preferable to hard-wiring, just in case a future update to MTE supports
more than 4 bits.

> > >  /*
> > >   * extra_context: describes extra space in the signal frame for
> > >   * additional structures that don't fit in sigcontext.__reserved[].
> > >   *
> > >   * Note:
> > >   *
> > > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > > - * extra space.  Any other record can be placed either in the extra
> > > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > > - * this file.
> > > + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> > > + * extra_context must be placed in sigcontext.__reserved[] if present.
> > > + * They cannot be placed in the extra space.  Any other record can be
> > > + * placed either in the extra space or in sigcontext.__reserved[],
> > > + * unless otherwise specified in this file.
> > >   *
> > >   * 2) There must not be more than one extra_context.
> > >   *
> > > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > > index c839b5bf1904..045b4f518836 100644
> > > --- a/arch/arm64/kernel/entry-common.c
> > > +++ b/arch/arm64/kernel/entry-common.c
> > > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> > >       unsigned long far = read_sysreg(far_el1);
> > >
> > >       local_daif_inherit(regs);
> > > -     far = untagged_addr(far);
> > >       do_mem_abort(far, esr, regs);
> > >  }
> > >  NOKPROBE_SYMBOL(el1_abort);
> > > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> > >
> > >       user_exit_irqoff();
> > >       local_daif_restore(DAIF_PROCCTX);
> > > -     far = untagged_addr(far);
> > >       do_mem_abort(far, esr, regs);
> > >  }
> > >  NOKPROBE_SYMBOL(el0_da);
> > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > index 339882db5a91..baa88dc02e5c 100644
> > > --- a/arch/arm64/kernel/signal.c
> > > +++ b/arch/arm64/kernel/signal.c
> > > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> > >
> > >       unsigned long fpsimd_offset;
> > >       unsigned long esr_offset;
> > > +     unsigned long ftb_offset;
> > >       unsigned long sve_offset;
> > >       unsigned long extra_offset;
> > >       unsigned long end_offset;
> > > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> > >                       break;
> > >
> > >               case ESR_MAGIC:
> > > +             case FAULT_ADDR_TOP_BYTE_MAGIC:
> > >                       /* ignore */
> > >                       break;
> > >
> > > @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > >                                    sizeof(struct esr_context));
> > >               if (err)
> > >                       return err;
> > > +
> > > +             err = sigframe_alloc(
> > > +                     user, &user->ftb_offset,
> > > +                     sizeof(struct fault_addr_top_byte_context));
> >
> > Nit: inconsistent indentation?
> >
> > (Mostly just because it makes the change look odd against the hunk
> > context, but not a big deal.)
> 
> With consistent indentation we violate 80 cols due to the extra long
> struct name. The indentation is what clang-format is giving me.

I suspected that might be why.  Fair enough (though a shorter name would
be no bad thing, it's not worth changing that just for nicer indentation).

> 
> > > +             if (err)
> > > +                     return err;
> > >       }
> > >
> > >       if (system_supports_sve()) {
> > > @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > >       __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
> > >       __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
> > >
> > > -     __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> > > +     __put_user_error(untagged_addr(current->thread.fault_address),
> > > +                      &sf->uc.uc_mcontext.fault_address, err);
> > >
> > >       err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
> > >
> > > @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > >               __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > >       }
> > >
> > > +     if (err == 0 && user->ftb_offset) {
> > > +             struct fault_addr_top_byte_context __user *ftb_ctx =
> > > +                     apply_user_offset(user, user->ftb_offset);
> > > +
> > > +             __put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> > > +                              &ftb_ctx->head.magic, err);
> > > +             __put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> > > +             __put_user_error(current->thread.fault_address >> 56,
> > > +                              &ftb_ctx->fault_addr_top_byte, err);
> > > +     }
> > > +
> >
> > How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?
> >
> > For Synchronous external aborts in particular, those bits are documented
> > as UNKNOWN, but I don't see any special handling,  There may be other
> > cases I haven't spotted.
> >
> > For preference we can omit this record entirely if we don't have any
> > information we can report, but certainly we shouldn't expose UNKNOWN
> > bits.
> 
> In this case we mask out the top byte in do_sea before passing the
> address to arm64_notify_die (which clears fault_address and passes the
> address argument on to arm64_force_sig_fault to be exposed via
> si_addr). So the record would always contain a 0 byte. It seems
> reasonable to omit the record in this case instead.

Ah, right.  Missed that.

The record is already omitted when fault_code == 0 IIUC, so perhaps
we're already doing the right thing for synchronous external aborts.

> > [ Aside:
> >
> > Also, what if we're not reporting a memory abort at all?  Does
> > thread.fault_address just contain junk from the last fault?  I see
> > nothing anywhere that cleans this up.  (This is historical and not
> > your fault, but it would be good to close this down while we're about
> > it.)
> >
> >
> > Hmmm, looking at the code I think we probably leak fault_address etc.
> > across execve() too, so it may even be stale junk from an old process
> > :/
> >
> > Maybe I just confused myself.
> >
> > End aside. ]
> 
> Yes, it's unclear whether we always manage to not expose a fault
> address if we're not reporting a data or instruction abort. The code
> would need to arrange for fault_code to be set to 0 in order to avoid
> exposing previous fault_address values via future signals. I don't see
> anywhere where we're resetting these fields after delivering a signal,
> so it seems possible by calling arm64_force_sig_fault without first
> setting fault_code (most callers do this, but the calls in
> arch/arm64/kernel/debug-monitors.c and arch/arm64/kernel/ptrace.c seem
> not to), or simply by calling force_sig_fault (which happens in many
> places throughout the kernel).
> 
> Maybe something like this would do the trick? (Untested, and forgive
> spaces instead of tabs, grumble grumble gmail):
> 
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index baa88dc02e5c..5867f2fdbe64 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> rt_sigframe_user_layout *user,
>                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
>                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
>                 __put_user_error(current->thread.fault_code,
> &esr_ctx->esr, err);
> +               current->thread.fault_code = 0;

Perhaps, but we'd need to be careful.  For example, can we run out of
user stack before this and deliver a SIGSEGV, but with the old
fault_code still set?  Then we'd emit the old fault code with the
new "can't deliver signal" signal, which doesn't make sense.

Stuff may also go wrong with signal prioritisation.

If a higher-priority signal (say SIGINT) comes in after a data abort
enters the kernel but before the resulting SIGSEGV is dequeued for
delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
With your change we'd then have cleared the fault code by the time we
deliver the SIGSEGV it actually relates to, if I've understood right.

Today, I think we just attach that fault code to every signal that's
delivered until something overwrites or resets it, which means that
a signal that needs fault_code gets it, at the expense of attaching
it to a bunch of other random signals too.


Checking the signal number and si_code might help us to know what we
should be doing with fault_code.  We need to have sure userspace can't
trick us with a non kernel generated signal here.  It would also be
necessary to check how PTRACE_SETSIGINFO interacts with this.


Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-18  9:53                     ` Dave Martin
@ 2020-05-19 22:00                       ` Peter Collingbourne
  2020-05-20  8:55                         ` Will Deacon
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-19 22:00 UTC (permalink / raw)
  To: Dave Martin
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Linux ARM, Richard Henderson

On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > On Wed, May 13, 2020 at 1:28 PM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Wed, May 13, 2020 at 11:09:14AM -0700, Peter Collingbourne wrote:
> > > > The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> > > > address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> > > > the tag bits may be needed by tools in order to accurately diagnose
> > > > memory errors, such as HWASan [1] or future tools based on the Memory
> > > > Tagging Extension (MTE).
> > > >
> > > > We should not stop clearing these bits in the existing fault address fields,
> > > > because there may be existing userspace applications that are expecting the tag
> > > > bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
> > > > (similar to the existing esr_context), and store the tag bits of FAR_EL1 there.
> > > >
> > > > [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > > >
> > > > Signed-off-by: Peter Collingbourne <pcc@google.com>
> > > > ---
> > > > v6:
> > > > - bring back comment about __reserved[]
> > > >
> > > > v5:
> > > > - add padding to fault_addr_top_byte_context in order to ensure the correct
> > > >   size and preserve sp alignment
> > > >
> > > > v4:
> > > > - expose only the tag bits in the context instead of the entire FAR_EL1
> > > > - remove mention of the new context from the sigcontext.__reserved[] note
> > > >
> > > > v3:
> > > > - add documentation to tagged-pointers.rst
> > > > - update comments in sigcontext.h
> > > >
> > > > v2:
> > > > - revert changes to hw_breakpoint.c
> > > > - rename set_thread_esr to set_thread_far_esr
> > > >
> > > >  Documentation/arm64/tagged-pointers.rst  | 17 +++++----
> > > >  arch/arm64/include/asm/exception.h       |  2 +-
> > > >  arch/arm64/include/asm/processor.h       |  2 +-
> > > >  arch/arm64/include/uapi/asm/sigcontext.h | 22 ++++++++----
> > > >  arch/arm64/kernel/entry-common.c         |  2 --
> > > >  arch/arm64/kernel/signal.c               | 22 +++++++++++-
> > > >  arch/arm64/mm/fault.c                    | 45 ++++++++++++++----------
> > > >  7 files changed, 77 insertions(+), 35 deletions(-)
> > > >
> > > > diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
> > > > index eab4323609b9..c6e9592a9dea 100644
> > > > --- a/Documentation/arm64/tagged-pointers.rst
> > > > +++ b/Documentation/arm64/tagged-pointers.rst
> > > > @@ -53,12 +53,17 @@ visibility.
> > > >  Preserving tags
> > > >  ---------------
> > > >
> > > > -Non-zero tags are not preserved when delivering signals. This means that
> > > > -signal handlers in applications making use of tags cannot rely on the
> > > > -tag information for user virtual addresses being maintained for fields
> > > > -inside siginfo_t. One exception to this rule is for signals raised in
> > > > -response to watchpoint debug exceptions, where the tag information will
> > > > -be preserved.
> > > > +Non-zero tags are not preserved in the fault address fields
> > > > +siginfo.si_addr or sigcontext.fault_address when delivering
> > > > +signals. This means that signal handlers in applications making use
> > > > +of tags cannot rely on the tag information for user virtual addresses
> > > > +being maintained in these fields. One exception to this rule is for
> > > > +signals raised in response to watchpoint debug exceptions, where the
> > > > +tag information will be preserved.
> > > > +
> > > > +The fault address tag is preserved in the fault_addr_top_byte field of
> > > > +the signal frame record fault_addr_top_byte_context, which is present
> > > > +for signals raised in response to data aborts and instruction aborts.
> > > >
> > > >  The architecture prevents the use of a tagged PC, so the upper byte will
> > > >  be set to a sign-extension of bit 55 on exception return.
> > > > diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
> > > > index 7a6e81ca23a8..90e772d9b2cd 100644
> > > > --- a/arch/arm64/include/asm/exception.h
> > > > +++ b/arch/arm64/include/asm/exception.h
> > > > @@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
> > > >  }
> > > >
> > > >  asmlinkage void enter_from_user_mode(void);
> > > > -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
> > > > +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
> > > >  void do_undefinstr(struct pt_regs *regs);
> > > >  asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
> > > >  void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
> > > > diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> > > > index 240fe5e5b720..63185be29ff9 100644
> > > > --- a/arch/arm64/include/asm/processor.h
> > > > +++ b/arch/arm64/include/asm/processor.h
> > > > @@ -144,7 +144,7 @@ struct thread_struct {
> > > >       void                    *sve_state;     /* SVE registers, if any */
> > > >       unsigned int            sve_vl;         /* SVE vector length */
> > > >       unsigned int            sve_vl_onexec;  /* SVE vl after next exec */
> > > > -     unsigned long           fault_address;  /* fault info */
> > > > +     unsigned long           fault_address;  /* FAR_EL1 value */
> > > >       unsigned long           fault_code;     /* ESR_EL1 value */
> > > >       struct debug_info       debug;          /* debugging */
> > > >  #ifdef CONFIG_ARM64_PTR_AUTH
> > > > diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > index 8b0ebce92427..2a3fe3de899d 100644
> > > > --- a/arch/arm64/include/uapi/asm/sigcontext.h
> > > > +++ b/arch/arm64/include/uapi/asm/sigcontext.h
> > > > @@ -44,11 +44,12 @@ struct sigcontext {
> > > >   *
> > > >   *   0x210           fpsimd_context
> > > >   *    0x10           esr_context
> > > > + *    0x10           fault_addr_top_byte_context
> > > >   *   0x8a0           sve_context (vl <= 64) (optional)
> > > >   *    0x20           extra_context (optional)
> > > >   *    0x10           terminator (null _aarch64_ctx)
> > > >   *
> > > > - *   0x510           (reserved for future allocation)
> > > > + *   0x500           (reserved for future allocation)
> > > >   *
> > > >   * New records that can exceed this space need to be opt-in for userspace, so
> > > >   * that an expanded signal frame is not generated unexpectedly.  The mechanism
> > > > @@ -94,17 +95,26 @@ struct esr_context {
> > > >       __u64 esr;
> > > >  };
> > > >
> > > > +/* Top byte of fault address (normally not exposed via si_addr) */
> > > > +#define FAULT_ADDR_TOP_BYTE_MAGIC    0x46544201
> > > > +
> > > > +struct fault_addr_top_byte_context {
> > > > +     struct _aarch64_ctx head;
> > > > +     __u8 fault_addr_top_byte;
> > > > +     __u8 __reserved[7];
> > > > +};
> > > > +
> > >
> > > Nit: the name here is a bit cumbersome (obviously bikeshedding...)
> > >
> > >
> > > For the rest, some of my comments may be bogus -- I haven't dug into
> > > this stuff for a little while!
> > >
> > >
> > > Anyway:
> > >
> > > Do we really get the whole top byte of the address in the FAR?  If so,
> > > fine, but I'm having trouble finding a clear statement in the
> > > architecture one way or the other.  (I didn't attempt to dive into the
> > > pseudocode.)
> >
> > I rely on this statement in the ARM:
> >
> > https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/far_el1
> > "For a Data Abort or Watchpoint exception, if address tagging is
> > enabled for the address accessed by the data access that caused the
> > exception, then this field includes the tag."
>
> Yes, I think that covers it.  I hadn't found a clear definition of
> "tag", but I think the TBI mechanism makes it "reasonably obvious" the
> non-address (i.e., tag) bits are [63:56].
>
> > And note that address tagging here essentially means TBI (which is
> > always enabled on Linux), and not memory tagging.
> >
> > > Also, since we're burning 16 bytes here, I'd prefer if we make this
> > > extensible.  At present the __reserved[7] is unusable because
> > > userspace has no way to know whether it's valid or not.
> > >
> > > Options include an additional flag byte (0 for now), or just making
> > > the whole thing a __u64.  In that case we can leave the top byte bits
> > > in their original positions if we want, but it would be a good idea to
> > > include a flag to say that field is valid at all.  (See comments below
> > > on Synchronous external abort.)
> > >
> > > So, say, foo_context->fault_info = (esr & (~0ULL << 56)) | TOP_BYTE_VALID.
> > > (with #defines for the bits/fields as appropriate).
> >
> > The flag bits seem like a good idea. Thinking ahead to the MTE sync
> > tag fault (which might not provide us with bits 60-63), we may
> > consider having separate bits to indicate "bits 56-59 valid" and "bits
> > 60-63 valid", set both bits for regular data aborts and only the
> > former for sync tag faults, which would avoid the need to define a
> > separate context for these faults. And if a future architecture
> > revision provides us with bits 60-63 for tag faults, we could start
> > setting both flag bits even for tag faults.
>
> Seems reasonable, but a "tag mask" field of some sort might be
> preferable to hard-wiring, just in case a future update to MTE supports
> more than 4 bits.

That's fine with me.

> > > >  /*
> > > >   * extra_context: describes extra space in the signal frame for
> > > >   * additional structures that don't fit in sigcontext.__reserved[].
> > > >   *
> > > >   * Note:
> > > >   *
> > > > - * 1) fpsimd_context, esr_context and extra_context must be placed in
> > > > - * sigcontext.__reserved[] if present.  They cannot be placed in the
> > > > - * extra space.  Any other record can be placed either in the extra
> > > > - * space or in sigcontext.__reserved[], unless otherwise specified in
> > > > - * this file.
> > > > + * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
> > > > + * extra_context must be placed in sigcontext.__reserved[] if present.
> > > > + * They cannot be placed in the extra space.  Any other record can be
> > > > + * placed either in the extra space or in sigcontext.__reserved[],
> > > > + * unless otherwise specified in this file.
> > > >   *
> > > >   * 2) There must not be more than one extra_context.
> > > >   *
> > > > diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
> > > > index c839b5bf1904..045b4f518836 100644
> > > > --- a/arch/arm64/kernel/entry-common.c
> > > > +++ b/arch/arm64/kernel/entry-common.c
> > > > @@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
> > > >       unsigned long far = read_sysreg(far_el1);
> > > >
> > > >       local_daif_inherit(regs);
> > > > -     far = untagged_addr(far);
> > > >       do_mem_abort(far, esr, regs);
> > > >  }
> > > >  NOKPROBE_SYMBOL(el1_abort);
> > > > @@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
> > > >
> > > >       user_exit_irqoff();
> > > >       local_daif_restore(DAIF_PROCCTX);
> > > > -     far = untagged_addr(far);
> > > >       do_mem_abort(far, esr, regs);
> > > >  }
> > > >  NOKPROBE_SYMBOL(el0_da);
> > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > > index 339882db5a91..baa88dc02e5c 100644
> > > > --- a/arch/arm64/kernel/signal.c
> > > > +++ b/arch/arm64/kernel/signal.c
> > > > @@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
> > > >
> > > >       unsigned long fpsimd_offset;
> > > >       unsigned long esr_offset;
> > > > +     unsigned long ftb_offset;
> > > >       unsigned long sve_offset;
> > > >       unsigned long extra_offset;
> > > >       unsigned long end_offset;
> > > > @@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
> > > >                       break;
> > > >
> > > >               case ESR_MAGIC:
> > > > +             case FAULT_ADDR_TOP_BYTE_MAGIC:
> > > >                       /* ignore */
> > > >                       break;
> > > >
> > > > @@ -581,6 +583,12 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
> > > >                                    sizeof(struct esr_context));
> > > >               if (err)
> > > >                       return err;
> > > > +
> > > > +             err = sigframe_alloc(
> > > > +                     user, &user->ftb_offset,
> > > > +                     sizeof(struct fault_addr_top_byte_context));
> > >
> > > Nit: inconsistent indentation?
> > >
> > > (Mostly just because it makes the change look odd against the hunk
> > > context, but not a big deal.)
> >
> > With consistent indentation we violate 80 cols due to the extra long
> > struct name. The indentation is what clang-format is giving me.
>
> I suspected that might be why.  Fair enough (though a shorter name would
> be no bad thing, it's not worth changing that just for nicer indentation).
>
> >
> > > > +             if (err)
> > > > +                     return err;
> > > >       }
> > > >
> > > >       if (system_supports_sve()) {
> > > > @@ -621,7 +629,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > >       __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
> > > >       __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
> > > >
> > > > -     __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
> > > > +     __put_user_error(untagged_addr(current->thread.fault_address),
> > > > +                      &sf->uc.uc_mcontext.fault_address, err);
> > > >
> > > >       err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
> > > >
> > > > @@ -641,6 +650,17 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
> > > >               __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
> > > >       }
> > > >
> > > > +     if (err == 0 && user->ftb_offset) {
> > > > +             struct fault_addr_top_byte_context __user *ftb_ctx =
> > > > +                     apply_user_offset(user, user->ftb_offset);
> > > > +
> > > > +             __put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
> > > > +                              &ftb_ctx->head.magic, err);
> > > > +             __put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
> > > > +             __put_user_error(current->thread.fault_address >> 56,
> > > > +                              &ftb_ctx->fault_addr_top_byte, err);
> > > > +     }
> > > > +
> > >
> > > How do we handle the fact that the top byte of FAR is sometimes UNKNOWN?
> > >
> > > For Synchronous external aborts in particular, those bits are documented
> > > as UNKNOWN, but I don't see any special handling,  There may be other
> > > cases I haven't spotted.
> > >
> > > For preference we can omit this record entirely if we don't have any
> > > information we can report, but certainly we shouldn't expose UNKNOWN
> > > bits.
> >
> > In this case we mask out the top byte in do_sea before passing the
> > address to arm64_notify_die (which clears fault_address and passes the
> > address argument on to arm64_force_sig_fault to be exposed via
> > si_addr). So the record would always contain a 0 byte. It seems
> > reasonable to omit the record in this case instead.
>
> Ah, right.  Missed that.
>
> The record is already omitted when fault_code == 0 IIUC, so perhaps
> we're already doing the right thing for synchronous external aborts.
>
> > > [ Aside:
> > >
> > > Also, what if we're not reporting a memory abort at all?  Does
> > > thread.fault_address just contain junk from the last fault?  I see
> > > nothing anywhere that cleans this up.  (This is historical and not
> > > your fault, but it would be good to close this down while we're about
> > > it.)
> > >
> > >
> > > Hmmm, looking at the code I think we probably leak fault_address etc.
> > > across execve() too, so it may even be stale junk from an old process
> > > :/
> > >
> > > Maybe I just confused myself.
> > >
> > > End aside. ]
> >
> > Yes, it's unclear whether we always manage to not expose a fault
> > address if we're not reporting a data or instruction abort. The code
> > would need to arrange for fault_code to be set to 0 in order to avoid
> > exposing previous fault_address values via future signals. I don't see
> > anywhere where we're resetting these fields after delivering a signal,
> > so it seems possible by calling arm64_force_sig_fault without first
> > setting fault_code (most callers do this, but the calls in
> > arch/arm64/kernel/debug-monitors.c and arch/arm64/kernel/ptrace.c seem
> > not to), or simply by calling force_sig_fault (which happens in many
> > places throughout the kernel).
> >
> > Maybe something like this would do the trick? (Untested, and forgive
> > spaces instead of tabs, grumble grumble gmail):
> >
> > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > index baa88dc02e5c..5867f2fdbe64 100644
> > --- a/arch/arm64/kernel/signal.c
> > +++ b/arch/arm64/kernel/signal.c
> > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > rt_sigframe_user_layout *user,
> >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >                 __put_user_error(current->thread.fault_code,
> > &esr_ctx->esr, err);
> > +               current->thread.fault_code = 0;
>
> Perhaps, but we'd need to be careful.  For example, can we run out of
> user stack before this and deliver a SIGSEGV, but with the old
> fault_code still set?  Then we'd emit the old fault code with the
> new "can't deliver signal" signal, which doesn't make sense.
>
> Stuff may also go wrong with signal prioritisation.
>
> If a higher-priority signal (say SIGINT) comes in after a data abort
> enters the kernel but before the resulting SIGSEGV is dequeued for
> delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> With your change we'd then have cleared the fault code by the time we
> deliver the SIGSEGV it actually relates to, if I've understood right.
>
> Today, I think we just attach that fault code to every signal that's
> delivered until something overwrites or resets it, which means that
> a signal that needs fault_code gets it, at the expense of attaching
> it to a bunch of other random signals too.
>
>
> Checking the signal number and si_code might help us to know what we
> should be doing with fault_code.  We need to have sure userspace can't
> trick us with a non kernel generated signal here.  It would also be
> necessary to check how PTRACE_SETSIGINFO interacts with this.

With these possible interactions in mind I think we should store the
fault code and fault address in kernel_siginfo instead of
thread_struct (and clear these fields when we receive a siginfo from
userspace, i.e. in copy_siginfo_from_user which is used by
ptrace(PTRACE_SETSIGINFO) among other places). That way, the
information is clearly associated with the signal itself and not the
thread, so we don't need to worry about our signal being delivered out
of order.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-19 22:00                       ` Peter Collingbourne
@ 2020-05-20  8:55                         ` Will Deacon
  2020-05-20  9:26                           ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Will Deacon @ 2020-05-20  8:55 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany,
	Evgenii Stepanov, Catalin Marinas, Vincenzo Frascino,
	Dave Martin, Linux ARM, Richard Henderson

On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > index baa88dc02e5c..5867f2fdbe64 100644
> > > --- a/arch/arm64/kernel/signal.c
> > > +++ b/arch/arm64/kernel/signal.c
> > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > > rt_sigframe_user_layout *user,
> > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > >                 __put_user_error(current->thread.fault_code,
> > > &esr_ctx->esr, err);
> > > +               current->thread.fault_code = 0;
> >
> > Perhaps, but we'd need to be careful.  For example, can we run out of
> > user stack before this and deliver a SIGSEGV, but with the old
> > fault_code still set?  Then we'd emit the old fault code with the
> > new "can't deliver signal" signal, which doesn't make sense.
> >
> > Stuff may also go wrong with signal prioritisation.
> >
> > If a higher-priority signal (say SIGINT) comes in after a data abort
> > enters the kernel but before the resulting SIGSEGV is dequeued for
> > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > With your change we'd then have cleared the fault code by the time we
> > deliver the SIGSEGV it actually relates to, if I've understood right.
> >
> > Today, I think we just attach that fault code to every signal that's
> > delivered until something overwrites or resets it, which means that
> > a signal that needs fault_code gets it, at the expense of attaching
> > it to a bunch of other random signals too.
> >
> >
> > Checking the signal number and si_code might help us to know what we
> > should be doing with fault_code.  We need to have sure userspace can't
> > trick us with a non kernel generated signal here.  It would also be
> > necessary to check how PTRACE_SETSIGINFO interacts with this.
> 
> With these possible interactions in mind I think we should store the
> fault code and fault address in kernel_siginfo instead of
> thread_struct (and clear these fields when we receive a siginfo from
> userspace, i.e. in copy_siginfo_from_user which is used by
> ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> information is clearly associated with the signal itself and not the
> thread, so we don't need to worry about our signal being delivered out
> of order.

Hmm, I can't see a way to do that that isn't horribly invasive in the core
signal code. Can you?

But generally, I agree: the per-thread handling of fault_address and
fault_code appears to be quite broken in the face of signal prioritisation
and signals that don't correspond directly to hardware trap. It would be
nice to have some tests for this...

If we want to pile on more bodges, perhaps we could stash the signal number
to which the fault_{address,code} relate, and then check that at delivery
and clear on a match. I hate it.

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-20  8:55                         ` Will Deacon
@ 2020-05-20  9:26                           ` Dave Martin
  2020-05-21  2:28                             ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-05-20  9:26 UTC (permalink / raw)
  To: Will Deacon
  Cc: Andrey Konovalov, Kevin Brodsky, Kostya Serebryany, Linux ARM,
	Catalin Marinas, Vincenzo Frascino, Peter Collingbourne,
	Evgenii Stepanov, Richard Henderson

On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > > index baa88dc02e5c..5867f2fdbe64 100644
> > > > --- a/arch/arm64/kernel/signal.c
> > > > +++ b/arch/arm64/kernel/signal.c
> > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > > > rt_sigframe_user_layout *user,
> > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > > >                 __put_user_error(current->thread.fault_code,
> > > > &esr_ctx->esr, err);
> > > > +               current->thread.fault_code = 0;
> > >
> > > Perhaps, but we'd need to be careful.  For example, can we run out of
> > > user stack before this and deliver a SIGSEGV, but with the old
> > > fault_code still set?  Then we'd emit the old fault code with the
> > > new "can't deliver signal" signal, which doesn't make sense.
> > >
> > > Stuff may also go wrong with signal prioritisation.
> > >
> > > If a higher-priority signal (say SIGINT) comes in after a data abort
> > > enters the kernel but before the resulting SIGSEGV is dequeued for
> > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > > With your change we'd then have cleared the fault code by the time we
> > > deliver the SIGSEGV it actually relates to, if I've understood right.
> > >
> > > Today, I think we just attach that fault code to every signal that's
> > > delivered until something overwrites or resets it, which means that
> > > a signal that needs fault_code gets it, at the expense of attaching
> > > it to a bunch of other random signals too.
> > >
> > >
> > > Checking the signal number and si_code might help us to know what we
> > > should be doing with fault_code.  We need to have sure userspace can't
> > > trick us with a non kernel generated signal here.  It would also be
> > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> > 
> > With these possible interactions in mind I think we should store the
> > fault code and fault address in kernel_siginfo instead of
> > thread_struct (and clear these fields when we receive a siginfo from
> > userspace, i.e. in copy_siginfo_from_user which is used by
> > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> > information is clearly associated with the signal itself and not the
> > thread, so we don't need to worry about our signal being delivered out
> > of order.
> 
> Hmm, I can't see a way to do that that isn't horribly invasive in the core
> signal code. Can you?
> 
> But generally, I agree: the per-thread handling of fault_address and
> fault_code appears to be quite broken in the face of signal prioritisation
> and signals that don't correspond directly to hardware trap. It would be
> nice to have some tests for this...
> 
> If we want to pile on more bodges, perhaps we could stash the signal number
> to which the fault_{address,code} relate, and then check that at delivery
> and clear on a match. I hate it.

I agree with Daniel's suggestion in principle, but I was also concerned
about whether it would be too invasive elsewhere.

Question though: does the core code take special care to make sure that
a force_sig cannot be outprioritised by a regular signal?  If so,
perhaps we get away with it.  I ask this, because the same same issue
may be hitting other arches otherwise.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-20  9:26                           ` Dave Martin
@ 2020-05-21  2:28                             ` Peter Collingbourne
  2020-05-21  2:29                               ` [PATCH v6 0/3] " Peter Collingbourne
  2020-05-21 12:35                               ` [PATCH v6] " Eric W. Biederman
  0 siblings, 2 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21  2:28 UTC (permalink / raw)
  To: Dave Martin
  Cc: Eric W. Biederman, Andrey Konovalov, Kevin Brodsky,
	Oleg Nesterov, Kostya Serebryany, Linux ARM, Catalin Marinas,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> > > > > --- a/arch/arm64/kernel/signal.c
> > > > > +++ b/arch/arm64/kernel/signal.c
> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > > > > rt_sigframe_user_layout *user,
> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > > > >                 __put_user_error(current->thread.fault_code,
> > > > > &esr_ctx->esr, err);
> > > > > +               current->thread.fault_code = 0;
> > > >
> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> > > > user stack before this and deliver a SIGSEGV, but with the old
> > > > fault_code still set?  Then we'd emit the old fault code with the
> > > > new "can't deliver signal" signal, which doesn't make sense.
> > > >
> > > > Stuff may also go wrong with signal prioritisation.
> > > >
> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > > > With your change we'd then have cleared the fault code by the time we
> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> > > >
> > > > Today, I think we just attach that fault code to every signal that's
> > > > delivered until something overwrites or resets it, which means that
> > > > a signal that needs fault_code gets it, at the expense of attaching
> > > > it to a bunch of other random signals too.
> > > >
> > > >
> > > > Checking the signal number and si_code might help us to know what we
> > > > should be doing with fault_code.  We need to have sure userspace can't
> > > > trick us with a non kernel generated signal here.  It would also be
> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> > >
> > > With these possible interactions in mind I think we should store the
> > > fault code and fault address in kernel_siginfo instead of
> > > thread_struct (and clear these fields when we receive a siginfo from
> > > userspace, i.e. in copy_siginfo_from_user which is used by
> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> > > information is clearly associated with the signal itself and not the
> > > thread, so we don't need to worry about our signal being delivered out
> > > of order.
> >
> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> > signal code. Can you?

I think I've come up with a way that doesn't seem to be too invasive.
See patch #1 of the series that I'm about to send out.

> > But generally, I agree: the per-thread handling of fault_address and
> > fault_code appears to be quite broken in the face of signal prioritisation
> > and signals that don't correspond directly to hardware trap. It would be
> > nice to have some tests for this...
> >
> > If we want to pile on more bodges, perhaps we could stash the signal number
> > to which the fault_{address,code} relate, and then check that at delivery
> > and clear on a match. I hate it.
>
> I agree with Daniel's suggestion in principle, but I was also concerned
> about whether it would be too invasive elsewhere.
>
> Question though: does the core code take special care to make sure that
> a force_sig cannot be outprioritised by a regular signal?  If so,
> perhaps we get away with it.  I ask this, because the same same issue
> may be hitting other arches otherwise.

Not as far as I can tell. There does appear to be prioritisation for
synchronous signals [1] but as far as I can tell nothing to
distinguish one of these signals from one with the same signal number
sent from userspace (e.g. via kill(2)).

Peter

[1] https://github.com/torvalds/linux/blob/b85051e755b0e9d6dd8f17ef1da083851b83287d/kernel/signal.c#L222

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v6 0/3] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21  2:28                             ` Peter Collingbourne
@ 2020-05-21  2:29                               ` Peter Collingbourne
  2020-05-21  2:29                                 ` [PATCH v6 1/3] signal: Allow architectures to store arch-specific data in kernel_siginfo Peter Collingbourne
                                                   ` (2 more replies)
  2020-05-21 12:35                               ` [PATCH v6] " Eric W. Biederman
  1 sibling, 3 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21  2:29 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon, Oleg Nesterov,
	Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address fields,
because there may be existing userspace applications that are expecting the tag
bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
(similar to the existing esr_context), and store the tag bits of FAR_EL1 there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Peter Collingbourne (3):
  signal: Allow architectures to store arch-specific data in
    kernel_siginfo
  arm64: Move fault address and fault code into kernel_siginfo
  arm64: Expose FAR_EL1 tag bits in sigcontext

 Documentation/arm64/tagged-pointers.rst  |  17 ++--
 arch/arm64/include/asm/exception.h       |   2 +-
 arch/arm64/include/asm/processor.h       |   2 -
 arch/arm64/include/asm/signal.h          |  19 ++++
 arch/arm64/include/asm/traps.h           |   8 +-
 arch/arm64/include/uapi/asm/sigcontext.h |  24 +++--
 arch/arm64/kernel/debug-monitors.c       |   4 +-
 arch/arm64/kernel/entry-common.c         |   2 -
 arch/arm64/kernel/probes/uprobes.c       |  18 ++--
 arch/arm64/kernel/ptrace.c               |   2 +-
 arch/arm64/kernel/signal.c               |  42 ++++++--
 arch/arm64/kernel/signal32.c             |  15 ++-
 arch/arm64/kernel/sys_compat.c           |   9 +-
 arch/arm64/kernel/traps.c                | 121 +++++++++++++++++++----
 arch/arm64/mm/fault.c                    | 111 ++++++---------------
 include/linux/signal.h                   |   2 -
 include/linux/signal_types.h             |   3 +
 kernel/signal.c                          |  17 +++-
 18 files changed, 259 insertions(+), 159 deletions(-)
 create mode 100644 arch/arm64/include/asm/signal.h

-- 
2.26.2.761.g0e0b3e54be-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v6 1/3] signal: Allow architectures to store arch-specific data in kernel_siginfo
  2020-05-21  2:29                               ` [PATCH v6 0/3] " Peter Collingbourne
@ 2020-05-21  2:29                                 ` Peter Collingbourne
  2020-05-21  2:29                                 ` [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo Peter Collingbourne
  2020-05-21  2:29                                 ` [PATCH v6 3/3] arm64: Expose FAR_EL1 tag bits in sigcontext Peter Collingbourne
  2 siblings, 0 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21  2:29 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon, Oleg Nesterov,
	Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

In some cases we would like to store architecture-specific data in
the kernel's siginfo but not in the userspace one. This is generally
data is conceptually part of siginfo but is not stored there for one
reason or another. For example, on arm64, the arch-specific fault
code register ESR_EL1 is exposed to signal handlers, but since it is
associated with many different types of signals it does not fit well
into siginfo and appears in sigcontext instead.

Currently this data is stored in thread_struct, which is error-prone
because the data is associated with the signal itself and not the task,
and as a result it could get out of sync with the signal that is
currently being delivered.

To help avoid these types of errors, introduce a way for architectures
to store architecture-specific data in the kernel_siginfo. This part
of the kernel_siginfo is not exposed to userspace so the architecture
can use it in any way that it likes without ABI concerns. A follow-up
change will start using this mechanism on arm64 to store the fault
code and fault address.

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
 include/linux/signal.h       |  2 --
 include/linux/signal_types.h |  3 +++
 kernel/signal.c              | 17 +++++++++++++----
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 05bacd2ab135..34bf18932e9b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -22,8 +22,6 @@ static inline void clear_siginfo(kernel_siginfo_t *info)
 	memset(info, 0, sizeof(*info));
 }
 
-#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))
-
 int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
 int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);
 
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index f8a90ae9c6ec..a8635eeb890b 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -11,6 +11,9 @@
 
 typedef struct kernel_siginfo {
 	__SIGINFO;
+#ifdef __ARCH_HAS_PRIVATE_SIGINFO
+	struct arch_private_siginfo arch;
+#endif
 } kernel_siginfo_t;
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 284fc1600063..a33df2280ed5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3180,15 +3180,21 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
 	return layout;
 }
 
+struct shared_siginfo {
+	__SIGINFO;
+};
+
+#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct shared_siginfo))
+
 static inline char __user *si_expansion(const siginfo_t __user *info)
 {
-	return ((char __user *)info) + sizeof(struct kernel_siginfo);
+	return ((char __user *)info) + sizeof(struct shared_siginfo);
 }
 
 int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
 {
 	char __user *expansion = si_expansion(to);
-	if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
+	if (copy_to_user(to, from , sizeof(struct shared_siginfo)))
 		return -EFAULT;
 	if (clear_user(expansion, SI_EXPANSION_SIZE))
 		return -EFAULT;
@@ -3198,6 +3204,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
 static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
 				       const siginfo_t __user *from)
 {
+#ifdef __ARCH_HAS_PRIVATE_SIGINFO
+	memset(&info->arch, 0, sizeof(info->arch));
+#endif
 	if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
 		char __user *expansion = si_expansion(from);
 		char buf[SI_EXPANSION_SIZE];
@@ -3221,7 +3230,7 @@ static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
 static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
 				    const siginfo_t __user *from)
 {
-	if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
+	if (copy_from_user(to, from, sizeof(struct shared_siginfo)))
 		return -EFAULT;
 	to->si_signo = signo;
 	return post_copy_siginfo_from_user(to, from);
@@ -3229,7 +3238,7 @@ static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
 
 int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
 {
-	if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
+	if (copy_from_user(to, from, sizeof(struct shared_siginfo)))
 		return -EFAULT;
 	return post_copy_siginfo_from_user(to, from);
 }
-- 
2.26.2.761.g0e0b3e54be-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo
  2020-05-21  2:29                               ` [PATCH v6 0/3] " Peter Collingbourne
  2020-05-21  2:29                                 ` [PATCH v6 1/3] signal: Allow architectures to store arch-specific data in kernel_siginfo Peter Collingbourne
@ 2020-05-21  2:29                                 ` Peter Collingbourne
  2020-05-21 13:34                                     ` kbuild test robot
  2020-05-21  2:29                                 ` [PATCH v6 3/3] arm64: Expose FAR_EL1 tag bits in sigcontext Peter Collingbourne
  2 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21  2:29 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon, Oleg Nesterov,
	Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

Previously this data was being stored in thread_struct, which is
error-prone because the data is associated with the signal itself
and not the task, and as a result it could get out of sync with the
signal that is currently being delivered. To avoid this problem,
move the fields to kernel_siginfo using the newly-introduced generic
support for doing so.

The new fields store the raw FAR_EL1 and ESR_EL1 values instead of
the cooked ones as we were doing before. For FAR_EL1 this will be
necessary in order to expose the high bits of FAR_EL1 in sigcontext
in an upcoming change. Do the same for ESR_EL1 for consistency and
to make the code less error-prone.

While here, fix an apparent compat bug where, when delivering a
SIGILL signal in response to an invalid syscall, the syscall number
was being interpreted as an ESR_EL1 value and translated into an
FSR before being stored in sigcontext.error_code, rather than being
stored in error_code directly, as was the intention (and the behaviour
of the code in arch/arm). This is achieved by moving the error_code
translation early so that the syscall number can avoid it.

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
 arch/arm64/include/asm/exception.h |   2 +-
 arch/arm64/include/asm/processor.h |   2 -
 arch/arm64/include/asm/signal.h    |  16 ++++
 arch/arm64/include/asm/traps.h     |   8 +-
 arch/arm64/kernel/debug-monitors.c |   4 +-
 arch/arm64/kernel/entry-common.c   |   2 -
 arch/arm64/kernel/probes/uprobes.c |  18 ++---
 arch/arm64/kernel/ptrace.c         |   2 +-
 arch/arm64/kernel/signal.c         |  17 +++--
 arch/arm64/kernel/signal32.c       |  15 ++--
 arch/arm64/kernel/sys_compat.c     |   9 ++-
 arch/arm64/kernel/traps.c          | 117 ++++++++++++++++++++++++-----
 arch/arm64/mm/fault.c              | 111 +++++++--------------------
 13 files changed, 183 insertions(+), 140 deletions(-)
 create mode 100644 arch/arm64/include/asm/signal.h

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..b326bfbcea62 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -144,8 +144,6 @@ struct thread_struct {
 	void			*sve_state;	/* SVE registers, if any */
 	unsigned int		sve_vl;		/* SVE vector length */
 	unsigned int		sve_vl_onexec;	/* SVE vl after next exec */
-	unsigned long		fault_address;	/* fault info */
-	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
 	struct ptrauth_keys_user	keys_user;
diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h
new file mode 100644
index 000000000000..f5c001b0a125
--- /dev/null
+++ b/arch/arm64/include/asm/signal.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM64_SIGNAL_H
+#define _ASM_ARM64_SIGNAL_H
+
+#include <uapi/asm/signal.h>
+
+#define __ARCH_HAS_PRIVATE_SIGINFO
+struct arch_private_siginfo {
+	/* FAR_EL1 value */
+	unsigned long fault_address;
+
+	/* Sanitized ESR_EL1 value, or FSR/syscall number in compat mode */
+	unsigned long error_code;
+};
+
+#endif
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index cee5928e1b7d..5ed5be5347e6 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -26,8 +26,12 @@ void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 void force_signal_inject(int signal, int code, unsigned long address);
 void arm64_notify_segfault(unsigned long addr);
-void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str);
-void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str);
+void arm64_force_sig_fault(int signo, int code, void __user *addr,
+			   unsigned long far, unsigned long esr,
+			   const char *str);
+void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
+			    unsigned long far, unsigned long esr,
+			    const char *str);
 void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, const char *str);
 
 /*
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 48222a4760c2..498e6393b2ca 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -232,8 +232,8 @@ static void send_user_sigtrap(int si_code)
 		local_irq_enable();
 
 	arm64_force_sig_fault(SIGTRAP, si_code,
-			     (void __user *)instruction_pointer(regs),
-			     "User debug trap");
+			      (void __user *)instruction_pointer(regs), 0, 0,
+			      "User debug trap");
 }
 
 static int single_step_handler(unsigned long unused, unsigned int esr,
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..045b4f518836 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index a412d8edbcd2..5bbcd2e813f0 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -9,8 +9,6 @@
 
 #include "decode-insn.h"
 
-#define UPROBE_INV_FAULT_CODE	UINT_MAX
-
 void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 		void *src, unsigned long len)
 {
@@ -63,9 +61,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	struct uprobe_task *utask = current->utask;
 
-	/* Initialize with an invalid fault code to detect if ol insn trapped */
-	current->thread.fault_code = UPROBE_INV_FAULT_CODE;
-
 	/* Instruction points to execute ol */
 	instruction_pointer_set(regs, utask->xol_vaddr);
 
@@ -78,7 +73,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	struct uprobe_task *utask = current->utask;
 
-	WARN_ON_ONCE(current->thread.fault_code != UPROBE_INV_FAULT_CODE);
+	WARN_ON_ONCE(arch_uprobe_xol_was_trapped(current));
 
 	/* Instruction points to execute next to breakpoint address */
 	instruction_pointer_set(regs, utask->vaddr + 4);
@@ -89,13 +84,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 }
 bool arch_uprobe_xol_was_trapped(struct task_struct *t)
 {
+	struct sigqueue *q;
+
 	/*
 	 * Between arch_uprobe_pre_xol and arch_uprobe_post_xol, if an xol
-	 * insn itself is trapped, then detect the case with the help of
-	 * invalid fault code which is being set in arch_uprobe_pre_xol
+	 * insn itself is trapped, then detect the case by checking for
+	 * non-zero esr_el1 in the task's pending signals.
 	 */
-	if (t->thread.fault_code != UPROBE_INV_FAULT_CODE)
-		return true;
+	list_for_each_entry (q, &t->pending.list, list)
+		if (q->info.arch.error_code)
+			return true;
 
 	return false;
 }
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b3d3005d9515..51bb8bcaf24b 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -198,7 +198,7 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 	}
 #endif
 	arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT,
-			      (void __user *)(bkpt->trigger),
+			      (void __user *)(bkpt->trigger), 0, 0,
 			      desc);
 }
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..10d7e9832a89 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -566,6 +566,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
  *	of the task.
  */
 static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
+				 struct kernel_siginfo *info,
 				 bool add_all)
 {
 	int err;
@@ -576,7 +577,7 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 		return err;
 
 	/* fault information, if valid */
-	if (add_all || current->thread.fault_code) {
+	if (add_all || info->arch.error_code) {
 		err = sigframe_alloc(user, &user->esr_offset,
 				     sizeof(struct esr_context));
 		if (err)
@@ -605,7 +606,8 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 }
 
 static int setup_sigframe(struct rt_sigframe_user_layout *user,
-			  struct pt_regs *regs, sigset_t *set)
+			  struct pt_regs *regs, sigset_t *set,
+			  struct kernel_siginfo *info)
 {
 	int i, err = 0;
 	struct rt_sigframe __user *sf = user->sigframe;
@@ -621,7 +623,8 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 	__put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);
 
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(untagged_addr(info->arch.fault_address),
+			 &sf->uc.uc_mcontext.fault_address, err);
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
@@ -638,7 +641,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 
 		__put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
 		__put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
-		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
+		__put_user_error(info->arch.error_code, &esr_ctx->esr, err);
 	}
 
 	/* Scalable Vector Extension state, if present */
@@ -701,7 +704,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user,
 	int err;
 
 	init_user_layout(user);
-	err = setup_sigframe_layout(user, false);
+	err = setup_sigframe_layout(user, &ksig->info, false);
 	if (err)
 		return err;
 
@@ -758,7 +761,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
 	__put_user_error(NULL, &frame->uc.uc_link, err);
 
 	err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
-	err |= setup_sigframe(&user, regs, set);
+	err |= setup_sigframe(&user, regs, set, &ksig->info);
 	if (err == 0) {
 		setup_return(regs, &ksig->ka, &user, usig);
 		if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
@@ -958,7 +961,7 @@ void __init minsigstksz_setup(void)
 	 * If this fails, SIGFRAME_MAXSZ needs to be enlarged.  It won't
 	 * be big enough, but it's our best guess:
 	 */
-	if (WARN_ON(setup_sigframe_layout(&user, true)))
+	if (WARN_ON(setup_sigframe_layout(&user, 0, true)))
 		return;
 
 	signal_minsigstksz = sigframe_size(&user) +
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 82feca6f7052..b302689b6651 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -37,8 +37,6 @@ struct compat_vfp_sigframe {
 #define VFP_MAGIC		0x56465001
 #define VFP_STORAGE_SIZE	sizeof(struct compat_vfp_sigframe)
 
-#define FSR_WRITE_SHIFT		(11)
-
 struct compat_aux_sigframe {
 	struct compat_vfp_sigframe	vfp;
 
@@ -384,7 +382,8 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 }
 
 static int compat_setup_sigframe(struct compat_sigframe __user *sf,
-				 struct pt_regs *regs, sigset_t *set)
+				 struct pt_regs *regs, sigset_t *set,
+				 struct kernel_siginfo *info)
 {
 	struct compat_aux_sigframe __user *aux;
 	unsigned long psr = pstate_to_compat_psr(regs->pstate);
@@ -409,10 +408,8 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf,
 	__put_user_error(psr, &sf->uc.uc_mcontext.arm_cpsr, err);
 
 	__put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err);
-	/* set the compat FSR WnR */
-	__put_user_error(!!(current->thread.fault_code & ESR_ELx_WNR) <<
-			 FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err);
-	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
+	__put_user_error(info->arch.error_code, &sf->uc.uc_mcontext.error_code, err);
+	__put_user_error(info->arch.fault_address, &sf->uc.uc_mcontext.fault_address, err);
 	__put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err);
 
 	err |= put_sigset_t(&sf->uc.uc_sigmask, set);
@@ -447,7 +444,7 @@ int compat_setup_rt_frame(int usig, struct ksignal *ksig,
 
 	err |= __compat_save_altstack(&frame->sig.uc.uc_stack, regs->compat_sp);
 
-	err |= compat_setup_sigframe(&frame->sig, regs, set);
+	err |= compat_setup_sigframe(&frame->sig, regs, set, &ksig->info);
 
 	if (err == 0) {
 		compat_setup_return(regs, &ksig->ka, frame->sig.retcode, frame, usig);
@@ -471,7 +468,7 @@ int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set,
 
 	__put_user_error(0x5ac3c35a, &frame->uc.uc_flags, err);
 
-	err |= compat_setup_sigframe(frame, regs, set);
+	err |= compat_setup_sigframe(frame, regs, set, &ksig->info);
 	if (err == 0)
 		compat_setup_return(regs, &ksig->ka, frame->retcode, frame, usig);
 
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 3c18c2454089..d7a0b93a8d9f 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -69,6 +69,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags)
 long compat_arm_syscall(struct pt_regs *regs, int scno)
 {
 	void __user *addr;
+	struct kernel_siginfo info;
 
 	switch (scno) {
 	/*
@@ -114,7 +115,11 @@ long compat_arm_syscall(struct pt_regs *regs, int scno)
 	addr  = (void __user *)instruction_pointer(regs) -
 		(compat_thumb_mode(regs) ? 2 : 4);
 
-	arm64_notify_die("Oops - bad compat syscall(2)", regs,
-			 SIGILL, ILL_ILLTRP, addr, scno);
+	clear_siginfo(&info);
+	info.si_signo = SIGILL;
+	info.si_code = ILL_ILLTRP;
+	info.si_addr = addr;
+	info.arch.error_code = scno;
+	force_sig_info(&info);
 	return 0;
 }
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index cf402be5c573..4545fe067ea9 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -44,6 +44,8 @@
 #include <asm/system_misc.h>
 #include <asm/sysreg.h>
 
+#define FSR_WRITE_SHIFT		(11)
+
 static const char *handler[]= {
 	"Synchronous Abort",
 	"IRQ",
@@ -209,12 +211,11 @@ void die(const char *str, struct pt_regs *regs, int err)
 		do_exit(SIGSEGV);
 }
 
-static void arm64_show_signal(int signo, const char *str)
+static void arm64_show_signal(int signo, unsigned long esr, const char *str)
 {
 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 	struct task_struct *tsk = current;
-	unsigned int esr = tsk->thread.fault_code;
 	struct pt_regs *regs = task_pt_regs(tsk);
 
 	/* Leave if the signal won't be shown */
@@ -225,7 +226,7 @@ static void arm64_show_signal(int signo, const char *str)
 
 	pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk));
 	if (esr)
-		pr_cont("%s, ESR 0x%08x, ", esr_get_class_string(esr), esr);
+		pr_cont("%s, ESR 0x%08lx, ", esr_get_class_string(esr), esr);
 
 	pr_cont("%s", str);
 	print_vma_addr(KERN_CONT " in ", regs->pc);
@@ -233,42 +234,121 @@ static void arm64_show_signal(int signo, const char *str)
 	__show_regs(regs);
 }
 
+static unsigned long esr_to_error_code(unsigned long esr, unsigned long far)
+{
+	/*
+	 * If the faulting address is in the kernel, we must sanitize the ESR.
+	 * From userspace's point of view, kernel-only mappings don't exist
+	 * at all, so we report them as level 0 translation faults.
+	 * (This is not quite the way that "no mapping there at all" behaves:
+	 * an alignment fault not caused by the memory type would take
+	 * precedence over translation fault for a real access to empty
+	 * space. Unfortunately we can't easily distinguish "alignment fault
+	 * not caused by memory type" from "alignment fault caused by memory
+	 * type", so we ignore this wrinkle and just return the translation
+	 * fault.)
+	 */
+	if (!is_ttbr0_addr(untagged_addr(far))) {
+		switch (ESR_ELx_EC(esr)) {
+		case ESR_ELx_EC_DABT_LOW:
+			/*
+			 * These bits provide only information about the
+			 * faulting instruction, which userspace knows already.
+			 * We explicitly clear bits which are architecturally
+			 * RES0 in case they are given meanings in future.
+			 * We always report the ESR as if the fault was taken
+			 * to EL1 and so ISV and the bits in ISS[23:14] are
+			 * clear. (In fact it always will be a fault to EL1.)
+			 */
+			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
+				ESR_ELx_CM | ESR_ELx_WNR;
+			esr |= ESR_ELx_FSC_FAULT;
+			break;
+		case ESR_ELx_EC_IABT_LOW:
+			/*
+			 * Claim a level 0 translation fault.
+			 * All other bits are architecturally RES0 for faults
+			 * reported with that DFSC value, so we clear them.
+			 */
+			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
+			esr |= ESR_ELx_FSC_FAULT;
+			break;
+		default:
+			/*
+			 * This should never happen (entry.S only brings us
+			 * into this code for insn and data aborts from a lower
+			 * exception level). Fail safe by not providing an ESR
+			 * context record at all.
+			 */
+			WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr);
+			esr = 0;
+			break;
+		}
+	}
+
+	if (is_compat_task()) {
+		/* Use the compat FSR WnR */
+		return !!(esr & ESR_ELx_WNR) << FSR_WRITE_SHIFT;
+	}
+
+	return esr;
+}
+
 void arm64_force_sig_fault(int signo, int code, void __user *addr,
+			   unsigned long far, unsigned long esr,
 			   const char *str)
 {
-	arm64_show_signal(signo, str);
-	if (signo == SIGKILL)
+	arm64_show_signal(signo, esr, str);
+	if (signo == SIGKILL) {
 		force_sig(SIGKILL);
-	else
-		force_sig_fault(signo, code, addr);
+	} else {
+		struct kernel_siginfo info;
+		clear_siginfo(&info);
+		info.si_signo = signo;
+		info.si_errno = 0;
+		info.si_code = code;
+		info.si_addr = addr;
+		info.arch.fault_address = far;
+		info.arch.error_code = esr_to_error_code(esr, far);
+		force_sig_info(&info);
+	}
 }
 
 void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
+			    unsigned long far, unsigned long esr,
 			    const char *str)
 {
-	arm64_show_signal(SIGBUS, str);
-	force_sig_mceerr(code, addr, lsb);
+	struct kernel_siginfo info;
+
+	arm64_show_signal(SIGBUS, esr, str);
+
+	clear_siginfo(&info);
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = code;
+	info.si_addr = addr;
+	info.si_addr_lsb = lsb;
+	info.arch.fault_address = far;
+	info.arch.error_code = esr_to_error_code(esr, far);
+	force_sig_info(&info);
 }
 
 void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr,
 				       const char *str)
 {
-	arm64_show_signal(SIGTRAP, str);
+	arm64_show_signal(SIGTRAP, 0, str);
 	force_sig_ptrace_errno_trap(errno, addr);
 }
 
 void arm64_notify_die(const char *str, struct pt_regs *regs,
 		      int signo, int sicode, void __user *addr,
-		      int err)
+		      int esr)
 {
 	if (user_mode(regs)) {
 		WARN_ON(regs != current_pt_regs());
-		current->thread.fault_address = 0;
-		current->thread.fault_code = err;
-
-		arm64_force_sig_fault(signo, sicode, addr, str);
+		arm64_force_sig_fault(signo, sicode, addr, 0, esr, str);
 	} else {
-		die(str, regs, err);
+		die(str, regs, esr);
 	}
 }
 
@@ -813,10 +893,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 {
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
-	current->thread.fault_address = 0;
-	current->thread.fault_code = esr;
-
-	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc,
+	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, 0, esr,
 			      "Bad EL0 synchronous exception");
 }
 
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..a7bada1392b3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -320,75 +320,19 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	die_kernel_fault(msg, addr, esr, regs);
 }
 
-static void set_thread_esr(unsigned long address, unsigned int esr)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
-	current->thread.fault_address = address;
+	unsigned long addr = untagged_addr(far);
 
-	/*
-	 * If the faulting address is in the kernel, we must sanitize the ESR.
-	 * From userspace's point of view, kernel-only mappings don't exist
-	 * at all, so we report them as level 0 translation faults.
-	 * (This is not quite the way that "no mapping there at all" behaves:
-	 * an alignment fault not caused by the memory type would take
-	 * precedence over translation fault for a real access to empty
-	 * space. Unfortunately we can't easily distinguish "alignment fault
-	 * not caused by memory type" from "alignment fault caused by memory
-	 * type", so we ignore this wrinkle and just return the translation
-	 * fault.)
-	 */
-	if (!is_ttbr0_addr(current->thread.fault_address)) {
-		switch (ESR_ELx_EC(esr)) {
-		case ESR_ELx_EC_DABT_LOW:
-			/*
-			 * These bits provide only information about the
-			 * faulting instruction, which userspace knows already.
-			 * We explicitly clear bits which are architecturally
-			 * RES0 in case they are given meanings in future.
-			 * We always report the ESR as if the fault was taken
-			 * to EL1 and so ISV and the bits in ISS[23:14] are
-			 * clear. (In fact it always will be a fault to EL1.)
-			 */
-			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
-				ESR_ELx_CM | ESR_ELx_WNR;
-			esr |= ESR_ELx_FSC_FAULT;
-			break;
-		case ESR_ELx_EC_IABT_LOW:
-			/*
-			 * Claim a level 0 translation fault.
-			 * All other bits are architecturally RES0 for faults
-			 * reported with that DFSC value, so we clear them.
-			 */
-			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
-			esr |= ESR_ELx_FSC_FAULT;
-			break;
-		default:
-			/*
-			 * This should never happen (entry.S only brings us
-			 * into this code for insn and data aborts from a lower
-			 * exception level). Fail safe by not providing an ESR
-			 * context record at all.
-			 */
-			WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr);
-			esr = 0;
-			break;
-		}
-	}
-
-	current->thread.fault_code = esr;
-}
-
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
-{
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
 	 */
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
-
-		set_thread_esr(addr, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
-				      inf->name);
+				      far, esr, inf->name);
 	} else {
 		__do_kernel_fault(addr, esr, regs);
 	}
@@ -439,7 +383,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +391,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -570,13 +515,12 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	}
 
 	inf = esr_to_fault_info(esr);
-	set_thread_esr(addr, esr);
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to successfully fix up
 		 * this page fault.
 		 */
-		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,
+		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, far, esr,
 				      inf->name);
 	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
 		unsigned int lsb;
@@ -586,7 +530,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
 
 		arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,
-				       inf->name);
+				       far, esr, inf->name);
 	} else {
 		/*
 		 * Something tried to access memory that isn't in our memory
@@ -594,7 +538,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		 */
 		arm64_force_sig_fault(SIGSEGV,
 				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
-				      (void __user *)addr,
+				      (void __user *)addr, far, esr,
 				      inf->name);
 	}
 
@@ -605,30 +549,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -644,7 +590,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -717,11 +663,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
@@ -730,8 +677,8 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 		show_pte(addr);
 	}
 
-	arm64_notify_die(inf->name, regs,
-			 inf->sig, inf->code, (void __user *)addr, esr);
+	arm64_notify_die(inf->name, regs, inf->sig, inf->code,
+			 (void __user *)addr, esr);
 }
 NOKPROBE_SYMBOL(do_mem_abort);
 
@@ -744,8 +691,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
 
 void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
-	arm64_notify_die("SP/PC alignment exception", regs,
-			 SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
+	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
+			 (void __user *)addr, esr);
 }
 NOKPROBE_SYMBOL(do_sp_pc_abort);
 
@@ -871,8 +818,8 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 		arm64_apply_bp_hardening();
 
 	if (inf->fn(addr_if_watchpoint, esr, regs)) {
-		arm64_notify_die(inf->name, regs,
-				 inf->sig, inf->code, (void __user *)pc, esr);
+		arm64_notify_die(inf->name, regs, inf->sig, inf->code,
+				 (void __user *)pc, esr);
 	}
 
 	debug_exception_exit(regs);
-- 
2.26.2.761.g0e0b3e54be-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* [PATCH v6 3/3] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21  2:29                               ` [PATCH v6 0/3] " Peter Collingbourne
  2020-05-21  2:29                                 ` [PATCH v6 1/3] signal: Allow architectures to store arch-specific data in kernel_siginfo Peter Collingbourne
  2020-05-21  2:29                                 ` [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo Peter Collingbourne
@ 2020-05-21  2:29                                 ` Peter Collingbourne
  2 siblings, 0 replies; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21  2:29 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon, Oleg Nesterov,
	Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address fields,
because there may be existing userspace applications that are expecting the tag
bits to be cleared. Instead, create a fault_addr_top_byte_context in sigcontext
(similar to the existing esr_context), and store the tag bits of FAR_EL1 there.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v6:
- move fault address and fault code into the kernel_siginfo data structure
- split the patch in three since it was getting large and now has
  generic and arch-specific parts

v5:
- add padding to fault_addr_top_byte_context in order to ensure the correct
  size and preserve sp alignment

v4:
- expose only the tag bits in the context instead of the entire FAR_EL1
- remove mention of the new context from the sigcontext.__reserved[] note

v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr

 Documentation/arm64/tagged-pointers.rst  | 17 ++++++++++------
 arch/arm64/include/asm/signal.h          |  3 +++
 arch/arm64/include/asm/traps.h           |  4 ++--
 arch/arm64/include/uapi/asm/sigcontext.h | 24 +++++++++++++++++------
 arch/arm64/kernel/debug-monitors.c       |  2 +-
 arch/arm64/kernel/ptrace.c               |  2 +-
 arch/arm64/kernel/signal.c               | 25 ++++++++++++++++++++++++
 arch/arm64/kernel/traps.c                | 10 ++++++----
 arch/arm64/mm/fault.c                    |  8 ++++----
 9 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..c6e9592a9dea 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@ visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the fault_addr_top_byte field of
+the signal frame record fault_addr_top_byte_context, which is present
+for signals raised in response to data aborts and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h
index f5c001b0a125..c80eb3b3ea40 100644
--- a/arch/arm64/include/asm/signal.h
+++ b/arch/arm64/include/asm/signal.h
@@ -9,6 +9,9 @@ struct arch_private_siginfo {
 	/* FAR_EL1 value */
 	unsigned long fault_address;
 
+	/* Mask of defined bits in the top byte of FAR_EL1 */
+	unsigned char fault_address_top_byte_mask;
+
 	/* Sanitized ESR_EL1 value, or FSR/syscall number in compat mode */
 	unsigned long error_code;
 };
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 5ed5be5347e6..981b930b1e6f 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -27,8 +27,8 @@ void unregister_undef_hook(struct undef_hook *hook);
 void force_signal_inject(int signal, int code, unsigned long address);
 void arm64_notify_segfault(unsigned long addr);
 void arm64_force_sig_fault(int signo, int code, void __user *addr,
-			   unsigned long far, unsigned long esr,
-			   const char *str);
+			   unsigned long far, unsigned char far_tb_mask,
+			   unsigned long esr, const char *str);
 void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
 			    unsigned long far, unsigned long esr,
 			    const char *str);
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..6ce5e1bb7efd 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -44,11 +44,12 @@ struct sigcontext {
  *
  *	0x210		fpsimd_context
  *	 0x10		esr_context
+ *	 0x10		fault_addr_top_byte_context
  *	0x8a0		sve_context (vl <= 64) (optional)
  *	 0x20		extra_context (optional)
  *	 0x10		terminator (null _aarch64_ctx)
  *
- *	0x510		(reserved for future allocation)
+ *	0x500		(reserved for future allocation)
  *
  * New records that can exceed this space need to be opt-in for userspace, so
  * that an expanded signal frame is not generated unexpectedly.  The mechanism
@@ -94,17 +95,28 @@ struct esr_context {
 	__u64 esr;
 };
 
+/* Top byte of fault address (normally not exposed via si_addr) */
+#define FAULT_ADDR_TOP_BYTE_MAGIC	0x46544201
+
+struct fault_addr_top_byte_context {
+	struct _aarch64_ctx head;
+	__u8 flags;
+	__u8 fault_addr_top_byte;
+	__u8 fault_addr_top_byte_mask;
+	__u8 __reserved[5];
+};
+
 /*
  * extra_context: describes extra space in the signal frame for
  * additional structures that don't fit in sigcontext.__reserved[].
  *
  * Note:
  *
- * 1) fpsimd_context, esr_context and extra_context must be placed in
- * sigcontext.__reserved[] if present.  They cannot be placed in the
- * extra space.  Any other record can be placed either in the extra
- * space or in sigcontext.__reserved[], unless otherwise specified in
- * this file.
+ * 1) fpsimd_context, esr_context, fault_addr_top_byte_context and
+ * extra_context must be placed in sigcontext.__reserved[] if present.
+ * They cannot be placed in the extra space.  Any other record can be
+ * placed either in the extra space or in sigcontext.__reserved[],
+ * unless otherwise specified in this file.
  *
  * 2) There must not be more than one extra_context.
  *
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 498e6393b2ca..575a907cd4fc 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -232,7 +232,7 @@ static void send_user_sigtrap(int si_code)
 		local_irq_enable();
 
 	arm64_force_sig_fault(SIGTRAP, si_code,
-			      (void __user *)instruction_pointer(regs), 0, 0,
+			      (void __user *)instruction_pointer(regs), 0, 0, 0,
 			      "User debug trap");
 }
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 51bb8bcaf24b..9b20284df88b 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -198,7 +198,7 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 	}
 #endif
 	arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT,
-			      (void __user *)(bkpt->trigger), 0, 0,
+			      (void __user *)(bkpt->trigger), 0, 0, 0,
 			      desc);
 }
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 10d7e9832a89..f6783de54412 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -55,6 +55,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long ftb_offset;
 	unsigned long sve_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
@@ -383,6 +384,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			break;
 
 		case ESR_MAGIC:
+		case FAULT_ADDR_TOP_BYTE_MAGIC:
 			/* ignore */
 			break;
 
@@ -582,6 +584,14 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 				     sizeof(struct esr_context));
 		if (err)
 			return err;
+        }
+
+        if (add_all || info->arch.fault_address_top_byte_mask) {
+		err = sigframe_alloc(
+			user, &user->ftb_offset,
+			sizeof(struct fault_addr_top_byte_context));
+		if (err)
+			return err;
 	}
 
 	if (system_supports_sve()) {
@@ -644,6 +654,21 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(info->arch.error_code, &esr_ctx->esr, err);
 	}
 
+	if (err == 0 && user->ftb_offset) {
+		struct fault_addr_top_byte_context __user *ftb_ctx =
+			apply_user_offset(user, user->ftb_offset);
+
+		__put_user_error(FAULT_ADDR_TOP_BYTE_MAGIC,
+				 &ftb_ctx->head.magic, err);
+		__put_user_error(sizeof(*ftb_ctx), &ftb_ctx->head.size, err);
+		__put_user_error(0, &ftb_ctx->flags, err);
+		__put_user_error((info->arch.fault_address >> 56) &
+					 info->arch.fault_address_top_byte_mask,
+				 &ftb_ctx->fault_addr_top_byte, err);
+		__put_user_error(info->arch.fault_address_top_byte_mask,
+				 &ftb_ctx->fault_addr_top_byte_mask, err);
+	}
+
 	/* Scalable Vector Extension state, if present */
 	if (system_supports_sve() && err == 0 && user->sve_offset) {
 		struct sve_context __user *sve_ctx =
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 4545fe067ea9..8154f2562f74 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -295,8 +295,8 @@ static unsigned long esr_to_error_code(unsigned long esr, unsigned long far)
 }
 
 void arm64_force_sig_fault(int signo, int code, void __user *addr,
-			   unsigned long far, unsigned long esr,
-			   const char *str)
+			   unsigned long far, unsigned char far_tb_mask,
+			   unsigned long esr, const char *str)
 {
 	arm64_show_signal(signo, esr, str);
 	if (signo == SIGKILL) {
@@ -309,6 +309,7 @@ void arm64_force_sig_fault(int signo, int code, void __user *addr,
 		info.si_code = code;
 		info.si_addr = addr;
 		info.arch.fault_address = far;
+		info.arch.fault_address_top_byte_mask = far_tb_mask;
 		info.arch.error_code = esr_to_error_code(esr, far);
 		force_sig_info(&info);
 	}
@@ -329,6 +330,7 @@ void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
 	info.si_addr = addr;
 	info.si_addr_lsb = lsb;
 	info.arch.fault_address = far;
+	info.arch.fault_address_top_byte_mask = 0xff;
 	info.arch.error_code = esr_to_error_code(esr, far);
 	force_sig_info(&info);
 }
@@ -346,7 +348,7 @@ void arm64_notify_die(const char *str, struct pt_regs *regs,
 {
 	if (user_mode(regs)) {
 		WARN_ON(regs != current_pt_regs());
-		arm64_force_sig_fault(signo, sicode, addr, 0, esr, str);
+		arm64_force_sig_fault(signo, sicode, addr, 0, 0, esr, str);
 	} else {
 		die(str, regs, esr);
 	}
@@ -893,7 +895,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 {
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
-	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, 0, esr,
+	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, 0, 0, esr,
 			      "Bad EL0 synchronous exception");
 }
 
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index a7bada1392b3..1ba95f308c10 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -332,7 +332,7 @@ static void do_bad_area(unsigned long far, unsigned int esr,
 	if (user_mode(regs)) {
 		const struct fault_info *inf = esr_to_fault_info(esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
-				      far, esr, inf->name);
+				      far, 0xff, esr, inf->name);
 	} else {
 		__do_kernel_fault(addr, esr, regs);
 	}
@@ -520,8 +520,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 		 * We had some memory, but were unable to successfully fix up
 		 * this page fault.
 		 */
-		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, far, esr,
-				      inf->name);
+		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,
+				      far, 0xff, esr, inf->name);
 	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
 		unsigned int lsb;
 
@@ -538,7 +538,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 		 */
 		arm64_force_sig_fault(SIGSEGV,
 				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
-				      (void __user *)addr, far, esr,
+				      (void __user *)addr, far, 0xff, esr,
 				      inf->name);
 	}
 
-- 
2.26.2.761.g0e0b3e54be-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21  2:28                             ` Peter Collingbourne
  2020-05-21  2:29                               ` [PATCH v6 0/3] " Peter Collingbourne
@ 2020-05-21 12:35                               ` Eric W. Biederman
  2020-05-21 18:03                                 ` Peter Collingbourne
  1 sibling, 1 reply; 64+ messages in thread
From: Eric W. Biederman @ 2020-05-21 12:35 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Linux ARM, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Dave Martin, Evgenii Stepanov, Richard Henderson

Peter Collingbourne <pcc@google.com> writes:

> On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
>>
>> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
>> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
>> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
>> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
>> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
>> > > > > index baa88dc02e5c..5867f2fdbe64 100644
>> > > > > --- a/arch/arm64/kernel/signal.c
>> > > > > +++ b/arch/arm64/kernel/signal.c
>> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
>> > > > > rt_sigframe_user_layout *user,
>> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
>> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
>> > > > >                 __put_user_error(current->thread.fault_code,
>> > > > > &esr_ctx->esr, err);
>> > > > > +               current->thread.fault_code = 0;
>> > > >
>> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
>> > > > user stack before this and deliver a SIGSEGV, but with the old
>> > > > fault_code still set?  Then we'd emit the old fault code with the
>> > > > new "can't deliver signal" signal, which doesn't make sense.
>> > > >
>> > > > Stuff may also go wrong with signal prioritisation.
>> > > >
>> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
>> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
>> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
>> > > > With your change we'd then have cleared the fault code by the time we
>> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
>> > > >
>> > > > Today, I think we just attach that fault code to every signal that's
>> > > > delivered until something overwrites or resets it, which means that
>> > > > a signal that needs fault_code gets it, at the expense of attaching
>> > > > it to a bunch of other random signals too.
>> > > >
>> > > >
>> > > > Checking the signal number and si_code might help us to know what we
>> > > > should be doing with fault_code.  We need to have sure userspace can't
>> > > > trick us with a non kernel generated signal here.  It would also be
>> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
>> > >
>> > > With these possible interactions in mind I think we should store the
>> > > fault code and fault address in kernel_siginfo instead of
>> > > thread_struct (and clear these fields when we receive a siginfo from
>> > > userspace, i.e. in copy_siginfo_from_user which is used by
>> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
>> > > information is clearly associated with the signal itself and not the
>> > > thread, so we don't need to worry about our signal being delivered out
>> > > of order.
>> >
>> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
>> > signal code. Can you?
>
> I think I've come up with a way that doesn't seem to be too invasive.
> See patch #1 of the series that I'm about to send out.
>
>> > But generally, I agree: the per-thread handling of fault_address and
>> > fault_code appears to be quite broken in the face of signal prioritisation
>> > and signals that don't correspond directly to hardware trap. It would be
>> > nice to have some tests for this...
>> >
>> > If we want to pile on more bodges, perhaps we could stash the signal number
>> > to which the fault_{address,code} relate, and then check that at delivery
>> > and clear on a match. I hate it.
>>
>> I agree with Daniel's suggestion in principle, but I was also concerned
>> about whether it would be too invasive elsewhere.
>>
>> Question though: does the core code take special care to make sure that
>> a force_sig cannot be outprioritised by a regular signal?  If so,
>> perhaps we get away with it.  I ask this, because the same same issue
>> may be hitting other arches otherwise.
>
> Not as far as I can tell. There does appear to be prioritisation for
> synchronous signals [1] but as far as I can tell nothing to
> distinguish one of these signals from one with the same signal number
> sent from userspace (e.g. via kill(2)).

The si_code will differ between signals generated between userspace
and signals generated by the kernel.

We do allow a little bit of ptrace and sending to yourself to spoof
kernel generated signals, for reasons of debugging and process migration
where an existing process needs to be reconstructed.  But the defenses
should be strong enough you can assume that we reliably distinguish
between a signal from userspace and a signal from the kernel.

I don't fully follow what you are doing but this feels like the
kind of case where a new si_code has been defined as well as additional
fields in siginfo.

In your patchset I really hate that you were going back to
force_sig_info, and filling out struct siginfo by hand.  That is an
error prone pattern, and I have fixed enough bugs in the kernel to prove
that.

I take exception to the idea that including the full address might break
userspace.  That means typically means someone has been too lazy to look
and see what userspace is doing.  When that userspace that might break
is the same userspace you are changing the kernel to serve that makes me
nervous.  AKA the userspace that cares about this signal and how it is
represented in siginfo.

A fix of one instance of SIGILL should not be included with a patch that
does something else, and really should come before everything else if
possible.

If this information really belongs in struct siginfo (as it sounds like)
please actually put the information in siginfo, and let userspace look
in siginfo to find it.  struct siginfo is a union with plenty of space,
and plenty of si_codes.

If this applies to multiple cases then it might be trickier but please
dig into the details, don't toss things into sigcontext just because
you can't figure out a clean design for reporting this.

Eric


> Peter
>
> [1] https://github.com/torvalds/linux/blob/b85051e755b0e9d6dd8f17ef1da083851b83287d/kernel/signal.c#L222

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo
  2020-05-21  2:29                                 ` [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo Peter Collingbourne
@ 2020-05-21 13:34                                     ` kbuild test robot
  0 siblings, 0 replies; 64+ messages in thread
From: kbuild test robot @ 2020-05-21 13:34 UTC (permalink / raw)
  To: Peter Collingbourne, Catalin Marinas, Evgenii Stepanov,
	Kostya Serebryany, Vincenzo Frascino, Dave Martin, Will Deacon,
	Oleg Nesterov, Eric W. Biederman
  Cc: kbuild-all, Andrey Konovalov, Kevin Brodsky, clang-built-linux,
	Peter Collingbourne, Linux ARM, Richard Henderson

[-- Attachment #1: Type: text/plain, Size: 9832 bytes --]

Hi Peter,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on arm-perf/for-next/perf]
[also build test WARNING on linus/master v5.7-rc6]
[cannot apply to arm64/for-next/core next-20200519]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Peter-Collingbourne/arm64-Expose-FAR_EL1-tag-bits-in-sigcontext/20200521-103345
base:   https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git for-next/perf
config: arm64-randconfig-r005-20200520 (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 3393cc4cebf9969db94dc424b7a2b6195589c33b)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm64 cross compiling tool for clang build
        # apt-get install binutils-aarch64-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=arm64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

>> arch/arm64/kernel/traps.c:283:55: warning: format specifies type 'unsigned int' but the argument has type 'unsigned long' [-Wformat]
WARN(1, "ESR 0x%x is not DABT or IABT from EL0n", esr);
~~                                  ^~~
%lx
include/asm-generic/bug.h:124:29: note: expanded from macro 'WARN'
__WARN_printf(TAINT_WARN, format);                                                                   ^~~~~~
include/asm-generic/bug.h:92:17: note: expanded from macro '__WARN_printf'
__warn_printk(arg);                                                                      ^~~
arch/arm64/kernel/traps.c:826:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_UNKNOWN]            = "Unknown/Uncategorized",
^~~~~~~~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:827:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_WFx]                = "WFI/WFE",
^~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:828:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP15_32]            = "CP15 MCR/MRC",
^~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:829:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP15_64]            = "CP15 MCRR/MRRC",
^~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:830:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP14_MR]            = "CP14 MCR/MRC",
^~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:831:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP14_LS]            = "CP14 LDC/STC",
^~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:832:27: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_FP_ASIMD]           = "ASIMD",
^~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:833:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP10_ID]            = "CP10 MRC/VMRS",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:834:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_PAC]                = "PAC",
^~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:835:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP14_64]            = "CP14 MCRR/MRRC",
^~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:836:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_ILL]                = "PSTATE.IL",
^~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:837:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_SVC32]              = "SVC (AArch32)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:838:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_HVC32]              = "HVC (AArch32)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:839:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_SMC32]              = "SMC (AArch32)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:840:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_SVC64]              = "SVC (AArch64)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:841:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]

vim +283 arch/arm64/kernel/traps.c

   236	
   237	static unsigned long esr_to_error_code(unsigned long esr, unsigned long far)
   238	{
   239		/*
   240		 * If the faulting address is in the kernel, we must sanitize the ESR.
   241		 * From userspace's point of view, kernel-only mappings don't exist
   242		 * at all, so we report them as level 0 translation faults.
   243		 * (This is not quite the way that "no mapping there at all" behaves:
   244		 * an alignment fault not caused by the memory type would take
   245		 * precedence over translation fault for a real access to empty
   246		 * space. Unfortunately we can't easily distinguish "alignment fault
   247		 * not caused by memory type" from "alignment fault caused by memory
   248		 * type", so we ignore this wrinkle and just return the translation
   249		 * fault.)
   250		 */
   251		if (!is_ttbr0_addr(untagged_addr(far))) {
   252			switch (ESR_ELx_EC(esr)) {
   253			case ESR_ELx_EC_DABT_LOW:
   254				/*
   255				 * These bits provide only information about the
   256				 * faulting instruction, which userspace knows already.
   257				 * We explicitly clear bits which are architecturally
   258				 * RES0 in case they are given meanings in future.
   259				 * We always report the ESR as if the fault was taken
   260				 * to EL1 and so ISV and the bits in ISS[23:14] are
   261				 * clear. (In fact it always will be a fault to EL1.)
   262				 */
   263				esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
   264					ESR_ELx_CM | ESR_ELx_WNR;
   265				esr |= ESR_ELx_FSC_FAULT;
   266				break;
   267			case ESR_ELx_EC_IABT_LOW:
   268				/*
   269				 * Claim a level 0 translation fault.
   270				 * All other bits are architecturally RES0 for faults
   271				 * reported with that DFSC value, so we clear them.
   272				 */
   273				esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
   274				esr |= ESR_ELx_FSC_FAULT;
   275				break;
   276			default:
   277				/*
   278				 * This should never happen (entry.S only brings us
   279				 * into this code for insn and data aborts from a lower
   280				 * exception level). Fail safe by not providing an ESR
   281				 * context record at all.
   282				 */
 > 283				WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr);
   284				esr = 0;
   285				break;
   286			}
   287		}
   288	
   289		if (is_compat_task()) {
   290			/* Use the compat FSR WnR */
   291			return !!(esr & ESR_ELx_WNR) << FSR_WRITE_SHIFT;
   292		}
   293	
   294		return esr;
   295	}
   296	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 47550 bytes --]

[-- Attachment #3: Type: text/plain, Size: 176 bytes --]

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo
@ 2020-05-21 13:34                                     ` kbuild test robot
  0 siblings, 0 replies; 64+ messages in thread
From: kbuild test robot @ 2020-05-21 13:34 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 10027 bytes --]

Hi Peter,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on arm-perf/for-next/perf]
[also build test WARNING on linus/master v5.7-rc6]
[cannot apply to arm64/for-next/core next-20200519]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Peter-Collingbourne/arm64-Expose-FAR_EL1-tag-bits-in-sigcontext/20200521-103345
base:   https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git for-next/perf
config: arm64-randconfig-r005-20200520 (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 3393cc4cebf9969db94dc424b7a2b6195589c33b)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm64 cross compiling tool for clang build
        # apt-get install binutils-aarch64-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=arm64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>, old ones prefixed by <<):

>> arch/arm64/kernel/traps.c:283:55: warning: format specifies type 'unsigned int' but the argument has type 'unsigned long' [-Wformat]
WARN(1, "ESR 0x%x is not DABT or IABT from EL0n", esr);
~~                                  ^~~
%lx
include/asm-generic/bug.h:124:29: note: expanded from macro 'WARN'
__WARN_printf(TAINT_WARN, format);                                                                   ^~~~~~
include/asm-generic/bug.h:92:17: note: expanded from macro '__WARN_printf'
__warn_printk(arg);                                                                      ^~~
arch/arm64/kernel/traps.c:826:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_UNKNOWN]            = "Unknown/Uncategorized",
^~~~~~~~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:827:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_WFx]                = "WFI/WFE",
^~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:828:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP15_32]            = "CP15 MCR/MRC",
^~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:829:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP15_64]            = "CP15 MCRR/MRRC",
^~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:830:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP14_MR]            = "CP14 MCR/MRC",
^~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:831:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP14_LS]            = "CP14 LDC/STC",
^~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:832:27: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_FP_ASIMD]           = "ASIMD",
^~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:833:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP10_ID]            = "CP10 MRC/VMRS",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:834:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_PAC]                = "PAC",
^~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:835:26: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_CP14_64]            = "CP14 MCRR/MRRC",
^~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:836:22: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_ILL]                = "PSTATE.IL",
^~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:837:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_SVC32]              = "SVC (AArch32)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:838:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_HVC32]              = "HVC (AArch32)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:839:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_SMC32]              = "SMC (AArch32)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:840:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]
[ESR_ELx_EC_SVC64]              = "SVC (AArch64)",
^~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:825:28: note: previous initialization is here
[0 ... ESR_ELx_EC_MAX]          = "UNRECOGNIZED EC",
^~~~~~~~~~~~~~~~~
arch/arm64/kernel/traps.c:841:24: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides]

vim +283 arch/arm64/kernel/traps.c

   236	
   237	static unsigned long esr_to_error_code(unsigned long esr, unsigned long far)
   238	{
   239		/*
   240		 * If the faulting address is in the kernel, we must sanitize the ESR.
   241		 * From userspace's point of view, kernel-only mappings don't exist
   242		 * at all, so we report them as level 0 translation faults.
   243		 * (This is not quite the way that "no mapping there at all" behaves:
   244		 * an alignment fault not caused by the memory type would take
   245		 * precedence over translation fault for a real access to empty
   246		 * space. Unfortunately we can't easily distinguish "alignment fault
   247		 * not caused by memory type" from "alignment fault caused by memory
   248		 * type", so we ignore this wrinkle and just return the translation
   249		 * fault.)
   250		 */
   251		if (!is_ttbr0_addr(untagged_addr(far))) {
   252			switch (ESR_ELx_EC(esr)) {
   253			case ESR_ELx_EC_DABT_LOW:
   254				/*
   255				 * These bits provide only information about the
   256				 * faulting instruction, which userspace knows already.
   257				 * We explicitly clear bits which are architecturally
   258				 * RES0 in case they are given meanings in future.
   259				 * We always report the ESR as if the fault was taken
   260				 * to EL1 and so ISV and the bits in ISS[23:14] are
   261				 * clear. (In fact it always will be a fault to EL1.)
   262				 */
   263				esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
   264					ESR_ELx_CM | ESR_ELx_WNR;
   265				esr |= ESR_ELx_FSC_FAULT;
   266				break;
   267			case ESR_ELx_EC_IABT_LOW:
   268				/*
   269				 * Claim a level 0 translation fault.
   270				 * All other bits are architecturally RES0 for faults
   271				 * reported with that DFSC value, so we clear them.
   272				 */
   273				esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
   274				esr |= ESR_ELx_FSC_FAULT;
   275				break;
   276			default:
   277				/*
   278				 * This should never happen (entry.S only brings us
   279				 * into this code for insn and data aborts from a lower
   280				 * exception level). Fail safe by not providing an ESR
   281				 * context record at all.
   282				 */
 > 283				WARN(1, "ESR 0x%x is not DABT or IABT from EL0\n", esr);
   284				esr = 0;
   285				break;
   286			}
   287		}
   288	
   289		if (is_compat_task()) {
   290			/* Use the compat FSR WnR */
   291			return !!(esr & ESR_ELx_WNR) << FSR_WRITE_SHIFT;
   292		}
   293	
   294		return esr;
   295	}
   296	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 47550 bytes --]

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21 12:35                               ` [PATCH v6] " Eric W. Biederman
@ 2020-05-21 18:03                                 ` Peter Collingbourne
  2020-05-21 19:24                                   ` Eric W. Biederman
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21 18:03 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Linux ARM, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Dave Martin, Evgenii Stepanov, Richard Henderson

On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Peter Collingbourne <pcc@google.com> writes:
>
> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >>
> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> >> > > > > --- a/arch/arm64/kernel/signal.c
> >> > > > > +++ b/arch/arm64/kernel/signal.c
> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> >> > > > > rt_sigframe_user_layout *user,
> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >> > > > >                 __put_user_error(current->thread.fault_code,
> >> > > > > &esr_ctx->esr, err);
> >> > > > > +               current->thread.fault_code = 0;
> >> > > >
> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> >> > > >
> >> > > > Stuff may also go wrong with signal prioritisation.
> >> > > >
> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> >> > > > With your change we'd then have cleared the fault code by the time we
> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> >> > > >
> >> > > > Today, I think we just attach that fault code to every signal that's
> >> > > > delivered until something overwrites or resets it, which means that
> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> >> > > > it to a bunch of other random signals too.
> >> > > >
> >> > > >
> >> > > > Checking the signal number and si_code might help us to know what we
> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> >> > > > trick us with a non kernel generated signal here.  It would also be
> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> >> > >
> >> > > With these possible interactions in mind I think we should store the
> >> > > fault code and fault address in kernel_siginfo instead of
> >> > > thread_struct (and clear these fields when we receive a siginfo from
> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> >> > > information is clearly associated with the signal itself and not the
> >> > > thread, so we don't need to worry about our signal being delivered out
> >> > > of order.
> >> >
> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> >> > signal code. Can you?
> >
> > I think I've come up with a way that doesn't seem to be too invasive.
> > See patch #1 of the series that I'm about to send out.
> >
> >> > But generally, I agree: the per-thread handling of fault_address and
> >> > fault_code appears to be quite broken in the face of signal prioritisation
> >> > and signals that don't correspond directly to hardware trap. It would be
> >> > nice to have some tests for this...
> >> >
> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> >> > to which the fault_{address,code} relate, and then check that at delivery
> >> > and clear on a match. I hate it.
> >>
> >> I agree with Daniel's suggestion in principle, but I was also concerned
> >> about whether it would be too invasive elsewhere.
> >>
> >> Question though: does the core code take special care to make sure that
> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> >> perhaps we get away with it.  I ask this, because the same same issue
> >> may be hitting other arches otherwise.
> >
> > Not as far as I can tell. There does appear to be prioritisation for
> > synchronous signals [1] but as far as I can tell nothing to
> > distinguish one of these signals from one with the same signal number
> > sent from userspace (e.g. via kill(2)).
>
> The si_code will differ between signals generated between userspace
> and signals generated by the kernel.
>
> We do allow a little bit of ptrace and sending to yourself to spoof
> kernel generated signals, for reasons of debugging and process migration
> where an existing process needs to be reconstructed.  But the defenses
> should be strong enough you can assume that we reliably distinguish
> between a signal from userspace and a signal from the kernel.

So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
add the context in that case? Seems fragile to me, but I suppose I
could live with it.

> I don't fully follow what you are doing but this feels like the
> kind of case where a new si_code has been defined as well as additional
> fields in siginfo.

There is no new si_code for this, the information will be exposed for
several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
(particularly SEGV_MTESERR, which is part of the proposed MTE patch
set). Note that we already have a union field for BUS_MCEERR_AR, and
we may want to expose it for the other si_codes that already have
union fields as well.

That being said, taking a closer look at siginfo, I think we are in
luck and we might be able to make this work in a reasonable way by
reusing padding (see below).

> In your patchset I really hate that you were going back to
> force_sig_info, and filling out struct siginfo by hand.  That is an
> error prone pattern, and I have fixed enough bugs in the kernel to prove
> that.

To be fair, most of the callers are in helper functions that take
explicit parameters similar to force_sig_fault et al, and the SIGILL
one could easily be made that way as well.

> I take exception to the idea that including the full address might break
> userspace.  That means typically means someone has been too lazy to look
> and see what userspace is doing.  When that userspace that might break
> is the same userspace you are changing the kernel to serve that makes me
> nervous.  AKA the userspace that cares about this signal and how it is
> represented in siginfo.

It's not a matter of being lazy. This behaviour isn't just an accident
but has been explicitly documented for years (see the
tagged-pointers.rst file that I changed: "Non-zero tags are not
preserved when delivering signals."), so users can reasonably rely on
it. Furthermore we simply don't have visibility into the majority of
userspace. For example, there are a lot of closed source Android apps
out there, and who knows what signal handlers they're installing and
how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
can't just change the documented semantics under their feet.

It's also not the same userspace either. The userspace that's
initially going to be consuming the new fields is in a part of the
Android system that handles and reports crashes, and that's something
that we control unlike all the apps.

Finally, the userspace may need to know whether the tag bits were
actually zero or whether they were just unavailable, otherwise
userspace could for example produce a misleading crash report. Simply
having the kernel set the top bits of si_addr wouldn't accomplish that
due to the kernel's previous behaviour, hence the mask to let
userspace know which bits are accurate.

> A fix of one instance of SIGILL should not be included with a patch that
> does something else, and really should come before everything else if
> possible.

Fair point. I can see if I can split that part out.

> If this information really belongs in struct siginfo (as it sounds like)
> please actually put the information in siginfo, and let userspace look
> in siginfo to find it.  struct siginfo is a union with plenty of space,
> and plenty of si_codes.
>
> If this applies to multiple cases then it might be trickier but please
> dig into the details, don't toss things into sigcontext just because
> you can't figure out a clean design for reporting this.

If we wanted this in siginfo, one idea that I had was to revert commit
b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
_addr_top_byte and _addr_top_byte_mask in the padding between
_addr_lsb and the union (with comments on all the fields of course to
say when they are filled in). I think that would work since we are
already clearing padding in siginfo, one nice property of the new
fields is that the zero values are correct in the case where the
information isn't being exposed (so old kernels would already have the
correct behaviour). That would only work on certain architectures
(i.e. at least alignof(void*) >= 4) so I suppose it could have an
#ifdef __aarch64__ around it.

Peter





Peter
>
> Eric
>
>
> > Peter
> >
> > [1] https://github.com/torvalds/linux/blob/b85051e755b0e9d6dd8f17ef1da083851b83287d/kernel/signal.c#L222

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21 18:03                                 ` Peter Collingbourne
@ 2020-05-21 19:24                                   ` Eric W. Biederman
  2020-05-21 20:48                                     ` Peter Collingbourne
  2020-05-26 13:03                                     ` [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext Dave Martin
  0 siblings, 2 replies; 64+ messages in thread
From: Eric W. Biederman @ 2020-05-21 19:24 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Andrey Konovalov, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Linux ARM, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Dave Martin, Evgenii Stepanov, Richard Henderson

Peter Collingbourne <pcc@google.com> writes:

> On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> Peter Collingbourne <pcc@google.com> writes:
>>
>> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
>> >>
>> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
>> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
>> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
>> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
>> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
>> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
>> >> > > > > --- a/arch/arm64/kernel/signal.c
>> >> > > > > +++ b/arch/arm64/kernel/signal.c
>> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
>> >> > > > > rt_sigframe_user_layout *user,
>> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
>> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
>> >> > > > >                 __put_user_error(current->thread.fault_code,
>> >> > > > > &esr_ctx->esr, err);
>> >> > > > > +               current->thread.fault_code = 0;
>> >> > > >
>> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
>> >> > > > user stack before this and deliver a SIGSEGV, but with the old
>> >> > > > fault_code still set?  Then we'd emit the old fault code with the
>> >> > > > new "can't deliver signal" signal, which doesn't make sense.
>> >> > > >
>> >> > > > Stuff may also go wrong with signal prioritisation.
>> >> > > >
>> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
>> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
>> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
>> >> > > > With your change we'd then have cleared the fault code by the time we
>> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
>> >> > > >
>> >> > > > Today, I think we just attach that fault code to every signal that's
>> >> > > > delivered until something overwrites or resets it, which means that
>> >> > > > a signal that needs fault_code gets it, at the expense of attaching
>> >> > > > it to a bunch of other random signals too.
>> >> > > >
>> >> > > >
>> >> > > > Checking the signal number and si_code might help us to know what we
>> >> > > > should be doing with fault_code.  We need to have sure userspace can't
>> >> > > > trick us with a non kernel generated signal here.  It would also be
>> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
>> >> > >
>> >> > > With these possible interactions in mind I think we should store the
>> >> > > fault code and fault address in kernel_siginfo instead of
>> >> > > thread_struct (and clear these fields when we receive a siginfo from
>> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
>> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
>> >> > > information is clearly associated with the signal itself and not the
>> >> > > thread, so we don't need to worry about our signal being delivered out
>> >> > > of order.
>> >> >
>> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
>> >> > signal code. Can you?
>> >
>> > I think I've come up with a way that doesn't seem to be too invasive.
>> > See patch #1 of the series that I'm about to send out.
>> >
>> >> > But generally, I agree: the per-thread handling of fault_address and
>> >> > fault_code appears to be quite broken in the face of signal prioritisation
>> >> > and signals that don't correspond directly to hardware trap. It would be
>> >> > nice to have some tests for this...
>> >> >
>> >> > If we want to pile on more bodges, perhaps we could stash the signal number
>> >> > to which the fault_{address,code} relate, and then check that at delivery
>> >> > and clear on a match. I hate it.
>> >>
>> >> I agree with Daniel's suggestion in principle, but I was also concerned
>> >> about whether it would be too invasive elsewhere.
>> >>
>> >> Question though: does the core code take special care to make sure that
>> >> a force_sig cannot be outprioritised by a regular signal?  If so,
>> >> perhaps we get away with it.  I ask this, because the same same issue
>> >> may be hitting other arches otherwise.
>> >
>> > Not as far as I can tell. There does appear to be prioritisation for
>> > synchronous signals [1] but as far as I can tell nothing to
>> > distinguish one of these signals from one with the same signal number
>> > sent from userspace (e.g. via kill(2)).
>>
>> The si_code will differ between signals generated between userspace
>> and signals generated by the kernel.
>>
>> We do allow a little bit of ptrace and sending to yourself to spoof
>> kernel generated signals, for reasons of debugging and process migration
>> where an existing process needs to be reconstructed.  But the defenses
>> should be strong enough you can assume that we reliably distinguish
>> between a signal from userspace and a signal from the kernel.
>
> So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> add the context in that case? Seems fragile to me, but I suppose I
> could live with it.
>
>> I don't fully follow what you are doing but this feels like the
>> kind of case where a new si_code has been defined as well as additional
>> fields in siginfo.
>
> There is no new si_code for this, the information will be exposed for
> several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> set). Note that we already have a union field for BUS_MCEERR_AR, and
> we may want to expose it for the other si_codes that already have
> union fields as well.
>
> That being said, taking a closer look at siginfo, I think we are in
> luck and we might be able to make this work in a reasonable way by
> reusing padding (see below).
>
>> In your patchset I really hate that you were going back to
>> force_sig_info, and filling out struct siginfo by hand.  That is an
>> error prone pattern, and I have fixed enough bugs in the kernel to prove
>> that.
>
> To be fair, most of the callers are in helper functions that take
> explicit parameters similar to force_sig_fault et al, and the SIGILL
> one could easily be made that way as well.
>
>> I take exception to the idea that including the full address might break
>> userspace.  That means typically means someone has been too lazy to look
>> and see what userspace is doing.  When that userspace that might break
>> is the same userspace you are changing the kernel to serve that makes me
>> nervous.  AKA the userspace that cares about this signal and how it is
>> represented in siginfo.
>
> It's not a matter of being lazy. This behaviour isn't just an accident
> but has been explicitly documented for years (see the
> tagged-pointers.rst file that I changed: "Non-zero tags are not
> preserved when delivering signals."), so users can reasonably rely on
> it. Furthermore we simply don't have visibility into the majority of
> userspace. For example, there are a lot of closed source Android apps
> out there, and who knows what signal handlers they're installing and
> how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> can't just change the documented semantics under their feet.
>
> It's also not the same userspace either. The userspace that's
> initially going to be consuming the new fields is in a part of the
> Android system that handles and reports crashes, and that's something
> that we control unlike all the apps.
>
> Finally, the userspace may need to know whether the tag bits were
> actually zero or whether they were just unavailable, otherwise
> userspace could for example produce a misleading crash report. Simply
> having the kernel set the top bits of si_addr wouldn't accomplish that
> due to the kernel's previous behaviour, hence the mask to let
> userspace know which bits are accurate.
>
>> A fix of one instance of SIGILL should not be included with a patch that
>> does something else, and really should come before everything else if
>> possible.
>
> Fair point. I can see if I can split that part out.
>
>> If this information really belongs in struct siginfo (as it sounds like)
>> please actually put the information in siginfo, and let userspace look
>> in siginfo to find it.  struct siginfo is a union with plenty of space,
>> and plenty of si_codes.
>>
>> If this applies to multiple cases then it might be trickier but please
>> dig into the details, don't toss things into sigcontext just because
>> you can't figure out a clean design for reporting this.
>
> If we wanted this in siginfo, one idea that I had was to revert commit
> b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> _addr_top_byte and _addr_top_byte_mask in the padding between
> _addr_lsb and the union (with comments on all the fields of course to
> say when they are filled in). I think that would work since we are
> already clearing padding in siginfo, one nice property of the new
> fields is that the zero values are correct in the case where the
> information isn't being exposed (so old kernels would already have the
> correct behaviour). That would only work on certain architectures
> (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> #ifdef __aarch64__ around it.

Perhaps add a 4th padding member to the union inside of _sigfault, that
adds something like 4 unsigned long's worth of data, and then have your
fields after the union.

Is it quite a bit of work to gather that information from the
instructions that faulted?  I am just checking that this work is really
makes sense.

What I really don't understand is how well this problem generalizes to
other architectures to tell if this is something other people need to
solve at some point as well.

Eric

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21 19:24                                   ` Eric W. Biederman
@ 2020-05-21 20:48                                     ` Peter Collingbourne
  2020-06-08 18:12                                       ` Peter Collingbourne
  2020-05-26 13:03                                     ` [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext Dave Martin
  1 sibling, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-05-21 20:48 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Linux ARM, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Dave Martin, Evgenii Stepanov, Richard Henderson

On Thu, May 21, 2020 at 12:28 PM Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> Peter Collingbourne <pcc@google.com> writes:
>
> > On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Peter Collingbourne <pcc@google.com> writes:
> >>
> >> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >>
> >> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> >> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> >> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> >> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> >> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> >> >> > > > > --- a/arch/arm64/kernel/signal.c
> >> >> > > > > +++ b/arch/arm64/kernel/signal.c
> >> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> >> >> > > > > rt_sigframe_user_layout *user,
> >> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >> >> > > > >                 __put_user_error(current->thread.fault_code,
> >> >> > > > > &esr_ctx->esr, err);
> >> >> > > > > +               current->thread.fault_code = 0;
> >> >> > > >
> >> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> >> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> >> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> >> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> >> >> > > >
> >> >> > > > Stuff may also go wrong with signal prioritisation.
> >> >> > > >
> >> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> >> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> >> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> >> >> > > > With your change we'd then have cleared the fault code by the time we
> >> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> >> >> > > >
> >> >> > > > Today, I think we just attach that fault code to every signal that's
> >> >> > > > delivered until something overwrites or resets it, which means that
> >> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> >> >> > > > it to a bunch of other random signals too.
> >> >> > > >
> >> >> > > >
> >> >> > > > Checking the signal number and si_code might help us to know what we
> >> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> >> >> > > > trick us with a non kernel generated signal here.  It would also be
> >> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> >> >> > >
> >> >> > > With these possible interactions in mind I think we should store the
> >> >> > > fault code and fault address in kernel_siginfo instead of
> >> >> > > thread_struct (and clear these fields when we receive a siginfo from
> >> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> >> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> >> >> > > information is clearly associated with the signal itself and not the
> >> >> > > thread, so we don't need to worry about our signal being delivered out
> >> >> > > of order.
> >> >> >
> >> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> >> >> > signal code. Can you?
> >> >
> >> > I think I've come up with a way that doesn't seem to be too invasive.
> >> > See patch #1 of the series that I'm about to send out.
> >> >
> >> >> > But generally, I agree: the per-thread handling of fault_address and
> >> >> > fault_code appears to be quite broken in the face of signal prioritisation
> >> >> > and signals that don't correspond directly to hardware trap. It would be
> >> >> > nice to have some tests for this...
> >> >> >
> >> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> >> >> > to which the fault_{address,code} relate, and then check that at delivery
> >> >> > and clear on a match. I hate it.
> >> >>
> >> >> I agree with Daniel's suggestion in principle, but I was also concerned
> >> >> about whether it would be too invasive elsewhere.
> >> >>
> >> >> Question though: does the core code take special care to make sure that
> >> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> >> >> perhaps we get away with it.  I ask this, because the same same issue
> >> >> may be hitting other arches otherwise.
> >> >
> >> > Not as far as I can tell. There does appear to be prioritisation for
> >> > synchronous signals [1] but as far as I can tell nothing to
> >> > distinguish one of these signals from one with the same signal number
> >> > sent from userspace (e.g. via kill(2)).
> >>
> >> The si_code will differ between signals generated between userspace
> >> and signals generated by the kernel.
> >>
> >> We do allow a little bit of ptrace and sending to yourself to spoof
> >> kernel generated signals, for reasons of debugging and process migration
> >> where an existing process needs to be reconstructed.  But the defenses
> >> should be strong enough you can assume that we reliably distinguish
> >> between a signal from userspace and a signal from the kernel.
> >
> > So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> > add the context in that case? Seems fragile to me, but I suppose I
> > could live with it.
> >
> >> I don't fully follow what you are doing but this feels like the
> >> kind of case where a new si_code has been defined as well as additional
> >> fields in siginfo.
> >
> > There is no new si_code for this, the information will be exposed for
> > several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> > SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> > (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> > set). Note that we already have a union field for BUS_MCEERR_AR, and
> > we may want to expose it for the other si_codes that already have
> > union fields as well.
> >
> > That being said, taking a closer look at siginfo, I think we are in
> > luck and we might be able to make this work in a reasonable way by
> > reusing padding (see below).
> >
> >> In your patchset I really hate that you were going back to
> >> force_sig_info, and filling out struct siginfo by hand.  That is an
> >> error prone pattern, and I have fixed enough bugs in the kernel to prove
> >> that.
> >
> > To be fair, most of the callers are in helper functions that take
> > explicit parameters similar to force_sig_fault et al, and the SIGILL
> > one could easily be made that way as well.
> >
> >> I take exception to the idea that including the full address might break
> >> userspace.  That means typically means someone has been too lazy to look
> >> and see what userspace is doing.  When that userspace that might break
> >> is the same userspace you are changing the kernel to serve that makes me
> >> nervous.  AKA the userspace that cares about this signal and how it is
> >> represented in siginfo.
> >
> > It's not a matter of being lazy. This behaviour isn't just an accident
> > but has been explicitly documented for years (see the
> > tagged-pointers.rst file that I changed: "Non-zero tags are not
> > preserved when delivering signals."), so users can reasonably rely on
> > it. Furthermore we simply don't have visibility into the majority of
> > userspace. For example, there are a lot of closed source Android apps
> > out there, and who knows what signal handlers they're installing and
> > how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> > can't just change the documented semantics under their feet.
> >
> > It's also not the same userspace either. The userspace that's
> > initially going to be consuming the new fields is in a part of the
> > Android system that handles and reports crashes, and that's something
> > that we control unlike all the apps.
> >
> > Finally, the userspace may need to know whether the tag bits were
> > actually zero or whether they were just unavailable, otherwise
> > userspace could for example produce a misleading crash report. Simply
> > having the kernel set the top bits of si_addr wouldn't accomplish that
> > due to the kernel's previous behaviour, hence the mask to let
> > userspace know which bits are accurate.
> >
> >> A fix of one instance of SIGILL should not be included with a patch that
> >> does something else, and really should come before everything else if
> >> possible.
> >
> > Fair point. I can see if I can split that part out.
> >
> >> If this information really belongs in struct siginfo (as it sounds like)
> >> please actually put the information in siginfo, and let userspace look
> >> in siginfo to find it.  struct siginfo is a union with plenty of space,
> >> and plenty of si_codes.
> >>
> >> If this applies to multiple cases then it might be trickier but please
> >> dig into the details, don't toss things into sigcontext just because
> >> you can't figure out a clean design for reporting this.
> >
> > If we wanted this in siginfo, one idea that I had was to revert commit
> > b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> > _addr_top_byte and _addr_top_byte_mask in the padding between
> > _addr_lsb and the union (with comments on all the fields of course to
> > say when they are filled in). I think that would work since we are
> > already clearing padding in siginfo, one nice property of the new
> > fields is that the zero values are correct in the case where the
> > information isn't being exposed (so old kernels would already have the
> > correct behaviour). That would only work on certain architectures
> > (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> > #ifdef __aarch64__ around it.
>
> Perhaps add a 4th padding member to the union inside of _sigfault, that
> adds something like 4 unsigned long's worth of data, and then have your
> fields after the union.

Maybe. I guess we could always add another union after my fields if we
end up needing another union member that is larger than the 4 unsigned
longs, which would be ugly but at least it would work. Reusing the
padding would avoid that but maybe it's not that likely that we'll
need that much.

> Is it quite a bit of work to gather that information from the
> instructions that faulted?  I am just checking that this work is really
> makes sense.

I think so. At a glance there are hundreds of load and store
instructions on arm64 and we would need to know how to disassemble all
of them and recompute the si_addr from scratch (since the tag bits
could come from any of the registers used to compute the address). And
we really don't want to be doing this tricky stuff in a signal handler
where we've just crashed.

> What I really don't understand is how well this problem generalizes to
> other architectures to tell if this is something other people need to
> solve at some point as well.

An architecture with a feature similar to ARM's TBI or MTE may need
something like this as well, depending on whether they decide to
expose the tag bits in si_addr from the start (and if the feature is
similar to TBI it certainly seems like a reasonable choice to follow
arm64 for compatibility reasons). I would imagine that the main thing
that could vary between architectures is the number of bits involved,
which suggests making the fields arch-specific (or making them larger,
but that may be wasteful).

The only other architecture that I'm aware of with such a feature is
SPARC (whose ADI is similar to MTE). The documentation [1] seems to
suggest that the tag bits are available in si_addr but isn't very
specific.

Peter

[1] https://www.kernel.org/doc/Documentation/sparc/adi.rst

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21 19:24                                   ` Eric W. Biederman
  2020-05-21 20:48                                     ` Peter Collingbourne
@ 2020-05-26 13:03                                     ` Dave Martin
  1 sibling, 0 replies; 64+ messages in thread
From: Dave Martin @ 2020-05-26 13:03 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Will Deacon, Andrey Konovalov, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Evgenii Stepanov, Catalin Marinas,
	Vincenzo Frascino, Peter Collingbourne, Linux ARM,
	Richard Henderson

On Thu, May 21, 2020 at 02:24:45PM -0500, Eric W. Biederman wrote:
> Peter Collingbourne <pcc@google.com> writes:
> 
> > On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> Peter Collingbourne <pcc@google.com> writes:
> >>
> >> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >>
> >> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> >> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> >> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> >> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> >> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> >> >> > > > > --- a/arch/arm64/kernel/signal.c
> >> >> > > > > +++ b/arch/arm64/kernel/signal.c
> >> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> >> >> > > > > rt_sigframe_user_layout *user,
> >> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> >> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> >> >> > > > >                 __put_user_error(current->thread.fault_code,
> >> >> > > > > &esr_ctx->esr, err);
> >> >> > > > > +               current->thread.fault_code = 0;
> >> >> > > >
> >> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> >> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> >> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> >> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> >> >> > > >
> >> >> > > > Stuff may also go wrong with signal prioritisation.
> >> >> > > >
> >> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> >> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> >> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> >> >> > > > With your change we'd then have cleared the fault code by the time we
> >> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> >> >> > > >
> >> >> > > > Today, I think we just attach that fault code to every signal that's
> >> >> > > > delivered until something overwrites or resets it, which means that
> >> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> >> >> > > > it to a bunch of other random signals too.
> >> >> > > >
> >> >> > > >
> >> >> > > > Checking the signal number and si_code might help us to know what we
> >> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> >> >> > > > trick us with a non kernel generated signal here.  It would also be
> >> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> >> >> > >
> >> >> > > With these possible interactions in mind I think we should store the
> >> >> > > fault code and fault address in kernel_siginfo instead of
> >> >> > > thread_struct (and clear these fields when we receive a siginfo from
> >> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> >> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> >> >> > > information is clearly associated with the signal itself and not the
> >> >> > > thread, so we don't need to worry about our signal being delivered out
> >> >> > > of order.
> >> >> >
> >> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> >> >> > signal code. Can you?
> >> >
> >> > I think I've come up with a way that doesn't seem to be too invasive.
> >> > See patch #1 of the series that I'm about to send out.
> >> >
> >> >> > But generally, I agree: the per-thread handling of fault_address and
> >> >> > fault_code appears to be quite broken in the face of signal prioritisation
> >> >> > and signals that don't correspond directly to hardware trap. It would be
> >> >> > nice to have some tests for this...
> >> >> >
> >> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> >> >> > to which the fault_{address,code} relate, and then check that at delivery
> >> >> > and clear on a match. I hate it.
> >> >>
> >> >> I agree with Daniel's suggestion in principle, but I was also concerned
> >> >> about whether it would be too invasive elsewhere.
> >> >>
> >> >> Question though: does the core code take special care to make sure that
> >> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> >> >> perhaps we get away with it.  I ask this, because the same same issue
> >> >> may be hitting other arches otherwise.
> >> >
> >> > Not as far as I can tell. There does appear to be prioritisation for
> >> > synchronous signals [1] but as far as I can tell nothing to
> >> > distinguish one of these signals from one with the same signal number
> >> > sent from userspace (e.g. via kill(2)).
> >>
> >> The si_code will differ between signals generated between userspace
> >> and signals generated by the kernel.
> >>
> >> We do allow a little bit of ptrace and sending to yourself to spoof
> >> kernel generated signals, for reasons of debugging and process migration
> >> where an existing process needs to be reconstructed.  But the defenses
> >> should be strong enough you can assume that we reliably distinguish
> >> between a signal from userspace and a signal from the kernel.
> >
> > So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> > add the context in that case? Seems fragile to me, but I suppose I
> > could live with it.
> >
> >> I don't fully follow what you are doing but this feels like the
> >> kind of case where a new si_code has been defined as well as additional
> >> fields in siginfo.
> >
> > There is no new si_code for this, the information will be exposed for
> > several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> > SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> > (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> > set). Note that we already have a union field for BUS_MCEERR_AR, and
> > we may want to expose it for the other si_codes that already have
> > union fields as well.
> >
> > That being said, taking a closer look at siginfo, I think we are in
> > luck and we might be able to make this work in a reasonable way by
> > reusing padding (see below).
> >
> >> In your patchset I really hate that you were going back to
> >> force_sig_info, and filling out struct siginfo by hand.  That is an
> >> error prone pattern, and I have fixed enough bugs in the kernel to prove
> >> that.
> >
> > To be fair, most of the callers are in helper functions that take
> > explicit parameters similar to force_sig_fault et al, and the SIGILL
> > one could easily be made that way as well.
> >
> >> I take exception to the idea that including the full address might break
> >> userspace.  That means typically means someone has been too lazy to look
> >> and see what userspace is doing.  When that userspace that might break
> >> is the same userspace you are changing the kernel to serve that makes me
> >> nervous.  AKA the userspace that cares about this signal and how it is
> >> represented in siginfo.
> >
> > It's not a matter of being lazy. This behaviour isn't just an accident
> > but has been explicitly documented for years (see the
> > tagged-pointers.rst file that I changed: "Non-zero tags are not
> > preserved when delivering signals."), so users can reasonably rely on
> > it. Furthermore we simply don't have visibility into the majority of
> > userspace. For example, there are a lot of closed source Android apps
> > out there, and who knows what signal handlers they're installing and
> > how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> > can't just change the documented semantics under their feet.
> >
> > It's also not the same userspace either. The userspace that's
> > initially going to be consuming the new fields is in a part of the
> > Android system that handles and reports crashes, and that's something
> > that we control unlike all the apps.
> >
> > Finally, the userspace may need to know whether the tag bits were
> > actually zero or whether they were just unavailable, otherwise
> > userspace could for example produce a misleading crash report. Simply
> > having the kernel set the top bits of si_addr wouldn't accomplish that
> > due to the kernel's previous behaviour, hence the mask to let
> > userspace know which bits are accurate.
> >
> >> A fix of one instance of SIGILL should not be included with a patch that
> >> does something else, and really should come before everything else if
> >> possible.
> >
> > Fair point. I can see if I can split that part out.
> >
> >> If this information really belongs in struct siginfo (as it sounds like)
> >> please actually put the information in siginfo, and let userspace look
> >> in siginfo to find it.  struct siginfo is a union with plenty of space,
> >> and plenty of si_codes.
> >>
> >> If this applies to multiple cases then it might be trickier but please
> >> dig into the details, don't toss things into sigcontext just because
> >> you can't figure out a clean design for reporting this.
> >
> > If we wanted this in siginfo, one idea that I had was to revert commit
> > b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> > _addr_top_byte and _addr_top_byte_mask in the padding between
> > _addr_lsb and the union (with comments on all the fields of course to
> > say when they are filled in). I think that would work since we are
> > already clearing padding in siginfo, one nice property of the new
> > fields is that the zero values are correct in the case where the
> > information isn't being exposed (so old kernels would already have the
> > correct behaviour). That would only work on certain architectures
> > (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> > #ifdef __aarch64__ around it.
> 
> Perhaps add a 4th padding member to the union inside of _sigfault, that
> adds something like 4 unsigned long's worth of data, and then have your
> fields after the union.
> 
> Is it quite a bit of work to gather that information from the
> instructions that faulted?  I am just checking that this work is really
> makes sense.
> 
> What I really don't understand is how well this problem generalizes to
> other architectures to tell if this is something other people need to
> solve at some point as well.

The broad issue here is how arch-specific fault diagnostics make it into
the signal frame, and whether this is needed at all.

The address tag bits are one case, but the same basic mechanism is also
used to report the type of failed access (read versus write) for
SIGSEGV on arm64.  (IIRC qemu relies on this for tracking page use /
dirtiness in userspace.)

Having a way to associate arch metadata of this sort with the
specific signal it relates to seems a good idea.  That way, we're not
relying on internal details of the signal common code such as the
precise order signals get delivered in.

This concept is certainly applicable to other arches, but I don't know
the extent to which they actually depend on it.


Ideally, there would be a si_flags field to add simple arch_specific
attributes in, but there seems no backwards compatible way to add such a
thing for existing signals.  (Or is there?)

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext
  2020-05-21 20:48                                     ` Peter Collingbourne
@ 2020-06-08 18:12                                       ` Peter Collingbourne
  2020-06-08 18:14                                         ` [PATCH v7] arm64: Expose FAR_EL1 tag bits in siginfo Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-06-08 18:12 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Linux ARM, Catalin Marinas, Vincenzo Frascino,
	Will Deacon, Dave Martin, Evgenii Stepanov, Richard Henderson

On Thu, May 21, 2020 at 1:48 PM Peter Collingbourne <pcc@google.com> wrote:
>
> On Thu, May 21, 2020 at 12:28 PM Eric W. Biederman
> <ebiederm@xmission.com> wrote:
> >
> > Peter Collingbourne <pcc@google.com> writes:
> >
> > > On Thu, May 21, 2020 at 5:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > >>
> > >> Peter Collingbourne <pcc@google.com> writes:
> > >>
> > >> > On Wed, May 20, 2020 at 2:26 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >> >>
> > >> >> On Wed, May 20, 2020 at 09:55:03AM +0100, Will Deacon wrote:
> > >> >> > On Tue, May 19, 2020 at 03:00:12PM -0700, Peter Collingbourne wrote:
> > >> >> > > On Mon, May 18, 2020 at 2:53 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >> >> > > > On Thu, May 14, 2020 at 05:58:21PM -0700, Peter Collingbourne wrote:
> > >> >> > > > > diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> > >> >> > > > > index baa88dc02e5c..5867f2fdbe64 100644
> > >> >> > > > > --- a/arch/arm64/kernel/signal.c
> > >> >> > > > > +++ b/arch/arm64/kernel/signal.c
> > >> >> > > > > @@ -648,6 +648,7 @@ static int setup_sigframe(struct
> > >> >> > > > > rt_sigframe_user_layout *user,
> > >> >> > > > >                 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
> > >> >> > > > >                 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
> > >> >> > > > >                 __put_user_error(current->thread.fault_code,
> > >> >> > > > > &esr_ctx->esr, err);
> > >> >> > > > > +               current->thread.fault_code = 0;
> > >> >> > > >
> > >> >> > > > Perhaps, but we'd need to be careful.  For example, can we run out of
> > >> >> > > > user stack before this and deliver a SIGSEGV, but with the old
> > >> >> > > > fault_code still set?  Then we'd emit the old fault code with the
> > >> >> > > > new "can't deliver signal" signal, which doesn't make sense.
> > >> >> > > >
> > >> >> > > > Stuff may also go wrong with signal prioritisation.
> > >> >> > > >
> > >> >> > > > If a higher-priority signal (say SIGINT) comes in after a data abort
> > >> >> > > > enters the kernel but before the resulting SIGSEGV is dequeued for
> > >> >> > > > delivery, wouldn't we deliver SIGINT first, with the bogus fault code?
> > >> >> > > > With your change we'd then have cleared the fault code by the time we
> > >> >> > > > deliver the SIGSEGV it actually relates to, if I've understood right.
> > >> >> > > >
> > >> >> > > > Today, I think we just attach that fault code to every signal that's
> > >> >> > > > delivered until something overwrites or resets it, which means that
> > >> >> > > > a signal that needs fault_code gets it, at the expense of attaching
> > >> >> > > > it to a bunch of other random signals too.
> > >> >> > > >
> > >> >> > > >
> > >> >> > > > Checking the signal number and si_code might help us to know what we
> > >> >> > > > should be doing with fault_code.  We need to have sure userspace can't
> > >> >> > > > trick us with a non kernel generated signal here.  It would also be
> > >> >> > > > necessary to check how PTRACE_SETSIGINFO interacts with this.
> > >> >> > >
> > >> >> > > With these possible interactions in mind I think we should store the
> > >> >> > > fault code and fault address in kernel_siginfo instead of
> > >> >> > > thread_struct (and clear these fields when we receive a siginfo from
> > >> >> > > userspace, i.e. in copy_siginfo_from_user which is used by
> > >> >> > > ptrace(PTRACE_SETSIGINFO) among other places). That way, the
> > >> >> > > information is clearly associated with the signal itself and not the
> > >> >> > > thread, so we don't need to worry about our signal being delivered out
> > >> >> > > of order.
> > >> >> >
> > >> >> > Hmm, I can't see a way to do that that isn't horribly invasive in the core
> > >> >> > signal code. Can you?
> > >> >
> > >> > I think I've come up with a way that doesn't seem to be too invasive.
> > >> > See patch #1 of the series that I'm about to send out.
> > >> >
> > >> >> > But generally, I agree: the per-thread handling of fault_address and
> > >> >> > fault_code appears to be quite broken in the face of signal prioritisation
> > >> >> > and signals that don't correspond directly to hardware trap. It would be
> > >> >> > nice to have some tests for this...
> > >> >> >
> > >> >> > If we want to pile on more bodges, perhaps we could stash the signal number
> > >> >> > to which the fault_{address,code} relate, and then check that at delivery
> > >> >> > and clear on a match. I hate it.
> > >> >>
> > >> >> I agree with Daniel's suggestion in principle, but I was also concerned
> > >> >> about whether it would be too invasive elsewhere.
> > >> >>
> > >> >> Question though: does the core code take special care to make sure that
> > >> >> a force_sig cannot be outprioritised by a regular signal?  If so,
> > >> >> perhaps we get away with it.  I ask this, because the same same issue
> > >> >> may be hitting other arches otherwise.
> > >> >
> > >> > Not as far as I can tell. There does appear to be prioritisation for
> > >> > synchronous signals [1] but as far as I can tell nothing to
> > >> > distinguish one of these signals from one with the same signal number
> > >> > sent from userspace (e.g. via kill(2)).
> > >>
> > >> The si_code will differ between signals generated between userspace
> > >> and signals generated by the kernel.
> > >>
> > >> We do allow a little bit of ptrace and sending to yourself to spoof
> > >> kernel generated signals, for reasons of debugging and process migration
> > >> where an existing process needs to be reconstructed.  But the defenses
> > >> should be strong enough you can assume that we reliably distinguish
> > >> between a signal from userspace and a signal from the kernel.
> > >
> > > So check for SIGBUS || SIGSEGV and one of the below si_codes, and only
> > > add the context in that case? Seems fragile to me, but I suppose I
> > > could live with it.
> > >
> > >> I don't fully follow what you are doing but this feels like the
> > >> kind of case where a new si_code has been defined as well as additional
> > >> fields in siginfo.
> > >
> > > There is no new si_code for this, the information will be exposed for
> > > several existing si_code types (BUS_ADRERR, BUS_ADRALN, BUS_MCEERR_AR,
> > > SEGV_ACCERR, SEGV_MAPERR), and possibly others in the future
> > > (particularly SEGV_MTESERR, which is part of the proposed MTE patch
> > > set). Note that we already have a union field for BUS_MCEERR_AR, and
> > > we may want to expose it for the other si_codes that already have
> > > union fields as well.
> > >
> > > That being said, taking a closer look at siginfo, I think we are in
> > > luck and we might be able to make this work in a reasonable way by
> > > reusing padding (see below).
> > >
> > >> In your patchset I really hate that you were going back to
> > >> force_sig_info, and filling out struct siginfo by hand.  That is an
> > >> error prone pattern, and I have fixed enough bugs in the kernel to prove
> > >> that.
> > >
> > > To be fair, most of the callers are in helper functions that take
> > > explicit parameters similar to force_sig_fault et al, and the SIGILL
> > > one could easily be made that way as well.
> > >
> > >> I take exception to the idea that including the full address might break
> > >> userspace.  That means typically means someone has been too lazy to look
> > >> and see what userspace is doing.  When that userspace that might break
> > >> is the same userspace you are changing the kernel to serve that makes me
> > >> nervous.  AKA the userspace that cares about this signal and how it is
> > >> represented in siginfo.
> > >
> > > It's not a matter of being lazy. This behaviour isn't just an accident
> > > but has been explicitly documented for years (see the
> > > tagged-pointers.rst file that I changed: "Non-zero tags are not
> > > preserved when delivering signals."), so users can reasonably rely on
> > > it. Furthermore we simply don't have visibility into the majority of
> > > userspace. For example, there are a lot of closed source Android apps
> > > out there, and who knows what signal handlers they're installing and
> > > how they're making use of the si_addr field on e.g. SEGV_MAPERR. We
> > > can't just change the documented semantics under their feet.
> > >
> > > It's also not the same userspace either. The userspace that's
> > > initially going to be consuming the new fields is in a part of the
> > > Android system that handles and reports crashes, and that's something
> > > that we control unlike all the apps.
> > >
> > > Finally, the userspace may need to know whether the tag bits were
> > > actually zero or whether they were just unavailable, otherwise
> > > userspace could for example produce a misleading crash report. Simply
> > > having the kernel set the top bits of si_addr wouldn't accomplish that
> > > due to the kernel's previous behaviour, hence the mask to let
> > > userspace know which bits are accurate.
> > >
> > >> A fix of one instance of SIGILL should not be included with a patch that
> > >> does something else, and really should come before everything else if
> > >> possible.
> > >
> > > Fair point. I can see if I can split that part out.
> > >
> > >> If this information really belongs in struct siginfo (as it sounds like)
> > >> please actually put the information in siginfo, and let userspace look
> > >> in siginfo to find it.  struct siginfo is a union with plenty of space,
> > >> and plenty of si_codes.
> > >>
> > >> If this applies to multiple cases then it might be trickier but please
> > >> dig into the details, don't toss things into sigcontext just because
> > >> you can't figure out a clean design for reporting this.
> > >
> > > If we wanted this in siginfo, one idea that I had was to revert commit
> > > b68a68d3dcc15ebbf23cbe91af1abf57591bd96b and add unsigned char fields
> > > _addr_top_byte and _addr_top_byte_mask in the padding between
> > > _addr_lsb and the union (with comments on all the fields of course to
> > > say when they are filled in). I think that would work since we are
> > > already clearing padding in siginfo, one nice property of the new
> > > fields is that the zero values are correct in the case where the
> > > information isn't being exposed (so old kernels would already have the
> > > correct behaviour). That would only work on certain architectures
> > > (i.e. at least alignof(void*) >= 4) so I suppose it could have an
> > > #ifdef __aarch64__ around it.
> >
> > Perhaps add a 4th padding member to the union inside of _sigfault, that
> > adds something like 4 unsigned long's worth of data, and then have your
> > fields after the union.
>
> Maybe. I guess we could always add another union after my fields if we
> end up needing another union member that is larger than the 4 unsigned
> longs, which would be ugly but at least it would work. Reusing the
> padding would avoid that but maybe it's not that likely that we'll
> need that much.

In the interests of getting the discussion on this started again I'm
sending a v7 which moves the fields into the padding bytes after
si_addr_lsb. It should be easy to switch to another location in
siginfo if you don't like this one.

> > Is it quite a bit of work to gather that information from the
> > instructions that faulted?  I am just checking that this work is really
> > makes sense.
>
> I think so. At a glance there are hundreds of load and store
> instructions on arm64 and we would need to know how to disassemble all
> of them and recompute the si_addr from scratch (since the tag bits
> could come from any of the registers used to compute the address). And
> we really don't want to be doing this tricky stuff in a signal handler
> where we've just crashed.
>
> > What I really don't understand is how well this problem generalizes to
> > other architectures to tell if this is something other people need to
> > solve at some point as well.
>
> An architecture with a feature similar to ARM's TBI or MTE may need
> something like this as well, depending on whether they decide to
> expose the tag bits in si_addr from the start (and if the feature is
> similar to TBI it certainly seems like a reasonable choice to follow
> arm64 for compatibility reasons). I would imagine that the main thing
> that could vary between architectures is the number of bits involved,
> which suggests making the fields arch-specific (or making them larger,
> but that may be wasteful).

I made the new fields arch-specific given the points that I made above.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH v7] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-08 18:12                                       ` Peter Collingbourne
@ 2020-06-08 18:14                                         ` Peter Collingbourne
       [not found]                                           ` <20200623020134.16655-1-pcc@google.com>
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-06-08 18:14 UTC (permalink / raw)
  To: Catalin Marinas, Evgenii Stepanov, Kostya Serebryany,
	Vincenzo Frascino, Dave Martin, Will Deacon, Oleg Nesterov,
	Eric W. Biederman
  Cc: Andrey Konovalov, Kevin Brodsky, Peter Collingbourne, Linux ARM,
	Richard Henderson

The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
address exposed via siginfo.si_addr and sigcontext.fault_address. However,
the tag bits may be needed by tools in order to accurately diagnose
memory errors, such as HWASan [1] or future tools based on the Memory
Tagging Extension (MTE).

We should not stop clearing these bits in the existing fault address
fields, because there may be existing userspace applications that
are expecting the tag bits to be cleared. Instead, create a new
aarch64-specific union field in siginfo, and store the tag bits of
FAR_EL1 there, together with a mask specifying which bits are valid.

The new fields are laid out in a part of siginfo that is currently
unused due to having previously been used for padding between
si_addr_lsb and the union. Existing kernels will zero-initialize
the padding, setting both fields to 0, which is a valid value for
the fields.

[1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html

Signed-off-by: Peter Collingbourne <pcc@google.com>
---
v7:
- switch to a new siginfo field instead of using sigcontext
- merge the patch back into one since the other patches are now
  unnecessary

v6:
- move fault address and fault code into the kernel_siginfo data structure
- split the patch in three since it was getting large and now has
  generic and arch-specific parts

v5:
- add padding to fault_addr_top_byte_context in order to ensure the correct
  size and preserve sp alignment

v4:
- expose only the tag bits in the context instead of the entire FAR_EL1
- remove mention of the new context from the sigcontext.__reserved[] note

v3:
- add documentation to tagged-pointers.rst
- update comments in sigcontext.h

v2:
- revert changes to hw_breakpoint.c
- rename set_thread_esr to set_thread_far_esr
 Documentation/arm64/tagged-pointers.rst | 17 +++++---
 arch/arm64/include/asm/exception.h      |  2 +-
 arch/arm64/include/asm/traps.h          |  7 +++-
 arch/arm64/kernel/debug-monitors.c      |  4 +-
 arch/arm64/kernel/entry-common.c        |  2 -
 arch/arm64/kernel/ptrace.c              |  2 +-
 arch/arm64/kernel/traps.c               | 35 ++++++++++++----
 arch/arm64/mm/fault.c                   | 54 ++++++++++++++-----------
 include/uapi/asm-generic/siginfo.h      | 12 ++++++
 9 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index eab4323609b9..f1880ed5cdf2 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -53,12 +53,17 @@ visibility.
 Preserving tags
 ---------------
 
-Non-zero tags are not preserved when delivering signals. This means that
-signal handlers in applications making use of tags cannot rely on the
-tag information for user virtual addresses being maintained for fields
-inside siginfo_t. One exception to this rule is for signals raised in
-response to watchpoint debug exceptions, where the tag information will
-be preserved.
+Non-zero tags are not preserved in the fault address fields
+siginfo.si_addr or sigcontext.fault_address when delivering
+signals. This means that signal handlers in applications making use
+of tags cannot rely on the tag information for user virtual addresses
+being maintained in these fields. One exception to this rule is for
+signals raised in response to watchpoint debug exceptions, where the
+tag information will be preserved.
+
+The fault address tag is preserved in the si_addr_top_byte field of
+siginfo, which is set for signals raised in response to data aborts
+and instruction aborts.
 
 The architecture prevents the use of a tagged PC, so the upper byte will
 be set to a sign-extension of bit 55 on exception return.
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..90e772d9b2cd 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -32,7 +32,7 @@ static inline u32 disr_to_esr(u64 disr)
 }
 
 asmlinkage void enter_from_user_mode(void);
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index cee5928e1b7d..8e4f6c5b97af 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -26,8 +26,11 @@ void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 void force_signal_inject(int signal, int code, unsigned long address);
 void arm64_notify_segfault(unsigned long addr);
-void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str);
-void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str);
+void arm64_force_sig_fault(int signo, int code, void __user *addr,
+			   unsigned long far, unsigned char far_tb_mask,
+			   const char *str);
+void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
+			    unsigned long far, const char *str);
 void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, const char *str);
 
 /*
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 48222a4760c2..498e6393b2ca 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -232,8 +232,8 @@ static void send_user_sigtrap(int si_code)
 		local_irq_enable();
 
 	arm64_force_sig_fault(SIGTRAP, si_code,
-			     (void __user *)instruction_pointer(regs),
-			     "User debug trap");
+			      (void __user *)instruction_pointer(regs), 0, 0,
+			      "User debug trap");
 }
 
 static int single_step_handler(unsigned long unused, unsigned int esr,
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..045b4f518836 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -22,7 +22,6 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 
 	local_daif_inherit(regs);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el1_abort);
@@ -104,7 +103,6 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr)
 
 	user_exit_irqoff();
 	local_daif_restore(DAIF_PROCCTX);
-	far = untagged_addr(far);
 	do_mem_abort(far, esr, regs);
 }
 NOKPROBE_SYMBOL(el0_da);
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b3d3005d9515..51bb8bcaf24b 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -198,7 +198,7 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 	}
 #endif
 	arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT,
-			      (void __user *)(bkpt->trigger),
+			      (void __user *)(bkpt->trigger), 0, 0,
 			      desc);
 }
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index cf402be5c573..53ddeb8bde0b 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -234,20 +234,41 @@ static void arm64_show_signal(int signo, const char *str)
 }
 
 void arm64_force_sig_fault(int signo, int code, void __user *addr,
+			   unsigned long far, unsigned char far_tb_mask,
 			   const char *str)
 {
 	arm64_show_signal(signo, str);
-	if (signo == SIGKILL)
+	if (signo == SIGKILL) {
 		force_sig(SIGKILL);
-	else
-		force_sig_fault(signo, code, addr);
+	} else {
+		struct kernel_siginfo info;
+		clear_siginfo(&info);
+		info.si_signo = signo;
+		info.si_errno = 0;
+		info.si_code = code;
+		info.si_addr = addr;
+		info.si_addr_top_byte = (far >> 56) & far_tb_mask;
+		info.si_addr_top_byte_mask = far_tb_mask;
+		force_sig_info(&info);
+	}
 }
 
 void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
-			    const char *str)
+			    unsigned long far, const char *str)
 {
+	struct kernel_siginfo info;
+
 	arm64_show_signal(SIGBUS, str);
-	force_sig_mceerr(code, addr, lsb);
+
+	clear_siginfo(&info);
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = code;
+	info.si_addr = addr;
+	info.si_addr_lsb = lsb;
+	info.si_addr_top_byte = far >> 56;
+	info.si_addr_top_byte_mask = 0xff;
+	force_sig_info(&info);
 }
 
 void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr,
@@ -266,7 +287,7 @@ void arm64_notify_die(const char *str, struct pt_regs *regs,
 		current->thread.fault_address = 0;
 		current->thread.fault_code = err;
 
-		arm64_force_sig_fault(signo, sicode, addr, str);
+		arm64_force_sig_fault(signo, sicode, addr, 0, 0, str);
 	} else {
 		die(str, regs, err);
 	}
@@ -816,7 +837,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 	current->thread.fault_address = 0;
 	current->thread.fault_code = esr;
 
-	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc,
+	arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, 0, 0,
 			      "Bad EL0 synchronous exception");
 }
 
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..724e896674e6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,7 +41,7 @@
 #include <asm/traps.h>
 
 struct fault_info {
-	int	(*fn)(unsigned long addr, unsigned int esr,
+	int	(*fn)(unsigned long far, unsigned int esr,
 		      struct pt_regs *regs);
 	int	sig;
 	int	code;
@@ -377,8 +377,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr)
 	current->thread.fault_code = esr;
 }
 
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static void do_bad_area(unsigned long far, unsigned int esr,
+			struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	/*
 	 * If we are in kernel mode at this point, we have no context to
 	 * handle this fault with.
@@ -388,7 +391,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 
 		set_thread_esr(addr, esr);
 		arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr,
-				      inf->name);
+				      far, 0xff, inf->name);
 	} else {
 		__do_kernel_fault(addr, esr, regs);
 	}
@@ -439,7 +442,7 @@ static bool is_write_abort(unsigned int esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
 				   struct pt_regs *regs)
 {
 	const struct fault_info *inf;
@@ -447,6 +450,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	vm_fault_t fault, major = 0;
 	unsigned long vm_flags = VM_ACCESS_FLAGS;
 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+	unsigned long addr = untagged_addr(far);
 
 	if (kprobe_page_fault(regs, esr))
 		return 0;
@@ -577,7 +581,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		 * this page fault.
 		 */
 		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,
-				      inf->name);
+				      far, 0xff, inf->name);
 	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
 		unsigned int lsb;
 
@@ -586,7 +590,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
 
 		arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,
-				       inf->name);
+				       far, inf->name);
 	} else {
 		/*
 		 * Something tried to access memory that isn't in our memory
@@ -594,8 +598,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		 */
 		arm64_force_sig_fault(SIGSEGV,
 				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
-				      (void __user *)addr,
-				      inf->name);
+				      (void __user *)addr, far, 0xff, inf->name);
 	}
 
 	return 0;
@@ -605,30 +608,32 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
-static int __kprobes do_translation_fault(unsigned long addr,
+static int __kprobes do_translation_fault(unsigned long far,
 					  unsigned int esr,
 					  struct pt_regs *regs)
 {
+	unsigned long addr = untagged_addr(far);
+
 	if (is_ttbr0_addr(addr))
-		return do_page_fault(addr, esr, regs);
+		return do_page_fault(far, esr, regs);
 
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
+static int do_alignment_fault(unsigned long far, unsigned int esr,
 			      struct pt_regs *regs)
 {
-	do_bad_area(addr, esr, regs);
+	do_bad_area(far, esr, regs);
 	return 0;
 }
 
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	return 1; /* "fault" */
 }
 
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf;
 	void __user *siaddr;
@@ -644,7 +649,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 	if (esr & ESR_ELx_FnV)
 		siaddr = NULL;
 	else
-		siaddr  = (void __user *)addr;
+		siaddr  = (void __user *)untagged_addr(far);
 	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 
 	return 0;
@@ -717,11 +722,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 };
 
-void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 {
 	const struct fault_info *inf = esr_to_fault_info(esr);
+	unsigned long addr = untagged_addr(far);
 
-	if (!inf->fn(addr, esr, regs))
+	if (!inf->fn(far, esr, regs))
 		return;
 
 	if (!user_mode(regs)) {
@@ -730,8 +736,8 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 		show_pte(addr);
 	}
 
-	arm64_notify_die(inf->name, regs,
-			 inf->sig, inf->code, (void __user *)addr, esr);
+	arm64_notify_die(inf->name, regs, inf->sig, inf->code,
+			 (void __user *)addr, esr);
 }
 NOKPROBE_SYMBOL(do_mem_abort);
 
@@ -744,8 +750,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
 
 void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
-	arm64_notify_die("SP/PC alignment exception", regs,
-			 SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
+	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
+			 (void __user *)addr, esr);
 }
 NOKPROBE_SYMBOL(do_sp_pc_abort);
 
@@ -871,8 +877,8 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 		arm64_apply_bp_hardening();
 
 	if (inf->fn(addr_if_watchpoint, esr, regs)) {
-		arm64_notify_die(inf->name, regs,
-				 inf->sig, inf->code, (void __user *)pc, esr);
+		arm64_notify_die(inf->name, regs, inf->sig, inf->code,
+				 (void __user *)pc, esr);
 	}
 
 	debug_exception_exit(regs);
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index cb3d6c267181..6dd82373eb2d 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,6 +91,14 @@ union __sifields {
 				char _dummy_pkey[__ADDR_BND_PKEY_PAD];
 				__u32 _pkey;
 			} _addr_pkey;
+#ifdef __aarch64__
+			/* used with all si_codes */
+			struct {
+				short _dummy_top_byte;
+				unsigned char _top_byte;
+				unsigned char _top_byte_mask;
+			} _addr_top_byte;
+#endif
 		};
 	} _sigfault;
 
@@ -148,6 +156,10 @@ typedef struct siginfo {
 #define si_int		_sifields._rt._sigval.sival_int
 #define si_ptr		_sifields._rt._sigval.sival_ptr
 #define si_addr		_sifields._sigfault._addr
+#ifdef __aarch64__
+#define si_addr_top_byte	_sifields._sigfault._addr_top_byte._top_byte
+#define si_addr_top_byte_mask	_sifields._sigfault._addr_top_byte._top_byte_mask
+#endif
 #ifdef __ARCH_SI_TRAPNO
 #define si_trapno	_sifields._sigfault._trapno
 #endif
-- 
2.27.0.278.ge193c7cf3a9-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
       [not found]                                             ` <87sgemrlgc.fsf@x220.int.ebiederm.org>
@ 2020-06-23 14:38                                               ` Dave Martin
  2020-06-23 17:47                                                 ` Eric W. Biederman
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-06-23 14:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Will Deacon, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Evgenii Stepanov, Andrey Konovalov,
	Vincenzo Frascino, Peter Collingbourne, Linux ARM,
	Richard Henderson

On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> Peter Collingbourne <pcc@google.com> writes:
> 
> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > index 47f651df781c..a8380a2b6361 100644
> > --- a/arch/arm64/kernel/traps.c
> > +++ b/arch/arm64/kernel/traps.c
> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> >  }
> >  
> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > +			   unsigned long far, unsigned char far_tb_mask,
> >  			   const char *str)
> >  {
> >  	arm64_show_signal(signo, str);
> > -	if (signo == SIGKILL)
> > +	if (signo == SIGKILL) {
> >  		force_sig(SIGKILL);
> > -	else
> > -		force_sig_fault(signo, code, addr);
> > +	} else {
> > +		struct kernel_siginfo info;
> > +		clear_siginfo(&info);
> > +		info.si_signo = signo;
> > +		info.si_errno = 0;
> > +		info.si_code = code;
> > +		info.si_addr = addr;
> > +		info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > +		info.si_addr_top_byte_mask = far_tb_mask;
> > +		force_sig_info(&info);
> > +	}
> >  }
> >  
> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > -			    const char *str)
> > +			    unsigned long far, const char *str)
> >  {
> > +	struct kernel_siginfo info;
> > +
> >  	arm64_show_signal(SIGBUS, str);
> > -	force_sig_mceerr(code, addr, lsb);
> > +
> > +	clear_siginfo(&info);
> > +	info.si_signo = SIGBUS;
> > +	info.si_errno = 0;
> > +	info.si_code = code;
> > +	info.si_addr = addr;
> > +	info.si_addr_lsb = lsb;
> > +	info.si_addr_top_byte = far >> 56;
> > +	info.si_addr_top_byte_mask = 0xff;
> > +	force_sig_info(&info);
> >  }
> 
> I have a real problem with this construction.  force_sig_info is not an
> interface that should be used for anything except to define a wrapper
> that takes it's parameters.

Can you elaborate?  How would you do this king of thing.

AIUI we absolutely need a forced signal here, we need to supply
metadata, and we don't have to open-code all that at every relevant
signal generation site...

> It is not clear to me that if you have adapted siginfo_layout.

Garbled sentence?

> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > index cb3d6c267181..6dd82373eb2d 100644
> > --- a/include/uapi/asm-generic/siginfo.h
> > +++ b/include/uapi/asm-generic/siginfo.h
> > @@ -91,6 +91,14 @@ union __sifields {
> >  				char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> >  				__u32 _pkey;
> >  			} _addr_pkey;
> > +#ifdef __aarch64__
> > +			/* used with all si_codes */
> > +			struct {
> > +				short _dummy_top_byte;

^ What's this for?  I don't have Eric's insight here.

> > +				unsigned char _top_byte;
> > +				unsigned char _top_byte_mask;
> > +			} _addr_top_byte;
> > +#endif
> >  		};
> >  	} _sigfault;
> >
> 
> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> 
> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> design this so any other architecture who has this challenge can use the
> code.  The kind of code does not get enough attention/maintenance if it
> is built for a single architecture.

Does this belong in the user-facing siginfo?  It seems a bit strange,
when other closely-related information such as esr_context is in the
arch-specific signal frame.


If trying to make this reusable, I wonder if we should have some sort of
"address attributes" field.

An alternative approach would be to add some opaque "arch_data" field,
that the arch code can go look at when delivering the signal.


I think that's all we were trying to achieve here: tack some arch
private data onto the signal, to avoid having to stash the same info in
thread_info and pray that it doesn't get clobbered in between signal
generation and delivery.

At signal delivery time, the arch signal delivery code could inspect
this data and emit it into the signal frame as appropriate for the arch.

[...]

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
       [not found]                                           ` <20200623020134.16655-1-pcc@google.com>
       [not found]                                             ` <87sgemrlgc.fsf@x220.int.ebiederm.org>
@ 2020-06-23 14:57                                             ` Dave Martin
  1 sibling, 0 replies; 64+ messages in thread
From: Dave Martin @ 2020-06-23 14:57 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Mon, Jun 22, 2020 at 07:01:34PM -0700, Peter Collingbourne wrote:
> The kernel currently clears the tag bits (i.e. bits 56-63) in the fault
> address exposed via siginfo.si_addr and sigcontext.fault_address. However,
> the tag bits may be needed by tools in order to accurately diagnose
> memory errors, such as HWASan [1] or future tools based on the Memory
> Tagging Extension (MTE).

[...]

Minor nit: Can you stop making each version of this series in-reply-to
the previous series please?  Most people don't do that, and it's giving
me weird threading in my mailbox...

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-23 14:38                                               ` [PATCH v8] " Dave Martin
@ 2020-06-23 17:47                                                 ` Eric W. Biederman
  2020-06-24  0:40                                                   ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Eric W. Biederman @ 2020-06-23 17:47 UTC (permalink / raw)
  To: Dave Martin
  Cc: Will Deacon, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Evgenii Stepanov, Andrey Konovalov,
	Vincenzo Frascino, Peter Collingbourne, Linux ARM,
	Richard Henderson

Dave Martin <Dave.Martin@arm.com> writes:

> On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
>> Peter Collingbourne <pcc@google.com> writes:
>> 
>> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
>> > index 47f651df781c..a8380a2b6361 100644
>> > --- a/arch/arm64/kernel/traps.c
>> > +++ b/arch/arm64/kernel/traps.c
>> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
>> >  }
>> >  
>> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
>> > +			   unsigned long far, unsigned char far_tb_mask,
>> >  			   const char *str)
>> >  {
>> >  	arm64_show_signal(signo, str);
>> > -	if (signo == SIGKILL)
>> > +	if (signo == SIGKILL) {
>> >  		force_sig(SIGKILL);
>> > -	else
>> > -		force_sig_fault(signo, code, addr);
>> > +	} else {
>> > +		struct kernel_siginfo info;
>> > +		clear_siginfo(&info);
>> > +		info.si_signo = signo;
>> > +		info.si_errno = 0;
>> > +		info.si_code = code;
>> > +		info.si_addr = addr;
>> > +		info.si_addr_top_byte = (far >> 56) & far_tb_mask;
>> > +		info.si_addr_top_byte_mask = far_tb_mask;
>> > +		force_sig_info(&info);
>> > +	}
>> >  }
>> >  
>> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
>> > -			    const char *str)
>> > +			    unsigned long far, const char *str)
>> >  {
>> > +	struct kernel_siginfo info;
>> > +
>> >  	arm64_show_signal(SIGBUS, str);
>> > -	force_sig_mceerr(code, addr, lsb);
>> > +
>> > +	clear_siginfo(&info);
>> > +	info.si_signo = SIGBUS;
>> > +	info.si_errno = 0;
>> > +	info.si_code = code;
>> > +	info.si_addr = addr;
>> > +	info.si_addr_lsb = lsb;
>> > +	info.si_addr_top_byte = far >> 56;
>> > +	info.si_addr_top_byte_mask = 0xff;
>> > +	force_sig_info(&info);
>> >  }
>> 
>> I have a real problem with this construction.  force_sig_info is not an
>> interface that should be used for anything except to define a wrapper
>> that takes it's parameters.
>
> Can you elaborate?  How would you do this king of thing.

There are no other uses of force_sig_info in architecture code.

I just removed them _all_ because they were almost all broken.
In fact your mcerr case is broken because it uses two different
union members simultantiously.

So I am looking for something like force_sig_mcerr or force_sig_fault
that includes your new information that then calls force_sig_info.

I know of no other way to safely use the siginfo struct.

> AIUI we absolutely need a forced signal here, we need to supply
> metadata, and we don't have to open-code all that at every relevant
> signal generation site...
>
>> It is not clear to me that if you have adapted siginfo_layout.
>
> Garbled sentence?

Looks like.  One of the pieces of code that needs to change
when siginfo gets updated is siginfo_layout so that the structure
can be properly decoded and made sense of.

I am not seeing anything like that.

>> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
>> > index cb3d6c267181..6dd82373eb2d 100644
>> > --- a/include/uapi/asm-generic/siginfo.h
>> > +++ b/include/uapi/asm-generic/siginfo.h
>> > @@ -91,6 +91,14 @@ union __sifields {
>> >  				char _dummy_pkey[__ADDR_BND_PKEY_PAD];
>> >  				__u32 _pkey;
>> >  			} _addr_pkey;
>> > +#ifdef __aarch64__
>> > +			/* used with all si_codes */
>> > +			struct {
>> > +				short _dummy_top_byte;
>
> ^ What's this for?  I don't have Eric's insight here.
>
>> > +				unsigned char _top_byte;
>> > +				unsigned char _top_byte_mask;
>> > +			} _addr_top_byte;
>> > +#endif
>> >  		};
>> >  	} _sigfault;
>> >
>> 
>> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
>> 
>> Please remove the "#ifdef __aarch64__".  If at all possible we want to
>> design this so any other architecture who has this challenge can use the
>> code.  The kind of code does not get enough attention/maintenance if it
>> is built for a single architecture.
>
> Does this belong in the user-facing siginfo?  It seems a bit strange,
> when other closely-related information such as esr_context is in the
> arch-specific signal frame.
>
>
> If trying to make this reusable, I wonder if we should have some sort of
> "address attributes" field.
>
> An alternative approach would be to add some opaque "arch_data" field,
> that the arch code can go look at when delivering the signal.

My point is arch specific hacks don't get looked at, and wind up being
broken.  So I am not encouraging anything that doesn't get looked at,
and winds up being broken.

> I think that's all we were trying to achieve here: tack some arch
> private data onto the signal, to avoid having to stash the same info in
> thread_info and pray that it doesn't get clobbered in between signal
> generation and delivery.

What makes it arch private data?  Why isn't it just data that your arch
happens to have that other architectures don't yet.

> At signal delivery time, the arch signal delivery code could inspect
> this data and emit it into the signal frame as appropriate for the
> arch.

Sorry this probably isn't what you mean but when I read that description
I get the feeling that you are asking for code that won't be reviewed or
looked at by anyone else.  So inevitably that code will be broken.
Frankly it is bad enough finding people to review and maintain the
generic code of the kernel.


With that said, and your desire for this data to go into the sigframe
(despite it sounding a lot like generic data that only aarch64 has
implemented yet) can you remind me why siginfo comes into the equation
at all?

Last I remember the discussion there were some issues and the plan was
to simply solve the problem generically and use siginfo, and there would
not need to be any sigframe changes.

But if you want to deliver via sigframe force_sig_info and all it's
variants will be delivered when the kernel returns back to userspace.
So there should be no need to touch siginfo or anything else in that
scenario.

Eric

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-23 17:47                                                 ` Eric W. Biederman
@ 2020-06-24  0:40                                                   ` Peter Collingbourne
  2020-06-24  9:28                                                     ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-06-24  0:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Kostya Serebryany,
	Evgenii Stepanov, Andrey Konovalov, Vincenzo Frascino,
	Will Deacon, Dave Martin, Linux ARM, Richard Henderson

On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> Dave Martin <Dave.Martin@arm.com> writes:
>
> > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> >> Peter Collingbourne <pcc@google.com> writes:
> >>
> >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> >> > index 47f651df781c..a8380a2b6361 100644
> >> > --- a/arch/arm64/kernel/traps.c
> >> > +++ b/arch/arm64/kernel/traps.c
> >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> >> >  }
> >> >
> >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> >> > +                     unsigned long far, unsigned char far_tb_mask,
> >> >                       const char *str)
> >> >  {
> >> >    arm64_show_signal(signo, str);
> >> > -  if (signo == SIGKILL)
> >> > +  if (signo == SIGKILL) {
> >> >            force_sig(SIGKILL);
> >> > -  else
> >> > -          force_sig_fault(signo, code, addr);
> >> > +  } else {
> >> > +          struct kernel_siginfo info;
> >> > +          clear_siginfo(&info);
> >> > +          info.si_signo = signo;
> >> > +          info.si_errno = 0;
> >> > +          info.si_code = code;
> >> > +          info.si_addr = addr;
> >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> >> > +          force_sig_info(&info);
> >> > +  }
> >> >  }
> >> >
> >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> >> > -                      const char *str)
> >> > +                      unsigned long far, const char *str)
> >> >  {
> >> > +  struct kernel_siginfo info;
> >> > +
> >> >    arm64_show_signal(SIGBUS, str);
> >> > -  force_sig_mceerr(code, addr, lsb);
> >> > +
> >> > +  clear_siginfo(&info);
> >> > +  info.si_signo = SIGBUS;
> >> > +  info.si_errno = 0;
> >> > +  info.si_code = code;
> >> > +  info.si_addr = addr;
> >> > +  info.si_addr_lsb = lsb;
> >> > +  info.si_addr_top_byte = far >> 56;
> >> > +  info.si_addr_top_byte_mask = 0xff;
> >> > +  force_sig_info(&info);
> >> >  }
> >>
> >> I have a real problem with this construction.  force_sig_info is not an
> >> interface that should be used for anything except to define a wrapper
> >> that takes it's parameters.
> >
> > Can you elaborate?  How would you do this king of thing.
>
> There are no other uses of force_sig_info in architecture code.
>
> I just removed them _all_ because they were almost all broken.
> In fact your mcerr case is broken because it uses two different
> union members simultantiously.

Is that really broken? I thought that the Linux kernel deliberately
didn't care about strict aliasing rules (the top-level Makefile passes
-fno-strict-aliasing) so I thought that it was valid in "Linux kernel
C" even though from a standards point of view it is invalid. (That
being said, this is probably moot with my proposed changes below
though.)

> So I am looking for something like force_sig_mcerr or force_sig_fault
> that includes your new information that then calls force_sig_info.
>
> I know of no other way to safely use the siginfo struct.

So you want something like:

int force_sig_fault_with_ignored_bits(int signo, int code, void __user
*addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);

in kernel/signal.c and the code in arch/arm64 would call that?

> > AIUI we absolutely need a forced signal here, we need to supply
> > metadata, and we don't have to open-code all that at every relevant
> > signal generation site...
> >
> >> It is not clear to me that if you have adapted siginfo_layout.
> >
> > Garbled sentence?
>
> Looks like.  One of the pieces of code that needs to change
> when siginfo gets updated is siginfo_layout so that the structure
> can be properly decoded and made sense of.
>
> I am not seeing anything like that.

Okay, this has to do with copying between the compat and non-compat
versions of the struct? Sure, I can update that, although the code
would be basically non-functional on arm64 because TBI isn't supported
on 32-bit ARM.

> >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> >> > index cb3d6c267181..6dd82373eb2d 100644
> >> > --- a/include/uapi/asm-generic/siginfo.h
> >> > +++ b/include/uapi/asm-generic/siginfo.h
> >> > @@ -91,6 +91,14 @@ union __sifields {
> >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> >> >                            __u32 _pkey;
> >> >                    } _addr_pkey;
> >> > +#ifdef __aarch64__
> >> > +                  /* used with all si_codes */
> >> > +                  struct {
> >> > +                          short _dummy_top_byte;
> >
> > ^ What's this for?  I don't have Eric's insight here.

We would need a short's worth of padding in order to prevent the
fields from occupying the same address as si_addr_lsb.

> >
> >> > +                          unsigned char _top_byte;
> >> > +                          unsigned char _top_byte_mask;
> >> > +                  } _addr_top_byte;
> >> > +#endif
> >> >            };
> >> >    } _sigfault;
> >> >
> >>
> >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> >>
> >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> >> design this so any other architecture who has this challenge can use the
> >> code.  The kind of code does not get enough attention/maintenance if it
> >> is built for a single architecture.

Seems reasonable. I was recently made aware that RISC-V was
considering a similar feature:
https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
I would have opted to expand this to other architectures on an
as-needed basis, but I'd also be fine with having it on all
architectures from the start.

If we make this arch-independent, we have an additional concern, which
is "what if some future architecture wants more than one byte here?"
For example, an architecture may have a "top-two-bytes-ignore"
feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
"si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
implies many more ignored bits (see slide 13 of the presentation). The
maximum size that these fields can possibly be is the size of a
pointer, and with that there wouldn't be enough room in the padding at
this point to accommodate the new fields.

That basically implies your earlier suggestion of adding a union
member here to accommodate future expansion of the union, and adding
the new fields after the union. I'm happy to make that change, with
the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".

> >
> > Does this belong in the user-facing siginfo?  It seems a bit strange,
> > when other closely-related information such as esr_context is in the
> > arch-specific signal frame.
> >
> >
> > If trying to make this reusable, I wonder if we should have some sort of
> > "address attributes" field.
> >
> > An alternative approach would be to add some opaque "arch_data" field,
> > that the arch code can go look at when delivering the signal.
>
> My point is arch specific hacks don't get looked at, and wind up being
> broken.  So I am not encouraging anything that doesn't get looked at,
> and winds up being broken.
>
> > I think that's all we were trying to achieve here: tack some arch
> > private data onto the signal, to avoid having to stash the same info in
> > thread_info and pray that it doesn't get clobbered in between signal
> > generation and delivery.
>
> What makes it arch private data?  Why isn't it just data that your arch
> happens to have that other architectures don't yet.
>
> > At signal delivery time, the arch signal delivery code could inspect
> > this data and emit it into the signal frame as appropriate for the
> > arch.
>
> Sorry this probably isn't what you mean but when I read that description
> I get the feeling that you are asking for code that won't be reviewed or
> looked at by anyone else.  So inevitably that code will be broken.
> Frankly it is bad enough finding people to review and maintain the
> generic code of the kernel.
>
>
> With that said, and your desire for this data to go into the sigframe
> (despite it sounding a lot like generic data that only aarch64 has
> implemented yet) can you remind me why siginfo comes into the equation
> at all?
>
> Last I remember the discussion there were some issues and the plan was
> to simply solve the problem generically and use siginfo, and there would
> not need to be any sigframe changes.
>
> But if you want to deliver via sigframe force_sig_info and all it's
> variants will be delivered when the kernel returns back to userspace.
> So there should be no need to touch siginfo or anything else in that
> scenario.

My understanding is that siginfo should contain information about the
signal itself, while sigcontext should contain any information about
the machine state at the point when the signal was delivered that is
needed in order to restore the state after returning from a signal
handler. The fault address isn't really part of the restorable machine
state (despite the existence of a "fault_address" field in
sigcontext), so any information relating to it belongs (at least
morally) in siginfo.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-24  0:40                                                   ` Peter Collingbourne
@ 2020-06-24  9:28                                                     ` Dave Martin
  2020-06-24 16:51                                                       ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-06-24  9:28 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> <ebiederm@xmission.com> wrote:
> >
> > Dave Martin <Dave.Martin@arm.com> writes:
> >
> > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > >> Peter Collingbourne <pcc@google.com> writes:
> > >>
> > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > >> > index 47f651df781c..a8380a2b6361 100644
> > >> > --- a/arch/arm64/kernel/traps.c
> > >> > +++ b/arch/arm64/kernel/traps.c
> > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > >> >  }
> > >> >
> > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > >> >                       const char *str)
> > >> >  {
> > >> >    arm64_show_signal(signo, str);
> > >> > -  if (signo == SIGKILL)
> > >> > +  if (signo == SIGKILL) {
> > >> >            force_sig(SIGKILL);
> > >> > -  else
> > >> > -          force_sig_fault(signo, code, addr);
> > >> > +  } else {
> > >> > +          struct kernel_siginfo info;
> > >> > +          clear_siginfo(&info);
> > >> > +          info.si_signo = signo;
> > >> > +          info.si_errno = 0;
> > >> > +          info.si_code = code;
> > >> > +          info.si_addr = addr;
> > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > >> > +          force_sig_info(&info);
> > >> > +  }
> > >> >  }
> > >> >
> > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > >> > -                      const char *str)
> > >> > +                      unsigned long far, const char *str)
> > >> >  {
> > >> > +  struct kernel_siginfo info;
> > >> > +
> > >> >    arm64_show_signal(SIGBUS, str);
> > >> > -  force_sig_mceerr(code, addr, lsb);
> > >> > +
> > >> > +  clear_siginfo(&info);
> > >> > +  info.si_signo = SIGBUS;
> > >> > +  info.si_errno = 0;
> > >> > +  info.si_code = code;
> > >> > +  info.si_addr = addr;
> > >> > +  info.si_addr_lsb = lsb;
> > >> > +  info.si_addr_top_byte = far >> 56;
> > >> > +  info.si_addr_top_byte_mask = 0xff;
> > >> > +  force_sig_info(&info);
> > >> >  }
> > >>
> > >> I have a real problem with this construction.  force_sig_info is not an
> > >> interface that should be used for anything except to define a wrapper
> > >> that takes it's parameters.
> > >
> > > Can you elaborate?  How would you do this king of thing.
> >
> > There are no other uses of force_sig_info in architecture code.
> >
> > I just removed them _all_ because they were almost all broken.
> > In fact your mcerr case is broken because it uses two different
> > union members simultantiously.
> 
> Is that really broken? I thought that the Linux kernel deliberately
> didn't care about strict aliasing rules (the top-level Makefile passes
> -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> C" even though from a standards point of view it is invalid. (That
> being said, this is probably moot with my proposed changes below
> though.)

I have a feeling that -fno-strict-aliasing only allows you to _read_ a
different union member from the one previously written.

Writing a different member from the last one written can still splatter
on the other members IIUC.

It would be better to keep things separate rather than risk
incorrectness just to save a few bytes.

IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.

> > So I am looking for something like force_sig_mcerr or force_sig_fault
> > that includes your new information that then calls force_sig_info.
> >
> > I know of no other way to safely use the siginfo struct.
> 
> So you want something like:
> 
> int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> 
> in kernel/signal.c and the code in arch/arm64 would call that?
> 
> > > AIUI we absolutely need a forced signal here, we need to supply
> > > metadata, and we don't have to open-code all that at every relevant
> > > signal generation site...
> > >
> > >> It is not clear to me that if you have adapted siginfo_layout.
> > >
> > > Garbled sentence?
> >
> > Looks like.  One of the pieces of code that needs to change
> > when siginfo gets updated is siginfo_layout so that the structure
> > can be properly decoded and made sense of.
> >
> > I am not seeing anything like that.
> 
> Okay, this has to do with copying between the compat and non-compat
> versions of the struct? Sure, I can update that, although the code
> would be basically non-functional on arm64 because TBI isn't supported
> on 32-bit ARM.
> 
> > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > >> > index cb3d6c267181..6dd82373eb2d 100644
> > >> > --- a/include/uapi/asm-generic/siginfo.h
> > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > >> > @@ -91,6 +91,14 @@ union __sifields {
> > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > >> >                            __u32 _pkey;
> > >> >                    } _addr_pkey;
> > >> > +#ifdef __aarch64__
> > >> > +                  /* used with all si_codes */
> > >> > +                  struct {
> > >> > +                          short _dummy_top_byte;
> > >
> > > ^ What's this for?  I don't have Eric's insight here.
> 
> We would need a short's worth of padding in order to prevent the
> fields from occupying the same address as si_addr_lsb.
> 
> > >
> > >> > +                          unsigned char _top_byte;
> > >> > +                          unsigned char _top_byte_mask;
> > >> > +                  } _addr_top_byte;
> > >> > +#endif
> > >> >            };
> > >> >    } _sigfault;
> > >> >
> > >>
> > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > >>
> > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > >> design this so any other architecture who has this challenge can use the
> > >> code.  The kind of code does not get enough attention/maintenance if it
> > >> is built for a single architecture.
> 
> Seems reasonable. I was recently made aware that RISC-V was
> considering a similar feature:
> https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> I would have opted to expand this to other architectures on an
> as-needed basis, but I'd also be fine with having it on all
> architectures from the start.
> 
> If we make this arch-independent, we have an additional concern, which
> is "what if some future architecture wants more than one byte here?"
> For example, an architecture may have a "top-two-bytes-ignore"
> feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> implies many more ignored bits (see slide 13 of the presentation). The
> maximum size that these fields can possibly be is the size of a
> pointer, and with that there wouldn't be enough room in the padding at
> this point to accommodate the new fields.
> 
> That basically implies your earlier suggestion of adding a union
> member here to accommodate future expansion of the union, and adding
> the new fields after the union. I'm happy to make that change, with
> the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".

I think what we need here is basically a flags word.

So long as we keep a flag spare to indicate the existence of a further
flags word, we can extend as needed.

How the existence of the first flags words is detected is another
problem.  If it only applies for newly-defined si_code values, then
I guess si_code may be sufficient.

> > >
> > > Does this belong in the user-facing siginfo?  It seems a bit strange,
> > > when other closely-related information such as esr_context is in the
> > > arch-specific signal frame.
> > >
> > >
> > > If trying to make this reusable, I wonder if we should have some sort of
> > > "address attributes" field.
> > >
> > > An alternative approach would be to add some opaque "arch_data" field,
> > > that the arch code can go look at when delivering the signal.
> >
> > My point is arch specific hacks don't get looked at, and wind up being
> > broken.  So I am not encouraging anything that doesn't get looked at,
> > and winds up being broken.

Arch code will get looked at, and is automatically inherently broken.
Nor is the core code always perfect...

I agree that generic is best, both for getting more eyes on it and for
coming up with a clean design, but there's also a risk of pointless
over-abstraction for things that just aren't generic enough.

Part of the issue is that each arch necessarily has its own way of
dumping its register state, while siginfo contains abstract diagnostic
information.  The boundary between these two is not clear-cut: for
example, arm64 dumps its exception syndrome register which contains
(among other things) imformation about whether a faulted access was a
read or write.  Is this generic information, or arch-specific
information?


A side problem is that siginfo_t as originally designed is quite hard to
extend.

AFAICT, any extension needs a new si_code, otherwise there is no way
to detect that the extension fields are present.  This is fine for
defining entirely new signal types, but seems to make it hard to add
supplementary information for existing signals. Have I missed something
here?

Say we wanted to add extra data for SIGSEGV to indicate the size of
access and whether it was a read or write.  If we try to add a new
si_code for this, then all software that inspects si_code at all for
SIGSEGV now has no idea what to do with this new si_code.

Reading between the lines, I wonder whether this is part of the reason
arches tend to go their own way:  such information can't be added
generically precisely because it _is_ generic -- too generic to justify
a new si_code.  If so, this problem is going to crop up again and
again...

> > > I think that's all we were trying to achieve here: tack some arch
> > > private data onto the signal, to avoid having to stash the same info in
> > > thread_info and pray that it doesn't get clobbered in between signal
> > > generation and delivery.
> >
> > What makes it arch private data?  Why isn't it just data that your arch
> > happens to have that other architectures don't yet.

I didn't mean it must be private, just that it can be.

> > > At signal delivery time, the arch signal delivery code could inspect
> > > this data and emit it into the signal frame as appropriate for the
> > > arch.
> >
> > Sorry this probably isn't what you mean but when I read that description
> > I get the feeling that you are asking for code that won't be reviewed or
> > looked at by anyone else.  So inevitably that code will be broken.
> > Frankly it is bad enough finding people to review and maintain the
> > generic code of the kernel.

Does this need flag up to the arch maintainers?  Signal code has been
heavily arch-specific for ages, and that's where the force of gravity
seems to point.  I a lot of work has gone into cleaning this up, but it
sounds like arch maintainers might need to push back harder on anything
that _could_ be done in the common code.

> > With that said, and your desire for this data to go into the sigframe
> > (despite it sounding a lot like generic data that only aarch64 has
> > implemented yet) can you remind me why siginfo comes into the equation
> > at all?
> >
> > Last I remember the discussion there were some issues and the plan was
> > to simply solve the problem generically and use siginfo, and there would
> > not need to be any sigframe changes.
> >
> > But if you want to deliver via sigframe force_sig_info and all it's
> > variants will be delivered when the kernel returns back to userspace.
> > So there should be no need to touch siginfo or anything else in that
> > scenario.
> 
> My understanding is that siginfo should contain information about the
> signal itself, while sigcontext should contain any information about
> the machine state at the point when the signal was delivered that is
> needed in order to restore the state after returning from a signal
> handler. The fault address isn't really part of the restorable machine
> state (despite the existence of a "fault_address" field in
> sigcontext), so any information relating to it belongs (at least
> morally) in siginfo.

I think this is more than just a principle.

Diagnostic information that is supposed to accompany a signal needs to
be captured at the time the signal is generated, otherwise it may have
been overwritten by another signal-generating event by the time the
first signal is actually delivered.

Currently this isn't handled properly in the arm64 code, so it looks
like some diagnostic fields in the arm64 signal frame can be wrong in
some situations.  (I know, that's your "non-generic code that hardly
anyone relies on will be broken" argument.  But the need to keep
diagnostic information with the signal instance it relates to feels like
a generic problem.)

I have no objection to finding a generic way to report the address tag
information, but "address tag" is not the most generic concept in the
world, even if there are a few arches with something analogous.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-24  9:28                                                     ` Dave Martin
@ 2020-06-24 16:51                                                       ` Peter Collingbourne
  2020-06-24 17:12                                                         ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-06-24 16:51 UTC (permalink / raw)
  To: Dave Martin
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > <ebiederm@xmission.com> wrote:
> > >
> > > Dave Martin <Dave.Martin@arm.com> writes:
> > >
> > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > >> Peter Collingbourne <pcc@google.com> writes:
> > > >>
> > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > >> > index 47f651df781c..a8380a2b6361 100644
> > > >> > --- a/arch/arm64/kernel/traps.c
> > > >> > +++ b/arch/arm64/kernel/traps.c
> > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > >> >  }
> > > >> >
> > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > >> >                       const char *str)
> > > >> >  {
> > > >> >    arm64_show_signal(signo, str);
> > > >> > -  if (signo == SIGKILL)
> > > >> > +  if (signo == SIGKILL) {
> > > >> >            force_sig(SIGKILL);
> > > >> > -  else
> > > >> > -          force_sig_fault(signo, code, addr);
> > > >> > +  } else {
> > > >> > +          struct kernel_siginfo info;
> > > >> > +          clear_siginfo(&info);
> > > >> > +          info.si_signo = signo;
> > > >> > +          info.si_errno = 0;
> > > >> > +          info.si_code = code;
> > > >> > +          info.si_addr = addr;
> > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > >> > +          force_sig_info(&info);
> > > >> > +  }
> > > >> >  }
> > > >> >
> > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > >> > -                      const char *str)
> > > >> > +                      unsigned long far, const char *str)
> > > >> >  {
> > > >> > +  struct kernel_siginfo info;
> > > >> > +
> > > >> >    arm64_show_signal(SIGBUS, str);
> > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > >> > +
> > > >> > +  clear_siginfo(&info);
> > > >> > +  info.si_signo = SIGBUS;
> > > >> > +  info.si_errno = 0;
> > > >> > +  info.si_code = code;
> > > >> > +  info.si_addr = addr;
> > > >> > +  info.si_addr_lsb = lsb;
> > > >> > +  info.si_addr_top_byte = far >> 56;
> > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > >> > +  force_sig_info(&info);
> > > >> >  }
> > > >>
> > > >> I have a real problem with this construction.  force_sig_info is not an
> > > >> interface that should be used for anything except to define a wrapper
> > > >> that takes it's parameters.
> > > >
> > > > Can you elaborate?  How would you do this king of thing.
> > >
> > > There are no other uses of force_sig_info in architecture code.
> > >
> > > I just removed them _all_ because they were almost all broken.
> > > In fact your mcerr case is broken because it uses two different
> > > union members simultantiously.
> >
> > Is that really broken? I thought that the Linux kernel deliberately
> > didn't care about strict aliasing rules (the top-level Makefile passes
> > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > C" even though from a standards point of view it is invalid. (That
> > being said, this is probably moot with my proposed changes below
> > though.)
>
> I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> different union member from the one previously written.
>
> Writing a different member from the last one written can still splatter
> on the other members IIUC.
>
> It would be better to keep things separate rather than risk
> incorrectness just to save a few bytes.
>
> IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
>
> > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > that includes your new information that then calls force_sig_info.
> > >
> > > I know of no other way to safely use the siginfo struct.
> >
> > So you want something like:
> >
> > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> >
> > in kernel/signal.c and the code in arch/arm64 would call that?
> >
> > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > metadata, and we don't have to open-code all that at every relevant
> > > > signal generation site...
> > > >
> > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > >
> > > > Garbled sentence?
> > >
> > > Looks like.  One of the pieces of code that needs to change
> > > when siginfo gets updated is siginfo_layout so that the structure
> > > can be properly decoded and made sense of.
> > >
> > > I am not seeing anything like that.
> >
> > Okay, this has to do with copying between the compat and non-compat
> > versions of the struct? Sure, I can update that, although the code
> > would be basically non-functional on arm64 because TBI isn't supported
> > on 32-bit ARM.
> >
> > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > >> >                            __u32 _pkey;
> > > >> >                    } _addr_pkey;
> > > >> > +#ifdef __aarch64__
> > > >> > +                  /* used with all si_codes */
> > > >> > +                  struct {
> > > >> > +                          short _dummy_top_byte;
> > > >
> > > > ^ What's this for?  I don't have Eric's insight here.
> >
> > We would need a short's worth of padding in order to prevent the
> > fields from occupying the same address as si_addr_lsb.
> >
> > > >
> > > >> > +                          unsigned char _top_byte;
> > > >> > +                          unsigned char _top_byte_mask;
> > > >> > +                  } _addr_top_byte;
> > > >> > +#endif
> > > >> >            };
> > > >> >    } _sigfault;
> > > >> >
> > > >>
> > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > >>
> > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > >> design this so any other architecture who has this challenge can use the
> > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > >> is built for a single architecture.
> >
> > Seems reasonable. I was recently made aware that RISC-V was
> > considering a similar feature:
> > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > I would have opted to expand this to other architectures on an
> > as-needed basis, but I'd also be fine with having it on all
> > architectures from the start.
> >
> > If we make this arch-independent, we have an additional concern, which
> > is "what if some future architecture wants more than one byte here?"
> > For example, an architecture may have a "top-two-bytes-ignore"
> > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > implies many more ignored bits (see slide 13 of the presentation). The
> > maximum size that these fields can possibly be is the size of a
> > pointer, and with that there wouldn't be enough room in the padding at
> > this point to accommodate the new fields.
> >
> > That basically implies your earlier suggestion of adding a union
> > member here to accommodate future expansion of the union, and adding
> > the new fields after the union. I'm happy to make that change, with
> > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
>
> I think what we need here is basically a flags word.
>
> So long as we keep a flag spare to indicate the existence of a further
> flags word, we can extend as needed.
>
> How the existence of the first flags words is detected is another
> problem.  If it only applies for newly-defined si_code values, then
> I guess si_code may be sufficient.

Existing kernels will zero-initialize unused regions of the siginfo
data structure. The zero-initialization of the padding at the end of
the struct is done by the clear_user call here:
https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193

and the zero-initialization of the padding between fields and unused
union members is done by the clear_siginfo function which the kernel
calls when initializing the data structure:
https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20

Therefore, a flag word value of 0 may be used to detect a lack of
support for flagged fields.

That being said, in this particular case, we do not need a flag word.
We can just take advantage of this zero-initialization behavior in
existing kernels to set si_addr_ignored_mask to 0, which indicates
that none of the bits in si_addr_ignored are valid.

Peter

> > > >
> > > > Does this belong in the user-facing siginfo?  It seems a bit strange,
> > > > when other closely-related information such as esr_context is in the
> > > > arch-specific signal frame.
> > > >
> > > >
> > > > If trying to make this reusable, I wonder if we should have some sort of
> > > > "address attributes" field.
> > > >
> > > > An alternative approach would be to add some opaque "arch_data" field,
> > > > that the arch code can go look at when delivering the signal.
> > >
> > > My point is arch specific hacks don't get looked at, and wind up being
> > > broken.  So I am not encouraging anything that doesn't get looked at,
> > > and winds up being broken.
>
> Arch code will get looked at, and is automatically inherently broken.
> Nor is the core code always perfect...
>
> I agree that generic is best, both for getting more eyes on it and for
> coming up with a clean design, but there's also a risk of pointless
> over-abstraction for things that just aren't generic enough.
>
> Part of the issue is that each arch necessarily has its own way of
> dumping its register state, while siginfo contains abstract diagnostic
> information.  The boundary between these two is not clear-cut: for
> example, arm64 dumps its exception syndrome register which contains
> (among other things) imformation about whether a faulted access was a
> read or write.  Is this generic information, or arch-specific
> information?
>
>
> A side problem is that siginfo_t as originally designed is quite hard to
> extend.
>
> AFAICT, any extension needs a new si_code, otherwise there is no way
> to detect that the extension fields are present.  This is fine for
> defining entirely new signal types, but seems to make it hard to add
> supplementary information for existing signals. Have I missed something
> here?
>
> Say we wanted to add extra data for SIGSEGV to indicate the size of
> access and whether it was a read or write.  If we try to add a new
> si_code for this, then all software that inspects si_code at all for
> SIGSEGV now has no idea what to do with this new si_code.
>
> Reading between the lines, I wonder whether this is part of the reason
> arches tend to go their own way:  such information can't be added
> generically precisely because it _is_ generic -- too generic to justify
> a new si_code.  If so, this problem is going to crop up again and
> again...
>
> > > > I think that's all we were trying to achieve here: tack some arch
> > > > private data onto the signal, to avoid having to stash the same info in
> > > > thread_info and pray that it doesn't get clobbered in between signal
> > > > generation and delivery.
> > >
> > > What makes it arch private data?  Why isn't it just data that your arch
> > > happens to have that other architectures don't yet.
>
> I didn't mean it must be private, just that it can be.
>
> > > > At signal delivery time, the arch signal delivery code could inspect
> > > > this data and emit it into the signal frame as appropriate for the
> > > > arch.
> > >
> > > Sorry this probably isn't what you mean but when I read that description
> > > I get the feeling that you are asking for code that won't be reviewed or
> > > looked at by anyone else.  So inevitably that code will be broken.
> > > Frankly it is bad enough finding people to review and maintain the
> > > generic code of the kernel.
>
> Does this need flag up to the arch maintainers?  Signal code has been
> heavily arch-specific for ages, and that's where the force of gravity
> seems to point.  I a lot of work has gone into cleaning this up, but it
> sounds like arch maintainers might need to push back harder on anything
> that _could_ be done in the common code.
>
> > > With that said, and your desire for this data to go into the sigframe
> > > (despite it sounding a lot like generic data that only aarch64 has
> > > implemented yet) can you remind me why siginfo comes into the equation
> > > at all?
> > >
> > > Last I remember the discussion there were some issues and the plan was
> > > to simply solve the problem generically and use siginfo, and there would
> > > not need to be any sigframe changes.
> > >
> > > But if you want to deliver via sigframe force_sig_info and all it's
> > > variants will be delivered when the kernel returns back to userspace.
> > > So there should be no need to touch siginfo or anything else in that
> > > scenario.
> >
> > My understanding is that siginfo should contain information about the
> > signal itself, while sigcontext should contain any information about
> > the machine state at the point when the signal was delivered that is
> > needed in order to restore the state after returning from a signal
> > handler. The fault address isn't really part of the restorable machine
> > state (despite the existence of a "fault_address" field in
> > sigcontext), so any information relating to it belongs (at least
> > morally) in siginfo.
>
> I think this is more than just a principle.
>
> Diagnostic information that is supposed to accompany a signal needs to
> be captured at the time the signal is generated, otherwise it may have
> been overwritten by another signal-generating event by the time the
> first signal is actually delivered.
>
> Currently this isn't handled properly in the arm64 code, so it looks
> like some diagnostic fields in the arm64 signal frame can be wrong in
> some situations.  (I know, that's your "non-generic code that hardly
> anyone relies on will be broken" argument.  But the need to keep
> diagnostic information with the signal instance it relates to feels like
> a generic problem.)
>
> I have no objection to finding a generic way to report the address tag
> information, but "address tag" is not the most generic concept in the
> world, even if there are a few arches with something analogous.
>
> Cheers
> ---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-24 16:51                                                       ` Peter Collingbourne
@ 2020-06-24 17:12                                                         ` Dave Martin
  2020-06-24 19:51                                                           ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-06-24 17:12 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > <ebiederm@xmission.com> wrote:
> > > >
> > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > >
> > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > >>
> > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > >> >  }
> > > > >> >
> > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > >> >                       const char *str)
> > > > >> >  {
> > > > >> >    arm64_show_signal(signo, str);
> > > > >> > -  if (signo == SIGKILL)
> > > > >> > +  if (signo == SIGKILL) {
> > > > >> >            force_sig(SIGKILL);
> > > > >> > -  else
> > > > >> > -          force_sig_fault(signo, code, addr);
> > > > >> > +  } else {
> > > > >> > +          struct kernel_siginfo info;
> > > > >> > +          clear_siginfo(&info);
> > > > >> > +          info.si_signo = signo;
> > > > >> > +          info.si_errno = 0;
> > > > >> > +          info.si_code = code;
> > > > >> > +          info.si_addr = addr;
> > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > >> > +          force_sig_info(&info);
> > > > >> > +  }
> > > > >> >  }
> > > > >> >
> > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > >> > -                      const char *str)
> > > > >> > +                      unsigned long far, const char *str)
> > > > >> >  {
> > > > >> > +  struct kernel_siginfo info;
> > > > >> > +
> > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > >> > +
> > > > >> > +  clear_siginfo(&info);
> > > > >> > +  info.si_signo = SIGBUS;
> > > > >> > +  info.si_errno = 0;
> > > > >> > +  info.si_code = code;
> > > > >> > +  info.si_addr = addr;
> > > > >> > +  info.si_addr_lsb = lsb;
> > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > >> > +  force_sig_info(&info);
> > > > >> >  }
> > > > >>
> > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > >> interface that should be used for anything except to define a wrapper
> > > > >> that takes it's parameters.
> > > > >
> > > > > Can you elaborate?  How would you do this king of thing.
> > > >
> > > > There are no other uses of force_sig_info in architecture code.
> > > >
> > > > I just removed them _all_ because they were almost all broken.
> > > > In fact your mcerr case is broken because it uses two different
> > > > union members simultantiously.
> > >
> > > Is that really broken? I thought that the Linux kernel deliberately
> > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > C" even though from a standards point of view it is invalid. (That
> > > being said, this is probably moot with my proposed changes below
> > > though.)
> >
> > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > different union member from the one previously written.
> >
> > Writing a different member from the last one written can still splatter
> > on the other members IIUC.
> >
> > It would be better to keep things separate rather than risk
> > incorrectness just to save a few bytes.
> >
> > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> >
> > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > that includes your new information that then calls force_sig_info.
> > > >
> > > > I know of no other way to safely use the siginfo struct.
> > >
> > > So you want something like:
> > >
> > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > >
> > > in kernel/signal.c and the code in arch/arm64 would call that?
> > >
> > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > signal generation site...
> > > > >
> > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > >
> > > > > Garbled sentence?
> > > >
> > > > Looks like.  One of the pieces of code that needs to change
> > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > can be properly decoded and made sense of.
> > > >
> > > > I am not seeing anything like that.
> > >
> > > Okay, this has to do with copying between the compat and non-compat
> > > versions of the struct? Sure, I can update that, although the code
> > > would be basically non-functional on arm64 because TBI isn't supported
> > > on 32-bit ARM.
> > >
> > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > >> >                            __u32 _pkey;
> > > > >> >                    } _addr_pkey;
> > > > >> > +#ifdef __aarch64__
> > > > >> > +                  /* used with all si_codes */
> > > > >> > +                  struct {
> > > > >> > +                          short _dummy_top_byte;
> > > > >
> > > > > ^ What's this for?  I don't have Eric's insight here.
> > >
> > > We would need a short's worth of padding in order to prevent the
> > > fields from occupying the same address as si_addr_lsb.
> > >
> > > > >
> > > > >> > +                          unsigned char _top_byte;
> > > > >> > +                          unsigned char _top_byte_mask;
> > > > >> > +                  } _addr_top_byte;
> > > > >> > +#endif
> > > > >> >            };
> > > > >> >    } _sigfault;
> > > > >> >
> > > > >>
> > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > >>
> > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > >> design this so any other architecture who has this challenge can use the
> > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > >> is built for a single architecture.
> > >
> > > Seems reasonable. I was recently made aware that RISC-V was
> > > considering a similar feature:
> > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > I would have opted to expand this to other architectures on an
> > > as-needed basis, but I'd also be fine with having it on all
> > > architectures from the start.
> > >
> > > If we make this arch-independent, we have an additional concern, which
> > > is "what if some future architecture wants more than one byte here?"
> > > For example, an architecture may have a "top-two-bytes-ignore"
> > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > implies many more ignored bits (see slide 13 of the presentation). The
> > > maximum size that these fields can possibly be is the size of a
> > > pointer, and with that there wouldn't be enough room in the padding at
> > > this point to accommodate the new fields.
> > >
> > > That basically implies your earlier suggestion of adding a union
> > > member here to accommodate future expansion of the union, and adding
> > > the new fields after the union. I'm happy to make that change, with
> > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> >
> > I think what we need here is basically a flags word.
> >
> > So long as we keep a flag spare to indicate the existence of a further
> > flags word, we can extend as needed.
> >
> > How the existence of the first flags words is detected is another
> > problem.  If it only applies for newly-defined si_code values, then
> > I guess si_code may be sufficient.
> 
> Existing kernels will zero-initialize unused regions of the siginfo
> data structure. The zero-initialization of the padding at the end of
> the struct is done by the clear_user call here:
> https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> 
> and the zero-initialization of the padding between fields and unused
> union members is done by the clear_siginfo function which the kernel
> calls when initializing the data structure:
> https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> 
> Therefore, a flag word value of 0 may be used to detect a lack of
> support for flagged fields.

It's not enough that we do this today.  We would have had to do it back
to the dawn of time (though in the arm64 case I guess we just need to go
back to when the arch/arm64 was merged).

v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
always the case, so unused parts of siginfo could be full of old junk
from the user stack, if the kernel is sufficiently old.

If we're trying to do something generic that makes sense on all arches,
this matters.  I may have misunderstood something about the code though.

> That being said, in this particular case, we do not need a flag word.
> We can just take advantage of this zero-initialization behavior in
> existing kernels to set si_addr_ignored_mask to 0, which indicates
> that none of the bits in si_addr_ignored are valid.

[...]

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-24 17:12                                                         ` Dave Martin
@ 2020-06-24 19:51                                                           ` Peter Collingbourne
  2020-07-06 16:41                                                             ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-06-24 19:51 UTC (permalink / raw)
  To: Dave Martin
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > <ebiederm@xmission.com> wrote:
> > > > >
> > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > >
> > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > >>
> > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > >> >  }
> > > > > >> >
> > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > >> >                       const char *str)
> > > > > >> >  {
> > > > > >> >    arm64_show_signal(signo, str);
> > > > > >> > -  if (signo == SIGKILL)
> > > > > >> > +  if (signo == SIGKILL) {
> > > > > >> >            force_sig(SIGKILL);
> > > > > >> > -  else
> > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > >> > +  } else {
> > > > > >> > +          struct kernel_siginfo info;
> > > > > >> > +          clear_siginfo(&info);
> > > > > >> > +          info.si_signo = signo;
> > > > > >> > +          info.si_errno = 0;
> > > > > >> > +          info.si_code = code;
> > > > > >> > +          info.si_addr = addr;
> > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > >> > +          force_sig_info(&info);
> > > > > >> > +  }
> > > > > >> >  }
> > > > > >> >
> > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > >> > -                      const char *str)
> > > > > >> > +                      unsigned long far, const char *str)
> > > > > >> >  {
> > > > > >> > +  struct kernel_siginfo info;
> > > > > >> > +
> > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > >> > +
> > > > > >> > +  clear_siginfo(&info);
> > > > > >> > +  info.si_signo = SIGBUS;
> > > > > >> > +  info.si_errno = 0;
> > > > > >> > +  info.si_code = code;
> > > > > >> > +  info.si_addr = addr;
> > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > >> > +  force_sig_info(&info);
> > > > > >> >  }
> > > > > >>
> > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > >> interface that should be used for anything except to define a wrapper
> > > > > >> that takes it's parameters.
> > > > > >
> > > > > > Can you elaborate?  How would you do this king of thing.
> > > > >
> > > > > There are no other uses of force_sig_info in architecture code.
> > > > >
> > > > > I just removed them _all_ because they were almost all broken.
> > > > > In fact your mcerr case is broken because it uses two different
> > > > > union members simultantiously.
> > > >
> > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > C" even though from a standards point of view it is invalid. (That
> > > > being said, this is probably moot with my proposed changes below
> > > > though.)
> > >
> > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > different union member from the one previously written.
> > >
> > > Writing a different member from the last one written can still splatter
> > > on the other members IIUC.
> > >
> > > It would be better to keep things separate rather than risk
> > > incorrectness just to save a few bytes.
> > >
> > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > >
> > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > that includes your new information that then calls force_sig_info.
> > > > >
> > > > > I know of no other way to safely use the siginfo struct.
> > > >
> > > > So you want something like:
> > > >
> > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > >
> > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > >
> > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > signal generation site...
> > > > > >
> > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > >
> > > > > > Garbled sentence?
> > > > >
> > > > > Looks like.  One of the pieces of code that needs to change
> > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > can be properly decoded and made sense of.
> > > > >
> > > > > I am not seeing anything like that.
> > > >
> > > > Okay, this has to do with copying between the compat and non-compat
> > > > versions of the struct? Sure, I can update that, although the code
> > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > on 32-bit ARM.
> > > >
> > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > >> >                            __u32 _pkey;
> > > > > >> >                    } _addr_pkey;
> > > > > >> > +#ifdef __aarch64__
> > > > > >> > +                  /* used with all si_codes */
> > > > > >> > +                  struct {
> > > > > >> > +                          short _dummy_top_byte;
> > > > > >
> > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > >
> > > > We would need a short's worth of padding in order to prevent the
> > > > fields from occupying the same address as si_addr_lsb.
> > > >
> > > > > >
> > > > > >> > +                          unsigned char _top_byte;
> > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > >> > +                  } _addr_top_byte;
> > > > > >> > +#endif
> > > > > >> >            };
> > > > > >> >    } _sigfault;
> > > > > >> >
> > > > > >>
> > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > >>
> > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > >> design this so any other architecture who has this challenge can use the
> > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > >> is built for a single architecture.
> > > >
> > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > considering a similar feature:
> > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > I would have opted to expand this to other architectures on an
> > > > as-needed basis, but I'd also be fine with having it on all
> > > > architectures from the start.
> > > >
> > > > If we make this arch-independent, we have an additional concern, which
> > > > is "what if some future architecture wants more than one byte here?"
> > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > maximum size that these fields can possibly be is the size of a
> > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > this point to accommodate the new fields.
> > > >
> > > > That basically implies your earlier suggestion of adding a union
> > > > member here to accommodate future expansion of the union, and adding
> > > > the new fields after the union. I'm happy to make that change, with
> > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > >
> > > I think what we need here is basically a flags word.
> > >
> > > So long as we keep a flag spare to indicate the existence of a further
> > > flags word, we can extend as needed.
> > >
> > > How the existence of the first flags words is detected is another
> > > problem.  If it only applies for newly-defined si_code values, then
> > > I guess si_code may be sufficient.
> >
> > Existing kernels will zero-initialize unused regions of the siginfo
> > data structure. The zero-initialization of the padding at the end of
> > the struct is done by the clear_user call here:
> > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> >
> > and the zero-initialization of the padding between fields and unused
> > union members is done by the clear_siginfo function which the kernel
> > calls when initializing the data structure:
> > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> >
> > Therefore, a flag word value of 0 may be used to detect a lack of
> > support for flagged fields.
>
> It's not enough that we do this today.  We would have had to do it back
> to the dawn of time (though in the arm64 case I guess we just need to go
> back to when the arch/arm64 was merged).
>
> v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> always the case, so unused parts of siginfo could be full of old junk
> from the user stack, if the kernel is sufficiently old.
>
> If we're trying to do something generic that makes sense on all arches,
> this matters.  I may have misunderstood something about the code though.

Hmm, I think you're right. The current behavior was introduced by
commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
released in 4.18. So if an application wants to be compatible with
pre-4.18 kernels then there would need to be some other way to
indicate that the fields are valid. Probably the simplest way would be
to have the application issue a uname(2) syscall and check the kernel
version before reading these fields. I have a couple of other ideas
that don't rely on version detection, if we'd prefer to avoid that.
(They are somewhat ugly, but our hand is forced by backwards
compatibility.)

One idea is to re-purpose the si_errno field as a flags field for
certain signal numbers. I checked a few kernel releases going back to
2.6.18 and it looks like the field is set to 0 except in the following
circumstances:
- sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
- seccomp failures (SIGSYS/SYS_SECCOMP)
- user-defined signal via kill_pid_usb_asyncio
- SIGSWI in 3.18 and before (code since removed)

It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
is currently unreleased. So if we wanted to go this route for SIGSEGV
we would need to stop the kernel from setting si_errno to EFAULT for
this signal before the 5.8 release.

Another idea was to have userspace set a flag in sa_flags when
registering a signal handler meaning "this signal handler requires
unknown siginfo fields to be zeroed", and have existing kernels reject
the syscall due to an unknown flag being set, but unfortunately this
won't work because existing kernels do not reject sigaction syscalls
with unknown flags set in sa_flags. A perhaps more radical idea in
this vein would be to claim some of the upper bits of the signal
number as flags that will cause the syscall to be rejected if set and
unknown to the kernel. Existing kernels (going back to at least
2.6.18) contain this code in do_sigaction:

        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;

and vald_signal is defined as:

static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

All architectures define _NSIG as a value <= 128, so they will reject
a signal number with any of bits 8-31 set. This means that we can use
any of those bits for mandatory flags. Most likely we could use bit 30
(expanding down as necessary), as it keeps the signal number positive
and permits future expansion of the signal number range.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-06-24 19:51                                                           ` Peter Collingbourne
@ 2020-07-06 16:41                                                             ` Dave Martin
  2020-07-06 19:20                                                               ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-07-06 16:41 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > >
> > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > <ebiederm@xmission.com> wrote:
> > > > > >
> > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > >
> > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > >>
> > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > >> >  }
> > > > > > >> >
> > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > >> >                       const char *str)
> > > > > > >> >  {
> > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > >> > -  if (signo == SIGKILL)
> > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > >> >            force_sig(SIGKILL);
> > > > > > >> > -  else
> > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > >> > +  } else {
> > > > > > >> > +          struct kernel_siginfo info;
> > > > > > >> > +          clear_siginfo(&info);
> > > > > > >> > +          info.si_signo = signo;
> > > > > > >> > +          info.si_errno = 0;
> > > > > > >> > +          info.si_code = code;
> > > > > > >> > +          info.si_addr = addr;
> > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > >> > +          force_sig_info(&info);
> > > > > > >> > +  }
> > > > > > >> >  }
> > > > > > >> >
> > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > >> > -                      const char *str)
> > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > >> >  {
> > > > > > >> > +  struct kernel_siginfo info;
> > > > > > >> > +
> > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > >> > +
> > > > > > >> > +  clear_siginfo(&info);
> > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > >> > +  info.si_errno = 0;
> > > > > > >> > +  info.si_code = code;
> > > > > > >> > +  info.si_addr = addr;
> > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > >> > +  force_sig_info(&info);
> > > > > > >> >  }
> > > > > > >>
> > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > >> that takes it's parameters.
> > > > > > >
> > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > >
> > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > >
> > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > union members simultantiously.
> > > > >
> > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > C" even though from a standards point of view it is invalid. (That
> > > > > being said, this is probably moot with my proposed changes below
> > > > > though.)
> > > >
> > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > different union member from the one previously written.
> > > >
> > > > Writing a different member from the last one written can still splatter
> > > > on the other members IIUC.
> > > >
> > > > It would be better to keep things separate rather than risk
> > > > incorrectness just to save a few bytes.
> > > >
> > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > >
> > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > that includes your new information that then calls force_sig_info.
> > > > > >
> > > > > > I know of no other way to safely use the siginfo struct.
> > > > >
> > > > > So you want something like:
> > > > >
> > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > >
> > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > >
> > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > signal generation site...
> > > > > > >
> > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > >
> > > > > > > Garbled sentence?
> > > > > >
> > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > can be properly decoded and made sense of.
> > > > > >
> > > > > > I am not seeing anything like that.
> > > > >
> > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > versions of the struct? Sure, I can update that, although the code
> > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > on 32-bit ARM.
> > > > >
> > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > >> >                            __u32 _pkey;
> > > > > > >> >                    } _addr_pkey;
> > > > > > >> > +#ifdef __aarch64__
> > > > > > >> > +                  /* used with all si_codes */
> > > > > > >> > +                  struct {
> > > > > > >> > +                          short _dummy_top_byte;
> > > > > > >
> > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > >
> > > > > We would need a short's worth of padding in order to prevent the
> > > > > fields from occupying the same address as si_addr_lsb.
> > > > >
> > > > > > >
> > > > > > >> > +                          unsigned char _top_byte;
> > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > >> > +                  } _addr_top_byte;
> > > > > > >> > +#endif
> > > > > > >> >            };
> > > > > > >> >    } _sigfault;
> > > > > > >> >
> > > > > > >>
> > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > >>
> > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > >> is built for a single architecture.
> > > > >
> > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > considering a similar feature:
> > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > I would have opted to expand this to other architectures on an
> > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > architectures from the start.
> > > > >
> > > > > If we make this arch-independent, we have an additional concern, which
> > > > > is "what if some future architecture wants more than one byte here?"
> > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > maximum size that these fields can possibly be is the size of a
> > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > this point to accommodate the new fields.
> > > > >
> > > > > That basically implies your earlier suggestion of adding a union
> > > > > member here to accommodate future expansion of the union, and adding
> > > > > the new fields after the union. I'm happy to make that change, with
> > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > >
> > > > I think what we need here is basically a flags word.
> > > >
> > > > So long as we keep a flag spare to indicate the existence of a further
> > > > flags word, we can extend as needed.
> > > >
> > > > How the existence of the first flags words is detected is another
> > > > problem.  If it only applies for newly-defined si_code values, then
> > > > I guess si_code may be sufficient.
> > >
> > > Existing kernels will zero-initialize unused regions of the siginfo
> > > data structure. The zero-initialization of the padding at the end of
> > > the struct is done by the clear_user call here:
> > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > >
> > > and the zero-initialization of the padding between fields and unused
> > > union members is done by the clear_siginfo function which the kernel
> > > calls when initializing the data structure:
> > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > >
> > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > support for flagged fields.
> >
> > It's not enough that we do this today.  We would have had to do it back
> > to the dawn of time (though in the arm64 case I guess we just need to go
> > back to when the arch/arm64 was merged).
> >
> > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > always the case, so unused parts of siginfo could be full of old junk
> > from the user stack, if the kernel is sufficiently old.
> >
> > If we're trying to do something generic that makes sense on all arches,
> > this matters.  I may have misunderstood something about the code though.
> 
> Hmm, I think you're right. The current behavior was introduced by
> commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> released in 4.18. So if an application wants to be compatible with
> pre-4.18 kernels then there would need to be some other way to
> indicate that the fields are valid. Probably the simplest way would be
> to have the application issue a uname(2) syscall and check the kernel
> version before reading these fields. I have a couple of other ideas
> that don't rely on version detection, if we'd prefer to avoid that.
> (They are somewhat ugly, but our hand is forced by backwards
> compatibility.)
> 
> One idea is to re-purpose the si_errno field as a flags field for
> certain signal numbers. I checked a few kernel releases going back to
> 2.6.18 and it looks like the field is set to 0 except in the following
> circumstances:
> - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> - seccomp failures (SIGSYS/SYS_SECCOMP)
> - user-defined signal via kill_pid_usb_asyncio
> - SIGSWI in 3.18 and before (code since removed)
> 
> It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> is currently unreleased. So if we wanted to go this route for SIGSEGV
> we would need to stop the kernel from setting si_errno to EFAULT for
> this signal before the 5.8 release.
> 
> Another idea was to have userspace set a flag in sa_flags when
> registering a signal handler meaning "this signal handler requires
> unknown siginfo fields to be zeroed", and have existing kernels reject
> the syscall due to an unknown flag being set, but unfortunately this
> won't work because existing kernels do not reject sigaction syscalls
> with unknown flags set in sa_flags. A perhaps more radical idea in
> this vein would be to claim some of the upper bits of the signal
> number as flags that will cause the syscall to be rejected if set and
> unknown to the kernel. Existing kernels (going back to at least
> 2.6.18) contain this code in do_sigaction:
> 
>         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
>                 return -EINVAL;
> 
> and vald_signal is defined as:
> 
> static inline int valid_signal(unsigned long sig)
> {
>         return sig <= _NSIG ? 1 : 0;
> }
> 
> All architectures define _NSIG as a value <= 128, so they will reject
> a signal number with any of bits 8-31 set. This means that we can use
> any of those bits for mandatory flags. Most likely we could use bit 30
> (expanding down as necessary), as it keeps the signal number positive
> and permits future expansion of the signal number range.

Does the signal core code actually gurantee to zero the unused fields?
Unless the fields are poked in by hand this is fraught with subtlelies,
especially when unions are involved.  (I'm sure the code tries to do it,
but I've not eyeballed it in detail...)


Using unused bits in the signal number to turn on new functionality
feels risky.  As currently specified, this is just a number.  Since
today a successful sigaction(n ...) guarantees that n is a valid signal
number, reasonable code like the following would trigger a buffer
overrun if we start trying to encode anything else in there:

struct sigaction actions[NSIG];

int do_something( ... )
{
	...

	if (!sigaction(n, sa, ...)) {
		actions[n] = *sa;
		return 0;
	}

	...
}


I think it would be cleaner for to add a single flag field that can be
used for detecting other extensions, and request it via a new sa_flags
bit.  This removes the need for sematically useless zeroing of unused
fields (though for hygiene and backwards compatibility reasons we would
probably want to carry on zeroing them anyway).

I can see no simpler way to add supplementary siginfo fields for
existing si_codes.  For si_codes that didn't exist before the zeroing
came in we could still detect optional si_code-specific fields via
zeroing, but it seems messary to have two ways of detecting extensions.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-06 16:41                                                             ` Dave Martin
@ 2020-07-06 19:20                                                               ` Peter Collingbourne
  2020-07-07 14:19                                                                 ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-07-06 19:20 UTC (permalink / raw)
  To: Dave Martin
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > >
> > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > <ebiederm@xmission.com> wrote:
> > > > > > >
> > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > >
> > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > >>
> > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > >> >  }
> > > > > > > >> >
> > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > >> >                       const char *str)
> > > > > > > >> >  {
> > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > >> > -  else
> > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > >> > +  } else {
> > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > >> > +          info.si_signo = signo;
> > > > > > > >> > +          info.si_errno = 0;
> > > > > > > >> > +          info.si_code = code;
> > > > > > > >> > +          info.si_addr = addr;
> > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > >> > +          force_sig_info(&info);
> > > > > > > >> > +  }
> > > > > > > >> >  }
> > > > > > > >> >
> > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > >> > -                      const char *str)
> > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > >> >  {
> > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > >> > +
> > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > >> > +
> > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > >> > +  info.si_errno = 0;
> > > > > > > >> > +  info.si_code = code;
> > > > > > > >> > +  info.si_addr = addr;
> > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > >> > +  force_sig_info(&info);
> > > > > > > >> >  }
> > > > > > > >>
> > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > >> that takes it's parameters.
> > > > > > > >
> > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > >
> > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > >
> > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > union members simultantiously.
> > > > > >
> > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > being said, this is probably moot with my proposed changes below
> > > > > > though.)
> > > > >
> > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > different union member from the one previously written.
> > > > >
> > > > > Writing a different member from the last one written can still splatter
> > > > > on the other members IIUC.
> > > > >
> > > > > It would be better to keep things separate rather than risk
> > > > > incorrectness just to save a few bytes.
> > > > >
> > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > >
> > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > >
> > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > >
> > > > > > So you want something like:
> > > > > >
> > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > >
> > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > >
> > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > signal generation site...
> > > > > > > >
> > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > >
> > > > > > > > Garbled sentence?
> > > > > > >
> > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > can be properly decoded and made sense of.
> > > > > > >
> > > > > > > I am not seeing anything like that.
> > > > > >
> > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > on 32-bit ARM.
> > > > > >
> > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > >> >                            __u32 _pkey;
> > > > > > > >> >                    } _addr_pkey;
> > > > > > > >> > +#ifdef __aarch64__
> > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > >> > +                  struct {
> > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > >
> > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > >
> > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > >
> > > > > > > >
> > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > >> > +#endif
> > > > > > > >> >            };
> > > > > > > >> >    } _sigfault;
> > > > > > > >> >
> > > > > > > >>
> > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > >>
> > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > >> is built for a single architecture.
> > > > > >
> > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > considering a similar feature:
> > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > I would have opted to expand this to other architectures on an
> > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > architectures from the start.
> > > > > >
> > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > this point to accommodate the new fields.
> > > > > >
> > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > >
> > > > > I think what we need here is basically a flags word.
> > > > >
> > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > flags word, we can extend as needed.
> > > > >
> > > > > How the existence of the first flags words is detected is another
> > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > I guess si_code may be sufficient.
> > > >
> > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > data structure. The zero-initialization of the padding at the end of
> > > > the struct is done by the clear_user call here:
> > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > >
> > > > and the zero-initialization of the padding between fields and unused
> > > > union members is done by the clear_siginfo function which the kernel
> > > > calls when initializing the data structure:
> > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > >
> > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > support for flagged fields.
> > >
> > > It's not enough that we do this today.  We would have had to do it back
> > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > back to when the arch/arm64 was merged).
> > >
> > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > always the case, so unused parts of siginfo could be full of old junk
> > > from the user stack, if the kernel is sufficiently old.
> > >
> > > If we're trying to do something generic that makes sense on all arches,
> > > this matters.  I may have misunderstood something about the code though.
> >
> > Hmm, I think you're right. The current behavior was introduced by
> > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > released in 4.18. So if an application wants to be compatible with
> > pre-4.18 kernels then there would need to be some other way to
> > indicate that the fields are valid. Probably the simplest way would be
> > to have the application issue a uname(2) syscall and check the kernel
> > version before reading these fields. I have a couple of other ideas
> > that don't rely on version detection, if we'd prefer to avoid that.
> > (They are somewhat ugly, but our hand is forced by backwards
> > compatibility.)
> >
> > One idea is to re-purpose the si_errno field as a flags field for
> > certain signal numbers. I checked a few kernel releases going back to
> > 2.6.18 and it looks like the field is set to 0 except in the following
> > circumstances:
> > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > - user-defined signal via kill_pid_usb_asyncio
> > - SIGSWI in 3.18 and before (code since removed)
> >
> > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > we would need to stop the kernel from setting si_errno to EFAULT for
> > this signal before the 5.8 release.
> >
> > Another idea was to have userspace set a flag in sa_flags when
> > registering a signal handler meaning "this signal handler requires
> > unknown siginfo fields to be zeroed", and have existing kernels reject
> > the syscall due to an unknown flag being set, but unfortunately this
> > won't work because existing kernels do not reject sigaction syscalls
> > with unknown flags set in sa_flags. A perhaps more radical idea in
> > this vein would be to claim some of the upper bits of the signal
> > number as flags that will cause the syscall to be rejected if set and
> > unknown to the kernel. Existing kernels (going back to at least
> > 2.6.18) contain this code in do_sigaction:
> >
> >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> >                 return -EINVAL;
> >
> > and vald_signal is defined as:
> >
> > static inline int valid_signal(unsigned long sig)
> > {
> >         return sig <= _NSIG ? 1 : 0;
> > }
> >
> > All architectures define _NSIG as a value <= 128, so they will reject
> > a signal number with any of bits 8-31 set. This means that we can use
> > any of those bits for mandatory flags. Most likely we could use bit 30
> > (expanding down as necessary), as it keeps the signal number positive
> > and permits future expansion of the signal number range.
>
> Does the signal core code actually gurantee to zero the unused fields?
> Unless the fields are poked in by hand this is fraught with subtlelies,
> especially when unions are involved.  (I'm sure the code tries to do it,
> but I've not eyeballed it in detail...)

It memsets the siginfo structure before setting the fields and sending
the signal (grep for clear_siginfo which is just a memset; you should
find a call before all callers of force_sig_info). Memset is the right
approach here since unlike setting fields by hand it clears padding
which could lead to information leaks from the kernel. IIUC this is
the reason why Eric wants all of the signals to be raised via wrappers
in kernel/signal.c instead of via force_sig_info directly (to make
this aspect easier to audit).

> Using unused bits in the signal number to turn on new functionality
> feels risky.  As currently specified, this is just a number.  Since
> today a successful sigaction(n ...) guarantees that n is a valid signal
> number, reasonable code like the following would trigger a buffer
> overrun if we start trying to encode anything else in there:
>
> struct sigaction actions[NSIG];
>
> int do_something( ... )
> {
>         ...
>
>         if (!sigaction(n, sa, ...)) {
>                 actions[n] = *sa;
>                 return 0;
>         }
>
>         ...
> }

I imagine the bit in the signal number being set by the direct caller
to sigaction, and we could specifically recommend that calling
pattern. In that case, your "n" wouldn't have the bit set in it. It
could only appear in newly-written code that doesn't follow our
recommendations, and there are already plenty of much more likely ways
to cause buffer overflows in C code that doesn't follow
recommendations anyway. (And even if such a buffer overflow occurred,
it would very likely be caught early in development by the MMU due to
the magnitude of the number 1<<30.)

> I think it would be cleaner for to add a single flag field that can be
> used for detecting other extensions, and request it via a new sa_flags
> bit.  This removes the need for sematically useless zeroing of unused
> fields (though for hygiene and backwards compatibility reasons we would
> probably want to carry on zeroing them anyway).
>
> I can see no simpler way to add supplementary siginfo fields for
> existing si_codes.  For si_codes that didn't exist before the zeroing
> came in we could still detect optional si_code-specific fields via
> zeroing, but it seems messary to have two ways of detecting extensions.

That would certainly be cleaner if it worked, but that would only be
the case if old kernels rejected unknown bits in sa_flags, and
unfortunately they don't. With the bit in the signal number, the "old
kernels reject" behavior admits relatively straightforward usage code:

void set_segv_handler(void) {
  struct sigaction sa;
  sa.sa_sigaction = handle_segv;
  sa.sa_flags = SA_SIGINFO;
  if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
succeeds in new kernels, fails in old kernels
    sa.sa_sigaction = clear_fields_and_handle_segv;
    if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
      perror("sigaction");
  }
}

void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
  sa->si_future_field = 0;
  handle_segv(signum, sa, ctx);
}

void handle_segv(int signum, siginfo_t *sa, void *ctx) {
  // At this point, si_future_field will have the value 0 in old
kernels and the kernel-supplied value in new kernels.
}

Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
number to sa_flags. In that case, the first sigaction would succeed in
old kernels so handle_segv wouldn't know whether it can safely read
from si_future_field. With the sa_flags approach, you would need
kernel version number checking via uname before setting the flag in
sa_flags, and at that point why even have the flag in sa_flags at all
since you could just have the signal handler conditionally read from
si_future_field based on the uname?

Note that the same applies to a flag indicating the availability of a
si_flags field in sigaction (just
s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
s/si_future_field/si_flags/ in the usage code above). In terms of
SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.

Another thought that occurred to me is that we may consider
generalizing this a step further and introducing a single flag bit in
the signal number that means "reject unknown flags in sa_flags". This
would mean that we wouldn't need to add any more flag bits to the
signal number in the future, thus limiting this signal number hack to
a single bit; all future mandatory behavior changes could just be put
behind a flag in sa_flags and userspace code would easily be able to
detect missing support for a flag and fall back if necessary. In our
case, this would imply usage code like this:

void set_segv_handler(void) {
  struct sigaction sa;
  sa.sa_sigaction = handle_segv;
  sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
  // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
  // Fails in kernels with SF_CHECK_SA_FLAGS support but no
SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
  // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
the bounds check on the signal number).
  if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
    sa.sa_sigaction = clear_fields_and_handle_segv;
    sa.sa_flags = SA_SIGINFO;
    // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
we're using sa_flags from the beginning of time.
    if (sigaction(SIGSEGV, &sa, 0) < 0)
      perror("sigaction");
  }
}

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-06 19:20                                                               ` Peter Collingbourne
@ 2020-07-07 14:19                                                                 ` Dave Martin
  2020-07-07 19:07                                                                   ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-07-07 14:19 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > >
> > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > >
> > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > >
> > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > >
> > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > >>
> > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > >> >  }
> > > > > > > > >> >
> > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > >> >                       const char *str)
> > > > > > > > >> >  {
> > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > >> > -  else
> > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > >> > +  } else {
> > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > >> > +          info.si_code = code;
> > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > >> > +  }
> > > > > > > > >> >  }
> > > > > > > > >> >
> > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > >> > -                      const char *str)
> > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > >> >  {
> > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > >> > +
> > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > >> > +
> > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > >> > +  info.si_code = code;
> > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > >> >  }
> > > > > > > > >>
> > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > >> that takes it's parameters.
> > > > > > > > >
> > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > >
> > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > >
> > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > union members simultantiously.
> > > > > > >
> > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > though.)
> > > > > >
> > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > different union member from the one previously written.
> > > > > >
> > > > > > Writing a different member from the last one written can still splatter
> > > > > > on the other members IIUC.
> > > > > >
> > > > > > It would be better to keep things separate rather than risk
> > > > > > incorrectness just to save a few bytes.
> > > > > >
> > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > >
> > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > >
> > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > >
> > > > > > > So you want something like:
> > > > > > >
> > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > >
> > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > >
> > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > signal generation site...
> > > > > > > > >
> > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > >
> > > > > > > > > Garbled sentence?
> > > > > > > >
> > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > can be properly decoded and made sense of.
> > > > > > > >
> > > > > > > > I am not seeing anything like that.
> > > > > > >
> > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > on 32-bit ARM.
> > > > > > >
> > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > >> > +                  struct {
> > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > >
> > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > >
> > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > >
> > > > > > > > >
> > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > >> > +#endif
> > > > > > > > >> >            };
> > > > > > > > >> >    } _sigfault;
> > > > > > > > >> >
> > > > > > > > >>
> > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > >>
> > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > >> is built for a single architecture.
> > > > > > >
> > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > considering a similar feature:
> > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > architectures from the start.
> > > > > > >
> > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > this point to accommodate the new fields.
> > > > > > >
> > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > >
> > > > > > I think what we need here is basically a flags word.
> > > > > >
> > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > flags word, we can extend as needed.
> > > > > >
> > > > > > How the existence of the first flags words is detected is another
> > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > I guess si_code may be sufficient.
> > > > >
> > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > data structure. The zero-initialization of the padding at the end of
> > > > > the struct is done by the clear_user call here:
> > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > >
> > > > > and the zero-initialization of the padding between fields and unused
> > > > > union members is done by the clear_siginfo function which the kernel
> > > > > calls when initializing the data structure:
> > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > >
> > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > support for flagged fields.
> > > >
> > > > It's not enough that we do this today.  We would have had to do it back
> > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > back to when the arch/arm64 was merged).
> > > >
> > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > always the case, so unused parts of siginfo could be full of old junk
> > > > from the user stack, if the kernel is sufficiently old.
> > > >
> > > > If we're trying to do something generic that makes sense on all arches,
> > > > this matters.  I may have misunderstood something about the code though.
> > >
> > > Hmm, I think you're right. The current behavior was introduced by
> > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > released in 4.18. So if an application wants to be compatible with
> > > pre-4.18 kernels then there would need to be some other way to
> > > indicate that the fields are valid. Probably the simplest way would be
> > > to have the application issue a uname(2) syscall and check the kernel
> > > version before reading these fields. I have a couple of other ideas
> > > that don't rely on version detection, if we'd prefer to avoid that.
> > > (They are somewhat ugly, but our hand is forced by backwards
> > > compatibility.)
> > >
> > > One idea is to re-purpose the si_errno field as a flags field for
> > > certain signal numbers. I checked a few kernel releases going back to
> > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > circumstances:
> > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > - user-defined signal via kill_pid_usb_asyncio
> > > - SIGSWI in 3.18 and before (code since removed)
> > >
> > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > this signal before the 5.8 release.
> > >
> > > Another idea was to have userspace set a flag in sa_flags when
> > > registering a signal handler meaning "this signal handler requires
> > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > the syscall due to an unknown flag being set, but unfortunately this
> > > won't work because existing kernels do not reject sigaction syscalls
> > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > this vein would be to claim some of the upper bits of the signal
> > > number as flags that will cause the syscall to be rejected if set and
> > > unknown to the kernel. Existing kernels (going back to at least
> > > 2.6.18) contain this code in do_sigaction:
> > >
> > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > >                 return -EINVAL;
> > >
> > > and vald_signal is defined as:
> > >
> > > static inline int valid_signal(unsigned long sig)
> > > {
> > >         return sig <= _NSIG ? 1 : 0;
> > > }
> > >
> > > All architectures define _NSIG as a value <= 128, so they will reject
> > > a signal number with any of bits 8-31 set. This means that we can use
> > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > (expanding down as necessary), as it keeps the signal number positive
> > > and permits future expansion of the signal number range.
> >
> > Does the signal core code actually gurantee to zero the unused fields?
> > Unless the fields are poked in by hand this is fraught with subtlelies,
> > especially when unions are involved.  (I'm sure the code tries to do it,
> > but I've not eyeballed it in detail...)
> 
> It memsets the siginfo structure before setting the fields and sending
> the signal (grep for clear_siginfo which is just a memset; you should
> find a call before all callers of force_sig_info). Memset is the right
> approach here since unlike setting fields by hand it clears padding
> which could lead to information leaks from the kernel. IIUC this is
> the reason why Eric wants all of the signals to be raised via wrappers
> in kernel/signal.c instead of via force_sig_info directly (to make
> this aspect easier to audit).

My impression was that the reason for this model is partly to ensure
that siginfo fields are populated more consistently.  When it was all
down to the individual callers, inconsistencies creeped in.

With regard to memset(), this is not a complete defence against data
leakage.  Assigning to a struct member can set any or all padding in
the struct to random garbage (consider write-combining of neighboring
member writes into a single larger accesses in asm for example).  The
only way to avoid this is to ensure that the struct is 100%
padding-free, and that each member of a union is the same size.  A
quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
the case.

This might need to be looked at separately.

But it does mean, strictly speaking, that we can't reliably add new
fields anywhere that there was previously padding: assigning to
neighboring members can still fill those with garbage after the
memset().

> > Using unused bits in the signal number to turn on new functionality
> > feels risky.  As currently specified, this is just a number.  Since
> > today a successful sigaction(n ...) guarantees that n is a valid signal
> > number, reasonable code like the following would trigger a buffer
> > overrun if we start trying to encode anything else in there:
> >
> > struct sigaction actions[NSIG];
> >
> > int do_something( ... )
> > {
> >         ...
> >
> >         if (!sigaction(n, sa, ...)) {
> >                 actions[n] = *sa;
> >                 return 0;
> >         }
> >
> >         ...
> > }
> 
> I imagine the bit in the signal number being set by the direct caller
> to sigaction, and we could specifically recommend that calling
> pattern. In that case, your "n" wouldn't have the bit set in it. It

I can imagine this too, but that doesn't mean that software does it.

If the above kind of thing exists in a framework or library somewhere,
we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
that provides a wrapper for sigaction may now go wrong even if your
pattern is followed -- i.e., the caller thinks it's calling sigaction
directly but in fact it isn't.

> could only appear in newly-written code that doesn't follow our
> recommendations, and there are already plenty of much more likely ways
> to cause buffer overflows in C code that doesn't follow
> recommendations anyway. (And even if such a buffer overflow occurred,
> it would very likely be caught early in development by the MMU due to
> the magnitude of the number 1<<30.)

Choosing the bit value is hard.  If shitfing it overflows, this can
trigger random undefined behaviour in the compiler in addition to (or
perhaps instead of) an out-of-bounds access or segfault.

If shifting it doesn't overflow, we might still fall into a valid
mapping, though I'd agree a segfault is more likely.

> 
> > I think it would be cleaner for to add a single flag field that can be
> > used for detecting other extensions, and request it via a new sa_flags
> > bit.  This removes the need for sematically useless zeroing of unused
> > fields (though for hygiene and backwards compatibility reasons we would
> > probably want to carry on zeroing them anyway).
> >
> > I can see no simpler way to add supplementary siginfo fields for
> > existing si_codes.  For si_codes that didn't exist before the zeroing
> > came in we could still detect optional si_code-specific fields via
> > zeroing, but it seems messary to have two ways of detecting extensions.
> 
> That would certainly be cleaner if it worked, but that would only be
> the case if old kernels rejected unknown bits in sa_flags, and
> unfortunately they don't. With the bit in the signal number, the "old

Hmm, that is a problem I wasn't aware of.

> kernels reject" behavior admits relatively straightforward usage code:
> 
> void set_segv_handler(void) {
>   struct sigaction sa;
>   sa.sa_sigaction = handle_segv;
>   sa.sa_flags = SA_SIGINFO;
>   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> succeeds in new kernels, fails in old kernels
>     sa.sa_sigaction = clear_fields_and_handle_segv;
>     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
>       perror("sigaction");
>   }
> }
> 
> void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
>   sa->si_future_field = 0;
>   handle_segv(signum, sa, ctx);
> }
> 
> void handle_segv(int signum, siginfo_t *sa, void *ctx) {
>   // At this point, si_future_field will have the value 0 in old
> kernels and the kernel-supplied value in new kernels.
> }
> 
> Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> number to sa_flags. In that case, the first sigaction would succeed in
> old kernels so handle_segv wouldn't know whether it can safely read
> from si_future_field. With the sa_flags approach, you would need
> kernel version number checking via uname before setting the flag in
> sa_flags, and at that point why even have the flag in sa_flags at all
> since you could just have the signal handler conditionally read from
> si_future_field based on the uname?

Software setting SA_SIFLAGS (or whatever) is new by definition, since
it would be using a new #define.  So it might be reasonable to put the
burden on that software to verify that the flag was really accepted by
the kernel, by reading it back.

Unfortunately, even relatively recent kernels blindly store sa_flags
in the kernel without validating it, and so it looks like duff flags
can be read back out via a sigaction() call.  Dang.


Perhaps a new frontend syscall could be added.  A new libc that knows
about this "sigaction2" could use it and mask off problem bits from
sa_flags in its sigaction() wrapper before calling sigaction2.  An old
libc would call the old sigaction syscall, where we would ignore these
new sa_flags bits as before.

This may not be a popular approach though, and software wouldn't be able
to use our new features until libc is updated to match.

If we go down this route, it may provide additional opportunities to fix
annoying defects in the old interface.


> Note that the same applies to a flag indicating the availability of a
> si_flags field in sigaction (just
> s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> s/si_future_field/si_flags/ in the usage code above). In terms of
> SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> 
> Another thought that occurred to me is that we may consider
> generalizing this a step further and introducing a single flag bit in
> the signal number that means "reject unknown flags in sa_flags". This
> would mean that we wouldn't need to add any more flag bits to the
> signal number in the future, thus limiting this signal number hack to
> a single bit; all future mandatory behavior changes could just be put
> behind a flag in sa_flags and userspace code would easily be able to
> detect missing support for a flag and fall back if necessary. In our
> case, this would imply usage code like this:
> 
> void set_segv_handler(void) {
>   struct sigaction sa;
>   sa.sa_sigaction = handle_segv;
>   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
>   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
>   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
>   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> the bounds check on the signal number).
>   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
>     sa.sa_sigaction = clear_fields_and_handle_segv;
>     sa.sa_flags = SA_SIGINFO;
>     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> we're using sa_flags from the beginning of time.
>     if (sigaction(SIGSEGV, &sa, 0) < 0)
>       perror("sigaction");
>   }
> }

As with the other options this could work, but looks like it could
break the ABI due to violating the original semantics for the signal
number argument.  Perhaps I'm being too paranoid.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-07 14:19                                                                 ` Dave Martin
@ 2020-07-07 19:07                                                                   ` Peter Collingbourne
  2020-07-08 11:00                                                                     ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-07-07 19:07 UTC (permalink / raw)
  To: Dave Martin
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > >
> > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > >
> > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > >
> > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > >>
> > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > >> >  }
> > > > > > > > > >> >
> > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > >> >                       const char *str)
> > > > > > > > > >> >  {
> > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > >> > -  else
> > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > >> > +  } else {
> > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > >> > +  }
> > > > > > > > > >> >  }
> > > > > > > > > >> >
> > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > >> > -                      const char *str)
> > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > >> >  {
> > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > >> > +
> > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > >> > +
> > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > >> >  }
> > > > > > > > > >>
> > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > >> that takes it's parameters.
> > > > > > > > > >
> > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > >
> > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > >
> > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > union members simultantiously.
> > > > > > > >
> > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > though.)
> > > > > > >
> > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > different union member from the one previously written.
> > > > > > >
> > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > on the other members IIUC.
> > > > > > >
> > > > > > > It would be better to keep things separate rather than risk
> > > > > > > incorrectness just to save a few bytes.
> > > > > > >
> > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > >
> > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > >
> > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > >
> > > > > > > > So you want something like:
> > > > > > > >
> > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > >
> > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > >
> > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > signal generation site...
> > > > > > > > > >
> > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > >
> > > > > > > > > > Garbled sentence?
> > > > > > > > >
> > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > >
> > > > > > > > > I am not seeing anything like that.
> > > > > > > >
> > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > on 32-bit ARM.
> > > > > > > >
> > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > >> > +                  struct {
> > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > >
> > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > >
> > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > >> > +#endif
> > > > > > > > > >> >            };
> > > > > > > > > >> >    } _sigfault;
> > > > > > > > > >> >
> > > > > > > > > >>
> > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > >>
> > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > >> is built for a single architecture.
> > > > > > > >
> > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > considering a similar feature:
> > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > architectures from the start.
> > > > > > > >
> > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > this point to accommodate the new fields.
> > > > > > > >
> > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > >
> > > > > > > I think what we need here is basically a flags word.
> > > > > > >
> > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > flags word, we can extend as needed.
> > > > > > >
> > > > > > > How the existence of the first flags words is detected is another
> > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > I guess si_code may be sufficient.
> > > > > >
> > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > the struct is done by the clear_user call here:
> > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > >
> > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > calls when initializing the data structure:
> > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > >
> > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > support for flagged fields.
> > > > >
> > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > back to when the arch/arm64 was merged).
> > > > >
> > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > from the user stack, if the kernel is sufficiently old.
> > > > >
> > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > this matters.  I may have misunderstood something about the code though.
> > > >
> > > > Hmm, I think you're right. The current behavior was introduced by
> > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > released in 4.18. So if an application wants to be compatible with
> > > > pre-4.18 kernels then there would need to be some other way to
> > > > indicate that the fields are valid. Probably the simplest way would be
> > > > to have the application issue a uname(2) syscall and check the kernel
> > > > version before reading these fields. I have a couple of other ideas
> > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > compatibility.)
> > > >
> > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > certain signal numbers. I checked a few kernel releases going back to
> > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > circumstances:
> > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > - user-defined signal via kill_pid_usb_asyncio
> > > > - SIGSWI in 3.18 and before (code since removed)
> > > >
> > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > this signal before the 5.8 release.
> > > >
> > > > Another idea was to have userspace set a flag in sa_flags when
> > > > registering a signal handler meaning "this signal handler requires
> > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > won't work because existing kernels do not reject sigaction syscalls
> > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > this vein would be to claim some of the upper bits of the signal
> > > > number as flags that will cause the syscall to be rejected if set and
> > > > unknown to the kernel. Existing kernels (going back to at least
> > > > 2.6.18) contain this code in do_sigaction:
> > > >
> > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > >                 return -EINVAL;
> > > >
> > > > and vald_signal is defined as:
> > > >
> > > > static inline int valid_signal(unsigned long sig)
> > > > {
> > > >         return sig <= _NSIG ? 1 : 0;
> > > > }
> > > >
> > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > (expanding down as necessary), as it keeps the signal number positive
> > > > and permits future expansion of the signal number range.
> > >
> > > Does the signal core code actually gurantee to zero the unused fields?
> > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > but I've not eyeballed it in detail...)
> >
> > It memsets the siginfo structure before setting the fields and sending
> > the signal (grep for clear_siginfo which is just a memset; you should
> > find a call before all callers of force_sig_info). Memset is the right
> > approach here since unlike setting fields by hand it clears padding
> > which could lead to information leaks from the kernel. IIUC this is
> > the reason why Eric wants all of the signals to be raised via wrappers
> > in kernel/signal.c instead of via force_sig_info directly (to make
> > this aspect easier to audit).
>
> My impression was that the reason for this model is partly to ensure
> that siginfo fields are populated more consistently.  When it was all
> down to the individual callers, inconsistencies creeped in.
>
> With regard to memset(), this is not a complete defence against data
> leakage.  Assigning to a struct member can set any or all padding in
> the struct to random garbage (consider write-combining of neighboring
> member writes into a single larger accesses in asm for example).  The

I don't believe that LLVM will store to padding like this. I don't
know about GCC, though, but I wouldn't be surprised if this is
something that the kernel would want to turn off in "kernel C" (like
it turns off strict aliasing) specifically because of the information
leak issue.

> only way to avoid this is to ensure that the struct is 100%
> padding-free, and that each member of a union is the same size.  A
> quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> the case.
>
> This might need to be looked at separately.
>
> But it does mean, strictly speaking, that we can't reliably add new
> fields anywhere that there was previously padding: assigning to
> neighboring members can still fill those with garbage after the
> memset().

...but this is largely moot because I'm not proposing to add new
fields in the padding any more (because the fields needed to become
larger in order to accommodate future hypothetical architectures which
might want to use the fields, and thus they wouldn't fit in the
padding). The siginfo.h diff would be something like:

diff --git a/include/uapi/asm-generic/siginfo.h
b/include/uapi/asm-generic/siginfo.h
index cb3d6c267181..4a2fe257415d 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,7 +91,10 @@ union __sifields {
                                char _dummy_pkey[__ADDR_BND_PKEY_PAD];
                                __u32 _pkey;
                        } _addr_pkey;
+                       void *_pad[6];
                };
+               uintptr_t _ignored_bits;
+               uintptr_t _ignored_bits_mask;
        } _sigfault;

        /* SIGPOLL */

or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.

> > > Using unused bits in the signal number to turn on new functionality
> > > feels risky.  As currently specified, this is just a number.  Since
> > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > number, reasonable code like the following would trigger a buffer
> > > overrun if we start trying to encode anything else in there:
> > >
> > > struct sigaction actions[NSIG];
> > >
> > > int do_something( ... )
> > > {
> > >         ...
> > >
> > >         if (!sigaction(n, sa, ...)) {
> > >                 actions[n] = *sa;
> > >                 return 0;
> > >         }
> > >
> > >         ...
> > > }
> >
> > I imagine the bit in the signal number being set by the direct caller
> > to sigaction, and we could specifically recommend that calling
> > pattern. In that case, your "n" wouldn't have the bit set in it. It
>
> I can imagine this too, but that doesn't mean that software does it.
>
> If the above kind of thing exists in a framework or library somewhere,
> we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> that provides a wrapper for sigaction may now go wrong even if your
> pattern is followed -- i.e., the caller thinks it's calling sigaction
> directly but in fact it isn't.

I'm aware of one library like that. It's called libsigchain, and it
has an early bounds check:
https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371

Until the library is changed to recognize the flag, calling code would
see the return value of -1 as if the kernel failed the syscall, and
would fall back to the code for old kernels.

In general I think that any library like this with independent
tracking of the kernel's purported signal handler state would need to
be very sensitive to which syscalls are capable of setting signal
handlers, what their semantics are, and so on. This applies to any
change that we might make to the signal handler interface. So for
example, if we introduced a new syscall as you propose below, and the
library hasn't been updated to recognize the new syscall, it will
silently miss changes in signal handler state caused by the new
syscall.

At the end of this argument lies "we can never change anything about
how signal handlers work because it could break some interposing
library somewhere" -- replace "signal handlers" with any kernel
feature whose behavior may be modified by an interposing library if
you like -- and I don't think we want to go that far. As far as I
know, this isn't really the kernel's business anyway -- the kernel's
stable ABI contract starts and ends with the syscall interface and not
some library on top.

That being said, we should perhaps try to define our interface so that
something reasonable will probably happen if there is such a library
and it hasn't been updated. With the new syscall, the library will
sometimes silently fail to work in some non-local fashion. With the
flag bit in the signal number, the library will either cause the
caller to fall back to the old kernel code path (if there is a bounds
check) or likely crash loudly (if there is no bounds check). To me,
the "flag bit in the signal number" behavior seems more reasonable,
since either something correct or something easy to debug will
probably happen at runtime.

> > could only appear in newly-written code that doesn't follow our
> > recommendations, and there are already plenty of much more likely ways
> > to cause buffer overflows in C code that doesn't follow
> > recommendations anyway. (And even if such a buffer overflow occurred,
> > it would very likely be caught early in development by the MMU due to
> > the magnitude of the number 1<<30.)
>
> Choosing the bit value is hard.  If shitfing it overflows, this can
> trigger random undefined behaviour in the compiler in addition to (or
> perhaps instead of) an out-of-bounds access or segfault.

It wouldn't overflow on a 64-bit architecture assuming normal array
indexing (the index would be promoted to pointer width before being
scaled to the array element size), and to begin with the users of this
would be 64-bit.

> If shifting it doesn't overflow, we might still fall into a valid
> mapping, though I'd agree a segfault is more likely.
>
> >
> > > I think it would be cleaner for to add a single flag field that can be
> > > used for detecting other extensions, and request it via a new sa_flags
> > > bit.  This removes the need for sematically useless zeroing of unused
> > > fields (though for hygiene and backwards compatibility reasons we would
> > > probably want to carry on zeroing them anyway).
> > >
> > > I can see no simpler way to add supplementary siginfo fields for
> > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > came in we could still detect optional si_code-specific fields via
> > > zeroing, but it seems messary to have two ways of detecting extensions.
> >
> > That would certainly be cleaner if it worked, but that would only be
> > the case if old kernels rejected unknown bits in sa_flags, and
> > unfortunately they don't. With the bit in the signal number, the "old
>
> Hmm, that is a problem I wasn't aware of.
>
> > kernels reject" behavior admits relatively straightforward usage code:
> >
> > void set_segv_handler(void) {
> >   struct sigaction sa;
> >   sa.sa_sigaction = handle_segv;
> >   sa.sa_flags = SA_SIGINFO;
> >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > succeeds in new kernels, fails in old kernels
> >     sa.sa_sigaction = clear_fields_and_handle_segv;
> >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> >       perror("sigaction");
> >   }
> > }
> >
> > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> >   sa->si_future_field = 0;
> >   handle_segv(signum, sa, ctx);
> > }
> >
> > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> >   // At this point, si_future_field will have the value 0 in old
> > kernels and the kernel-supplied value in new kernels.
> > }
> >
> > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > number to sa_flags. In that case, the first sigaction would succeed in
> > old kernels so handle_segv wouldn't know whether it can safely read
> > from si_future_field. With the sa_flags approach, you would need
> > kernel version number checking via uname before setting the flag in
> > sa_flags, and at that point why even have the flag in sa_flags at all
> > since you could just have the signal handler conditionally read from
> > si_future_field based on the uname?
>
> Software setting SA_SIFLAGS (or whatever) is new by definition, since
> it would be using a new #define.  So it might be reasonable to put the
> burden on that software to verify that the flag was really accepted by
> the kernel, by reading it back.

That doesn't seem like a good idea even if it worked, because it could
lead to race conditions. If the si_flags-reading signal handler were
invoked in response to a signal between when you set it and when you
ended up replacing it with the fallback signal handler for old
kernels, the handler may end up reading garbage data from si_flags.

> Unfortunately, even relatively recent kernels blindly store sa_flags
> in the kernel without validating it, and so it looks like duff flags
> can be read back out via a sigaction() call.  Dang.
>
>
> Perhaps a new frontend syscall could be added.  A new libc that knows
> about this "sigaction2" could use it and mask off problem bits from
> sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> libc would call the old sigaction syscall, where we would ignore these
> new sa_flags bits as before.

I'm not currently in favor of the new syscall but if we do this I
would keep sigaction and sigaction2 separate. That is, libc sigaction
should always use the sigaction syscall, and libc sigaction2 should
always use the sigaction2 syscall. We should avoid libc's sigaction
having different behavior based on the libc version and kernel
version, as that would make it harder to reason about its behavior.
Calling code would need to check for presence of sigaction2 in both
libc and the kernel, e.g.

__attribute__((weak)) decltype(sigaction2) sigaction2;

void set_segv_handler(void) {
  struct sigaction sa;
  sa.sa_sigaction = handle_segv;
  sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
  if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
    sa.sa_sigaction = clear_fields_and_handle_segv;
    sa.sa_flags = SA_SIGINFO;
    if (sigaction(SIGSEGV, &sa, 0) < 0)
      perror("sigaction");
  }
}

> This may not be a popular approach though, and software wouldn't be able
> to use our new features until libc is updated to match.
>
> If we go down this route, it may provide additional opportunities to fix
> annoying defects in the old interface.
>
>
> > Note that the same applies to a flag indicating the availability of a
> > si_flags field in sigaction (just
> > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > s/si_future_field/si_flags/ in the usage code above). In terms of
> > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> >
> > Another thought that occurred to me is that we may consider
> > generalizing this a step further and introducing a single flag bit in
> > the signal number that means "reject unknown flags in sa_flags". This
> > would mean that we wouldn't need to add any more flag bits to the
> > signal number in the future, thus limiting this signal number hack to
> > a single bit; all future mandatory behavior changes could just be put
> > behind a flag in sa_flags and userspace code would easily be able to
> > detect missing support for a flag and fall back if necessary. In our
> > case, this would imply usage code like this:
> >
> > void set_segv_handler(void) {
> >   struct sigaction sa;
> >   sa.sa_sigaction = handle_segv;
> >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > the bounds check on the signal number).
> >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> >     sa.sa_sigaction = clear_fields_and_handle_segv;
> >     sa.sa_flags = SA_SIGINFO;
> >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > we're using sa_flags from the beginning of time.
> >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> >       perror("sigaction");
> >   }
> > }
>
> As with the other options this could work, but looks like it could
> break the ABI due to violating the original semantics for the signal
> number argument.  Perhaps I'm being too paranoid.

There's no ABI being broken here, as long as we consider syscalls to
be the stable ABI layer. Old kernels are simply rejecting arguments
that they don't know about yet. By that argument, any introduction of
a new syscall is an ABI break because it changes the semantics of a
previously-unallocated syscall number.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-07 19:07                                                                   ` Peter Collingbourne
@ 2020-07-08 11:00                                                                     ` Dave Martin
  2020-07-08 13:58                                                                       ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-07-08 11:00 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > >
> > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > >
> > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > >
> > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > >
> > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > >>
> > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > >> >  }
> > > > > > > > > > >> >
> > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > >> >  {
> > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > >> > -  else
> > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > >> > +  } else {
> > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > >> > +  }
> > > > > > > > > > >> >  }
> > > > > > > > > > >> >
> > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > >> >  {
> > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > >> > +
> > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > >> > +
> > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > >> >  }
> > > > > > > > > > >>
> > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > >
> > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > >
> > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > >
> > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > union members simultantiously.
> > > > > > > > >
> > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > though.)
> > > > > > > >
> > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > different union member from the one previously written.
> > > > > > > >
> > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > on the other members IIUC.
> > > > > > > >
> > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > incorrectness just to save a few bytes.
> > > > > > > >
> > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > >
> > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > >
> > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > >
> > > > > > > > > So you want something like:
> > > > > > > > >
> > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > >
> > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > >
> > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > signal generation site...
> > > > > > > > > > >
> > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > >
> > > > > > > > > > > Garbled sentence?
> > > > > > > > > >
> > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > >
> > > > > > > > > > I am not seeing anything like that.
> > > > > > > > >
> > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > on 32-bit ARM.
> > > > > > > > >
> > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > >> > +                  struct {
> > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > >
> > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > >
> > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > >> > +#endif
> > > > > > > > > > >> >            };
> > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > >> >
> > > > > > > > > > >>
> > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > >>
> > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > >> is built for a single architecture.
> > > > > > > > >
> > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > considering a similar feature:
> > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > architectures from the start.
> > > > > > > > >
> > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > this point to accommodate the new fields.
> > > > > > > > >
> > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > >
> > > > > > > > I think what we need here is basically a flags word.
> > > > > > > >
> > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > flags word, we can extend as needed.
> > > > > > > >
> > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > I guess si_code may be sufficient.
> > > > > > >
> > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > the struct is done by the clear_user call here:
> > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > >
> > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > calls when initializing the data structure:
> > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > >
> > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > support for flagged fields.
> > > > > >
> > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > back to when the arch/arm64 was merged).
> > > > > >
> > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > >
> > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > this matters.  I may have misunderstood something about the code though.
> > > > >
> > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > released in 4.18. So if an application wants to be compatible with
> > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > version before reading these fields. I have a couple of other ideas
> > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > compatibility.)
> > > > >
> > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > circumstances:
> > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > >
> > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > this signal before the 5.8 release.
> > > > >
> > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > registering a signal handler meaning "this signal handler requires
> > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > this vein would be to claim some of the upper bits of the signal
> > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > 2.6.18) contain this code in do_sigaction:
> > > > >
> > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > >                 return -EINVAL;
> > > > >
> > > > > and vald_signal is defined as:
> > > > >
> > > > > static inline int valid_signal(unsigned long sig)
> > > > > {
> > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > }
> > > > >
> > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > and permits future expansion of the signal number range.
> > > >
> > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > but I've not eyeballed it in detail...)
> > >
> > > It memsets the siginfo structure before setting the fields and sending
> > > the signal (grep for clear_siginfo which is just a memset; you should
> > > find a call before all callers of force_sig_info). Memset is the right
> > > approach here since unlike setting fields by hand it clears padding
> > > which could lead to information leaks from the kernel. IIUC this is
> > > the reason why Eric wants all of the signals to be raised via wrappers
> > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > this aspect easier to audit).
> >
> > My impression was that the reason for this model is partly to ensure
> > that siginfo fields are populated more consistently.  When it was all
> > down to the individual callers, inconsistencies creeped in.
> >
> > With regard to memset(), this is not a complete defence against data
> > leakage.  Assigning to a struct member can set any or all padding in
> > the struct to random garbage (consider write-combining of neighboring
> > member writes into a single larger accesses in asm for example).  The
> 
> I don't believe that LLVM will store to padding like this. I don't
> know about GCC, though, but I wouldn't be surprised if this is
> something that the kernel would want to turn off in "kernel C" (like
> it turns off strict aliasing) specifically because of the information
> leak issue.

Again, the issue is not future kernel builds -- we can always find a way
to fix the behaviour for those -- but past kernel builds.

> > only way to avoid this is to ensure that the struct is 100%
> > padding-free, and that each member of a union is the same size.  A
> > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > the case.
> >
> > This might need to be looked at separately.
> >
> > But it does mean, strictly speaking, that we can't reliably add new
> > fields anywhere that there was previously padding: assigning to
> > neighboring members can still fill those with garbage after the
> > memset().
> 
> ...but this is largely moot because I'm not proposing to add new
> fields in the padding any more (because the fields needed to become
> larger in order to accommodate future hypothetical architectures which
> might want to use the fields, and thus they wouldn't fit in the
> padding). The siginfo.h diff would be something like:
> 
> diff --git a/include/uapi/asm-generic/siginfo.h
> b/include/uapi/asm-generic/siginfo.h
> index cb3d6c267181..4a2fe257415d 100644
> --- a/include/uapi/asm-generic/siginfo.h
> +++ b/include/uapi/asm-generic/siginfo.h
> @@ -91,7 +91,10 @@ union __sifields {
>                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
>                                 __u32 _pkey;
>                         } _addr_pkey;
> +                       void *_pad[6];
>                 };
> +               uintptr_t _ignored_bits;
> +               uintptr_t _ignored_bits_mask;

This _is_ in padding: the tail-padding of the (previously smaller)
_sigfault.  Again, the compiler was allowed to populate this area with
junk before these fields were added.

I agree that it seems fairly unlikely that the compiler would have been
overwriting this in normal circumstances, but that's not a guarantee.
My worry is that if this goes wrong, it will go wrong silently and
unpredictably.

>         } _sigfault;
> 
>         /* SIGPOLL */
> 
> or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> 
> > > > Using unused bits in the signal number to turn on new functionality
> > > > feels risky.  As currently specified, this is just a number.  Since
> > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > number, reasonable code like the following would trigger a buffer
> > > > overrun if we start trying to encode anything else in there:
> > > >
> > > > struct sigaction actions[NSIG];
> > > >
> > > > int do_something( ... )
> > > > {
> > > >         ...
> > > >
> > > >         if (!sigaction(n, sa, ...)) {
> > > >                 actions[n] = *sa;
> > > >                 return 0;
> > > >         }
> > > >
> > > >         ...
> > > > }
> > >
> > > I imagine the bit in the signal number being set by the direct caller
> > > to sigaction, and we could specifically recommend that calling
> > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> >
> > I can imagine this too, but that doesn't mean that software does it.
> >
> > If the above kind of thing exists in a framework or library somewhere,
> > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > that provides a wrapper for sigaction may now go wrong even if your
> > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > directly but in fact it isn't.
> 
> I'm aware of one library like that. It's called libsigchain, and it
> has an early bounds check:
> https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> 
> Until the library is changed to recognize the flag, calling code would
> see the return value of -1 as if the kernel failed the syscall, and
> would fall back to the code for old kernels.

But only after some bad dereferences.  If these were writes, this means
that memory _may_ be silently corrupted (I don't say it't likely in a
given case, and we cannot pick a flag bit that makes this impossible).

So, _even though the user program is correct_, our change may trigger
the corruption of arbitrary user memory.  This what I mean by an ABI
break.  The fact that the corruption is not done by the syscall itself
is no excuse.

We also fail to notice failures in sigaddset() etc., though in this code
it looks like that should not matter.

> In general I think that any library like this with independent
> tracking of the kernel's purported signal handler state would need to
> be very sensitive to which syscalls are capable of setting signal
> handlers, what their semantics are, and so on. This applies to any
> change that we might make to the signal handler interface. So for
> example, if we introduced a new syscall as you propose below, and the
> library hasn't been updated to recognize the new syscall, it will
> silently miss changes in signal handler state caused by the new
> syscall.
> 
> At the end of this argument lies "we can never change anything about
> how signal handlers work because it could break some interposing
> library somewhere" -- replace "signal handlers" with any kernel
> feature whose behavior may be modified by an interposing library if
> you like -- and I don't think we want to go that far. As far as I
> know, this isn't really the kernel's business anyway -- the kernel's
> stable ABI contract starts and ends with the syscall interface and not
> some library on top.
> 
> That being said, we should perhaps try to define our interface so that
> something reasonable will probably happen if there is such a library
> and it hasn't been updated. With the new syscall, the library will
> sometimes silently fail to work in some non-local fashion. With the
> flag bit in the signal number, the library will either cause the
> caller to fall back to the old kernel code path (if there is a bounds
> check) or likely crash loudly (if there is no bounds check). To me,
> the "flag bit in the signal number" behavior seems more reasonable,
> since either something correct or something easy to debug will
> probably happen at runtime.
> 
> > > could only appear in newly-written code that doesn't follow our
> > > recommendations, and there are already plenty of much more likely ways
> > > to cause buffer overflows in C code that doesn't follow
> > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > it would very likely be caught early in development by the MMU due to
> > > the magnitude of the number 1<<30.)
> >
> > Choosing the bit value is hard.  If shitfing it overflows, this can
> > trigger random undefined behaviour in the compiler in addition to (or
> > perhaps instead of) an out-of-bounds access or segfault.
> 
> It wouldn't overflow on a 64-bit architecture assuming normal array
> indexing (the index would be promoted to pointer width before being
> scaled to the array element size), and to begin with the users of this
> would be 64-bit.

Unless we don't offer this feature for 32-bit at all (possible, if ugly)
we can't stop people using it.

> > If shifting it doesn't overflow, we might still fall into a valid
> > mapping, though I'd agree a segfault is more likely.
> >
> > >
> > > > I think it would be cleaner for to add a single flag field that can be
> > > > used for detecting other extensions, and request it via a new sa_flags
> > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > probably want to carry on zeroing them anyway).
> > > >
> > > > I can see no simpler way to add supplementary siginfo fields for
> > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > came in we could still detect optional si_code-specific fields via
> > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > >
> > > That would certainly be cleaner if it worked, but that would only be
> > > the case if old kernels rejected unknown bits in sa_flags, and
> > > unfortunately they don't. With the bit in the signal number, the "old
> >
> > Hmm, that is a problem I wasn't aware of.
> >
> > > kernels reject" behavior admits relatively straightforward usage code:
> > >
> > > void set_segv_handler(void) {
> > >   struct sigaction sa;
> > >   sa.sa_sigaction = handle_segv;
> > >   sa.sa_flags = SA_SIGINFO;
> > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > succeeds in new kernels, fails in old kernels
> > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > >       perror("sigaction");
> > >   }
> > > }
> > >
> > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > >   sa->si_future_field = 0;
> > >   handle_segv(signum, sa, ctx);
> > > }
> > >
> > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > >   // At this point, si_future_field will have the value 0 in old
> > > kernels and the kernel-supplied value in new kernels.
> > > }
> > >
> > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > number to sa_flags. In that case, the first sigaction would succeed in
> > > old kernels so handle_segv wouldn't know whether it can safely read
> > > from si_future_field. With the sa_flags approach, you would need
> > > kernel version number checking via uname before setting the flag in
> > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > since you could just have the signal handler conditionally read from
> > > si_future_field based on the uname?
> >
> > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > it would be using a new #define.  So it might be reasonable to put the
> > burden on that software to verify that the flag was really accepted by
> > the kernel, by reading it back.
> 
> That doesn't seem like a good idea even if it worked, because it could
> lead to race conditions. If the si_flags-reading signal handler were
> invoked in response to a signal between when you set it and when you
> ended up replacing it with the fallback signal handler for old
> kernels, the handler may end up reading garbage data from si_flags.

Not really.  My example may have this problem, but the signal handler
can be written to support both scenarios, based on testing a flag that
the main program sets after verifying that the flag could be set.  Or
the signal could be blocked around establishment (often a good idea for
other reasons).

But I agree it's a bit gross, and anyway doesn't work due to the fact
that the kernel doesn't filter out unrecognised flags anyway.

> > Unfortunately, even relatively recent kernels blindly store sa_flags
> > in the kernel without validating it, and so it looks like duff flags
> > can be read back out via a sigaction() call.  Dang.
> >
> >
> > Perhaps a new frontend syscall could be added.  A new libc that knows
> > about this "sigaction2" could use it and mask off problem bits from
> > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > libc would call the old sigaction syscall, where we would ignore these
> > new sa_flags bits as before.
> 
> I'm not currently in favor of the new syscall but if we do this I
> would keep sigaction and sigaction2 separate. That is, libc sigaction
> should always use the sigaction syscall, and libc sigaction2 should
> always use the sigaction2 syscall. We should avoid libc's sigaction
> having different behavior based on the libc version and kernel
> version, as that would make it harder to reason about its behavior.
> Calling code would need to check for presence of sigaction2 in both
> libc and the kernel, e.g.
> 
> __attribute__((weak)) decltype(sigaction2) sigaction2;
> 
> void set_segv_handler(void) {
>   struct sigaction sa;
>   sa.sa_sigaction = handle_segv;
>   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
>   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
>     sa.sa_sigaction = clear_fields_and_handle_segv;
>     sa.sa_flags = SA_SIGINFO;
>     if (sigaction(SIGSEGV, &sa, 0) < 0)
>       perror("sigaction");
>   }
> }

I guess.  But I share your distaste for adding a new syscall.

> 
> > This may not be a popular approach though, and software wouldn't be able
> > to use our new features until libc is updated to match.
> >
> > If we go down this route, it may provide additional opportunities to fix
> > annoying defects in the old interface.
> >
> >
> > > Note that the same applies to a flag indicating the availability of a
> > > si_flags field in sigaction (just
> > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > >
> > > Another thought that occurred to me is that we may consider
> > > generalizing this a step further and introducing a single flag bit in
> > > the signal number that means "reject unknown flags in sa_flags". This
> > > would mean that we wouldn't need to add any more flag bits to the
> > > signal number in the future, thus limiting this signal number hack to
> > > a single bit; all future mandatory behavior changes could just be put
> > > behind a flag in sa_flags and userspace code would easily be able to
> > > detect missing support for a flag and fall back if necessary. In our
> > > case, this would imply usage code like this:
> > >
> > > void set_segv_handler(void) {
> > >   struct sigaction sa;
> > >   sa.sa_sigaction = handle_segv;
> > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > the bounds check on the signal number).
> > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > >     sa.sa_flags = SA_SIGINFO;
> > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > we're using sa_flags from the beginning of time.
> > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > >       perror("sigaction");
> > >   }
> > > }
> >
> > As with the other options this could work, but looks like it could
> > break the ABI due to violating the original semantics for the signal
> > number argument.  Perhaps I'm being too paranoid.
> 
> There's no ABI being broken here, as long as we consider syscalls to
> be the stable ABI layer. Old kernels are simply rejecting arguments
> that they don't know about yet. By that argument, any introduction of
> a new syscall is an ABI break because it changes the semantics of a
> previously-unallocated syscall number.

As argued above, I think this is an invalid argument.

Although any addition will change behaviour (so is a break in some
sense), the key is not to make "surprising" changes.

Having something random happen when setting a previously reserved flag
bit, or when issuing a syscall when an unknown syscall number, or not
surprising at all.

Making fundamental changes to the encoding of an existing argument is
highly surprising, on the other hand: as your example shows, it is
reasonable to index an array using a signal number.

I agree that this doesn't get us closer to a practical solution though.


But we do seem to need some mechanism in addition to (or instead of)
sa_flags.

Here's another thought:

Since si_flags would be either always present or always absent, it
could make sense to have a global property to report this, rather than
an sa_flags or signal number bit to request it per-signal.

Requiring software to parse uname() might be reasonable for that, if
cumbersome (did you suggest this previously?).  If we're concerned that
the awkwardness of this would encourage people not to bother (or
encourage people to do it wrong) then we might opt for something simpler
like an AT_FLAGS bit.

Ultimately libc could provide a more portable interface for discovery,
such as via sysconf().

Thoughts?

---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-08 11:00                                                                     ` Dave Martin
@ 2020-07-08 13:58                                                                       ` Dave Martin
  2020-07-08 22:21                                                                         ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-07-08 13:58 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Wed, Jul 08, 2020 at 12:00:22PM +0100, Dave Martin wrote:
> On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> > On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > >
> > > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > >
> > > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > > >
> > > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > > >>
> > > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > > >> >  }
> > > > > > > > > > > >> >
> > > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > > >> >  {
> > > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > > >> > -  else
> > > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > > >> > +  } else {
> > > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > > >> > +  }
> > > > > > > > > > > >> >  }
> > > > > > > > > > > >> >
> > > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > > >> >  {
> > > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > > >> > +
> > > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > > >> > +
> > > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > > >> >  }
> > > > > > > > > > > >>
> > > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > > >
> > > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > > >
> > > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > > >
> > > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > > union members simultantiously.
> > > > > > > > > >
> > > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > > though.)
> > > > > > > > >
> > > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > > different union member from the one previously written.
> > > > > > > > >
> > > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > > on the other members IIUC.
> > > > > > > > >
> > > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > > incorrectness just to save a few bytes.
> > > > > > > > >
> > > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > > >
> > > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > > >
> > > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > > >
> > > > > > > > > > So you want something like:
> > > > > > > > > >
> > > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > >
> > > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > > >
> > > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > > signal generation site...
> > > > > > > > > > > >
> > > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > > >
> > > > > > > > > > > > Garbled sentence?
> > > > > > > > > > >
> > > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > > >
> > > > > > > > > > > I am not seeing anything like that.
> > > > > > > > > >
> > > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > > on 32-bit ARM.
> > > > > > > > > >
> > > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > > >> > +                  struct {
> > > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > > >
> > > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > > >
> > > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > > >> > +#endif
> > > > > > > > > > > >> >            };
> > > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > > >> >
> > > > > > > > > > > >>
> > > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > > >>
> > > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > > >> is built for a single architecture.
> > > > > > > > > >
> > > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > > considering a similar feature:
> > > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > > architectures from the start.
> > > > > > > > > >
> > > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > > this point to accommodate the new fields.
> > > > > > > > > >
> > > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > > >
> > > > > > > > > I think what we need here is basically a flags word.
> > > > > > > > >
> > > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > > flags word, we can extend as needed.
> > > > > > > > >
> > > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > > I guess si_code may be sufficient.
> > > > > > > >
> > > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > > the struct is done by the clear_user call here:
> > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > > >
> > > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > > calls when initializing the data structure:
> > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > > >
> > > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > > support for flagged fields.
> > > > > > >
> > > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > > back to when the arch/arm64 was merged).
> > > > > > >
> > > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > > >
> > > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > > this matters.  I may have misunderstood something about the code though.
> > > > > >
> > > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > > released in 4.18. So if an application wants to be compatible with
> > > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > > version before reading these fields. I have a couple of other ideas
> > > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > > compatibility.)
> > > > > >
> > > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > > circumstances:
> > > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > > >
> > > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > > this signal before the 5.8 release.
> > > > > >
> > > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > > registering a signal handler meaning "this signal handler requires
> > > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > > this vein would be to claim some of the upper bits of the signal
> > > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > > 2.6.18) contain this code in do_sigaction:
> > > > > >
> > > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > > >                 return -EINVAL;
> > > > > >
> > > > > > and vald_signal is defined as:
> > > > > >
> > > > > > static inline int valid_signal(unsigned long sig)
> > > > > > {
> > > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > > }
> > > > > >
> > > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > > and permits future expansion of the signal number range.
> > > > >
> > > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > > but I've not eyeballed it in detail...)
> > > >
> > > > It memsets the siginfo structure before setting the fields and sending
> > > > the signal (grep for clear_siginfo which is just a memset; you should
> > > > find a call before all callers of force_sig_info). Memset is the right
> > > > approach here since unlike setting fields by hand it clears padding
> > > > which could lead to information leaks from the kernel. IIUC this is
> > > > the reason why Eric wants all of the signals to be raised via wrappers
> > > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > > this aspect easier to audit).
> > >
> > > My impression was that the reason for this model is partly to ensure
> > > that siginfo fields are populated more consistently.  When it was all
> > > down to the individual callers, inconsistencies creeped in.
> > >
> > > With regard to memset(), this is not a complete defence against data
> > > leakage.  Assigning to a struct member can set any or all padding in
> > > the struct to random garbage (consider write-combining of neighboring
> > > member writes into a single larger accesses in asm for example).  The
> > 
> > I don't believe that LLVM will store to padding like this. I don't
> > know about GCC, though, but I wouldn't be surprised if this is
> > something that the kernel would want to turn off in "kernel C" (like
> > it turns off strict aliasing) specifically because of the information
> > leak issue.
> 
> Again, the issue is not future kernel builds -- we can always find a way
> to fix the behaviour for those -- but past kernel builds.
> 
> > > only way to avoid this is to ensure that the struct is 100%
> > > padding-free, and that each member of a union is the same size.  A
> > > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > > the case.
> > >
> > > This might need to be looked at separately.
> > >
> > > But it does mean, strictly speaking, that we can't reliably add new
> > > fields anywhere that there was previously padding: assigning to
> > > neighboring members can still fill those with garbage after the
> > > memset().
> > 
> > ...but this is largely moot because I'm not proposing to add new
> > fields in the padding any more (because the fields needed to become
> > larger in order to accommodate future hypothetical architectures which
> > might want to use the fields, and thus they wouldn't fit in the
> > padding). The siginfo.h diff would be something like:
> > 
> > diff --git a/include/uapi/asm-generic/siginfo.h
> > b/include/uapi/asm-generic/siginfo.h
> > index cb3d6c267181..4a2fe257415d 100644
> > --- a/include/uapi/asm-generic/siginfo.h
> > +++ b/include/uapi/asm-generic/siginfo.h
> > @@ -91,7 +91,10 @@ union __sifields {
> >                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> >                                 __u32 _pkey;
> >                         } _addr_pkey;
> > +                       void *_pad[6];
> >                 };
> > +               uintptr_t _ignored_bits;
> > +               uintptr_t _ignored_bits_mask;
> 
> This _is_ in padding: the tail-padding of the (previously smaller)
> _sigfault.  Again, the compiler was allowed to populate this area with
> junk before these fields were added.
> 
> I agree that it seems fairly unlikely that the compiler would have been
> overwriting this in normal circumstances, but that's not a guarantee.
> My worry is that if this goes wrong, it will go wrong silently and
> unpredictably.
> 
> >         } _sigfault;
> > 
> >         /* SIGPOLL */
> > 
> > or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> > 
> > > > > Using unused bits in the signal number to turn on new functionality
> > > > > feels risky.  As currently specified, this is just a number.  Since
> > > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > > number, reasonable code like the following would trigger a buffer
> > > > > overrun if we start trying to encode anything else in there:
> > > > >
> > > > > struct sigaction actions[NSIG];
> > > > >
> > > > > int do_something( ... )
> > > > > {
> > > > >         ...
> > > > >
> > > > >         if (!sigaction(n, sa, ...)) {
> > > > >                 actions[n] = *sa;
> > > > >                 return 0;
> > > > >         }
> > > > >
> > > > >         ...
> > > > > }
> > > >
> > > > I imagine the bit in the signal number being set by the direct caller
> > > > to sigaction, and we could specifically recommend that calling
> > > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> > >
> > > I can imagine this too, but that doesn't mean that software does it.
> > >
> > > If the above kind of thing exists in a framework or library somewhere,
> > > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > > that provides a wrapper for sigaction may now go wrong even if your
> > > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > > directly but in fact it isn't.
> > 
> > I'm aware of one library like that. It's called libsigchain, and it
> > has an early bounds check:
> > https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> > 
> > Until the library is changed to recognize the flag, calling code would
> > see the return value of -1 as if the kernel failed the syscall, and
> > would fall back to the code for old kernels.
> 
> But only after some bad dereferences.  If these were writes, this means
> that memory _may_ be silently corrupted (I don't say it't likely in a
> given case, and we cannot pick a flag bit that makes this impossible).
> 
> So, _even though the user program is correct_, our change may trigger
> the corruption of arbitrary user memory.  This what I mean by an ABI
> break.  The fact that the corruption is not done by the syscall itself
> is no excuse.
> 
> We also fail to notice failures in sigaddset() etc., though in this code
> it looks like that should not matter.
> 
> > In general I think that any library like this with independent
> > tracking of the kernel's purported signal handler state would need to
> > be very sensitive to which syscalls are capable of setting signal
> > handlers, what their semantics are, and so on. This applies to any
> > change that we might make to the signal handler interface. So for
> > example, if we introduced a new syscall as you propose below, and the
> > library hasn't been updated to recognize the new syscall, it will
> > silently miss changes in signal handler state caused by the new
> > syscall.
> > 
> > At the end of this argument lies "we can never change anything about
> > how signal handlers work because it could break some interposing
> > library somewhere" -- replace "signal handlers" with any kernel
> > feature whose behavior may be modified by an interposing library if
> > you like -- and I don't think we want to go that far. As far as I
> > know, this isn't really the kernel's business anyway -- the kernel's
> > stable ABI contract starts and ends with the syscall interface and not
> > some library on top.
> > 
> > That being said, we should perhaps try to define our interface so that
> > something reasonable will probably happen if there is such a library
> > and it hasn't been updated. With the new syscall, the library will
> > sometimes silently fail to work in some non-local fashion. With the
> > flag bit in the signal number, the library will either cause the
> > caller to fall back to the old kernel code path (if there is a bounds
> > check) or likely crash loudly (if there is no bounds check). To me,
> > the "flag bit in the signal number" behavior seems more reasonable,
> > since either something correct or something easy to debug will
> > probably happen at runtime.
> > 
> > > > could only appear in newly-written code that doesn't follow our
> > > > recommendations, and there are already plenty of much more likely ways
> > > > to cause buffer overflows in C code that doesn't follow
> > > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > > it would very likely be caught early in development by the MMU due to
> > > > the magnitude of the number 1<<30.)
> > >
> > > Choosing the bit value is hard.  If shitfing it overflows, this can
> > > trigger random undefined behaviour in the compiler in addition to (or
> > > perhaps instead of) an out-of-bounds access or segfault.
> > 
> > It wouldn't overflow on a 64-bit architecture assuming normal array
> > indexing (the index would be promoted to pointer width before being
> > scaled to the array element size), and to begin with the users of this
> > would be 64-bit.
> 
> Unless we don't offer this feature for 32-bit at all (possible, if ugly)
> we can't stop people using it.
> 
> > > If shifting it doesn't overflow, we might still fall into a valid
> > > mapping, though I'd agree a segfault is more likely.
> > >
> > > >
> > > > > I think it would be cleaner for to add a single flag field that can be
> > > > > used for detecting other extensions, and request it via a new sa_flags
> > > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > > probably want to carry on zeroing them anyway).
> > > > >
> > > > > I can see no simpler way to add supplementary siginfo fields for
> > > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > > came in we could still detect optional si_code-specific fields via
> > > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > > >
> > > > That would certainly be cleaner if it worked, but that would only be
> > > > the case if old kernels rejected unknown bits in sa_flags, and
> > > > unfortunately they don't. With the bit in the signal number, the "old
> > >
> > > Hmm, that is a problem I wasn't aware of.
> > >
> > > > kernels reject" behavior admits relatively straightforward usage code:
> > > >
> > > > void set_segv_handler(void) {
> > > >   struct sigaction sa;
> > > >   sa.sa_sigaction = handle_segv;
> > > >   sa.sa_flags = SA_SIGINFO;
> > > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > > succeeds in new kernels, fails in old kernels
> > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > > >       perror("sigaction");
> > > >   }
> > > > }
> > > >
> > > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > >   sa->si_future_field = 0;
> > > >   handle_segv(signum, sa, ctx);
> > > > }
> > > >
> > > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > >   // At this point, si_future_field will have the value 0 in old
> > > > kernels and the kernel-supplied value in new kernels.
> > > > }
> > > >
> > > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > > number to sa_flags. In that case, the first sigaction would succeed in
> > > > old kernels so handle_segv wouldn't know whether it can safely read
> > > > from si_future_field. With the sa_flags approach, you would need
> > > > kernel version number checking via uname before setting the flag in
> > > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > > since you could just have the signal handler conditionally read from
> > > > si_future_field based on the uname?
> > >
> > > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > > it would be using a new #define.  So it might be reasonable to put the
> > > burden on that software to verify that the flag was really accepted by
> > > the kernel, by reading it back.
> > 
> > That doesn't seem like a good idea even if it worked, because it could
> > lead to race conditions. If the si_flags-reading signal handler were
> > invoked in response to a signal between when you set it and when you
> > ended up replacing it with the fallback signal handler for old
> > kernels, the handler may end up reading garbage data from si_flags.
> 
> Not really.  My example may have this problem, but the signal handler
> can be written to support both scenarios, based on testing a flag that
> the main program sets after verifying that the flag could be set.  Or
> the signal could be blocked around establishment (often a good idea for
> other reasons).
> 
> But I agree it's a bit gross, and anyway doesn't work due to the fact
> that the kernel doesn't filter out unrecognised flags anyway.
> 
> > > Unfortunately, even relatively recent kernels blindly store sa_flags
> > > in the kernel without validating it, and so it looks like duff flags
> > > can be read back out via a sigaction() call.  Dang.
> > >
> > >
> > > Perhaps a new frontend syscall could be added.  A new libc that knows
> > > about this "sigaction2" could use it and mask off problem bits from
> > > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > > libc would call the old sigaction syscall, where we would ignore these
> > > new sa_flags bits as before.
> > 
> > I'm not currently in favor of the new syscall but if we do this I
> > would keep sigaction and sigaction2 separate. That is, libc sigaction
> > should always use the sigaction syscall, and libc sigaction2 should
> > always use the sigaction2 syscall. We should avoid libc's sigaction
> > having different behavior based on the libc version and kernel
> > version, as that would make it harder to reason about its behavior.
> > Calling code would need to check for presence of sigaction2 in both
> > libc and the kernel, e.g.
> > 
> > __attribute__((weak)) decltype(sigaction2) sigaction2;
> > 
> > void set_segv_handler(void) {
> >   struct sigaction sa;
> >   sa.sa_sigaction = handle_segv;
> >   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
> >   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
> >     sa.sa_sigaction = clear_fields_and_handle_segv;
> >     sa.sa_flags = SA_SIGINFO;
> >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> >       perror("sigaction");
> >   }
> > }
> 
> I guess.  But I share your distaste for adding a new syscall.
> 
> > 
> > > This may not be a popular approach though, and software wouldn't be able
> > > to use our new features until libc is updated to match.
> > >
> > > If we go down this route, it may provide additional opportunities to fix
> > > annoying defects in the old interface.
> > >
> > >
> > > > Note that the same applies to a flag indicating the availability of a
> > > > si_flags field in sigaction (just
> > > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > > >
> > > > Another thought that occurred to me is that we may consider
> > > > generalizing this a step further and introducing a single flag bit in
> > > > the signal number that means "reject unknown flags in sa_flags". This
> > > > would mean that we wouldn't need to add any more flag bits to the
> > > > signal number in the future, thus limiting this signal number hack to
> > > > a single bit; all future mandatory behavior changes could just be put
> > > > behind a flag in sa_flags and userspace code would easily be able to
> > > > detect missing support for a flag and fall back if necessary. In our
> > > > case, this would imply usage code like this:
> > > >
> > > > void set_segv_handler(void) {
> > > >   struct sigaction sa;
> > > >   sa.sa_sigaction = handle_segv;
> > > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > > the bounds check on the signal number).
> > > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > >     sa.sa_flags = SA_SIGINFO;
> > > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > > we're using sa_flags from the beginning of time.
> > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > >       perror("sigaction");
> > > >   }
> > > > }
> > >
> > > As with the other options this could work, but looks like it could
> > > break the ABI due to violating the original semantics for the signal
> > > number argument.  Perhaps I'm being too paranoid.
> > 
> > There's no ABI being broken here, as long as we consider syscalls to
> > be the stable ABI layer. Old kernels are simply rejecting arguments
> > that they don't know about yet. By that argument, any introduction of
> > a new syscall is an ABI break because it changes the semantics of a
> > previously-unallocated syscall number.
> 
> As argued above, I think this is an invalid argument.
> 
> Although any addition will change behaviour (so is a break in some
> sense), the key is not to make "surprising" changes.
> 
> Having something random happen when setting a previously reserved flag
> bit, or when issuing a syscall when an unknown syscall number, or not
> surprising at all.
> 
> Making fundamental changes to the encoding of an existing argument is
> highly surprising, on the other hand: as your example shows, it is
> reasonable to index an array using a signal number.
> 
> I agree that this doesn't get us closer to a practical solution though.
> 
> 
> But we do seem to need some mechanism in addition to (or instead of)
> sa_flags.
> 
> Here's another thought:
> 
> Since si_flags would be either always present or always absent, it
> could make sense to have a global property to report this, rather than
> an sa_flags or signal number bit to request it per-signal.
> 
> Requiring software to parse uname() might be reasonable for that, if
> cumbersome (did you suggest this previously?).  If we're concerned that
> the awkwardness of this would encourage people not to bother (or
> encourage people to do it wrong) then we might opt for something simpler
> like an AT_FLAGS bit.
> 
> Ultimately libc could provide a more portable interface for discovery,
> such as via sysconf().
> 
> Thoughts?


While you're thinking about that, here's another idea:

It occurs to me that there are spare bits in si_code.  si_code is an
enum, but unlike the signal number there are no specific bounds for
this value, so we may have an easier time recycling bits here.

The high bits of si_code are usually sign-extension and so not always
0, but we can XOR flags into them provided we don't forget the real sign.

Software that isn't expecting twiddled bits would get confused, so we
need a new SA_ flag to enable this.  But this flag (SA_CODEX) below is
now just a request.  If the kernel doesn't understand it (or without
SA_SIGINFO) then no flags would be reported in si_code, which is
backwards-compatible.  


A handler would now do

void handler(int n, siginfo_t *si, ...)
{
	int flags = SI_FLAGS(si->si_code);
	int code = SI_CODE(si->si_code);

	if (!(flags & SIF_CODEX) {
		/* flags not supported */
		/* Careful assignment of flag meanings may make this
		   check unnecessary, but it's probably useful for
		   developers for testing their code. */
	}

	/* Handle signal based on n, code and flags */
}


If the kernel doesn't report any flags (perhaps because it's too old)
then SI_FLAGS() will yield 0 and SI_CODE() will just return si_code
unchanged.  This means that even non-SA_CODEX handlers can use these
macros, which may ease migration.

Cheers
---Dave

--8<--

diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index cb3d6c2..4e77c71 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -176,6 +176,18 @@ typedef struct siginfo {
 #define SI_DETHREAD	-7		/* sent by execve() killing subsidiary threads */
 #define SI_ASYNCNL	-60		/* sent by glibc async name lookup completion */
 
+#define __SI_FLAGS	0x7ffff000	/* optional code extension flags */
+#define SIF_CODEX	0x40000000	/* code extension flags supported */
+
+/*
+ * Extract value and extension flags from si_code.
+ * These are only required in handlers registered with SA_CODEX.
+ */
+#define SI_CODE(sicode)
+	((sicode) >= 0 ? (sicode) & ~__SI_FLAGS : (sicode) | __SI_FLAGS)
+#define SI_FLAGS(sicode) \
+	(((sicode) >= 0 ? (sicode) : ~(sicode)) & __SI_FLAGS)
+
 #define SI_FROMUSER(siptr)	((siptr)->si_code <= 0)
 #define SI_FROMKERNEL(siptr)	((siptr)->si_code > 0)
 
diff --git a/include/uapi/asm-generic/signal.h b/include/uapi/asm-generic/signal.h
index 5c716a9..c20f5f61 100644
--- a/include/uapi/asm-generic/signal.h
+++ b/include/uapi/asm-generic/signal.h
@@ -61,6 +61,7 @@
  * SA_RESETHAND clears the handler when the signal is delivered.
  * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
  * SA_NODEFER prevents the current signal from being masked in the handler.
+ * SA_CODEX allows extension flag reporting in si_code.
  *
  * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
  * Unix names RESETHAND and NODEFER respectively.
@@ -68,6 +69,7 @@
 #define SA_NOCLDSTOP	0x00000001
 #define SA_NOCLDWAIT	0x00000002
 #define SA_SIGINFO	0x00000004
+#define SA_CODEX	0x04000000
 #define SA_ONSTACK	0x08000000
 #define SA_RESTART	0x10000000
 #define SA_NODEFER	0x40000000
diff --git a/kernel/signal.c b/kernel/signal.c
index ee22ec7..8e8550a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-08 13:58                                                                       ` Dave Martin
@ 2020-07-08 22:21                                                                         ` Peter Collingbourne
  2020-07-13 13:24                                                                           ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-07-08 22:21 UTC (permalink / raw)
  To: Dave Martin
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Wed, Jul 8, 2020 at 6:58 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, Jul 08, 2020 at 12:00:22PM +0100, Dave Martin wrote:
> > On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> > > On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > >
> > > > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > >
> > > > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > > > >
> > > > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > > > >>
> > > > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > > > >> >  }
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > > > >> >  {
> > > > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > > > >> > -  else
> > > > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > > > >> > +  } else {
> > > > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > > > >> > +  }
> > > > > > > > > > > > >> >  }
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > > > >> >  {
> > > > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > > > >> > +
> > > > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > > > >> > +
> > > > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > > > >> >  }
> > > > > > > > > > > > >>
> > > > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > > > >
> > > > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > > > >
> > > > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > > > union members simultantiously.
> > > > > > > > > > >
> > > > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > > > though.)
> > > > > > > > > >
> > > > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > > > different union member from the one previously written.
> > > > > > > > > >
> > > > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > > > on the other members IIUC.
> > > > > > > > > >
> > > > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > > > incorrectness just to save a few bytes.
> > > > > > > > > >
> > > > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > > > >
> > > > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > > > >
> > > > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > > > >
> > > > > > > > > > > So you want something like:
> > > > > > > > > > >
> > > > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > >
> > > > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > > > >
> > > > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > > > signal generation site...
> > > > > > > > > > > > >
> > > > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Garbled sentence?
> > > > > > > > > > > >
> > > > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > > > >
> > > > > > > > > > > > I am not seeing anything like that.
> > > > > > > > > > >
> > > > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > > > on 32-bit ARM.
> > > > > > > > > > >
> > > > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > > > >> > +                  struct {
> > > > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > > > >
> > > > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > > > >
> > > > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > > > >> > +#endif
> > > > > > > > > > > > >> >            };
> > > > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > > > >> >
> > > > > > > > > > > > >>
> > > > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > > > >>
> > > > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > > > >> is built for a single architecture.
> > > > > > > > > > >
> > > > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > > > considering a similar feature:
> > > > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > > > architectures from the start.
> > > > > > > > > > >
> > > > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > > > this point to accommodate the new fields.
> > > > > > > > > > >
> > > > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > > > >
> > > > > > > > > > I think what we need here is basically a flags word.
> > > > > > > > > >
> > > > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > > > flags word, we can extend as needed.
> > > > > > > > > >
> > > > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > > > I guess si_code may be sufficient.
> > > > > > > > >
> > > > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > > > the struct is done by the clear_user call here:
> > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > > > >
> > > > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > > > calls when initializing the data structure:
> > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > > > >
> > > > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > > > support for flagged fields.
> > > > > > > >
> > > > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > > > back to when the arch/arm64 was merged).
> > > > > > > >
> > > > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > > > >
> > > > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > > > this matters.  I may have misunderstood something about the code though.
> > > > > > >
> > > > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > > > released in 4.18. So if an application wants to be compatible with
> > > > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > > > version before reading these fields. I have a couple of other ideas
> > > > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > > > compatibility.)
> > > > > > >
> > > > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > > > circumstances:
> > > > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > > > >
> > > > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > > > this signal before the 5.8 release.
> > > > > > >
> > > > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > > > registering a signal handler meaning "this signal handler requires
> > > > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > > > this vein would be to claim some of the upper bits of the signal
> > > > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > > > 2.6.18) contain this code in do_sigaction:
> > > > > > >
> > > > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > > > >                 return -EINVAL;
> > > > > > >
> > > > > > > and vald_signal is defined as:
> > > > > > >
> > > > > > > static inline int valid_signal(unsigned long sig)
> > > > > > > {
> > > > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > > > }
> > > > > > >
> > > > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > > > and permits future expansion of the signal number range.
> > > > > >
> > > > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > > > but I've not eyeballed it in detail...)
> > > > >
> > > > > It memsets the siginfo structure before setting the fields and sending
> > > > > the signal (grep for clear_siginfo which is just a memset; you should
> > > > > find a call before all callers of force_sig_info). Memset is the right
> > > > > approach here since unlike setting fields by hand it clears padding
> > > > > which could lead to information leaks from the kernel. IIUC this is
> > > > > the reason why Eric wants all of the signals to be raised via wrappers
> > > > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > > > this aspect easier to audit).
> > > >
> > > > My impression was that the reason for this model is partly to ensure
> > > > that siginfo fields are populated more consistently.  When it was all
> > > > down to the individual callers, inconsistencies creeped in.
> > > >
> > > > With regard to memset(), this is not a complete defence against data
> > > > leakage.  Assigning to a struct member can set any or all padding in
> > > > the struct to random garbage (consider write-combining of neighboring
> > > > member writes into a single larger accesses in asm for example).  The
> > >
> > > I don't believe that LLVM will store to padding like this. I don't
> > > know about GCC, though, but I wouldn't be surprised if this is
> > > something that the kernel would want to turn off in "kernel C" (like
> > > it turns off strict aliasing) specifically because of the information
> > > leak issue.
> >
> > Again, the issue is not future kernel builds -- we can always find a way
> > to fix the behaviour for those -- but past kernel builds.

I thought that the whole point of the "bit in the signal number" (or
SI_CODEX or whatever) was that we didn't need to worry about the
behavior of past kernel builds?

> >
> > > > only way to avoid this is to ensure that the struct is 100%
> > > > padding-free, and that each member of a union is the same size.  A
> > > > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > > > the case.
> > > >
> > > > This might need to be looked at separately.
> > > >
> > > > But it does mean, strictly speaking, that we can't reliably add new
> > > > fields anywhere that there was previously padding: assigning to
> > > > neighboring members can still fill those with garbage after the
> > > > memset().
> > >
> > > ...but this is largely moot because I'm not proposing to add new
> > > fields in the padding any more (because the fields needed to become
> > > larger in order to accommodate future hypothetical architectures which
> > > might want to use the fields, and thus they wouldn't fit in the
> > > padding). The siginfo.h diff would be something like:
> > >
> > > diff --git a/include/uapi/asm-generic/siginfo.h
> > > b/include/uapi/asm-generic/siginfo.h
> > > index cb3d6c267181..4a2fe257415d 100644
> > > --- a/include/uapi/asm-generic/siginfo.h
> > > +++ b/include/uapi/asm-generic/siginfo.h
> > > @@ -91,7 +91,10 @@ union __sifields {
> > >                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > >                                 __u32 _pkey;
> > >                         } _addr_pkey;
> > > +                       void *_pad[6];
> > >                 };
> > > +               uintptr_t _ignored_bits;
> > > +               uintptr_t _ignored_bits_mask;
> >
> > This _is_ in padding: the tail-padding of the (previously smaller)
> > _sigfault.  Again, the compiler was allowed to populate this area with
> > junk before these fields were added.
> >
> > I agree that it seems fairly unlikely that the compiler would have been
> > overwriting this in normal circumstances, but that's not a guarantee.
> > My worry is that if this goes wrong, it will go wrong silently and
> > unpredictably.
> >
> > >         } _sigfault;
> > >
> > >         /* SIGPOLL */
> > >
> > > or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> > >
> > > > > > Using unused bits in the signal number to turn on new functionality
> > > > > > feels risky.  As currently specified, this is just a number.  Since
> > > > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > > > number, reasonable code like the following would trigger a buffer
> > > > > > overrun if we start trying to encode anything else in there:
> > > > > >
> > > > > > struct sigaction actions[NSIG];
> > > > > >
> > > > > > int do_something( ... )
> > > > > > {
> > > > > >         ...
> > > > > >
> > > > > >         if (!sigaction(n, sa, ...)) {
> > > > > >                 actions[n] = *sa;
> > > > > >                 return 0;
> > > > > >         }
> > > > > >
> > > > > >         ...
> > > > > > }
> > > > >
> > > > > I imagine the bit in the signal number being set by the direct caller
> > > > > to sigaction, and we could specifically recommend that calling
> > > > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> > > >
> > > > I can imagine this too, but that doesn't mean that software does it.
> > > >
> > > > If the above kind of thing exists in a framework or library somewhere,
> > > > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > > > that provides a wrapper for sigaction may now go wrong even if your
> > > > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > > > directly but in fact it isn't.
> > >
> > > I'm aware of one library like that. It's called libsigchain, and it
> > > has an early bounds check:
> > > https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> > >
> > > Until the library is changed to recognize the flag, calling code would
> > > see the return value of -1 as if the kernel failed the syscall, and
> > > would fall back to the code for old kernels.
> >
> > But only after some bad dereferences.  If these were writes, this means
> > that memory _may_ be silently corrupted (I don't say it't likely in a
> > given case, and we cannot pick a flag bit that makes this impossible).

You're talking about libsigchain, right? I don't see any bad
references, the function returns after noticing the bounds check
failure.

> > So, _even though the user program is correct_, our change may trigger

Let's say that you were talking about some other library and not
libsigchain. Such an interceptor wouldn't be correct though, it failed
to account for our change to the syscall semantics. If the accesses
were before the syscall (or the bounds check), then the interceptor
would not have been correct in the first place because POSIX requires
returning -1 with errno=EINVAL (and not crashing) if the signal number
is invalid.

> > the corruption of arbitrary user memory.  This what I mean by an ABI
> > break.  The fact that the corruption is not done by the syscall itself
> > is no excuse.

At some point, though, accommodating interceptors becomes pretty much
tantamount to saying "we can never change anything". Even just adding
a field to __sifields (which is pretty much required for what we need
to do) could break things in the presence of some interceptors because
the interceptor could be copying the fields manually to a new data
structure before calling the user's signal handler (e.g. because it
wants to defer the signal until later) and miss our new field. I think
most of the other ideas we're discussing fail to meet this bar as well
and I'll go into more details later on.

> > We also fail to notice failures in sigaddset() etc., though in this code
> > it looks like that should not matter.

Maybe you're looking at the handler ("SignalChain::Handler")? The bit
wouldn't be set in the signo argument to the handler. I'm talking
about line 371 of the code I linked, in the sigaction interceptor
"__sigaction" (it looks like sometimes the link doesn't take you to
the correct line for some reason).

> >
> > > In general I think that any library like this with independent
> > > tracking of the kernel's purported signal handler state would need to
> > > be very sensitive to which syscalls are capable of setting signal
> > > handlers, what their semantics are, and so on. This applies to any
> > > change that we might make to the signal handler interface. So for
> > > example, if we introduced a new syscall as you propose below, and the
> > > library hasn't been updated to recognize the new syscall, it will
> > > silently miss changes in signal handler state caused by the new
> > > syscall.
> > >
> > > At the end of this argument lies "we can never change anything about
> > > how signal handlers work because it could break some interposing
> > > library somewhere" -- replace "signal handlers" with any kernel
> > > feature whose behavior may be modified by an interposing library if
> > > you like -- and I don't think we want to go that far. As far as I
> > > know, this isn't really the kernel's business anyway -- the kernel's
> > > stable ABI contract starts and ends with the syscall interface and not
> > > some library on top.
> > >
> > > That being said, we should perhaps try to define our interface so that
> > > something reasonable will probably happen if there is such a library
> > > and it hasn't been updated. With the new syscall, the library will
> > > sometimes silently fail to work in some non-local fashion. With the
> > > flag bit in the signal number, the library will either cause the
> > > caller to fall back to the old kernel code path (if there is a bounds
> > > check) or likely crash loudly (if there is no bounds check). To me,
> > > the "flag bit in the signal number" behavior seems more reasonable,
> > > since either something correct or something easy to debug will
> > > probably happen at runtime.
> > >
> > > > > could only appear in newly-written code that doesn't follow our
> > > > > recommendations, and there are already plenty of much more likely ways
> > > > > to cause buffer overflows in C code that doesn't follow
> > > > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > > > it would very likely be caught early in development by the MMU due to
> > > > > the magnitude of the number 1<<30.)
> > > >
> > > > Choosing the bit value is hard.  If shitfing it overflows, this can
> > > > trigger random undefined behaviour in the compiler in addition to (or
> > > > perhaps instead of) an out-of-bounds access or segfault.
> > >
> > > It wouldn't overflow on a 64-bit architecture assuming normal array
> > > indexing (the index would be promoted to pointer width before being
> > > scaled to the array element size), and to begin with the users of this
> > > would be 64-bit.
> >
> > Unless we don't offer this feature for 32-bit at all (possible, if ugly)
> > we can't stop people using it.

My point is that the problem in the interceptor library would probably
be noticed on 64-bit (since that's what most people use these days),
which would probably result in it being fixed by the time it reaches
32-bit users.

> >
> > > > If shifting it doesn't overflow, we might still fall into a valid
> > > > mapping, though I'd agree a segfault is more likely.
> > > >
> > > > >
> > > > > > I think it would be cleaner for to add a single flag field that can be
> > > > > > used for detecting other extensions, and request it via a new sa_flags
> > > > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > > > probably want to carry on zeroing them anyway).
> > > > > >
> > > > > > I can see no simpler way to add supplementary siginfo fields for
> > > > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > > > came in we could still detect optional si_code-specific fields via
> > > > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > > > >
> > > > > That would certainly be cleaner if it worked, but that would only be
> > > > > the case if old kernels rejected unknown bits in sa_flags, and
> > > > > unfortunately they don't. With the bit in the signal number, the "old
> > > >
> > > > Hmm, that is a problem I wasn't aware of.
> > > >
> > > > > kernels reject" behavior admits relatively straightforward usage code:
> > > > >
> > > > > void set_segv_handler(void) {
> > > > >   struct sigaction sa;
> > > > >   sa.sa_sigaction = handle_segv;
> > > > >   sa.sa_flags = SA_SIGINFO;
> > > > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > > > succeeds in new kernels, fails in old kernels
> > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > > > >       perror("sigaction");
> > > > >   }
> > > > > }
> > > > >
> > > > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > >   sa->si_future_field = 0;
> > > > >   handle_segv(signum, sa, ctx);
> > > > > }
> > > > >
> > > > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > >   // At this point, si_future_field will have the value 0 in old
> > > > > kernels and the kernel-supplied value in new kernels.
> > > > > }
> > > > >
> > > > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > > > number to sa_flags. In that case, the first sigaction would succeed in
> > > > > old kernels so handle_segv wouldn't know whether it can safely read
> > > > > from si_future_field. With the sa_flags approach, you would need
> > > > > kernel version number checking via uname before setting the flag in
> > > > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > > > since you could just have the signal handler conditionally read from
> > > > > si_future_field based on the uname?
> > > >
> > > > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > > > it would be using a new #define.  So it might be reasonable to put the
> > > > burden on that software to verify that the flag was really accepted by
> > > > the kernel, by reading it back.
> > >
> > > That doesn't seem like a good idea even if it worked, because it could
> > > lead to race conditions. If the si_flags-reading signal handler were
> > > invoked in response to a signal between when you set it and when you
> > > ended up replacing it with the fallback signal handler for old
> > > kernels, the handler may end up reading garbage data from si_flags.
> >
> > Not really.  My example may have this problem, but the signal handler
> > can be written to support both scenarios, based on testing a flag that
> > the main program sets after verifying that the flag could be set.  Or
> > the signal could be blocked around establishment (often a good idea for
> > other reasons).
> >
> > But I agree it's a bit gross, and anyway doesn't work due to the fact
> > that the kernel doesn't filter out unrecognised flags anyway.
> >
> > > > Unfortunately, even relatively recent kernels blindly store sa_flags
> > > > in the kernel without validating it, and so it looks like duff flags
> > > > can be read back out via a sigaction() call.  Dang.
> > > >
> > > >
> > > > Perhaps a new frontend syscall could be added.  A new libc that knows
> > > > about this "sigaction2" could use it and mask off problem bits from
> > > > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > > > libc would call the old sigaction syscall, where we would ignore these
> > > > new sa_flags bits as before.
> > >
> > > I'm not currently in favor of the new syscall but if we do this I
> > > would keep sigaction and sigaction2 separate. That is, libc sigaction
> > > should always use the sigaction syscall, and libc sigaction2 should
> > > always use the sigaction2 syscall. We should avoid libc's sigaction
> > > having different behavior based on the libc version and kernel
> > > version, as that would make it harder to reason about its behavior.
> > > Calling code would need to check for presence of sigaction2 in both
> > > libc and the kernel, e.g.
> > >
> > > __attribute__((weak)) decltype(sigaction2) sigaction2;
> > >
> > > void set_segv_handler(void) {
> > >   struct sigaction sa;
> > >   sa.sa_sigaction = handle_segv;
> > >   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
> > >   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
> > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > >     sa.sa_flags = SA_SIGINFO;
> > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > >       perror("sigaction");
> > >   }
> > > }
> >
> > I guess.  But I share your distaste for adding a new syscall.
> >
> > >
> > > > This may not be a popular approach though, and software wouldn't be able
> > > > to use our new features until libc is updated to match.
> > > >
> > > > If we go down this route, it may provide additional opportunities to fix
> > > > annoying defects in the old interface.
> > > >
> > > >
> > > > > Note that the same applies to a flag indicating the availability of a
> > > > > si_flags field in sigaction (just
> > > > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > > > >
> > > > > Another thought that occurred to me is that we may consider
> > > > > generalizing this a step further and introducing a single flag bit in
> > > > > the signal number that means "reject unknown flags in sa_flags". This
> > > > > would mean that we wouldn't need to add any more flag bits to the
> > > > > signal number in the future, thus limiting this signal number hack to
> > > > > a single bit; all future mandatory behavior changes could just be put
> > > > > behind a flag in sa_flags and userspace code would easily be able to
> > > > > detect missing support for a flag and fall back if necessary. In our
> > > > > case, this would imply usage code like this:
> > > > >
> > > > > void set_segv_handler(void) {
> > > > >   struct sigaction sa;
> > > > >   sa.sa_sigaction = handle_segv;
> > > > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > > > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > > > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > > > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > > > the bounds check on the signal number).
> > > > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > >     sa.sa_flags = SA_SIGINFO;
> > > > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > > > we're using sa_flags from the beginning of time.
> > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > >       perror("sigaction");
> > > > >   }
> > > > > }
> > > >
> > > > As with the other options this could work, but looks like it could
> > > > break the ABI due to violating the original semantics for the signal
> > > > number argument.  Perhaps I'm being too paranoid.
> > >
> > > There's no ABI being broken here, as long as we consider syscalls to
> > > be the stable ABI layer. Old kernels are simply rejecting arguments
> > > that they don't know about yet. By that argument, any introduction of
> > > a new syscall is an ABI break because it changes the semantics of a
> > > previously-unallocated syscall number.
> >
> > As argued above, I think this is an invalid argument.
> >
> > Although any addition will change behaviour (so is a break in some
> > sense), the key is not to make "surprising" changes.

If we care about interceptors then I don't think "surprising" comes
into it. It's more a question of "does the anticipated behavior of the
interceptor match our desired behavior", where "desired" means "most
likely to avoid silent breakage". We would need to get into the head
of a potential interceptor author and think about how they would have
handled the signal number argument, as well as other arguments like
sa_flags if we want to go that route, and see whether that behavior
would lead to the desired result.

In this case, I think we exactly want the interceptor author to have
thought "oh, it's just a number, I'll (possibly do a bounds check and
then) use the number as an index into an array". This will lead to one
of two outcomes: crashing (yes, yes, it won't always crash, but if the
alternative is that it never crashes and we get silently incorrect
behavior all of the time, I'll take sometimes crashing) or fail the
bounds check and pretend to be an old kernel (the latter is
anticipated by POSIX which requires returning -1/EINVAL for an invalid
signal number). Each of these behaviors are desirable, as they are
observable failures, which are more likely to result in fixes than
silent ones.

> > Having something random happen when setting a previously reserved flag
> > bit, or when issuing a syscall when an unknown syscall number, or not
> > surprising at all.

Introducing a new syscall is right out in this model. The interceptor
author wouldn't have anticipated our introducing a new syscall, so the
new syscall wouldn't be intercepted and calls to the new syscall would
silently bypass the interceptor. For example, adding sigaction2 could
result in signal handlers being set without the interceptor's
knowledge.

Regarding a sa_flags bit, let's get inside the head of the interceptor
author again. How would they handle a flag bit that they don't
recognize when replacing the signal handler? It wouldn't be correct to
just pass it through to the kernel, or drop the flag on the floor, as
it might be semantically meaningful (and thus could change the calling
convention as SA_SIGINFO does, or change the meaning of fields in
siginfo, as SA_CODEX would do). A correctly written sigaction
interceptor should probably abort the program upon encountering an
unknown flag (thus giving a human a chance to update the interceptor),
but chances are that they don't. Ignoring all but a few flags (and
passing a fixed set of flags to the kernel) seems to be what
libsigchain does, and in the case of SA_CODEX it would seem to result
in desirable behavior (but I suspect that it isn't handling the other
flags correctly), but I could also see an interceptor author just
passing it unchanged to the kernel without checking it (perhaps
because they didn't think about these issues, and because that didn't
matter until now, with the exception of from-the-beginning-of-time
flags like SA_SIGINFO). And with SA_CODEX that could lead to silent
misreading of si_code in the interceptor's signal handler, if it
hasn't been updated to use the new macros.

> > Making fundamental changes to the encoding of an existing argument is
> > highly surprising, on the other hand: as your example shows, it is
> > reasonable to index an array using a signal number.
> >
> > I agree that this doesn't get us closer to a practical solution though.
> >
> >
> > But we do seem to need some mechanism in addition to (or instead of)
> > sa_flags.
> >
> > Here's another thought:
> >
> > Since si_flags would be either always present or always absent, it
> > could make sense to have a global property to report this, rather than
> > an sa_flags or signal number bit to request it per-signal.
> >
> > Requiring software to parse uname() might be reasonable for that, if
> > cumbersome (did you suggest this previously?).  If we're concerned that
> > the awkwardness of this would encourage people not to bother (or
> > encourage people to do it wrong) then we might opt for something simpler
> > like an AT_FLAGS bit.
> >
> > Ultimately libc could provide a more portable interface for discovery,
> > such as via sysconf().
> >
> > Thoughts?

Yes, this was all to avoid the userspace code needing to contain a
version check (or equivalent). Maybe the version check would be better
than the alternatives though (although it's still vulnerable to
non-updated interceptors not copying our new fields). AT_FLAGS sounds
good to me.

> While you're thinking about that, here's another idea:
>
> It occurs to me that there are spare bits in si_code.  si_code is an
> enum, but unlike the signal number there are no specific bounds for
> this value, so we may have an easier time recycling bits here.
>
> The high bits of si_code are usually sign-extension and so not always
> 0, but we can XOR flags into them provided we don't forget the real sign.
>
> Software that isn't expecting twiddled bits would get confused, so we
> need a new SA_ flag to enable this.  But this flag (SA_CODEX) below is
> now just a request.  If the kernel doesn't understand it (or without
> SA_SIGINFO) then no flags would be reported in si_code, which is
> backwards-compatible.
>
> A handler would now do
>
> void handler(int n, siginfo_t *si, ...)
> {
>         int flags = SI_FLAGS(si->si_code);
>         int code = SI_CODE(si->si_code);
>
>         if (!(flags & SIF_CODEX) {
>                 /* flags not supported */
>                 /* Careful assignment of flag meanings may make this
>                    check unnecessary, but it's probably useful for
>                    developers for testing their code. */
>         }
>
>         /* Handle signal based on n, code and flags */
> }
>
>
> If the kernel doesn't report any flags (perhaps because it's too old)
> then SI_FLAGS() will yield 0 and SI_CODE() will just return si_code
> unchanged.  This means that even non-SA_CODEX handlers can use these
> macros, which may ease migration.

Thanks, this seems more appealing to me than the bit in the signal
number idea. It uses the sa_flags field as intended and doesn't abuse
the fields of the siginfo data structure too much. I don't think we
should put as much weight into interceptor concerns as you do, so you
can consider my above argumentation to be from a devil's advocate
perspective.

>
> Cheers
> ---Dave
>
> --8<--
>
> diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> index cb3d6c2..4e77c71 100644
> --- a/include/uapi/asm-generic/siginfo.h
> +++ b/include/uapi/asm-generic/siginfo.h
> @@ -176,6 +176,18 @@ typedef struct siginfo {
>  #define SI_DETHREAD    -7              /* sent by execve() killing subsidiary threads */
>  #define SI_ASYNCNL     -60             /* sent by glibc async name lookup completion */
>
> +#define __SI_FLAGS     0x7ffff000      /* optional code extension flags */
> +#define SIF_CODEX      0x40000000      /* code extension flags supported */

I don't think we would need this flag because even old kernels
"support" the extension flags under this scheme (they just don't
report any features). We would only need to introduce flags for actual
features.

Peter

> +
> +/*
> + * Extract value and extension flags from si_code.
> + * These are only required in handlers registered with SA_CODEX.
> + */
> +#define SI_CODE(sicode)
> +       ((sicode) >= 0 ? (sicode) & ~__SI_FLAGS : (sicode) | __SI_FLAGS)
> +#define SI_FLAGS(sicode) \
> +       (((sicode) >= 0 ? (sicode) : ~(sicode)) & __SI_FLAGS)
> +
>  #define SI_FROMUSER(siptr)     ((siptr)->si_code <= 0)
>  #define SI_FROMKERNEL(siptr)   ((siptr)->si_code > 0)
>
> diff --git a/include/uapi/asm-generic/signal.h b/include/uapi/asm-generic/signal.h
> index 5c716a9..c20f5f61 100644
> --- a/include/uapi/asm-generic/signal.h
> +++ b/include/uapi/asm-generic/signal.h
> @@ -61,6 +61,7 @@
>   * SA_RESETHAND clears the handler when the signal is delivered.
>   * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
>   * SA_NODEFER prevents the current signal from being masked in the handler.
> + * SA_CODEX allows extension flag reporting in si_code.
>   *
>   * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
>   * Unix names RESETHAND and NODEFER respectively.
> @@ -68,6 +69,7 @@
>  #define SA_NOCLDSTOP   0x00000001
>  #define SA_NOCLDWAIT   0x00000002
>  #define SA_SIGINFO     0x00000004
> +#define SA_CODEX       0x04000000
>  #define SA_ONSTACK     0x08000000
>  #define SA_RESTART     0x10000000
>  #define SA_NODEFER     0x40000000
> diff --git a/kernel/signal.c b/kernel/signal.c
> index ee22ec7..8e8550a 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-08 22:21                                                                         ` Peter Collingbourne
@ 2020-07-13 13:24                                                                           ` Dave Martin
  2020-07-13 20:50                                                                             ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-07-13 13:24 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Wed, Jul 08, 2020 at 03:21:13PM -0700, Peter Collingbourne wrote:
> On Wed, Jul 8, 2020 at 6:58 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Wed, Jul 08, 2020 at 12:00:22PM +0100, Dave Martin wrote:
> > > On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> > > > On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > >
> > > > > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > > > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > >
> > > > > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > > > > >
> > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > > > > >> > -  else
> > > > > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > > > > >> > +  } else {
> > > > > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > > > > >> > +  }
> > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > > > > >
> > > > > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > > > > union members simultantiously.
> > > > > > > > > > > >
> > > > > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > > > > though.)
> > > > > > > > > > >
> > > > > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > > > > different union member from the one previously written.
> > > > > > > > > > >
> > > > > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > > > > on the other members IIUC.
> > > > > > > > > > >
> > > > > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > > > > incorrectness just to save a few bytes.
> > > > > > > > > > >
> > > > > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > > > > >
> > > > > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > > > > >
> > > > > > > > > > > > So you want something like:
> > > > > > > > > > > >
> > > > > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > >
> > > > > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > > > > >
> > > > > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > > > > signal generation site...
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Garbled sentence?
> > > > > > > > > > > > >
> > > > > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I am not seeing anything like that.
> > > > > > > > > > > >
> > > > > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > > > > on 32-bit ARM.
> > > > > > > > > > > >
> > > > > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > > > > >> > +                  struct {
> > > > > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > > > > >
> > > > > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > > > > >> > +#endif
> > > > > > > > > > > > > >> >            };
> > > > > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > > > > >> >
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > > > > >>
> > > > > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > > > > >> is built for a single architecture.
> > > > > > > > > > > >
> > > > > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > > > > considering a similar feature:
> > > > > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > > > > architectures from the start.
> > > > > > > > > > > >
> > > > > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > > > > this point to accommodate the new fields.
> > > > > > > > > > > >
> > > > > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > > > > >
> > > > > > > > > > > I think what we need here is basically a flags word.
> > > > > > > > > > >
> > > > > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > > > > flags word, we can extend as needed.
> > > > > > > > > > >
> > > > > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > > > > I guess si_code may be sufficient.
> > > > > > > > > >
> > > > > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > > > > the struct is done by the clear_user call here:
> > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > > > > >
> > > > > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > > > > calls when initializing the data structure:
> > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > > > > >
> > > > > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > > > > support for flagged fields.
> > > > > > > > >
> > > > > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > > > > back to when the arch/arm64 was merged).
> > > > > > > > >
> > > > > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > > > > >
> > > > > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > > > > this matters.  I may have misunderstood something about the code though.
> > > > > > > >
> > > > > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > > > > released in 4.18. So if an application wants to be compatible with
> > > > > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > > > > version before reading these fields. I have a couple of other ideas
> > > > > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > > > > compatibility.)
> > > > > > > >
> > > > > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > > > > circumstances:
> > > > > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > > > > >
> > > > > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > > > > this signal before the 5.8 release.
> > > > > > > >
> > > > > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > > > > registering a signal handler meaning "this signal handler requires
> > > > > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > > > > this vein would be to claim some of the upper bits of the signal
> > > > > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > > > > 2.6.18) contain this code in do_sigaction:
> > > > > > > >
> > > > > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > > > > >                 return -EINVAL;
> > > > > > > >
> > > > > > > > and vald_signal is defined as:
> > > > > > > >
> > > > > > > > static inline int valid_signal(unsigned long sig)
> > > > > > > > {
> > > > > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > > > > }
> > > > > > > >
> > > > > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > > > > and permits future expansion of the signal number range.
> > > > > > >
> > > > > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > > > > but I've not eyeballed it in detail...)
> > > > > >
> > > > > > It memsets the siginfo structure before setting the fields and sending
> > > > > > the signal (grep for clear_siginfo which is just a memset; you should
> > > > > > find a call before all callers of force_sig_info). Memset is the right
> > > > > > approach here since unlike setting fields by hand it clears padding
> > > > > > which could lead to information leaks from the kernel. IIUC this is
> > > > > > the reason why Eric wants all of the signals to be raised via wrappers
> > > > > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > > > > this aspect easier to audit).
> > > > >
> > > > > My impression was that the reason for this model is partly to ensure
> > > > > that siginfo fields are populated more consistently.  When it was all
> > > > > down to the individual callers, inconsistencies creeped in.
> > > > >
> > > > > With regard to memset(), this is not a complete defence against data
> > > > > leakage.  Assigning to a struct member can set any or all padding in
> > > > > the struct to random garbage (consider write-combining of neighboring
> > > > > member writes into a single larger accesses in asm for example).  The
> > > >
> > > > I don't believe that LLVM will store to padding like this. I don't
> > > > know about GCC, though, but I wouldn't be surprised if this is
> > > > something that the kernel would want to turn off in "kernel C" (like
> > > > it turns off strict aliasing) specifically because of the information
> > > > leak issue.
> > >
> > > Again, the issue is not future kernel builds -- we can always find a way
> > > to fix the behaviour for those -- but past kernel builds.
> 
> I thought that the whole point of the "bit in the signal number" (or
> SI_CODEX or whatever) was that we didn't need to worry about the
> behavior of past kernel builds?

It depends on what we use the new flag(s) for.

If the flag means just that unused padding is safely zeroed, that could
work -- but we'd want high confidence that it really is zeroed even in
wacky configurations.

> > > > > only way to avoid this is to ensure that the struct is 100%
> > > > > padding-free, and that each member of a union is the same size.  A
> > > > > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > > > > the case.
> > > > >
> > > > > This might need to be looked at separately.
> > > > >
> > > > > But it does mean, strictly speaking, that we can't reliably add new
> > > > > fields anywhere that there was previously padding: assigning to
> > > > > neighboring members can still fill those with garbage after the
> > > > > memset().
> > > >
> > > > ...but this is largely moot because I'm not proposing to add new
> > > > fields in the padding any more (because the fields needed to become
> > > > larger in order to accommodate future hypothetical architectures which
> > > > might want to use the fields, and thus they wouldn't fit in the
> > > > padding). The siginfo.h diff would be something like:
> > > >
> > > > diff --git a/include/uapi/asm-generic/siginfo.h
> > > > b/include/uapi/asm-generic/siginfo.h
> > > > index cb3d6c267181..4a2fe257415d 100644
> > > > --- a/include/uapi/asm-generic/siginfo.h
> > > > +++ b/include/uapi/asm-generic/siginfo.h
> > > > @@ -91,7 +91,10 @@ union __sifields {
> > > >                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > >                                 __u32 _pkey;
> > > >                         } _addr_pkey;
> > > > +                       void *_pad[6];
> > > >                 };
> > > > +               uintptr_t _ignored_bits;
> > > > +               uintptr_t _ignored_bits_mask;
> > >
> > > This _is_ in padding: the tail-padding of the (previously smaller)
> > > _sigfault.  Again, the compiler was allowed to populate this area with
> > > junk before these fields were added.
> > >
> > > I agree that it seems fairly unlikely that the compiler would have been
> > > overwriting this in normal circumstances, but that's not a guarantee.
> > > My worry is that if this goes wrong, it will go wrong silently and
> > > unpredictably.
> > >
> > > >         } _sigfault;
> > > >
> > > >         /* SIGPOLL */
> > > >
> > > > or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> > > >
> > > > > > > Using unused bits in the signal number to turn on new functionality
> > > > > > > feels risky.  As currently specified, this is just a number.  Since
> > > > > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > > > > number, reasonable code like the following would trigger a buffer
> > > > > > > overrun if we start trying to encode anything else in there:
> > > > > > >
> > > > > > > struct sigaction actions[NSIG];
> > > > > > >
> > > > > > > int do_something( ... )
> > > > > > > {
> > > > > > >         ...
> > > > > > >
> > > > > > >         if (!sigaction(n, sa, ...)) {
> > > > > > >                 actions[n] = *sa;
> > > > > > >                 return 0;
> > > > > > >         }
> > > > > > >
> > > > > > >         ...
> > > > > > > }
> > > > > >
> > > > > > I imagine the bit in the signal number being set by the direct caller
> > > > > > to sigaction, and we could specifically recommend that calling
> > > > > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> > > > >
> > > > > I can imagine this too, but that doesn't mean that software does it.
> > > > >
> > > > > If the above kind of thing exists in a framework or library somewhere,
> > > > > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > > > > that provides a wrapper for sigaction may now go wrong even if your
> > > > > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > > > > directly but in fact it isn't.
> > > >
> > > > I'm aware of one library like that. It's called libsigchain, and it
> > > > has an early bounds check:
> > > > https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> > > >
> > > > Until the library is changed to recognize the flag, calling code would
> > > > see the return value of -1 as if the kernel failed the syscall, and
> > > > would fall back to the code for old kernels.
> > >
> > > But only after some bad dereferences.  If these were writes, this means
> > > that memory _may_ be silently corrupted (I don't say it't likely in a
> > > given case, and we cannot pick a flag bit that makes this impossible).
> 
> You're talking about libsigchain, right? I don't see any bad
> references, the function returns after noticing the bounds check
> failure.

Yes, I confused myself by reading Handler() out of context.  The kernel
will invoke this with signo to a real signal number (without any flags).

The sigaction wrapper does the bounds check before doing anything else,
just as you say -- so that looks fine.

(Side question: is all this thread-safe?  Is there some implicit locking
somewhere?)

> > > So, _even though the user program is correct_, our change may trigger
> 
> Let's say that you were talking about some other library and not
> libsigchain. Such an interceptor wouldn't be correct though, it failed
> to account for our change to the syscall semantics. If the accesses
> were before the syscall (or the bounds check), then the interceptor
> would not have been correct in the first place because POSIX requires
> returning -1 with errno=EINVAL (and not crashing) if the signal number
> is invalid.
> 
> > > the corruption of arbitrary user memory.  This what I mean by an ABI
> > > break.  The fact that the corruption is not done by the syscall itself
> > > is no excuse.
> 
> At some point, though, accommodating interceptors becomes pretty much
> tantamount to saying "we can never change anything". Even just adding
> a field to __sifields (which is pretty much required for what we need
> to do) could break things in the presence of some interceptors because
> the interceptor could be copying the fields manually to a new data
> structure before calling the user's signal handler (e.g. because it
> wants to defer the signal until later) and miss our new field. I think
> most of the other ideas we're discussing fail to meet this bar as well
> and I'll go into more details later on.

I agree we cannot always avoid breaking such things.  But we should do
our best to avoid it.

> > > We also fail to notice failures in sigaddset() etc., though in this code
> > > it looks like that should not matter.
> 
> Maybe you're looking at the handler ("SignalChain::Handler")? The bit
> wouldn't be set in the signo argument to the handler. I'm talking
> about line 371 of the code I linked, in the sigaction interceptor
> "__sigaction" (it looks like sometimes the link doesn't take you to
> the correct line for some reason).

Ack, I confused myself.

> > > > In general I think that any library like this with independent
> > > > tracking of the kernel's purported signal handler state would need to
> > > > be very sensitive to which syscalls are capable of setting signal
> > > > handlers, what their semantics are, and so on. This applies to any
> > > > change that we might make to the signal handler interface. So for
> > > > example, if we introduced a new syscall as you propose below, and the
> > > > library hasn't been updated to recognize the new syscall, it will
> > > > silently miss changes in signal handler state caused by the new
> > > > syscall.
> > > >
> > > > At the end of this argument lies "we can never change anything about
> > > > how signal handlers work because it could break some interposing
> > > > library somewhere" -- replace "signal handlers" with any kernel
> > > > feature whose behavior may be modified by an interposing library if
> > > > you like -- and I don't think we want to go that far. As far as I
> > > > know, this isn't really the kernel's business anyway -- the kernel's
> > > > stable ABI contract starts and ends with the syscall interface and not
> > > > some library on top.
> > > >
> > > > That being said, we should perhaps try to define our interface so that
> > > > something reasonable will probably happen if there is such a library
> > > > and it hasn't been updated. With the new syscall, the library will
> > > > sometimes silently fail to work in some non-local fashion. With the
> > > > flag bit in the signal number, the library will either cause the
> > > > caller to fall back to the old kernel code path (if there is a bounds
> > > > check) or likely crash loudly (if there is no bounds check). To me,
> > > > the "flag bit in the signal number" behavior seems more reasonable,
> > > > since either something correct or something easy to debug will
> > > > probably happen at runtime.
> > > >
> > > > > > could only appear in newly-written code that doesn't follow our
> > > > > > recommendations, and there are already plenty of much more likely ways
> > > > > > to cause buffer overflows in C code that doesn't follow
> > > > > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > > > > it would very likely be caught early in development by the MMU due to
> > > > > > the magnitude of the number 1<<30.)
> > > > >
> > > > > Choosing the bit value is hard.  If shitfing it overflows, this can
> > > > > trigger random undefined behaviour in the compiler in addition to (or
> > > > > perhaps instead of) an out-of-bounds access or segfault.
> > > >
> > > > It wouldn't overflow on a 64-bit architecture assuming normal array
> > > > indexing (the index would be promoted to pointer width before being
> > > > scaled to the array element size), and to begin with the users of this
> > > > would be 64-bit.
> > >
> > > Unless we don't offer this feature for 32-bit at all (possible, if ugly)
> > > we can't stop people using it.
> 
> My point is that the problem in the interceptor library would probably
> be noticed on 64-bit (since that's what most people use these days),
> which would probably result in it being fixed by the time it reaches
> 32-bit users.

Agreed.  But we shouldn't take such bets unless we really have to.

> > > > > If shifting it doesn't overflow, we might still fall into a valid
> > > > > mapping, though I'd agree a segfault is more likely.
> > > > >
> > > > > >
> > > > > > > I think it would be cleaner for to add a single flag field that can be
> > > > > > > used for detecting other extensions, and request it via a new sa_flags
> > > > > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > > > > probably want to carry on zeroing them anyway).
> > > > > > >
> > > > > > > I can see no simpler way to add supplementary siginfo fields for
> > > > > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > > > > came in we could still detect optional si_code-specific fields via
> > > > > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > > > > >
> > > > > > That would certainly be cleaner if it worked, but that would only be
> > > > > > the case if old kernels rejected unknown bits in sa_flags, and
> > > > > > unfortunately they don't. With the bit in the signal number, the "old
> > > > >
> > > > > Hmm, that is a problem I wasn't aware of.
> > > > >
> > > > > > kernels reject" behavior admits relatively straightforward usage code:
> > > > > >
> > > > > > void set_segv_handler(void) {
> > > > > >   struct sigaction sa;
> > > > > >   sa.sa_sigaction = handle_segv;
> > > > > >   sa.sa_flags = SA_SIGINFO;
> > > > > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > > > > succeeds in new kernels, fails in old kernels
> > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > > > > >       perror("sigaction");
> > > > > >   }
> > > > > > }
> > > > > >
> > > > > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > >   sa->si_future_field = 0;
> > > > > >   handle_segv(signum, sa, ctx);
> > > > > > }
> > > > > >
> > > > > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > >   // At this point, si_future_field will have the value 0 in old
> > > > > > kernels and the kernel-supplied value in new kernels.
> > > > > > }
> > > > > >
> > > > > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > > > > number to sa_flags. In that case, the first sigaction would succeed in
> > > > > > old kernels so handle_segv wouldn't know whether it can safely read
> > > > > > from si_future_field. With the sa_flags approach, you would need
> > > > > > kernel version number checking via uname before setting the flag in
> > > > > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > > > > since you could just have the signal handler conditionally read from
> > > > > > si_future_field based on the uname?
> > > > >
> > > > > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > > > > it would be using a new #define.  So it might be reasonable to put the
> > > > > burden on that software to verify that the flag was really accepted by
> > > > > the kernel, by reading it back.
> > > >
> > > > That doesn't seem like a good idea even if it worked, because it could
> > > > lead to race conditions. If the si_flags-reading signal handler were
> > > > invoked in response to a signal between when you set it and when you
> > > > ended up replacing it with the fallback signal handler for old
> > > > kernels, the handler may end up reading garbage data from si_flags.
> > >
> > > Not really.  My example may have this problem, but the signal handler
> > > can be written to support both scenarios, based on testing a flag that
> > > the main program sets after verifying that the flag could be set.  Or
> > > the signal could be blocked around establishment (often a good idea for
> > > other reasons).
> > >
> > > But I agree it's a bit gross, and anyway doesn't work due to the fact
> > > that the kernel doesn't filter out unrecognised flags anyway.
> > >
> > > > > Unfortunately, even relatively recent kernels blindly store sa_flags
> > > > > in the kernel without validating it, and so it looks like duff flags
> > > > > can be read back out via a sigaction() call.  Dang.
> > > > >
> > > > >
> > > > > Perhaps a new frontend syscall could be added.  A new libc that knows
> > > > > about this "sigaction2" could use it and mask off problem bits from
> > > > > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > > > > libc would call the old sigaction syscall, where we would ignore these
> > > > > new sa_flags bits as before.
> > > >
> > > > I'm not currently in favor of the new syscall but if we do this I
> > > > would keep sigaction and sigaction2 separate. That is, libc sigaction
> > > > should always use the sigaction syscall, and libc sigaction2 should
> > > > always use the sigaction2 syscall. We should avoid libc's sigaction
> > > > having different behavior based on the libc version and kernel
> > > > version, as that would make it harder to reason about its behavior.
> > > > Calling code would need to check for presence of sigaction2 in both
> > > > libc and the kernel, e.g.
> > > >
> > > > __attribute__((weak)) decltype(sigaction2) sigaction2;
> > > >
> > > > void set_segv_handler(void) {
> > > >   struct sigaction sa;
> > > >   sa.sa_sigaction = handle_segv;
> > > >   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
> > > >   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
> > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > >     sa.sa_flags = SA_SIGINFO;
> > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > >       perror("sigaction");
> > > >   }
> > > > }
> > >
> > > I guess.  But I share your distaste for adding a new syscall.
> > >
> > > >
> > > > > This may not be a popular approach though, and software wouldn't be able
> > > > > to use our new features until libc is updated to match.
> > > > >
> > > > > If we go down this route, it may provide additional opportunities to fix
> > > > > annoying defects in the old interface.
> > > > >
> > > > >
> > > > > > Note that the same applies to a flag indicating the availability of a
> > > > > > si_flags field in sigaction (just
> > > > > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > > > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > > > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > > > > >
> > > > > > Another thought that occurred to me is that we may consider
> > > > > > generalizing this a step further and introducing a single flag bit in
> > > > > > the signal number that means "reject unknown flags in sa_flags". This
> > > > > > would mean that we wouldn't need to add any more flag bits to the
> > > > > > signal number in the future, thus limiting this signal number hack to
> > > > > > a single bit; all future mandatory behavior changes could just be put
> > > > > > behind a flag in sa_flags and userspace code would easily be able to
> > > > > > detect missing support for a flag and fall back if necessary. In our
> > > > > > case, this would imply usage code like this:
> > > > > >
> > > > > > void set_segv_handler(void) {
> > > > > >   struct sigaction sa;
> > > > > >   sa.sa_sigaction = handle_segv;
> > > > > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > > > > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > > > > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > > > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > > > > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > > > > the bounds check on the signal number).
> > > > > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > >     sa.sa_flags = SA_SIGINFO;
> > > > > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > > > > we're using sa_flags from the beginning of time.
> > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > > >       perror("sigaction");
> > > > > >   }
> > > > > > }
> > > > >
> > > > > As with the other options this could work, but looks like it could
> > > > > break the ABI due to violating the original semantics for the signal
> > > > > number argument.  Perhaps I'm being too paranoid.
> > > >
> > > > There's no ABI being broken here, as long as we consider syscalls to
> > > > be the stable ABI layer. Old kernels are simply rejecting arguments
> > > > that they don't know about yet. By that argument, any introduction of
> > > > a new syscall is an ABI break because it changes the semantics of a
> > > > previously-unallocated syscall number.
> > >
> > > As argued above, I think this is an invalid argument.
> > >
> > > Although any addition will change behaviour (so is a break in some
> > > sense), the key is not to make "surprising" changes.
> 
> If we care about interceptors then I don't think "surprising" comes
> into it. It's more a question of "does the anticipated behavior of the
> interceptor match our desired behavior", where "desired" means "most
> likely to avoid silent breakage". We would need to get into the head
> of a potential interceptor author and think about how they would have
> handled the signal number argument, as well as other arguments like
> sa_flags if we want to go that route, and see whether that behavior
> would lead to the desired result.

That's exactly what I mean by "surprising".  However, not every
interceptor author will be making the same assumptions, and not every
bit of software affected will be an interceptor.  So some judgement
needs to be applied.

> In this case, I think we exactly want the interceptor author to have
> thought "oh, it's just a number, I'll (possibly do a bounds check and
> then) use the number as an index into an array". This will lead to one
> of two outcomes: crashing (yes, yes, it won't always crash, but if the
> alternative is that it never crashes and we get silently incorrect
> behavior all of the time, I'll take sometimes crashing) or fail the
> bounds check and pretend to be an old kernel (the latter is
> anticipated by POSIX which requires returning -1/EINVAL for an invalid
> signal number). Each of these behaviors are desirable, as they are
> observable failures, which are more likely to result in fixes than
> silent ones.

Agreed, except wanting the author to have thought something doesn't
ensure that they actually did think that.

> > > Having something random happen when setting a previously reserved flag
> > > bit, or when issuing a syscall when an unknown syscall number, or not
> > > surprising at all.
> 
> Introducing a new syscall is right out in this model. The interceptor
> author wouldn't have anticipated our introducing a new syscall, so the
> new syscall wouldn't be intercepted and calls to the new syscall would
> silently bypass the interceptor. For example, adding sigaction2 could
> result in signal handlers being set without the interceptor's
> knowledge.

Agreed.  My sentence was a bit mangled: I mean to say "Having something
random happen when [...] issuing a syscall *with* an unknown syscall
number *is* not surprising at all."

I agree that adding a new syscall is problematic if we want to avoid
breaking existing interceptors in particular.  Other types of code are
much less likely to be affected by the addition of new syscalls.

> Regarding a sa_flags bit, let's get inside the head of the interceptor
> author again. How would they handle a flag bit that they don't
> recognize when replacing the signal handler? It wouldn't be correct to
> just pass it through to the kernel, or drop the flag on the floor, as
> it might be semantically meaningful (and thus could change the calling
> convention as SA_SIGINFO does, or change the meaning of fields in
> siginfo, as SA_CODEX would do). A correctly written sigaction
> interceptor should probably abort the program upon encountering an
> unknown flag (thus giving a human a chance to update the interceptor),
> but chances are that they don't. Ignoring all but a few flags (and
> passing a fixed set of flags to the kernel) seems to be what
> libsigchain does, and in the case of SA_CODEX it would seem to result
> in desirable behavior (but I suspect that it isn't handling the other
> flags correctly), but I could also see an interceptor author just
> passing it unchanged to the kernel without checking it (perhaps
> because they didn't think about these issues, and because that didn't
> matter until now, with the exception of from-the-beginning-of-time
> flags like SA_SIGINFO). And with SA_CODEX that could lead to silent
> misreading of si_code in the interceptor's signal handler, if it
> hasn't been updated to use the new macros.

Agreed.  I've tried to implement things rather like this in the past,
and how to interpret the flags is a tricky issue.  Some of the flags are
impossible to emulate even when you know what they mean, in particular
SA_NODEFER and SA_RESTART.

Making new flags safe to ignore and harmless to set of you don't know
what they mean is the safest approach, but not always possible (I think
I managed this with by suggestion below, though).

Ideally, a flags field should be specified with rules that say exactly
what to do with flags you don't recognise.  Sadly this is usually not
thought about until it's too late.

> > > Making fundamental changes to the encoding of an existing argument is
> > > highly surprising, on the other hand: as your example shows, it is
> > > reasonable to index an array using a signal number.
> > >
> > > I agree that this doesn't get us closer to a practical solution though.
> > >
> > >
> > > But we do seem to need some mechanism in addition to (or instead of)
> > > sa_flags.
> > >
> > > Here's another thought:
> > >
> > > Since si_flags would be either always present or always absent, it
> > > could make sense to have a global property to report this, rather than
> > > an sa_flags or signal number bit to request it per-signal.
> > >
> > > Requiring software to parse uname() might be reasonable for that, if
> > > cumbersome (did you suggest this previously?).  If we're concerned that
> > > the awkwardness of this would encourage people not to bother (or
> > > encourage people to do it wrong) then we might opt for something simpler
> > > like an AT_FLAGS bit.
> > >
> > > Ultimately libc could provide a more portable interface for discovery,
> > > such as via sysconf().
> > >
> > > Thoughts?
> 
> Yes, this was all to avoid the userspace code needing to contain a
> version check (or equivalent). Maybe the version check would be better
> than the alternatives though (although it's still vulnerable to
> non-updated interceptors not copying our new fields). AT_FLAGS sounds
> good to me.

Ack (though not needed if we use the approach outlined below).

> 
> > While you're thinking about that, here's another idea:
> >
> > It occurs to me that there are spare bits in si_code.  si_code is an
> > enum, but unlike the signal number there are no specific bounds for
> > this value, so we may have an easier time recycling bits here.
> >
> > The high bits of si_code are usually sign-extension and so not always
> > 0, but we can XOR flags into them provided we don't forget the real sign.
> >
> > Software that isn't expecting twiddled bits would get confused, so we
> > need a new SA_ flag to enable this.  But this flag (SA_CODEX) below is
> > now just a request.  If the kernel doesn't understand it (or without
> > SA_SIGINFO) then no flags would be reported in si_code, which is
> > backwards-compatible.
> >
> > A handler would now do
> >
> > void handler(int n, siginfo_t *si, ...)
> > {
> >         int flags = SI_FLAGS(si->si_code);
> >         int code = SI_CODE(si->si_code);
> >
> >         if (!(flags & SIF_CODEX) {
> >                 /* flags not supported */
> >                 /* Careful assignment of flag meanings may make this
> >                    check unnecessary, but it's probably useful for
> >                    developers for testing their code. */
> >         }
> >
> >         /* Handle signal based on n, code and flags */
> > }
> >
> >
> > If the kernel doesn't report any flags (perhaps because it's too old)
> > then SI_FLAGS() will yield 0 and SI_CODE() will just return si_code
> > unchanged.  This means that even non-SA_CODEX handlers can use these
> > macros, which may ease migration.
> 
> Thanks, this seems more appealing to me than the bit in the signal
> number idea. It uses the sa_flags field as intended and doesn't abuse
> the fields of the siginfo data structure too much. I don't think we
> should put as much weight into interceptor concerns as you do, so you
> can consider my above argumentation to be from a devil's advocate
> perspective.

Ditto: we shouldn't break forwards compatibility wherever we can avoid
doing so, but with APIs that are not well designed for forwards
compatibility (the signal API included) we have to be realistic.

The interceptor case is just one example of something that could break,
but I'll admit I couldn't come up with a more convincing example so far.

> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > index cb3d6c2..4e77c71 100644
> > --- a/include/uapi/asm-generic/siginfo.h
> > +++ b/include/uapi/asm-generic/siginfo.h
> > @@ -176,6 +176,18 @@ typedef struct siginfo {
> >  #define SI_DETHREAD    -7              /* sent by execve() killing subsidiary threads */
> >  #define SI_ASYNCNL     -60             /* sent by glibc async name lookup completion */
> >
> > +#define __SI_FLAGS     0x7ffff000      /* optional code extension flags */
> > +#define SIF_CODEX      0x40000000      /* code extension flags supported */
> 
> I don't think we would need this flag because even old kernels
> "support" the extension flags under this scheme (they just don't
> report any features). We would only need to introduce flags for actual
> features.

Agreed, it's not strictly required.  It occurred to me that someone
writing code in userspace would want a way to verify that they'd set the
flag correctly and that the kernel supported it.  But there ought to be
other ways to do that once real si_code flags get added.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-13 13:24                                                                           ` Dave Martin
@ 2020-07-13 20:50                                                                             ` Peter Collingbourne
  2020-07-14 17:36                                                                               ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-07-13 20:50 UTC (permalink / raw)
  To: Dave Martin
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Mon, Jul 13, 2020 at 6:24 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Wed, Jul 08, 2020 at 03:21:13PM -0700, Peter Collingbourne wrote:
> > On Wed, Jul 8, 2020 at 6:58 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Wed, Jul 08, 2020 at 12:00:22PM +0100, Dave Martin wrote:
> > > > On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> > > > > On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > >
> > > > > > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > > > > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > > > > > >> > -  else
> > > > > > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > > > > > >> > +  } else {
> > > > > > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > > > > > >> > +  }
> > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > > > > > union members simultantiously.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > > > > > though.)
> > > > > > > > > > > >
> > > > > > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > > > > > different union member from the one previously written.
> > > > > > > > > > > >
> > > > > > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > > > > > on the other members IIUC.
> > > > > > > > > > > >
> > > > > > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > > > > > incorrectness just to save a few bytes.
> > > > > > > > > > > >
> > > > > > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > > > > > >
> > > > > > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > > > > > >
> > > > > > > > > > > > > So you want something like:
> > > > > > > > > > > > >
> > > > > > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > >
> > > > > > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > > > > > signal generation site...
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Garbled sentence?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I am not seeing anything like that.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > > > > > on 32-bit ARM.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > > > > > >> > +                  struct {
> > > > > > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > > > > > >
> > > > > > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > > > > > >> > +#endif
> > > > > > > > > > > > > > >> >            };
> > > > > > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > > > > > >> is built for a single architecture.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > > > > > considering a similar feature:
> > > > > > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > > > > > architectures from the start.
> > > > > > > > > > > > >
> > > > > > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > > > > > this point to accommodate the new fields.
> > > > > > > > > > > > >
> > > > > > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > > > > > >
> > > > > > > > > > > > I think what we need here is basically a flags word.
> > > > > > > > > > > >
> > > > > > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > > > > > flags word, we can extend as needed.
> > > > > > > > > > > >
> > > > > > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > > > > > I guess si_code may be sufficient.
> > > > > > > > > > >
> > > > > > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > > > > > the struct is done by the clear_user call here:
> > > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > > > > > >
> > > > > > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > > > > > calls when initializing the data structure:
> > > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > > > > > >
> > > > > > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > > > > > support for flagged fields.
> > > > > > > > > >
> > > > > > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > > > > > back to when the arch/arm64 was merged).
> > > > > > > > > >
> > > > > > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > > > > > >
> > > > > > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > > > > > this matters.  I may have misunderstood something about the code though.
> > > > > > > > >
> > > > > > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > > > > > released in 4.18. So if an application wants to be compatible with
> > > > > > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > > > > > version before reading these fields. I have a couple of other ideas
> > > > > > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > > > > > compatibility.)
> > > > > > > > >
> > > > > > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > > > > > circumstances:
> > > > > > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > > > > > >
> > > > > > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > > > > > this signal before the 5.8 release.
> > > > > > > > >
> > > > > > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > > > > > registering a signal handler meaning "this signal handler requires
> > > > > > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > > > > > this vein would be to claim some of the upper bits of the signal
> > > > > > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > > > > > 2.6.18) contain this code in do_sigaction:
> > > > > > > > >
> > > > > > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > > > > > >                 return -EINVAL;
> > > > > > > > >
> > > > > > > > > and vald_signal is defined as:
> > > > > > > > >
> > > > > > > > > static inline int valid_signal(unsigned long sig)
> > > > > > > > > {
> > > > > > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > > > > > and permits future expansion of the signal number range.
> > > > > > > >
> > > > > > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > > > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > > > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > > > > > but I've not eyeballed it in detail...)
> > > > > > >
> > > > > > > It memsets the siginfo structure before setting the fields and sending
> > > > > > > the signal (grep for clear_siginfo which is just a memset; you should
> > > > > > > find a call before all callers of force_sig_info). Memset is the right
> > > > > > > approach here since unlike setting fields by hand it clears padding
> > > > > > > which could lead to information leaks from the kernel. IIUC this is
> > > > > > > the reason why Eric wants all of the signals to be raised via wrappers
> > > > > > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > > > > > this aspect easier to audit).
> > > > > >
> > > > > > My impression was that the reason for this model is partly to ensure
> > > > > > that siginfo fields are populated more consistently.  When it was all
> > > > > > down to the individual callers, inconsistencies creeped in.
> > > > > >
> > > > > > With regard to memset(), this is not a complete defence against data
> > > > > > leakage.  Assigning to a struct member can set any or all padding in
> > > > > > the struct to random garbage (consider write-combining of neighboring
> > > > > > member writes into a single larger accesses in asm for example).  The
> > > > >
> > > > > I don't believe that LLVM will store to padding like this. I don't
> > > > > know about GCC, though, but I wouldn't be surprised if this is
> > > > > something that the kernel would want to turn off in "kernel C" (like
> > > > > it turns off strict aliasing) specifically because of the information
> > > > > leak issue.
> > > >
> > > > Again, the issue is not future kernel builds -- we can always find a way
> > > > to fix the behaviour for those -- but past kernel builds.
> >
> > I thought that the whole point of the "bit in the signal number" (or
> > SI_CODEX or whatever) was that we didn't need to worry about the
> > behavior of past kernel builds?
>
> It depends on what we use the new flag(s) for.
>
> If the flag means just that unused padding is safely zeroed, that could
> work -- but we'd want high confidence that it really is zeroed even in
> wacky configurations.
>
> > > > > > only way to avoid this is to ensure that the struct is 100%
> > > > > > padding-free, and that each member of a union is the same size.  A
> > > > > > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > > > > > the case.
> > > > > >
> > > > > > This might need to be looked at separately.
> > > > > >
> > > > > > But it does mean, strictly speaking, that we can't reliably add new
> > > > > > fields anywhere that there was previously padding: assigning to
> > > > > > neighboring members can still fill those with garbage after the
> > > > > > memset().
> > > > >
> > > > > ...but this is largely moot because I'm not proposing to add new
> > > > > fields in the padding any more (because the fields needed to become
> > > > > larger in order to accommodate future hypothetical architectures which
> > > > > might want to use the fields, and thus they wouldn't fit in the
> > > > > padding). The siginfo.h diff would be something like:
> > > > >
> > > > > diff --git a/include/uapi/asm-generic/siginfo.h
> > > > > b/include/uapi/asm-generic/siginfo.h
> > > > > index cb3d6c267181..4a2fe257415d 100644
> > > > > --- a/include/uapi/asm-generic/siginfo.h
> > > > > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > @@ -91,7 +91,10 @@ union __sifields {
> > > > >                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > >                                 __u32 _pkey;
> > > > >                         } _addr_pkey;
> > > > > +                       void *_pad[6];
> > > > >                 };
> > > > > +               uintptr_t _ignored_bits;
> > > > > +               uintptr_t _ignored_bits_mask;
> > > >
> > > > This _is_ in padding: the tail-padding of the (previously smaller)
> > > > _sigfault.  Again, the compiler was allowed to populate this area with
> > > > junk before these fields were added.
> > > >
> > > > I agree that it seems fairly unlikely that the compiler would have been
> > > > overwriting this in normal circumstances, but that's not a guarantee.
> > > > My worry is that if this goes wrong, it will go wrong silently and
> > > > unpredictably.
> > > >
> > > > >         } _sigfault;
> > > > >
> > > > >         /* SIGPOLL */
> > > > >
> > > > > or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> > > > >
> > > > > > > > Using unused bits in the signal number to turn on new functionality
> > > > > > > > feels risky.  As currently specified, this is just a number.  Since
> > > > > > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > > > > > number, reasonable code like the following would trigger a buffer
> > > > > > > > overrun if we start trying to encode anything else in there:
> > > > > > > >
> > > > > > > > struct sigaction actions[NSIG];
> > > > > > > >
> > > > > > > > int do_something( ... )
> > > > > > > > {
> > > > > > > >         ...
> > > > > > > >
> > > > > > > >         if (!sigaction(n, sa, ...)) {
> > > > > > > >                 actions[n] = *sa;
> > > > > > > >                 return 0;
> > > > > > > >         }
> > > > > > > >
> > > > > > > >         ...
> > > > > > > > }
> > > > > > >
> > > > > > > I imagine the bit in the signal number being set by the direct caller
> > > > > > > to sigaction, and we could specifically recommend that calling
> > > > > > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> > > > > >
> > > > > > I can imagine this too, but that doesn't mean that software does it.
> > > > > >
> > > > > > If the above kind of thing exists in a framework or library somewhere,
> > > > > > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > > > > > that provides a wrapper for sigaction may now go wrong even if your
> > > > > > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > > > > > directly but in fact it isn't.
> > > > >
> > > > > I'm aware of one library like that. It's called libsigchain, and it
> > > > > has an early bounds check:
> > > > > https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> > > > >
> > > > > Until the library is changed to recognize the flag, calling code would
> > > > > see the return value of -1 as if the kernel failed the syscall, and
> > > > > would fall back to the code for old kernels.
> > > >
> > > > But only after some bad dereferences.  If these were writes, this means
> > > > that memory _may_ be silently corrupted (I don't say it't likely in a
> > > > given case, and we cannot pick a flag bit that makes this impossible).
> >
> > You're talking about libsigchain, right? I don't see any bad
> > references, the function returns after noticing the bounds check
> > failure.
>
> Yes, I confused myself by reading Handler() out of context.  The kernel
> will invoke this with signo to a real signal number (without any flags).
>
> The sigaction wrapper does the bounds check before doing anything else,
> just as you say -- so that looks fine.
>
> (Side question: is all this thread-safe?  Is there some implicit locking
> somewhere?)

I think maybe it isn't? There seem to be possible races on the
handler_ field. One possibility is that the function could race with
itself on another thread, which could be fixed via locking, but it
would also need to handle races between itself and the signal handler,
most likely by blocking the signal while setting it.

> > > > So, _even though the user program is correct_, our change may trigger
> >
> > Let's say that you were talking about some other library and not
> > libsigchain. Such an interceptor wouldn't be correct though, it failed
> > to account for our change to the syscall semantics. If the accesses
> > were before the syscall (or the bounds check), then the interceptor
> > would not have been correct in the first place because POSIX requires
> > returning -1 with errno=EINVAL (and not crashing) if the signal number
> > is invalid.
> >
> > > > the corruption of arbitrary user memory.  This what I mean by an ABI
> > > > break.  The fact that the corruption is not done by the syscall itself
> > > > is no excuse.
> >
> > At some point, though, accommodating interceptors becomes pretty much
> > tantamount to saying "we can never change anything". Even just adding
> > a field to __sifields (which is pretty much required for what we need
> > to do) could break things in the presence of some interceptors because
> > the interceptor could be copying the fields manually to a new data
> > structure before calling the user's signal handler (e.g. because it
> > wants to defer the signal until later) and miss our new field. I think
> > most of the other ideas we're discussing fail to meet this bar as well
> > and I'll go into more details later on.
>
> I agree we cannot always avoid breaking such things.  But we should do
> our best to avoid it.

I think that given the hand that we've been dealt, no matter what we
do, we can't really avoid risking breaking something. The relevant
questions are "what are we going to risk breaking", "how much risk is
there", "will it be easily noticed/fixable", and "once we're on the
other side of the potential breakage, will we find ourselves in a
position where changing things involves less breakage risk".

> > > > We also fail to notice failures in sigaddset() etc., though in this code
> > > > it looks like that should not matter.
> >
> > Maybe you're looking at the handler ("SignalChain::Handler")? The bit
> > wouldn't be set in the signo argument to the handler. I'm talking
> > about line 371 of the code I linked, in the sigaction interceptor
> > "__sigaction" (it looks like sometimes the link doesn't take you to
> > the correct line for some reason).
>
> Ack, I confused myself.
>
> > > > > In general I think that any library like this with independent
> > > > > tracking of the kernel's purported signal handler state would need to
> > > > > be very sensitive to which syscalls are capable of setting signal
> > > > > handlers, what their semantics are, and so on. This applies to any
> > > > > change that we might make to the signal handler interface. So for
> > > > > example, if we introduced a new syscall as you propose below, and the
> > > > > library hasn't been updated to recognize the new syscall, it will
> > > > > silently miss changes in signal handler state caused by the new
> > > > > syscall.
> > > > >
> > > > > At the end of this argument lies "we can never change anything about
> > > > > how signal handlers work because it could break some interposing
> > > > > library somewhere" -- replace "signal handlers" with any kernel
> > > > > feature whose behavior may be modified by an interposing library if
> > > > > you like -- and I don't think we want to go that far. As far as I
> > > > > know, this isn't really the kernel's business anyway -- the kernel's
> > > > > stable ABI contract starts and ends with the syscall interface and not
> > > > > some library on top.
> > > > >
> > > > > That being said, we should perhaps try to define our interface so that
> > > > > something reasonable will probably happen if there is such a library
> > > > > and it hasn't been updated. With the new syscall, the library will
> > > > > sometimes silently fail to work in some non-local fashion. With the
> > > > > flag bit in the signal number, the library will either cause the
> > > > > caller to fall back to the old kernel code path (if there is a bounds
> > > > > check) or likely crash loudly (if there is no bounds check). To me,
> > > > > the "flag bit in the signal number" behavior seems more reasonable,
> > > > > since either something correct or something easy to debug will
> > > > > probably happen at runtime.
> > > > >
> > > > > > > could only appear in newly-written code that doesn't follow our
> > > > > > > recommendations, and there are already plenty of much more likely ways
> > > > > > > to cause buffer overflows in C code that doesn't follow
> > > > > > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > > > > > it would very likely be caught early in development by the MMU due to
> > > > > > > the magnitude of the number 1<<30.)
> > > > > >
> > > > > > Choosing the bit value is hard.  If shitfing it overflows, this can
> > > > > > trigger random undefined behaviour in the compiler in addition to (or
> > > > > > perhaps instead of) an out-of-bounds access or segfault.
> > > > >
> > > > > It wouldn't overflow on a 64-bit architecture assuming normal array
> > > > > indexing (the index would be promoted to pointer width before being
> > > > > scaled to the array element size), and to begin with the users of this
> > > > > would be 64-bit.
> > > >
> > > > Unless we don't offer this feature for 32-bit at all (possible, if ugly)
> > > > we can't stop people using it.
> >
> > My point is that the problem in the interceptor library would probably
> > be noticed on 64-bit (since that's what most people use these days),
> > which would probably result in it being fixed by the time it reaches
> > 32-bit users.
>
> Agreed.  But we shouldn't take such bets unless we really have to.
>
> > > > > > If shifting it doesn't overflow, we might still fall into a valid
> > > > > > mapping, though I'd agree a segfault is more likely.
> > > > > >
> > > > > > >
> > > > > > > > I think it would be cleaner for to add a single flag field that can be
> > > > > > > > used for detecting other extensions, and request it via a new sa_flags
> > > > > > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > > > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > > > > > probably want to carry on zeroing them anyway).
> > > > > > > >
> > > > > > > > I can see no simpler way to add supplementary siginfo fields for
> > > > > > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > > > > > came in we could still detect optional si_code-specific fields via
> > > > > > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > > > > > >
> > > > > > > That would certainly be cleaner if it worked, but that would only be
> > > > > > > the case if old kernels rejected unknown bits in sa_flags, and
> > > > > > > unfortunately they don't. With the bit in the signal number, the "old
> > > > > >
> > > > > > Hmm, that is a problem I wasn't aware of.
> > > > > >
> > > > > > > kernels reject" behavior admits relatively straightforward usage code:
> > > > > > >
> > > > > > > void set_segv_handler(void) {
> > > > > > >   struct sigaction sa;
> > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > >   sa.sa_flags = SA_SIGINFO;
> > > > > > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > > > > > succeeds in new kernels, fails in old kernels
> > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > > > > > >       perror("sigaction");
> > > > > > >   }
> > > > > > > }
> > > > > > >
> > > > > > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > > >   sa->si_future_field = 0;
> > > > > > >   handle_segv(signum, sa, ctx);
> > > > > > > }
> > > > > > >
> > > > > > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > > >   // At this point, si_future_field will have the value 0 in old
> > > > > > > kernels and the kernel-supplied value in new kernels.
> > > > > > > }
> > > > > > >
> > > > > > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > > > > > number to sa_flags. In that case, the first sigaction would succeed in
> > > > > > > old kernels so handle_segv wouldn't know whether it can safely read
> > > > > > > from si_future_field. With the sa_flags approach, you would need
> > > > > > > kernel version number checking via uname before setting the flag in
> > > > > > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > > > > > since you could just have the signal handler conditionally read from
> > > > > > > si_future_field based on the uname?
> > > > > >
> > > > > > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > > > > > it would be using a new #define.  So it might be reasonable to put the
> > > > > > burden on that software to verify that the flag was really accepted by
> > > > > > the kernel, by reading it back.
> > > > >
> > > > > That doesn't seem like a good idea even if it worked, because it could
> > > > > lead to race conditions. If the si_flags-reading signal handler were
> > > > > invoked in response to a signal between when you set it and when you
> > > > > ended up replacing it with the fallback signal handler for old
> > > > > kernels, the handler may end up reading garbage data from si_flags.
> > > >
> > > > Not really.  My example may have this problem, but the signal handler
> > > > can be written to support both scenarios, based on testing a flag that
> > > > the main program sets after verifying that the flag could be set.  Or
> > > > the signal could be blocked around establishment (often a good idea for
> > > > other reasons).
> > > >
> > > > But I agree it's a bit gross, and anyway doesn't work due to the fact
> > > > that the kernel doesn't filter out unrecognised flags anyway.
> > > >
> > > > > > Unfortunately, even relatively recent kernels blindly store sa_flags
> > > > > > in the kernel without validating it, and so it looks like duff flags
> > > > > > can be read back out via a sigaction() call.  Dang.
> > > > > >
> > > > > >
> > > > > > Perhaps a new frontend syscall could be added.  A new libc that knows
> > > > > > about this "sigaction2" could use it and mask off problem bits from
> > > > > > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > > > > > libc would call the old sigaction syscall, where we would ignore these
> > > > > > new sa_flags bits as before.
> > > > >
> > > > > I'm not currently in favor of the new syscall but if we do this I
> > > > > would keep sigaction and sigaction2 separate. That is, libc sigaction
> > > > > should always use the sigaction syscall, and libc sigaction2 should
> > > > > always use the sigaction2 syscall. We should avoid libc's sigaction
> > > > > having different behavior based on the libc version and kernel
> > > > > version, as that would make it harder to reason about its behavior.
> > > > > Calling code would need to check for presence of sigaction2 in both
> > > > > libc and the kernel, e.g.
> > > > >
> > > > > __attribute__((weak)) decltype(sigaction2) sigaction2;
> > > > >
> > > > > void set_segv_handler(void) {
> > > > >   struct sigaction sa;
> > > > >   sa.sa_sigaction = handle_segv;
> > > > >   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
> > > > >   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
> > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > >     sa.sa_flags = SA_SIGINFO;
> > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > >       perror("sigaction");
> > > > >   }
> > > > > }
> > > >
> > > > I guess.  But I share your distaste for adding a new syscall.
> > > >
> > > > >
> > > > > > This may not be a popular approach though, and software wouldn't be able
> > > > > > to use our new features until libc is updated to match.
> > > > > >
> > > > > > If we go down this route, it may provide additional opportunities to fix
> > > > > > annoying defects in the old interface.
> > > > > >
> > > > > >
> > > > > > > Note that the same applies to a flag indicating the availability of a
> > > > > > > si_flags field in sigaction (just
> > > > > > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > > > > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > > > > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > > > > > >
> > > > > > > Another thought that occurred to me is that we may consider
> > > > > > > generalizing this a step further and introducing a single flag bit in
> > > > > > > the signal number that means "reject unknown flags in sa_flags". This
> > > > > > > would mean that we wouldn't need to add any more flag bits to the
> > > > > > > signal number in the future, thus limiting this signal number hack to
> > > > > > > a single bit; all future mandatory behavior changes could just be put
> > > > > > > behind a flag in sa_flags and userspace code would easily be able to
> > > > > > > detect missing support for a flag and fall back if necessary. In our
> > > > > > > case, this would imply usage code like this:
> > > > > > >
> > > > > > > void set_segv_handler(void) {
> > > > > > >   struct sigaction sa;
> > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > > > > > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > > > > > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > > > > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > > > > > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > > > > > the bounds check on the signal number).
> > > > > > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > >     sa.sa_flags = SA_SIGINFO;
> > > > > > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > > > > > we're using sa_flags from the beginning of time.
> > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > > > >       perror("sigaction");
> > > > > > >   }
> > > > > > > }
> > > > > >
> > > > > > As with the other options this could work, but looks like it could
> > > > > > break the ABI due to violating the original semantics for the signal
> > > > > > number argument.  Perhaps I'm being too paranoid.
> > > > >
> > > > > There's no ABI being broken here, as long as we consider syscalls to
> > > > > be the stable ABI layer. Old kernels are simply rejecting arguments
> > > > > that they don't know about yet. By that argument, any introduction of
> > > > > a new syscall is an ABI break because it changes the semantics of a
> > > > > previously-unallocated syscall number.
> > > >
> > > > As argued above, I think this is an invalid argument.
> > > >
> > > > Although any addition will change behaviour (so is a break in some
> > > > sense), the key is not to make "surprising" changes.
> >
> > If we care about interceptors then I don't think "surprising" comes
> > into it. It's more a question of "does the anticipated behavior of the
> > interceptor match our desired behavior", where "desired" means "most
> > likely to avoid silent breakage". We would need to get into the head
> > of a potential interceptor author and think about how they would have
> > handled the signal number argument, as well as other arguments like
> > sa_flags if we want to go that route, and see whether that behavior
> > would lead to the desired result.
>
> That's exactly what I mean by "surprising".

Not quite, see below.

> However, not every
> interceptor author will be making the same assumptions, and not every
> bit of software affected will be an interceptor.

I can see a couple of ways in which non-interceptor software could be affected:

- It's doing something like "call sigaction on every possible signal
number in the 31-bit range and end up failing if the syscall
succeeded" (e.g. with an OOB write). Perhaps software could be doing
something like this in a loop to collect all currently registered
signal handlers. That being said, this program:

#include <limits.h>
#include <signal.h>

int main() {
  struct sigaction act;
  for (int i = 1; i != INT_MAX; ++i) {
    sigaction(i, 0, &act);
  }
}

takes around 5 seconds to run on my relatively-fast machine, so I
would expect any such code to be noticed as a performance issue and
either be changed to be bounded on _NSIG or break on EINVAL.

This is probably the largest potential flaw that I can currently see
in the "bit in the signal number" idea, since it could conceivably
result in userspace code being broken without having first required it
to have been changed to make use of the new feature. I'm not convinced
that it would be an ABI break though, because the code seems unlikely
to exist in this form in the wild because of the performance issue,
and you could anyway make the argument that the code is incorrect
because, in order to contain a loop like this, it would need to be
able to handle large, previously-unknown signal numbers somehow. If we
accept that the code is incorrect, a similar line of argument applies
as for interceptors (i.e. likely to result in an OOB access which will
fail loudly and be easily debugged and fixed).

- If we do something that involves introducing a new flag in sa_flags,
the flag may be exposed to unaware software via the oldact argument to
sigaction, and I suppose that it's conceivable that exposing a
previously-unknown flag like this could somehow break something. But
this seems like an unreasonable restriction because it would mean that
we can never add a flag to sa_flags no matter what.

>  So some judgement
> needs to be applied.

Of course. We need to agree *how* to apply the judgement though.

> > In this case, I think we exactly want the interceptor author to have
> > thought "oh, it's just a number, I'll (possibly do a bounds check and
> > then) use the number as an index into an array". This will lead to one
> > of two outcomes: crashing (yes, yes, it won't always crash, but if the
> > alternative is that it never crashes and we get silently incorrect
> > behavior all of the time, I'll take sometimes crashing) or fail the
> > bounds check and pretend to be an old kernel (the latter is
> > anticipated by POSIX which requires returning -1/EINVAL for an invalid
> > signal number). Each of these behaviors are desirable, as they are
> > observable failures, which are more likely to result in fixes than
> > silent ones.
>
> Agreed, except wanting the author to have thought something doesn't
> ensure that they actually did think that.

True, but if our goal is only to accommodate reasonably written
interceptors, we don't actually need to ensure anything here.

> > > > Having something random happen when setting a previously reserved flag
> > > > bit, or when issuing a syscall when an unknown syscall number, or not
> > > > surprising at all.
> >
> > Introducing a new syscall is right out in this model. The interceptor
> > author wouldn't have anticipated our introducing a new syscall, so the
> > new syscall wouldn't be intercepted and calls to the new syscall would
> > silently bypass the interceptor. For example, adding sigaction2 could
> > result in signal handlers being set without the interceptor's
> > knowledge.
>
> Agreed.  My sentence was a bit mangled: I mean to say "Having something
> random happen when [...] issuing a syscall *with* an unknown syscall
> number *is* not surprising at all."
>
> I agree that adding a new syscall is problematic if we want to avoid
> breaking existing interceptors in particular.  Other types of code are
> much less likely to be affected by the addition of new syscalls.

Right, and this to me is a case in point for why I would say that
"surprising" isn't the right frame of analysis here. My analysis seems
to generally be that "anticipated interceptor behavior matches desired
behavior" is positively correlated with "surprising" (i.e. the
interceptor viewpoint is the dual of the user viewpoint), so if we
care about interceptors we may end up making a "surprising" change
even though it doesn't intuitively seem like the right thing to do.

> > Regarding a sa_flags bit, let's get inside the head of the interceptor
> > author again. How would they handle a flag bit that they don't
> > recognize when replacing the signal handler? It wouldn't be correct to
> > just pass it through to the kernel, or drop the flag on the floor, as
> > it might be semantically meaningful (and thus could change the calling
> > convention as SA_SIGINFO does, or change the meaning of fields in
> > siginfo, as SA_CODEX would do). A correctly written sigaction
> > interceptor should probably abort the program upon encountering an
> > unknown flag (thus giving a human a chance to update the interceptor),
> > but chances are that they don't. Ignoring all but a few flags (and
> > passing a fixed set of flags to the kernel) seems to be what
> > libsigchain does, and in the case of SA_CODEX it would seem to result
> > in desirable behavior (but I suspect that it isn't handling the other
> > flags correctly), but I could also see an interceptor author just
> > passing it unchanged to the kernel without checking it (perhaps
> > because they didn't think about these issues, and because that didn't
> > matter until now, with the exception of from-the-beginning-of-time
> > flags like SA_SIGINFO). And with SA_CODEX that could lead to silent
> > misreading of si_code in the interceptor's signal handler, if it
> > hasn't been updated to use the new macros.
>
> Agreed.  I've tried to implement things rather like this in the past,
> and how to interpret the flags is a tricky issue.  Some of the flags are
> impossible to emulate even when you know what they mean, in particular
> SA_NODEFER and SA_RESTART.
>
> Making new flags safe to ignore and harmless to set of you don't know
> what they mean is the safest approach, but not always possible (I think
> I managed this with by suggestion below, though).

Again, this suggestion could lead to silent failures in an interceptor, if:
- the interceptor passes the sa_flags through to the kernel unchanged
(or otherwise doesn't touch SA_CODEX)
- the interceptor replaces the user's sa_sigaction
- the interceptor's replacement sa_sigaction tests the provided si_code.

Maybe you're not concerned about that, though? At least to me it seems
in the same ballpark of likelihood as the ways in which things could
go wrong with the signal number bit.

> Ideally, a flags field should be specified with rules that say exactly
> what to do with flags you don't recognise.  Sadly this is usually not
> thought about until it's too late.

It perhaps isn't too late to introduce such rules for sigaction if we
adopt the signal number bit and we make it mean "reject unknown
flags".

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-13 20:50                                                                             ` Peter Collingbourne
@ 2020-07-14 17:36                                                                               ` Dave Martin
  2020-08-18  3:16                                                                                 ` Peter Collingbourne
  0 siblings, 1 reply; 64+ messages in thread
From: Dave Martin @ 2020-07-14 17:36 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Mon, Jul 13, 2020 at 01:50:30PM -0700, Peter Collingbourne wrote:
> On Mon, Jul 13, 2020 at 6:24 AM Dave Martin <Dave.Martin@arm.com> wrote:
> >
> > On Wed, Jul 08, 2020 at 03:21:13PM -0700, Peter Collingbourne wrote:
> > > On Wed, Jul 8, 2020 at 6:58 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > >
> > > > On Wed, Jul 08, 2020 at 12:00:22PM +0100, Dave Martin wrote:
> > > > > On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> > > > > > On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > > > > > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > >
> > > > > > > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > > > > > > >> > -  else
> > > > > > > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > > > > > > >> > +  } else {
> > > > > > > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > > > > > > >> > +  }
> > > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > > > > > > union members simultantiously.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > > > > > > though.)
> > > > > > > > > > > > >
> > > > > > > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > > > > > > different union member from the one previously written.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > > > > > > on the other members IIUC.
> > > > > > > > > > > > >
> > > > > > > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > > > > > > incorrectness just to save a few bytes.
> > > > > > > > > > > > >
> > > > > > > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So you want something like:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > > > > > > signal generation site...
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Garbled sentence?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > I am not seeing anything like that.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > > > > > > on 32-bit ARM.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > > > > > > >> > +                  struct {
> > > > > > > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > > > > > > >> > +#endif
> > > > > > > > > > > > > > > >> >            };
> > > > > > > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > > > > > > >> is built for a single architecture.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > > > > > > considering a similar feature:
> > > > > > > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > > > > > > architectures from the start.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > > > > > > this point to accommodate the new fields.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > > > > > > >
> > > > > > > > > > > > > I think what we need here is basically a flags word.
> > > > > > > > > > > > >
> > > > > > > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > > > > > > flags word, we can extend as needed.
> > > > > > > > > > > > >
> > > > > > > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > > > > > > I guess si_code may be sufficient.
> > > > > > > > > > > >
> > > > > > > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > > > > > > the struct is done by the clear_user call here:
> > > > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > > > > > > >
> > > > > > > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > > > > > > calls when initializing the data structure:
> > > > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > > > > > > >
> > > > > > > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > > > > > > support for flagged fields.
> > > > > > > > > > >
> > > > > > > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > > > > > > back to when the arch/arm64 was merged).
> > > > > > > > > > >
> > > > > > > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > > > > > > >
> > > > > > > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > > > > > > this matters.  I may have misunderstood something about the code though.
> > > > > > > > > >
> > > > > > > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > > > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > > > > > > released in 4.18. So if an application wants to be compatible with
> > > > > > > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > > > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > > > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > > > > > > version before reading these fields. I have a couple of other ideas
> > > > > > > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > > > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > > > > > > compatibility.)
> > > > > > > > > >
> > > > > > > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > > > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > > > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > > > > > > circumstances:
> > > > > > > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > > > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > > > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > > > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > > > > > > >
> > > > > > > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > > > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > > > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > > > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > > > > > > this signal before the 5.8 release.
> > > > > > > > > >
> > > > > > > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > > > > > > registering a signal handler meaning "this signal handler requires
> > > > > > > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > > > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > > > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > > > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > > > > > > this vein would be to claim some of the upper bits of the signal
> > > > > > > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > > > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > > > > > > 2.6.18) contain this code in do_sigaction:
> > > > > > > > > >
> > > > > > > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > > > > > > >                 return -EINVAL;
> > > > > > > > > >
> > > > > > > > > > and vald_signal is defined as:
> > > > > > > > > >
> > > > > > > > > > static inline int valid_signal(unsigned long sig)
> > > > > > > > > > {
> > > > > > > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > > > > > > }
> > > > > > > > > >
> > > > > > > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > > > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > > > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > > > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > > > > > > and permits future expansion of the signal number range.
> > > > > > > > >
> > > > > > > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > > > > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > > > > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > > > > > > but I've not eyeballed it in detail...)
> > > > > > > >
> > > > > > > > It memsets the siginfo structure before setting the fields and sending
> > > > > > > > the signal (grep for clear_siginfo which is just a memset; you should
> > > > > > > > find a call before all callers of force_sig_info). Memset is the right
> > > > > > > > approach here since unlike setting fields by hand it clears padding
> > > > > > > > which could lead to information leaks from the kernel. IIUC this is
> > > > > > > > the reason why Eric wants all of the signals to be raised via wrappers
> > > > > > > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > > > > > > this aspect easier to audit).
> > > > > > >
> > > > > > > My impression was that the reason for this model is partly to ensure
> > > > > > > that siginfo fields are populated more consistently.  When it was all
> > > > > > > down to the individual callers, inconsistencies creeped in.
> > > > > > >
> > > > > > > With regard to memset(), this is not a complete defence against data
> > > > > > > leakage.  Assigning to a struct member can set any or all padding in
> > > > > > > the struct to random garbage (consider write-combining of neighboring
> > > > > > > member writes into a single larger accesses in asm for example).  The
> > > > > >
> > > > > > I don't believe that LLVM will store to padding like this. I don't
> > > > > > know about GCC, though, but I wouldn't be surprised if this is
> > > > > > something that the kernel would want to turn off in "kernel C" (like
> > > > > > it turns off strict aliasing) specifically because of the information
> > > > > > leak issue.
> > > > >
> > > > > Again, the issue is not future kernel builds -- we can always find a way
> > > > > to fix the behaviour for those -- but past kernel builds.
> > >
> > > I thought that the whole point of the "bit in the signal number" (or
> > > SI_CODEX or whatever) was that we didn't need to worry about the
> > > behavior of past kernel builds?
> >
> > It depends on what we use the new flag(s) for.
> >
> > If the flag means just that unused padding is safely zeroed, that could
> > work -- but we'd want high confidence that it really is zeroed even in
> > wacky configurations.
> >
> > > > > > > only way to avoid this is to ensure that the struct is 100%
> > > > > > > padding-free, and that each member of a union is the same size.  A
> > > > > > > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > > > > > > the case.
> > > > > > >
> > > > > > > This might need to be looked at separately.
> > > > > > >
> > > > > > > But it does mean, strictly speaking, that we can't reliably add new
> > > > > > > fields anywhere that there was previously padding: assigning to
> > > > > > > neighboring members can still fill those with garbage after the
> > > > > > > memset().
> > > > > >
> > > > > > ...but this is largely moot because I'm not proposing to add new
> > > > > > fields in the padding any more (because the fields needed to become
> > > > > > larger in order to accommodate future hypothetical architectures which
> > > > > > might want to use the fields, and thus they wouldn't fit in the
> > > > > > padding). The siginfo.h diff would be something like:
> > > > > >
> > > > > > diff --git a/include/uapi/asm-generic/siginfo.h
> > > > > > b/include/uapi/asm-generic/siginfo.h
> > > > > > index cb3d6c267181..4a2fe257415d 100644
> > > > > > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > @@ -91,7 +91,10 @@ union __sifields {
> > > > > >                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > >                                 __u32 _pkey;
> > > > > >                         } _addr_pkey;
> > > > > > +                       void *_pad[6];
> > > > > >                 };
> > > > > > +               uintptr_t _ignored_bits;
> > > > > > +               uintptr_t _ignored_bits_mask;
> > > > >
> > > > > This _is_ in padding: the tail-padding of the (previously smaller)
> > > > > _sigfault.  Again, the compiler was allowed to populate this area with
> > > > > junk before these fields were added.
> > > > >
> > > > > I agree that it seems fairly unlikely that the compiler would have been
> > > > > overwriting this in normal circumstances, but that's not a guarantee.
> > > > > My worry is that if this goes wrong, it will go wrong silently and
> > > > > unpredictably.
> > > > >
> > > > > >         } _sigfault;
> > > > > >
> > > > > >         /* SIGPOLL */
> > > > > >
> > > > > > or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> > > > > >
> > > > > > > > > Using unused bits in the signal number to turn on new functionality
> > > > > > > > > feels risky.  As currently specified, this is just a number.  Since
> > > > > > > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > > > > > > number, reasonable code like the following would trigger a buffer
> > > > > > > > > overrun if we start trying to encode anything else in there:
> > > > > > > > >
> > > > > > > > > struct sigaction actions[NSIG];
> > > > > > > > >
> > > > > > > > > int do_something( ... )
> > > > > > > > > {
> > > > > > > > >         ...
> > > > > > > > >
> > > > > > > > >         if (!sigaction(n, sa, ...)) {
> > > > > > > > >                 actions[n] = *sa;
> > > > > > > > >                 return 0;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > >         ...
> > > > > > > > > }
> > > > > > > >
> > > > > > > > I imagine the bit in the signal number being set by the direct caller
> > > > > > > > to sigaction, and we could specifically recommend that calling
> > > > > > > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> > > > > > >
> > > > > > > I can imagine this too, but that doesn't mean that software does it.
> > > > > > >
> > > > > > > If the above kind of thing exists in a framework or library somewhere,
> > > > > > > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > > > > > > that provides a wrapper for sigaction may now go wrong even if your
> > > > > > > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > > > > > > directly but in fact it isn't.
> > > > > >
> > > > > > I'm aware of one library like that. It's called libsigchain, and it
> > > > > > has an early bounds check:
> > > > > > https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> > > > > >
> > > > > > Until the library is changed to recognize the flag, calling code would
> > > > > > see the return value of -1 as if the kernel failed the syscall, and
> > > > > > would fall back to the code for old kernels.
> > > > >
> > > > > But only after some bad dereferences.  If these were writes, this means
> > > > > that memory _may_ be silently corrupted (I don't say it't likely in a
> > > > > given case, and we cannot pick a flag bit that makes this impossible).
> > >
> > > You're talking about libsigchain, right? I don't see any bad
> > > references, the function returns after noticing the bounds check
> > > failure.
> >
> > Yes, I confused myself by reading Handler() out of context.  The kernel
> > will invoke this with signo to a real signal number (without any flags).
> >
> > The sigaction wrapper does the bounds check before doing anything else,
> > just as you say -- so that looks fine.
> >
> > (Side question: is all this thread-safe?  Is there some implicit locking
> > somewhere?)
> 
> I think maybe it isn't? There seem to be possible races on the
> handler_ field. One possibility is that the function could race with
> itself on another thread, which could be fixed via locking, but it
> would also need to handle races between itself and the signal handler,
> most likely by blocking the signal while setting it.

Hmmm, tricky... anyway, that's not my problem ;)

> > > > > So, _even though the user program is correct_, our change may trigger
> > >
> > > Let's say that you were talking about some other library and not
> > > libsigchain. Such an interceptor wouldn't be correct though, it failed
> > > to account for our change to the syscall semantics. If the accesses
> > > were before the syscall (or the bounds check), then the interceptor
> > > would not have been correct in the first place because POSIX requires
> > > returning -1 with errno=EINVAL (and not crashing) if the signal number
> > > is invalid.
> > >
> > > > > the corruption of arbitrary user memory.  This what I mean by an ABI
> > > > > break.  The fact that the corruption is not done by the syscall itself
> > > > > is no excuse.
> > >
> > > At some point, though, accommodating interceptors becomes pretty much
> > > tantamount to saying "we can never change anything". Even just adding
> > > a field to __sifields (which is pretty much required for what we need
> > > to do) could break things in the presence of some interceptors because
> > > the interceptor could be copying the fields manually to a new data
> > > structure before calling the user's signal handler (e.g. because it
> > > wants to defer the signal until later) and miss our new field. I think
> > > most of the other ideas we're discussing fail to meet this bar as well
> > > and I'll go into more details later on.
> >
> > I agree we cannot always avoid breaking such things.  But we should do
> > our best to avoid it.
> 
> I think that given the hand that we've been dealt, no matter what we
> do, we can't really avoid risking breaking something. The relevant
> questions are "what are we going to risk breaking", "how much risk is
> there", "will it be easily noticed/fixable", and "once we're on the
> other side of the potential breakage, will we find ourselves in a
> position where changing things involves less breakage risk".
> 
> > > > > We also fail to notice failures in sigaddset() etc., though in this code
> > > > > it looks like that should not matter.
> > >
> > > Maybe you're looking at the handler ("SignalChain::Handler")? The bit
> > > wouldn't be set in the signo argument to the handler. I'm talking
> > > about line 371 of the code I linked, in the sigaction interceptor
> > > "__sigaction" (it looks like sometimes the link doesn't take you to
> > > the correct line for some reason).
> >
> > Ack, I confused myself.
> >
> > > > > > In general I think that any library like this with independent
> > > > > > tracking of the kernel's purported signal handler state would need to
> > > > > > be very sensitive to which syscalls are capable of setting signal
> > > > > > handlers, what their semantics are, and so on. This applies to any
> > > > > > change that we might make to the signal handler interface. So for
> > > > > > example, if we introduced a new syscall as you propose below, and the
> > > > > > library hasn't been updated to recognize the new syscall, it will
> > > > > > silently miss changes in signal handler state caused by the new
> > > > > > syscall.
> > > > > >
> > > > > > At the end of this argument lies "we can never change anything about
> > > > > > how signal handlers work because it could break some interposing
> > > > > > library somewhere" -- replace "signal handlers" with any kernel
> > > > > > feature whose behavior may be modified by an interposing library if
> > > > > > you like -- and I don't think we want to go that far. As far as I
> > > > > > know, this isn't really the kernel's business anyway -- the kernel's
> > > > > > stable ABI contract starts and ends with the syscall interface and not
> > > > > > some library on top.
> > > > > >
> > > > > > That being said, we should perhaps try to define our interface so that
> > > > > > something reasonable will probably happen if there is such a library
> > > > > > and it hasn't been updated. With the new syscall, the library will
> > > > > > sometimes silently fail to work in some non-local fashion. With the
> > > > > > flag bit in the signal number, the library will either cause the
> > > > > > caller to fall back to the old kernel code path (if there is a bounds
> > > > > > check) or likely crash loudly (if there is no bounds check). To me,
> > > > > > the "flag bit in the signal number" behavior seems more reasonable,
> > > > > > since either something correct or something easy to debug will
> > > > > > probably happen at runtime.
> > > > > >
> > > > > > > > could only appear in newly-written code that doesn't follow our
> > > > > > > > recommendations, and there are already plenty of much more likely ways
> > > > > > > > to cause buffer overflows in C code that doesn't follow
> > > > > > > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > > > > > > it would very likely be caught early in development by the MMU due to
> > > > > > > > the magnitude of the number 1<<30.)
> > > > > > >
> > > > > > > Choosing the bit value is hard.  If shitfing it overflows, this can
> > > > > > > trigger random undefined behaviour in the compiler in addition to (or
> > > > > > > perhaps instead of) an out-of-bounds access or segfault.
> > > > > >
> > > > > > It wouldn't overflow on a 64-bit architecture assuming normal array
> > > > > > indexing (the index would be promoted to pointer width before being
> > > > > > scaled to the array element size), and to begin with the users of this
> > > > > > would be 64-bit.
> > > > >
> > > > > Unless we don't offer this feature for 32-bit at all (possible, if ugly)
> > > > > we can't stop people using it.
> > >
> > > My point is that the problem in the interceptor library would probably
> > > be noticed on 64-bit (since that's what most people use these days),
> > > which would probably result in it being fixed by the time it reaches
> > > 32-bit users.
> >
> > Agreed.  But we shouldn't take such bets unless we really have to.
> >
> > > > > > > If shifting it doesn't overflow, we might still fall into a valid
> > > > > > > mapping, though I'd agree a segfault is more likely.
> > > > > > >
> > > > > > > >
> > > > > > > > > I think it would be cleaner for to add a single flag field that can be
> > > > > > > > > used for detecting other extensions, and request it via a new sa_flags
> > > > > > > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > > > > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > > > > > > probably want to carry on zeroing them anyway).
> > > > > > > > >
> > > > > > > > > I can see no simpler way to add supplementary siginfo fields for
> > > > > > > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > > > > > > came in we could still detect optional si_code-specific fields via
> > > > > > > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > > > > > > >
> > > > > > > > That would certainly be cleaner if it worked, but that would only be
> > > > > > > > the case if old kernels rejected unknown bits in sa_flags, and
> > > > > > > > unfortunately they don't. With the bit in the signal number, the "old
> > > > > > >
> > > > > > > Hmm, that is a problem I wasn't aware of.
> > > > > > >
> > > > > > > > kernels reject" behavior admits relatively straightforward usage code:
> > > > > > > >
> > > > > > > > void set_segv_handler(void) {
> > > > > > > >   struct sigaction sa;
> > > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > > >   sa.sa_flags = SA_SIGINFO;
> > > > > > > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > > > > > > succeeds in new kernels, fails in old kernels
> > > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > > > > > > >       perror("sigaction");
> > > > > > > >   }
> > > > > > > > }
> > > > > > > >
> > > > > > > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > > > >   sa->si_future_field = 0;
> > > > > > > >   handle_segv(signum, sa, ctx);
> > > > > > > > }
> > > > > > > >
> > > > > > > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > > > >   // At this point, si_future_field will have the value 0 in old
> > > > > > > > kernels and the kernel-supplied value in new kernels.
> > > > > > > > }
> > > > > > > >
> > > > > > > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > > > > > > number to sa_flags. In that case, the first sigaction would succeed in
> > > > > > > > old kernels so handle_segv wouldn't know whether it can safely read
> > > > > > > > from si_future_field. With the sa_flags approach, you would need
> > > > > > > > kernel version number checking via uname before setting the flag in
> > > > > > > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > > > > > > since you could just have the signal handler conditionally read from
> > > > > > > > si_future_field based on the uname?
> > > > > > >
> > > > > > > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > > > > > > it would be using a new #define.  So it might be reasonable to put the
> > > > > > > burden on that software to verify that the flag was really accepted by
> > > > > > > the kernel, by reading it back.
> > > > > >
> > > > > > That doesn't seem like a good idea even if it worked, because it could
> > > > > > lead to race conditions. If the si_flags-reading signal handler were
> > > > > > invoked in response to a signal between when you set it and when you
> > > > > > ended up replacing it with the fallback signal handler for old
> > > > > > kernels, the handler may end up reading garbage data from si_flags.
> > > > >
> > > > > Not really.  My example may have this problem, but the signal handler
> > > > > can be written to support both scenarios, based on testing a flag that
> > > > > the main program sets after verifying that the flag could be set.  Or
> > > > > the signal could be blocked around establishment (often a good idea for
> > > > > other reasons).
> > > > >
> > > > > But I agree it's a bit gross, and anyway doesn't work due to the fact
> > > > > that the kernel doesn't filter out unrecognised flags anyway.
> > > > >
> > > > > > > Unfortunately, even relatively recent kernels blindly store sa_flags
> > > > > > > in the kernel without validating it, and so it looks like duff flags
> > > > > > > can be read back out via a sigaction() call.  Dang.
> > > > > > >
> > > > > > >
> > > > > > > Perhaps a new frontend syscall could be added.  A new libc that knows
> > > > > > > about this "sigaction2" could use it and mask off problem bits from
> > > > > > > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > > > > > > libc would call the old sigaction syscall, where we would ignore these
> > > > > > > new sa_flags bits as before.
> > > > > >
> > > > > > I'm not currently in favor of the new syscall but if we do this I
> > > > > > would keep sigaction and sigaction2 separate. That is, libc sigaction
> > > > > > should always use the sigaction syscall, and libc sigaction2 should
> > > > > > always use the sigaction2 syscall. We should avoid libc's sigaction
> > > > > > having different behavior based on the libc version and kernel
> > > > > > version, as that would make it harder to reason about its behavior.
> > > > > > Calling code would need to check for presence of sigaction2 in both
> > > > > > libc and the kernel, e.g.
> > > > > >
> > > > > > __attribute__((weak)) decltype(sigaction2) sigaction2;
> > > > > >
> > > > > > void set_segv_handler(void) {
> > > > > >   struct sigaction sa;
> > > > > >   sa.sa_sigaction = handle_segv;
> > > > > >   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
> > > > > >   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
> > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > >     sa.sa_flags = SA_SIGINFO;
> > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > > >       perror("sigaction");
> > > > > >   }
> > > > > > }
> > > > >
> > > > > I guess.  But I share your distaste for adding a new syscall.
> > > > >
> > > > > >
> > > > > > > This may not be a popular approach though, and software wouldn't be able
> > > > > > > to use our new features until libc is updated to match.
> > > > > > >
> > > > > > > If we go down this route, it may provide additional opportunities to fix
> > > > > > > annoying defects in the old interface.
> > > > > > >
> > > > > > >
> > > > > > > > Note that the same applies to a flag indicating the availability of a
> > > > > > > > si_flags field in sigaction (just
> > > > > > > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > > > > > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > > > > > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > > > > > > >
> > > > > > > > Another thought that occurred to me is that we may consider
> > > > > > > > generalizing this a step further and introducing a single flag bit in
> > > > > > > > the signal number that means "reject unknown flags in sa_flags". This
> > > > > > > > would mean that we wouldn't need to add any more flag bits to the
> > > > > > > > signal number in the future, thus limiting this signal number hack to
> > > > > > > > a single bit; all future mandatory behavior changes could just be put
> > > > > > > > behind a flag in sa_flags and userspace code would easily be able to
> > > > > > > > detect missing support for a flag and fall back if necessary. In our
> > > > > > > > case, this would imply usage code like this:
> > > > > > > >
> > > > > > > > void set_segv_handler(void) {
> > > > > > > >   struct sigaction sa;
> > > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > > > > > > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > > > > > > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > > > > > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > > > > > > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > > > > > > the bounds check on the signal number).
> > > > > > > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > > >     sa.sa_flags = SA_SIGINFO;
> > > > > > > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > > > > > > we're using sa_flags from the beginning of time.
> > > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > > > > >       perror("sigaction");
> > > > > > > >   }
> > > > > > > > }
> > > > > > >
> > > > > > > As with the other options this could work, but looks like it could
> > > > > > > break the ABI due to violating the original semantics for the signal
> > > > > > > number argument.  Perhaps I'm being too paranoid.
> > > > > >
> > > > > > There's no ABI being broken here, as long as we consider syscalls to
> > > > > > be the stable ABI layer. Old kernels are simply rejecting arguments
> > > > > > that they don't know about yet. By that argument, any introduction of
> > > > > > a new syscall is an ABI break because it changes the semantics of a
> > > > > > previously-unallocated syscall number.
> > > > >
> > > > > As argued above, I think this is an invalid argument.
> > > > >
> > > > > Although any addition will change behaviour (so is a break in some
> > > > > sense), the key is not to make "surprising" changes.
> > >
> > > If we care about interceptors then I don't think "surprising" comes
> > > into it. It's more a question of "does the anticipated behavior of the
> > > interceptor match our desired behavior", where "desired" means "most
> > > likely to avoid silent breakage". We would need to get into the head
> > > of a potential interceptor author and think about how they would have
> > > handled the signal number argument, as well as other arguments like
> > > sa_flags if we want to go that route, and see whether that behavior
> > > would lead to the desired result.
> >
> > That's exactly what I mean by "surprising".
> 
> Not quite, see below.
> 
> > However, not every
> > interceptor author will be making the same assumptions, and not every
> > bit of software affected will be an interceptor.
> 
> I can see a couple of ways in which non-interceptor software could be affected:
> 
> - It's doing something like "call sigaction on every possible signal
> number in the 31-bit range and end up failing if the syscall
> succeeded" (e.g. with an OOB write). Perhaps software could be doing
> something like this in a loop to collect all currently registered
> signal handlers. That being said, this program:
> 
> #include <limits.h>
> #include <signal.h>
> 
> int main() {
>   struct sigaction act;
>   for (int i = 1; i != INT_MAX; ++i) {
>     sigaction(i, 0, &act);
>   }
> }
> 
> takes around 5 seconds to run on my relatively-fast machine, so I
> would expect any such code to be noticed as a performance issue and
> either be changed to be bounded on _NSIG or break on EINVAL.
> 
> This is probably the largest potential flaw that I can currently see
> in the "bit in the signal number" idea, since it could conceivably
> result in userspace code being broken without having first required it
> to have been changed to make use of the new feature. I'm not convinced
> that it would be an ABI break though, because the code seems unlikely
> to exist in this form in the wild because of the performance issue,
> and you could anyway make the argument that the code is incorrect
> because, in order to contain a loop like this, it would need to be
> able to handle large, previously-unknown signal numbers somehow. If we
> accept that the code is incorrect, a similar line of argument applies
> as for interceptors (i.e. likely to result in an OOB access which will
> fail loudly and be easily debugged and fixed).
> 
> - If we do something that involves introducing a new flag in sa_flags,
> the flag may be exposed to unaware software via the oldact argument to
> sigaction, and I suppose that it's conceivable that exposing a
> previously-unknown flag like this could somehow break something. But
> this seems like an unreasonable restriction because it would mean that
> we can never add a flag to sa_flags no matter what.
> 
> >  So some judgement
> > needs to be applied.
> 
> Of course. We need to agree *how* to apply the judgement though.
> 
> > > In this case, I think we exactly want the interceptor author to have
> > > thought "oh, it's just a number, I'll (possibly do a bounds check and
> > > then) use the number as an index into an array". This will lead to one
> > > of two outcomes: crashing (yes, yes, it won't always crash, but if the
> > > alternative is that it never crashes and we get silently incorrect
> > > behavior all of the time, I'll take sometimes crashing) or fail the
> > > bounds check and pretend to be an old kernel (the latter is
> > > anticipated by POSIX which requires returning -1/EINVAL for an invalid
> > > signal number). Each of these behaviors are desirable, as they are
> > > observable failures, which are more likely to result in fixes than
> > > silent ones.
> >
> > Agreed, except wanting the author to have thought something doesn't
> > ensure that they actually did think that.
> 
> True, but if our goal is only to accommodate reasonably written
> interceptors, we don't actually need to ensure anything here.
> 
> > > > > Having something random happen when setting a previously reserved flag
> > > > > bit, or when issuing a syscall when an unknown syscall number, or not
> > > > > surprising at all.
> > >
> > > Introducing a new syscall is right out in this model. The interceptor
> > > author wouldn't have anticipated our introducing a new syscall, so the
> > > new syscall wouldn't be intercepted and calls to the new syscall would
> > > silently bypass the interceptor. For example, adding sigaction2 could
> > > result in signal handlers being set without the interceptor's
> > > knowledge.
> >
> > Agreed.  My sentence was a bit mangled: I mean to say "Having something
> > random happen when [...] issuing a syscall *with* an unknown syscall
> > number *is* not surprising at all."
> >
> > I agree that adding a new syscall is problematic if we want to avoid
> > breaking existing interceptors in particular.  Other types of code are
> > much less likely to be affected by the addition of new syscalls.
> 
> Right, and this to me is a case in point for why I would say that
> "surprising" isn't the right frame of analysis here. My analysis seems
> to generally be that "anticipated interceptor behavior matches desired
> behavior" is positively correlated with "surprising" (i.e. the
> interceptor viewpoint is the dual of the user viewpoint), so if we
> care about interceptors we may end up making a "surprising" change
> even though it doesn't intuitively seem like the right thing to do.

You're right that interceptors are different from normal callers.  I'm
not sure I follow your argument, but an alternative way of looking at
it might be to say that an interceptor is both an implementation of an
interface and a caller of the same interface.  Since API specs are
rarely complete enough to cover the corner cases that arise from this,
full portability is hard to achieve on top of an evolving kernel.

However, I think this interceptor thing is a bit of a red herring.  I
just intended that as an illustration of the kind of code that might
fall foul.  This doesn't mean that it's 100% certain that no other
software can be affected.

The starting points for this discussion were: "is it reasonable for a
caller to pass an unvalidated signal number to sigaction(), and rely on
sigaction() to validate it?" and "is it reasonable to assume that a
signal number accepted by sigaction() fits the POSIX specification of
a valid signal number?"

I think yes; you aren't (or weren't) convinced.

The mere fact that it's hard to agree suggests to me that the
specification is too weak to extend safely in this area.  Unfortunately,
it's rather weak for sa_flags too, although a non-full flags argument
does at least suggest that future extensions might appear.

> > > Regarding a sa_flags bit, let's get inside the head of the interceptor
> > > author again. How would they handle a flag bit that they don't
> > > recognize when replacing the signal handler? It wouldn't be correct to
> > > just pass it through to the kernel, or drop the flag on the floor, as
> > > it might be semantically meaningful (and thus could change the calling
> > > convention as SA_SIGINFO does, or change the meaning of fields in
> > > siginfo, as SA_CODEX would do). A correctly written sigaction
> > > interceptor should probably abort the program upon encountering an
> > > unknown flag (thus giving a human a chance to update the interceptor),
> > > but chances are that they don't. Ignoring all but a few flags (and
> > > passing a fixed set of flags to the kernel) seems to be what
> > > libsigchain does, and in the case of SA_CODEX it would seem to result
> > > in desirable behavior (but I suspect that it isn't handling the other
> > > flags correctly), but I could also see an interceptor author just
> > > passing it unchanged to the kernel without checking it (perhaps
> > > because they didn't think about these issues, and because that didn't
> > > matter until now, with the exception of from-the-beginning-of-time
> > > flags like SA_SIGINFO). And with SA_CODEX that could lead to silent
> > > misreading of si_code in the interceptor's signal handler, if it
> > > hasn't been updated to use the new macros.
> >
> > Agreed.  I've tried to implement things rather like this in the past,
> > and how to interpret the flags is a tricky issue.  Some of the flags are
> > impossible to emulate even when you know what they mean, in particular
> > SA_NODEFER and SA_RESTART.
> >
> > Making new flags safe to ignore and harmless to set of you don't know
> > what they mean is the safest approach, but not always possible (I think
> > I managed this with by suggestion below, though).
> 
> Again, this suggestion could lead to silent failures in an interceptor, if:
> - the interceptor passes the sa_flags through to the kernel unchanged
> (or otherwise doesn't touch SA_CODEX)
> - the interceptor replaces the user's sa_sigaction
> - the interceptor's replacement sa_sigaction tests the provided si_code.
> 
> Maybe you're not concerned about that, though? At least to me it seems
> in the same ballpark of likelihood as the ways in which things could
> go wrong with the signal number bit.

I agree this is a concern, and perhaps a bit nastier in practice than
side-effects of setting random bits in the signal number.

Personally I do tend to be paranoid about flags arguments and try to
police them in any code that isn't a trivial pass-through, but this
doesn't mean that all code out there does it.  (Including the kernel's
sigaction()!)

> > Ideally, a flags field should be specified with rules that say exactly
> > what to do with flags you don't recognise.  Sadly this is usually not
> > thought about until it's too late.
> 
> It perhaps isn't too late to introduce such rules for sigaction if we
> adopt the signal number bit and we make it mean "reject unknown
> flags".

If Eric likes the idea then fair enough, but as I've tried to argue this
may still just be moving the problem around rather than solving it.


As a final random idea to add to the mix, we could add two or more
flags in sa_flags, and require the kernel to transform them in a
specific way, say:

#define SA_WANT_FLAGS 0x00c700000
#define SA_HAVE_FLAGS 0x009200000
#define SA_FLAGS_MASK 0x00ff00000

volatile sig_atomic_t have_flags = 0;

	sa.sa_flags |= SA_WANT_FLAGS;
	if (sigaction(n, &sa, NULL))
		if (!sigaction(n, NULL, &sa) &&
				(sa.sa_flags & SA_FLAGS_MASK) == SA_HAVE_FLAGS)
			have_flags = 1;

This is at least proof against "dumb readback".

Provided that the handler can cope with the have_flags == 0 case and
just reads the flag once per call, I don't think we would need to worry
about races.

Of course, an interceptor that doesn't understand this mechanism and
munges or manufactures its own siginfo might still fail to properly
initialise our new field before passing it on to a signal handler that
is expecting it.  But that's already broken: such an interceptor might
also not understand new si_codes that the client code absolutely relies
on.  And new si_codes _do_ get added (that's another extensibility fail
in the existing signal API).


So... overall, maybe a bit in the signal number isn't a lot worse, and
perhaps it _will_ lead to cleaner failures.

Really, I don't see a way to solve it properly without a new API.


In the meantime, can I suggest:

 (1) Come up with an extensible way of encoding supplementary
     information in siginfo.  If the consensus is that zeroing unused
     fields is sufficient and that the kernel and compiler will
     reliably do it, then great.  Otherwise, we might need explicit
     flags fields or something.
     
 (2) Hack up any simple mechanism (such as your signal number flag) for
     requesting/detecting the extra information.

Along with an illustration of a application of the mechanism (i.e.,
reporting address tag bits), this should at least provide a basis for
further review.

We can then try to swap in a different mechanism for (2) if people have
still have concerns (or it not, keep it).

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-07-14 17:36                                                                               ` Dave Martin
@ 2020-08-18  3:16                                                                                 ` Peter Collingbourne
  2020-08-18 13:50                                                                                   ` Dave Martin
  0 siblings, 1 reply; 64+ messages in thread
From: Peter Collingbourne @ 2020-08-18  3:16 UTC (permalink / raw)
  To: Dave Martin
  Cc: Catalin Marinas, Kevin Brodsky, Oleg Nesterov, Evgenii Stepanov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Linux ARM, Richard Henderson

On Tue, Jul 14, 2020 at 10:36 AM Dave Martin <Dave.Martin@arm.com> wrote:
>
> On Mon, Jul 13, 2020 at 01:50:30PM -0700, Peter Collingbourne wrote:
> > On Mon, Jul 13, 2020 at 6:24 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > >
> > > On Wed, Jul 08, 2020 at 03:21:13PM -0700, Peter Collingbourne wrote:
> > > > On Wed, Jul 8, 2020 at 6:58 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > >
> > > > > On Wed, Jul 08, 2020 at 12:00:22PM +0100, Dave Martin wrote:
> > > > > > On Tue, Jul 07, 2020 at 12:07:09PM -0700, Peter Collingbourne wrote:
> > > > > > > On Tue, Jul 7, 2020 at 7:19 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Jul 06, 2020 at 12:20:33PM -0700, Peter Collingbourne wrote:
> > > > > > > > > On Mon, Jul 6, 2020 at 9:41 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Wed, Jun 24, 2020 at 12:51:43PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > On Wed, Jun 24, 2020 at 10:12 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Wed, Jun 24, 2020 at 09:51:49AM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > > > On Wed, Jun 24, 2020 at 2:28 AM Dave Martin <Dave.Martin@arm.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 05:40:08PM -0700, Peter Collingbourne wrote:
> > > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 10:52 AM Eric W. Biederman
> > > > > > > > > > > > > > > <ebiederm@xmission.com> wrote:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Dave Martin <Dave.Martin@arm.com> writes:
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > On Tue, Jun 23, 2020 at 07:54:59AM -0500, Eric W. Biederman wrote:
> > > > > > > > > > > > > > > > >> Peter Collingbourne <pcc@google.com> writes:
> > > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > > >> > diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > > > >> > index 47f651df781c..a8380a2b6361 100644
> > > > > > > > > > > > > > > > >> > --- a/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > > > >> > +++ b/arch/arm64/kernel/traps.c
> > > > > > > > > > > > > > > > >> > @@ -235,20 +235,41 @@ static void arm64_show_signal(int signo, const char *str)
> > > > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > > > >> >  void arm64_force_sig_fault(int signo, int code, void __user *addr,
> > > > > > > > > > > > > > > > >> > +                     unsigned long far, unsigned char far_tb_mask,
> > > > > > > > > > > > > > > > >> >                       const char *str)
> > > > > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > > > > >> >    arm64_show_signal(signo, str);
> > > > > > > > > > > > > > > > >> > -  if (signo == SIGKILL)
> > > > > > > > > > > > > > > > >> > +  if (signo == SIGKILL) {
> > > > > > > > > > > > > > > > >> >            force_sig(SIGKILL);
> > > > > > > > > > > > > > > > >> > -  else
> > > > > > > > > > > > > > > > >> > -          force_sig_fault(signo, code, addr);
> > > > > > > > > > > > > > > > >> > +  } else {
> > > > > > > > > > > > > > > > >> > +          struct kernel_siginfo info;
> > > > > > > > > > > > > > > > >> > +          clear_siginfo(&info);
> > > > > > > > > > > > > > > > >> > +          info.si_signo = signo;
> > > > > > > > > > > > > > > > >> > +          info.si_errno = 0;
> > > > > > > > > > > > > > > > >> > +          info.si_code = code;
> > > > > > > > > > > > > > > > >> > +          info.si_addr = addr;
> > > > > > > > > > > > > > > > >> > +          info.si_addr_top_byte = (far >> 56) & far_tb_mask;
> > > > > > > > > > > > > > > > >> > +          info.si_addr_top_byte_mask = far_tb_mask;
> > > > > > > > > > > > > > > > >> > +          force_sig_info(&info);
> > > > > > > > > > > > > > > > >> > +  }
> > > > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > > > >> >  void arm64_force_sig_mceerr(int code, void __user *addr, short lsb,
> > > > > > > > > > > > > > > > >> > -                      const char *str)
> > > > > > > > > > > > > > > > >> > +                      unsigned long far, const char *str)
> > > > > > > > > > > > > > > > >> >  {
> > > > > > > > > > > > > > > > >> > +  struct kernel_siginfo info;
> > > > > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > > > > >> >    arm64_show_signal(SIGBUS, str);
> > > > > > > > > > > > > > > > >> > -  force_sig_mceerr(code, addr, lsb);
> > > > > > > > > > > > > > > > >> > +
> > > > > > > > > > > > > > > > >> > +  clear_siginfo(&info);
> > > > > > > > > > > > > > > > >> > +  info.si_signo = SIGBUS;
> > > > > > > > > > > > > > > > >> > +  info.si_errno = 0;
> > > > > > > > > > > > > > > > >> > +  info.si_code = code;
> > > > > > > > > > > > > > > > >> > +  info.si_addr = addr;
> > > > > > > > > > > > > > > > >> > +  info.si_addr_lsb = lsb;
> > > > > > > > > > > > > > > > >> > +  info.si_addr_top_byte = far >> 56;
> > > > > > > > > > > > > > > > >> > +  info.si_addr_top_byte_mask = 0xff;
> > > > > > > > > > > > > > > > >> > +  force_sig_info(&info);
> > > > > > > > > > > > > > > > >> >  }
> > > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > > >> I have a real problem with this construction.  force_sig_info is not an
> > > > > > > > > > > > > > > > >> interface that should be used for anything except to define a wrapper
> > > > > > > > > > > > > > > > >> that takes it's parameters.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Can you elaborate?  How would you do this king of thing.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > There are no other uses of force_sig_info in architecture code.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I just removed them _all_ because they were almost all broken.
> > > > > > > > > > > > > > > > In fact your mcerr case is broken because it uses two different
> > > > > > > > > > > > > > > > union members simultantiously.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Is that really broken? I thought that the Linux kernel deliberately
> > > > > > > > > > > > > > > didn't care about strict aliasing rules (the top-level Makefile passes
> > > > > > > > > > > > > > > -fno-strict-aliasing) so I thought that it was valid in "Linux kernel
> > > > > > > > > > > > > > > C" even though from a standards point of view it is invalid. (That
> > > > > > > > > > > > > > > being said, this is probably moot with my proposed changes below
> > > > > > > > > > > > > > > though.)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I have a feeling that -fno-strict-aliasing only allows you to _read_ a
> > > > > > > > > > > > > > different union member from the one previously written.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Writing a different member from the last one written can still splatter
> > > > > > > > > > > > > > on the other members IIUC.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > It would be better to keep things separate rather than risk
> > > > > > > > > > > > > > incorrectness just to save a few bytes.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > IMHO -fno-strict-aliasing is no excuse for gratuitous type-punning.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > So I am looking for something like force_sig_mcerr or force_sig_fault
> > > > > > > > > > > > > > > > that includes your new information that then calls force_sig_info.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I know of no other way to safely use the siginfo struct.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > So you want something like:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > int force_sig_fault_with_ignored_bits(int signo, int code, void __user
> > > > > > > > > > > > > > > *addr, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > > > > int force_sig_mceerr_with_ignored_bits(int code, void __user *addr,
> > > > > > > > > > > > > > > short lsb, uintptr_t addr_ignored, uintptr_t addr_ignored_mask);
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > in kernel/signal.c and the code in arch/arm64 would call that?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > AIUI we absolutely need a forced signal here, we need to supply
> > > > > > > > > > > > > > > > > metadata, and we don't have to open-code all that at every relevant
> > > > > > > > > > > > > > > > > signal generation site...
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >> It is not clear to me that if you have adapted siginfo_layout.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Garbled sentence?
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Looks like.  One of the pieces of code that needs to change
> > > > > > > > > > > > > > > > when siginfo gets updated is siginfo_layout so that the structure
> > > > > > > > > > > > > > > > can be properly decoded and made sense of.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I am not seeing anything like that.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Okay, this has to do with copying between the compat and non-compat
> > > > > > > > > > > > > > > versions of the struct? Sure, I can update that, although the code
> > > > > > > > > > > > > > > would be basically non-functional on arm64 because TBI isn't supported
> > > > > > > > > > > > > > > on 32-bit ARM.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >> > diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > > > >> > index cb3d6c267181..6dd82373eb2d 100644
> > > > > > > > > > > > > > > > >> > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > > > >> > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > > > > > > > > > > >> > @@ -91,6 +91,14 @@ union __sifields {
> > > > > > > > > > > > > > > > >> >                            char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > > > > > > > > > > > >> >                            __u32 _pkey;
> > > > > > > > > > > > > > > > >> >                    } _addr_pkey;
> > > > > > > > > > > > > > > > >> > +#ifdef __aarch64__
> > > > > > > > > > > > > > > > >> > +                  /* used with all si_codes */
> > > > > > > > > > > > > > > > >> > +                  struct {
> > > > > > > > > > > > > > > > >> > +                          short _dummy_top_byte;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > ^ What's this for?  I don't have Eric's insight here.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > We would need a short's worth of padding in order to prevent the
> > > > > > > > > > > > > > > fields from occupying the same address as si_addr_lsb.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >> > +                          unsigned char _top_byte;
> > > > > > > > > > > > > > > > >> > +                          unsigned char _top_byte_mask;
> > > > > > > > > > > > > > > > >> > +                  } _addr_top_byte;
> > > > > > > > > > > > > > > > >> > +#endif
> > > > > > > > > > > > > > > > >> >            };
> > > > > > > > > > > > > > > > >> >    } _sigfault;
> > > > > > > > > > > > > > > > >> >
> > > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > > >> Why the _dummy_top_byte?  Oh I see it should be spelled "short _addr_lsb;".
> > > > > > > > > > > > > > > > >>
> > > > > > > > > > > > > > > > >> Please remove the "#ifdef __aarch64__".  If at all possible we want to
> > > > > > > > > > > > > > > > >> design this so any other architecture who has this challenge can use the
> > > > > > > > > > > > > > > > >> code.  The kind of code does not get enough attention/maintenance if it
> > > > > > > > > > > > > > > > >> is built for a single architecture.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Seems reasonable. I was recently made aware that RISC-V was
> > > > > > > > > > > > > > > considering a similar feature:
> > > > > > > > > > > > > > > https://lists.riscv.org/g/tech-tee/topic/risc_v_tbi_proposal/72855478
> > > > > > > > > > > > > > > I would have opted to expand this to other architectures on an
> > > > > > > > > > > > > > > as-needed basis, but I'd also be fine with having it on all
> > > > > > > > > > > > > > > architectures from the start.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > If we make this arch-independent, we have an additional concern, which
> > > > > > > > > > > > > > > is "what if some future architecture wants more than one byte here?"
> > > > > > > > > > > > > > > For example, an architecture may have a "top-two-bytes-ignore"
> > > > > > > > > > > > > > > feature, which would imply two-byte (misnamed) "si_addr_top_byte" and
> > > > > > > > > > > > > > > "si_addr_top_byte_mask" fields. And the RISC-V proposal potentially
> > > > > > > > > > > > > > > implies many more ignored bits (see slide 13 of the presentation). The
> > > > > > > > > > > > > > > maximum size that these fields can possibly be is the size of a
> > > > > > > > > > > > > > > pointer, and with that there wouldn't be enough room in the padding at
> > > > > > > > > > > > > > > this point to accommodate the new fields.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > That basically implies your earlier suggestion of adding a union
> > > > > > > > > > > > > > > member here to accommodate future expansion of the union, and adding
> > > > > > > > > > > > > > > the new fields after the union. I'm happy to make that change, with
> > > > > > > > > > > > > > > the fields renamed "si_addr_ignored" and "si_addr_ignored_mask".
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I think what we need here is basically a flags word.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So long as we keep a flag spare to indicate the existence of a further
> > > > > > > > > > > > > > flags word, we can extend as needed.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > How the existence of the first flags words is detected is another
> > > > > > > > > > > > > > problem.  If it only applies for newly-defined si_code values, then
> > > > > > > > > > > > > > I guess si_code may be sufficient.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Existing kernels will zero-initialize unused regions of the siginfo
> > > > > > > > > > > > > data structure. The zero-initialization of the padding at the end of
> > > > > > > > > > > > > the struct is done by the clear_user call here:
> > > > > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/kernel/signal.c#L3193
> > > > > > > > > > > > >
> > > > > > > > > > > > > and the zero-initialization of the padding between fields and unused
> > > > > > > > > > > > > union members is done by the clear_siginfo function which the kernel
> > > > > > > > > > > > > calls when initializing the data structure:
> > > > > > > > > > > > > https://github.com/torvalds/linux/blob/3e08a95294a4fb3702bb3d35ed08028433c37fe6/include/linux/signal.h#L20
> > > > > > > > > > > > >
> > > > > > > > > > > > > Therefore, a flag word value of 0 may be used to detect a lack of
> > > > > > > > > > > > > support for flagged fields.
> > > > > > > > > > > >
> > > > > > > > > > > > It's not enough that we do this today.  We would have had to do it back
> > > > > > > > > > > > to the dawn of time (though in the arm64 case I guess we just need to go
> > > > > > > > > > > > back to when the arch/arm64 was merged).
> > > > > > > > > > > >
> > > > > > > > > > > > v2.6.12:kernel/signal.c:copy_siginfo_to_user() suggests that this wasn't
> > > > > > > > > > > > always the case, so unused parts of siginfo could be full of old junk
> > > > > > > > > > > > from the user stack, if the kernel is sufficiently old.
> > > > > > > > > > > >
> > > > > > > > > > > > If we're trying to do something generic that makes sense on all arches,
> > > > > > > > > > > > this matters.  I may have misunderstood something about the code though.
> > > > > > > > > > >
> > > > > > > > > > > Hmm, I think you're right. The current behavior was introduced by
> > > > > > > > > > > commit c999b933faa5e281e3af2e110eccaf91698b0a81 which was first
> > > > > > > > > > > released in 4.18. So if an application wants to be compatible with
> > > > > > > > > > > pre-4.18 kernels then there would need to be some other way to
> > > > > > > > > > > indicate that the fields are valid. Probably the simplest way would be
> > > > > > > > > > > to have the application issue a uname(2) syscall and check the kernel
> > > > > > > > > > > version before reading these fields. I have a couple of other ideas
> > > > > > > > > > > that don't rely on version detection, if we'd prefer to avoid that.
> > > > > > > > > > > (They are somewhat ugly, but our hand is forced by backwards
> > > > > > > > > > > compatibility.)
> > > > > > > > > > >
> > > > > > > > > > > One idea is to re-purpose the si_errno field as a flags field for
> > > > > > > > > > > certain signal numbers. I checked a few kernel releases going back to
> > > > > > > > > > > 2.6.18 and it looks like the field is set to 0 except in the following
> > > > > > > > > > > circumstances:
> > > > > > > > > > > - sending a hardware breakpoint (SIGTRAP/TRAP_HWBKPT)
> > > > > > > > > > > - seccomp failures (SIGSYS/SYS_SECCOMP)
> > > > > > > > > > > - user-defined signal via kill_pid_usb_asyncio
> > > > > > > > > > > - SIGSWI in 3.18 and before (code since removed)
> > > > > > > > > > >
> > > > > > > > > > > It is also set to EFAULT for certain SIGSEGV/SEGV_MAPERR signals on
> > > > > > > > > > > powerpc since commit c96c4436aba4c12f1f48369f2f90bc43e12fe36c, which
> > > > > > > > > > > is currently unreleased. So if we wanted to go this route for SIGSEGV
> > > > > > > > > > > we would need to stop the kernel from setting si_errno to EFAULT for
> > > > > > > > > > > this signal before the 5.8 release.
> > > > > > > > > > >
> > > > > > > > > > > Another idea was to have userspace set a flag in sa_flags when
> > > > > > > > > > > registering a signal handler meaning "this signal handler requires
> > > > > > > > > > > unknown siginfo fields to be zeroed", and have existing kernels reject
> > > > > > > > > > > the syscall due to an unknown flag being set, but unfortunately this
> > > > > > > > > > > won't work because existing kernels do not reject sigaction syscalls
> > > > > > > > > > > with unknown flags set in sa_flags. A perhaps more radical idea in
> > > > > > > > > > > this vein would be to claim some of the upper bits of the signal
> > > > > > > > > > > number as flags that will cause the syscall to be rejected if set and
> > > > > > > > > > > unknown to the kernel. Existing kernels (going back to at least
> > > > > > > > > > > 2.6.18) contain this code in do_sigaction:
> > > > > > > > > > >
> > > > > > > > > > >         if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
> > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > >
> > > > > > > > > > > and vald_signal is defined as:
> > > > > > > > > > >
> > > > > > > > > > > static inline int valid_signal(unsigned long sig)
> > > > > > > > > > > {
> > > > > > > > > > >         return sig <= _NSIG ? 1 : 0;
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > All architectures define _NSIG as a value <= 128, so they will reject
> > > > > > > > > > > a signal number with any of bits 8-31 set. This means that we can use
> > > > > > > > > > > any of those bits for mandatory flags. Most likely we could use bit 30
> > > > > > > > > > > (expanding down as necessary), as it keeps the signal number positive
> > > > > > > > > > > and permits future expansion of the signal number range.
> > > > > > > > > >
> > > > > > > > > > Does the signal core code actually gurantee to zero the unused fields?
> > > > > > > > > > Unless the fields are poked in by hand this is fraught with subtlelies,
> > > > > > > > > > especially when unions are involved.  (I'm sure the code tries to do it,
> > > > > > > > > > but I've not eyeballed it in detail...)
> > > > > > > > >
> > > > > > > > > It memsets the siginfo structure before setting the fields and sending
> > > > > > > > > the signal (grep for clear_siginfo which is just a memset; you should
> > > > > > > > > find a call before all callers of force_sig_info). Memset is the right
> > > > > > > > > approach here since unlike setting fields by hand it clears padding
> > > > > > > > > which could lead to information leaks from the kernel. IIUC this is
> > > > > > > > > the reason why Eric wants all of the signals to be raised via wrappers
> > > > > > > > > in kernel/signal.c instead of via force_sig_info directly (to make
> > > > > > > > > this aspect easier to audit).
> > > > > > > >
> > > > > > > > My impression was that the reason for this model is partly to ensure
> > > > > > > > that siginfo fields are populated more consistently.  When it was all
> > > > > > > > down to the individual callers, inconsistencies creeped in.
> > > > > > > >
> > > > > > > > With regard to memset(), this is not a complete defence against data
> > > > > > > > leakage.  Assigning to a struct member can set any or all padding in
> > > > > > > > the struct to random garbage (consider write-combining of neighboring
> > > > > > > > member writes into a single larger accesses in asm for example).  The
> > > > > > >
> > > > > > > I don't believe that LLVM will store to padding like this. I don't
> > > > > > > know about GCC, though, but I wouldn't be surprised if this is
> > > > > > > something that the kernel would want to turn off in "kernel C" (like
> > > > > > > it turns off strict aliasing) specifically because of the information
> > > > > > > leak issue.
> > > > > >
> > > > > > Again, the issue is not future kernel builds -- we can always find a way
> > > > > > to fix the behaviour for those -- but past kernel builds.
> > > >
> > > > I thought that the whole point of the "bit in the signal number" (or
> > > > SI_CODEX or whatever) was that we didn't need to worry about the
> > > > behavior of past kernel builds?
> > >
> > > It depends on what we use the new flag(s) for.
> > >
> > > If the flag means just that unused padding is safely zeroed, that could
> > > work -- but we'd want high confidence that it really is zeroed even in
> > > wacky configurations.
> > >
> > > > > > > > only way to avoid this is to ensure that the struct is 100%
> > > > > > > > padding-free, and that each member of a union is the same size.  A
> > > > > > > > quick clance at <uapi/asm-generic/siginfo.h> confirms that this is not
> > > > > > > > the case.
> > > > > > > >
> > > > > > > > This might need to be looked at separately.
> > > > > > > >
> > > > > > > > But it does mean, strictly speaking, that we can't reliably add new
> > > > > > > > fields anywhere that there was previously padding: assigning to
> > > > > > > > neighboring members can still fill those with garbage after the
> > > > > > > > memset().
> > > > > > >
> > > > > > > ...but this is largely moot because I'm not proposing to add new
> > > > > > > fields in the padding any more (because the fields needed to become
> > > > > > > larger in order to accommodate future hypothetical architectures which
> > > > > > > might want to use the fields, and thus they wouldn't fit in the
> > > > > > > padding). The siginfo.h diff would be something like:
> > > > > > >
> > > > > > > diff --git a/include/uapi/asm-generic/siginfo.h
> > > > > > > b/include/uapi/asm-generic/siginfo.h
> > > > > > > index cb3d6c267181..4a2fe257415d 100644
> > > > > > > --- a/include/uapi/asm-generic/siginfo.h
> > > > > > > +++ b/include/uapi/asm-generic/siginfo.h
> > > > > > > @@ -91,7 +91,10 @@ union __sifields {
> > > > > > >                                 char _dummy_pkey[__ADDR_BND_PKEY_PAD];
> > > > > > >                                 __u32 _pkey;
> > > > > > >                         } _addr_pkey;
> > > > > > > +                       void *_pad[6];
> > > > > > >                 };
> > > > > > > +               uintptr_t _ignored_bits;
> > > > > > > +               uintptr_t _ignored_bits_mask;
> > > > > >
> > > > > > This _is_ in padding: the tail-padding of the (previously smaller)
> > > > > > _sigfault.  Again, the compiler was allowed to populate this area with
> > > > > > junk before these fields were added.
> > > > > >
> > > > > > I agree that it seems fairly unlikely that the compiler would have been
> > > > > > overwriting this in normal circumstances, but that's not a guarantee.
> > > > > > My worry is that if this goes wrong, it will go wrong silently and
> > > > > > unpredictably.
> > > > > >
> > > > > > >         } _sigfault;
> > > > > > >
> > > > > > >         /* SIGPOLL */
> > > > > > >
> > > > > > > or with a "uintptr_t _flags" added in before _ignored_bits if we go with that.
> > > > > > >
> > > > > > > > > > Using unused bits in the signal number to turn on new functionality
> > > > > > > > > > feels risky.  As currently specified, this is just a number.  Since
> > > > > > > > > > today a successful sigaction(n ...) guarantees that n is a valid signal
> > > > > > > > > > number, reasonable code like the following would trigger a buffer
> > > > > > > > > > overrun if we start trying to encode anything else in there:
> > > > > > > > > >
> > > > > > > > > > struct sigaction actions[NSIG];
> > > > > > > > > >
> > > > > > > > > > int do_something( ... )
> > > > > > > > > > {
> > > > > > > > > >         ...
> > > > > > > > > >
> > > > > > > > > >         if (!sigaction(n, sa, ...)) {
> > > > > > > > > >                 actions[n] = *sa;
> > > > > > > > > >                 return 0;
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > >         ...
> > > > > > > > > > }
> > > > > > > > >
> > > > > > > > > I imagine the bit in the signal number being set by the direct caller
> > > > > > > > > to sigaction, and we could specifically recommend that calling
> > > > > > > > > pattern. In that case, your "n" wouldn't have the bit set in it. It
> > > > > > > >
> > > > > > > > I can imagine this too, but that doesn't mean that software does it.
> > > > > > > >
> > > > > > > > If the above kind of thing exists in a framework or library somewhere,
> > > > > > > > we could get problems.  Similarly, a pre-existing LD_PRELOAD framework
> > > > > > > > that provides a wrapper for sigaction may now go wrong even if your
> > > > > > > > pattern is followed -- i.e., the caller thinks it's calling sigaction
> > > > > > > > directly but in fact it isn't.
> > > > > > >
> > > > > > > I'm aware of one library like that. It's called libsigchain, and it
> > > > > > > has an early bounds check:
> > > > > > > https://cs.android.com/android/platform/superproject/+/master:art/sigchainlib/sigchain.cc;l=371
> > > > > > >
> > > > > > > Until the library is changed to recognize the flag, calling code would
> > > > > > > see the return value of -1 as if the kernel failed the syscall, and
> > > > > > > would fall back to the code for old kernels.
> > > > > >
> > > > > > But only after some bad dereferences.  If these were writes, this means
> > > > > > that memory _may_ be silently corrupted (I don't say it't likely in a
> > > > > > given case, and we cannot pick a flag bit that makes this impossible).
> > > >
> > > > You're talking about libsigchain, right? I don't see any bad
> > > > references, the function returns after noticing the bounds check
> > > > failure.
> > >
> > > Yes, I confused myself by reading Handler() out of context.  The kernel
> > > will invoke this with signo to a real signal number (without any flags).
> > >
> > > The sigaction wrapper does the bounds check before doing anything else,
> > > just as you say -- so that looks fine.
> > >
> > > (Side question: is all this thread-safe?  Is there some implicit locking
> > > somewhere?)
> >
> > I think maybe it isn't? There seem to be possible races on the
> > handler_ field. One possibility is that the function could race with
> > itself on another thread, which could be fixed via locking, but it
> > would also need to handle races between itself and the signal handler,
> > most likely by blocking the signal while setting it.
>
> Hmmm, tricky... anyway, that's not my problem ;)
>
> > > > > > So, _even though the user program is correct_, our change may trigger
> > > >
> > > > Let's say that you were talking about some other library and not
> > > > libsigchain. Such an interceptor wouldn't be correct though, it failed
> > > > to account for our change to the syscall semantics. If the accesses
> > > > were before the syscall (or the bounds check), then the interceptor
> > > > would not have been correct in the first place because POSIX requires
> > > > returning -1 with errno=EINVAL (and not crashing) if the signal number
> > > > is invalid.
> > > >
> > > > > > the corruption of arbitrary user memory.  This what I mean by an ABI
> > > > > > break.  The fact that the corruption is not done by the syscall itself
> > > > > > is no excuse.
> > > >
> > > > At some point, though, accommodating interceptors becomes pretty much
> > > > tantamount to saying "we can never change anything". Even just adding
> > > > a field to __sifields (which is pretty much required for what we need
> > > > to do) could break things in the presence of some interceptors because
> > > > the interceptor could be copying the fields manually to a new data
> > > > structure before calling the user's signal handler (e.g. because it
> > > > wants to defer the signal until later) and miss our new field. I think
> > > > most of the other ideas we're discussing fail to meet this bar as well
> > > > and I'll go into more details later on.
> > >
> > > I agree we cannot always avoid breaking such things.  But we should do
> > > our best to avoid it.
> >
> > I think that given the hand that we've been dealt, no matter what we
> > do, we can't really avoid risking breaking something. The relevant
> > questions are "what are we going to risk breaking", "how much risk is
> > there", "will it be easily noticed/fixable", and "once we're on the
> > other side of the potential breakage, will we find ourselves in a
> > position where changing things involves less breakage risk".
> >
> > > > > > We also fail to notice failures in sigaddset() etc., though in this code
> > > > > > it looks like that should not matter.
> > > >
> > > > Maybe you're looking at the handler ("SignalChain::Handler")? The bit
> > > > wouldn't be set in the signo argument to the handler. I'm talking
> > > > about line 371 of the code I linked, in the sigaction interceptor
> > > > "__sigaction" (it looks like sometimes the link doesn't take you to
> > > > the correct line for some reason).
> > >
> > > Ack, I confused myself.
> > >
> > > > > > > In general I think that any library like this with independent
> > > > > > > tracking of the kernel's purported signal handler state would need to
> > > > > > > be very sensitive to which syscalls are capable of setting signal
> > > > > > > handlers, what their semantics are, and so on. This applies to any
> > > > > > > change that we might make to the signal handler interface. So for
> > > > > > > example, if we introduced a new syscall as you propose below, and the
> > > > > > > library hasn't been updated to recognize the new syscall, it will
> > > > > > > silently miss changes in signal handler state caused by the new
> > > > > > > syscall.
> > > > > > >
> > > > > > > At the end of this argument lies "we can never change anything about
> > > > > > > how signal handlers work because it could break some interposing
> > > > > > > library somewhere" -- replace "signal handlers" with any kernel
> > > > > > > feature whose behavior may be modified by an interposing library if
> > > > > > > you like -- and I don't think we want to go that far. As far as I
> > > > > > > know, this isn't really the kernel's business anyway -- the kernel's
> > > > > > > stable ABI contract starts and ends with the syscall interface and not
> > > > > > > some library on top.
> > > > > > >
> > > > > > > That being said, we should perhaps try to define our interface so that
> > > > > > > something reasonable will probably happen if there is such a library
> > > > > > > and it hasn't been updated. With the new syscall, the library will
> > > > > > > sometimes silently fail to work in some non-local fashion. With the
> > > > > > > flag bit in the signal number, the library will either cause the
> > > > > > > caller to fall back to the old kernel code path (if there is a bounds
> > > > > > > check) or likely crash loudly (if there is no bounds check). To me,
> > > > > > > the "flag bit in the signal number" behavior seems more reasonable,
> > > > > > > since either something correct or something easy to debug will
> > > > > > > probably happen at runtime.
> > > > > > >
> > > > > > > > > could only appear in newly-written code that doesn't follow our
> > > > > > > > > recommendations, and there are already plenty of much more likely ways
> > > > > > > > > to cause buffer overflows in C code that doesn't follow
> > > > > > > > > recommendations anyway. (And even if such a buffer overflow occurred,
> > > > > > > > > it would very likely be caught early in development by the MMU due to
> > > > > > > > > the magnitude of the number 1<<30.)
> > > > > > > >
> > > > > > > > Choosing the bit value is hard.  If shitfing it overflows, this can
> > > > > > > > trigger random undefined behaviour in the compiler in addition to (or
> > > > > > > > perhaps instead of) an out-of-bounds access or segfault.
> > > > > > >
> > > > > > > It wouldn't overflow on a 64-bit architecture assuming normal array
> > > > > > > indexing (the index would be promoted to pointer width before being
> > > > > > > scaled to the array element size), and to begin with the users of this
> > > > > > > would be 64-bit.
> > > > > >
> > > > > > Unless we don't offer this feature for 32-bit at all (possible, if ugly)
> > > > > > we can't stop people using it.
> > > >
> > > > My point is that the problem in the interceptor library would probably
> > > > be noticed on 64-bit (since that's what most people use these days),
> > > > which would probably result in it being fixed by the time it reaches
> > > > 32-bit users.
> > >
> > > Agreed.  But we shouldn't take such bets unless we really have to.
> > >
> > > > > > > > If shifting it doesn't overflow, we might still fall into a valid
> > > > > > > > mapping, though I'd agree a segfault is more likely.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > I think it would be cleaner for to add a single flag field that can be
> > > > > > > > > > used for detecting other extensions, and request it via a new sa_flags
> > > > > > > > > > bit.  This removes the need for sematically useless zeroing of unused
> > > > > > > > > > fields (though for hygiene and backwards compatibility reasons we would
> > > > > > > > > > probably want to carry on zeroing them anyway).
> > > > > > > > > >
> > > > > > > > > > I can see no simpler way to add supplementary siginfo fields for
> > > > > > > > > > existing si_codes.  For si_codes that didn't exist before the zeroing
> > > > > > > > > > came in we could still detect optional si_code-specific fields via
> > > > > > > > > > zeroing, but it seems messary to have two ways of detecting extensions.
> > > > > > > > >
> > > > > > > > > That would certainly be cleaner if it worked, but that would only be
> > > > > > > > > the case if old kernels rejected unknown bits in sa_flags, and
> > > > > > > > > unfortunately they don't. With the bit in the signal number, the "old
> > > > > > > >
> > > > > > > > Hmm, that is a problem I wasn't aware of.
> > > > > > > >
> > > > > > > > > kernels reject" behavior admits relatively straightforward usage code:
> > > > > > > > >
> > > > > > > > > void set_segv_handler(void) {
> > > > > > > > >   struct sigaction sa;
> > > > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > > > >   sa.sa_flags = SA_SIGINFO;
> > > > > > > > >   if (sigaction(SIGSEGV | SF_CLEAR_UNKNOWN_FIELDS, &sa, 0) < 0) { //
> > > > > > > > > succeeds in new kernels, fails in old kernels
> > > > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0) // succeeds in old kernels
> > > > > > > > >       perror("sigaction");
> > > > > > > > >   }
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > void clear_fields_and_handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > > > > >   sa->si_future_field = 0;
> > > > > > > > >   handle_segv(signum, sa, ctx);
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > void handle_segv(int signum, siginfo_t *sa, void *ctx) {
> > > > > > > > >   // At this point, si_future_field will have the value 0 in old
> > > > > > > > > kernels and the kernel-supplied value in new kernels.
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > Imagine if we moved the flag SF_CLEAR_UNKNOWN_FIELDS from the signal
> > > > > > > > > number to sa_flags. In that case, the first sigaction would succeed in
> > > > > > > > > old kernels so handle_segv wouldn't know whether it can safely read
> > > > > > > > > from si_future_field. With the sa_flags approach, you would need
> > > > > > > > > kernel version number checking via uname before setting the flag in
> > > > > > > > > sa_flags, and at that point why even have the flag in sa_flags at all
> > > > > > > > > since you could just have the signal handler conditionally read from
> > > > > > > > > si_future_field based on the uname?
> > > > > > > >
> > > > > > > > Software setting SA_SIFLAGS (or whatever) is new by definition, since
> > > > > > > > it would be using a new #define.  So it might be reasonable to put the
> > > > > > > > burden on that software to verify that the flag was really accepted by
> > > > > > > > the kernel, by reading it back.
> > > > > > >
> > > > > > > That doesn't seem like a good idea even if it worked, because it could
> > > > > > > lead to race conditions. If the si_flags-reading signal handler were
> > > > > > > invoked in response to a signal between when you set it and when you
> > > > > > > ended up replacing it with the fallback signal handler for old
> > > > > > > kernels, the handler may end up reading garbage data from si_flags.
> > > > > >
> > > > > > Not really.  My example may have this problem, but the signal handler
> > > > > > can be written to support both scenarios, based on testing a flag that
> > > > > > the main program sets after verifying that the flag could be set.  Or
> > > > > > the signal could be blocked around establishment (often a good idea for
> > > > > > other reasons).
> > > > > >
> > > > > > But I agree it's a bit gross, and anyway doesn't work due to the fact
> > > > > > that the kernel doesn't filter out unrecognised flags anyway.
> > > > > >
> > > > > > > > Unfortunately, even relatively recent kernels blindly store sa_flags
> > > > > > > > in the kernel without validating it, and so it looks like duff flags
> > > > > > > > can be read back out via a sigaction() call.  Dang.
> > > > > > > >
> > > > > > > >
> > > > > > > > Perhaps a new frontend syscall could be added.  A new libc that knows
> > > > > > > > about this "sigaction2" could use it and mask off problem bits from
> > > > > > > > sa_flags in its sigaction() wrapper before calling sigaction2.  An old
> > > > > > > > libc would call the old sigaction syscall, where we would ignore these
> > > > > > > > new sa_flags bits as before.
> > > > > > >
> > > > > > > I'm not currently in favor of the new syscall but if we do this I
> > > > > > > would keep sigaction and sigaction2 separate. That is, libc sigaction
> > > > > > > should always use the sigaction syscall, and libc sigaction2 should
> > > > > > > always use the sigaction2 syscall. We should avoid libc's sigaction
> > > > > > > having different behavior based on the libc version and kernel
> > > > > > > version, as that would make it harder to reason about its behavior.
> > > > > > > Calling code would need to check for presence of sigaction2 in both
> > > > > > > libc and the kernel, e.g.
> > > > > > >
> > > > > > > __attribute__((weak)) decltype(sigaction2) sigaction2;
> > > > > > >
> > > > > > > void set_segv_handler(void) {
> > > > > > >   struct sigaction sa;
> > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > >   sa.sa_flags = SA_SIGINFO | SA_SIFLAGS;
> > > > > > >   if (!sigaction2 || sigaction2(SIGSEGV, &sa, 0) < 0) {
> > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > >     sa.sa_flags = SA_SIGINFO;
> > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > > > >       perror("sigaction");
> > > > > > >   }
> > > > > > > }
> > > > > >
> > > > > > I guess.  But I share your distaste for adding a new syscall.
> > > > > >
> > > > > > >
> > > > > > > > This may not be a popular approach though, and software wouldn't be able
> > > > > > > > to use our new features until libc is updated to match.
> > > > > > > >
> > > > > > > > If we go down this route, it may provide additional opportunities to fix
> > > > > > > > annoying defects in the old interface.
> > > > > > > >
> > > > > > > >
> > > > > > > > > Note that the same applies to a flag indicating the availability of a
> > > > > > > > > si_flags field in sigaction (just
> > > > > > > > > s/SF_CLEAR_UNKNOWN_FIELDS/SF_HAS_SI_FLAGS/ and
> > > > > > > > > s/si_future_field/si_flags/ in the usage code above). In terms of
> > > > > > > > > SF_CLEAR_UNKNOWN_FIELDS versus SF_HAS_SI_FLAGS I'd be fine either way.
> > > > > > > > >
> > > > > > > > > Another thought that occurred to me is that we may consider
> > > > > > > > > generalizing this a step further and introducing a single flag bit in
> > > > > > > > > the signal number that means "reject unknown flags in sa_flags". This
> > > > > > > > > would mean that we wouldn't need to add any more flag bits to the
> > > > > > > > > signal number in the future, thus limiting this signal number hack to
> > > > > > > > > a single bit; all future mandatory behavior changes could just be put
> > > > > > > > > behind a flag in sa_flags and userspace code would easily be able to
> > > > > > > > > detect missing support for a flag and fall back if necessary. In our
> > > > > > > > > case, this would imply usage code like this:
> > > > > > > > >
> > > > > > > > > void set_segv_handler(void) {
> > > > > > > > >   struct sigaction sa;
> > > > > > > > >   sa.sa_sigaction = handle_segv;
> > > > > > > > >   sa.sa_flags = SA_SIGINFO | SA_CLEAR_UNKNOWN_FIELDS;
> > > > > > > > >   // Succeeds in kernels with SA_CLEAR_UNKNOWN_FIELDS support.
> > > > > > > > >   // Fails in kernels with SF_CHECK_SA_FLAGS support but no
> > > > > > > > > SA_CLEAR_UNKNOWN_FIELDS support (because of the unknown flags check).
> > > > > > > > >   // Fails in kernels without SF_CHECK_SA_FLAGS support (because of
> > > > > > > > > the bounds check on the signal number).
> > > > > > > > >   if (sigaction(SIGSEGV | SF_CHECK_SA_FLAGS, &sa, 0) < 0) {
> > > > > > > > >     sa.sa_sigaction = clear_fields_and_handle_segv;
> > > > > > > > >     sa.sa_flags = SA_SIGINFO;
> > > > > > > > >     // Succeeds in old kernels, no need to use SF_CHECK_SA_FLAGS since
> > > > > > > > > we're using sa_flags from the beginning of time.
> > > > > > > > >     if (sigaction(SIGSEGV, &sa, 0) < 0)
> > > > > > > > >       perror("sigaction");
> > > > > > > > >   }
> > > > > > > > > }
> > > > > > > >
> > > > > > > > As with the other options this could work, but looks like it could
> > > > > > > > break the ABI due to violating the original semantics for the signal
> > > > > > > > number argument.  Perhaps I'm being too paranoid.
> > > > > > >
> > > > > > > There's no ABI being broken here, as long as we consider syscalls to
> > > > > > > be the stable ABI layer. Old kernels are simply rejecting arguments
> > > > > > > that they don't know about yet. By that argument, any introduction of
> > > > > > > a new syscall is an ABI break because it changes the semantics of a
> > > > > > > previously-unallocated syscall number.
> > > > > >
> > > > > > As argued above, I think this is an invalid argument.
> > > > > >
> > > > > > Although any addition will change behaviour (so is a break in some
> > > > > > sense), the key is not to make "surprising" changes.
> > > >
> > > > If we care about interceptors then I don't think "surprising" comes
> > > > into it. It's more a question of "does the anticipated behavior of the
> > > > interceptor match our desired behavior", where "desired" means "most
> > > > likely to avoid silent breakage". We would need to get into the head
> > > > of a potential interceptor author and think about how they would have
> > > > handled the signal number argument, as well as other arguments like
> > > > sa_flags if we want to go that route, and see whether that behavior
> > > > would lead to the desired result.
> > >
> > > That's exactly what I mean by "surprising".
> >
> > Not quite, see below.
> >
> > > However, not every
> > > interceptor author will be making the same assumptions, and not every
> > > bit of software affected will be an interceptor.
> >
> > I can see a couple of ways in which non-interceptor software could be affected:
> >
> > - It's doing something like "call sigaction on every possible signal
> > number in the 31-bit range and end up failing if the syscall
> > succeeded" (e.g. with an OOB write). Perhaps software could be doing
> > something like this in a loop to collect all currently registered
> > signal handlers. That being said, this program:
> >
> > #include <limits.h>
> > #include <signal.h>
> >
> > int main() {
> >   struct sigaction act;
> >   for (int i = 1; i != INT_MAX; ++i) {
> >     sigaction(i, 0, &act);
> >   }
> > }
> >
> > takes around 5 seconds to run on my relatively-fast machine, so I
> > would expect any such code to be noticed as a performance issue and
> > either be changed to be bounded on _NSIG or break on EINVAL.
> >
> > This is probably the largest potential flaw that I can currently see
> > in the "bit in the signal number" idea, since it could conceivably
> > result in userspace code being broken without having first required it
> > to have been changed to make use of the new feature. I'm not convinced
> > that it would be an ABI break though, because the code seems unlikely
> > to exist in this form in the wild because of the performance issue,
> > and you could anyway make the argument that the code is incorrect
> > because, in order to contain a loop like this, it would need to be
> > able to handle large, previously-unknown signal numbers somehow. If we
> > accept that the code is incorrect, a similar line of argument applies
> > as for interceptors (i.e. likely to result in an OOB access which will
> > fail loudly and be easily debugged and fixed).
> >
> > - If we do something that involves introducing a new flag in sa_flags,
> > the flag may be exposed to unaware software via the oldact argument to
> > sigaction, and I suppose that it's conceivable that exposing a
> > previously-unknown flag like this could somehow break something. But
> > this seems like an unreasonable restriction because it would mean that
> > we can never add a flag to sa_flags no matter what.
> >
> > >  So some judgement
> > > needs to be applied.
> >
> > Of course. We need to agree *how* to apply the judgement though.
> >
> > > > In this case, I think we exactly want the interceptor author to have
> > > > thought "oh, it's just a number, I'll (possibly do a bounds check and
> > > > then) use the number as an index into an array". This will lead to one
> > > > of two outcomes: crashing (yes, yes, it won't always crash, but if the
> > > > alternative is that it never crashes and we get silently incorrect
> > > > behavior all of the time, I'll take sometimes crashing) or fail the
> > > > bounds check and pretend to be an old kernel (the latter is
> > > > anticipated by POSIX which requires returning -1/EINVAL for an invalid
> > > > signal number). Each of these behaviors are desirable, as they are
> > > > observable failures, which are more likely to result in fixes than
> > > > silent ones.
> > >
> > > Agreed, except wanting the author to have thought something doesn't
> > > ensure that they actually did think that.
> >
> > True, but if our goal is only to accommodate reasonably written
> > interceptors, we don't actually need to ensure anything here.
> >
> > > > > > Having something random happen when setting a previously reserved flag
> > > > > > bit, or when issuing a syscall when an unknown syscall number, or not
> > > > > > surprising at all.
> > > >
> > > > Introducing a new syscall is right out in this model. The interceptor
> > > > author wouldn't have anticipated our introducing a new syscall, so the
> > > > new syscall wouldn't be intercepted and calls to the new syscall would
> > > > silently bypass the interceptor. For example, adding sigaction2 could
> > > > result in signal handlers being set without the interceptor's
> > > > knowledge.
> > >
> > > Agreed.  My sentence was a bit mangled: I mean to say "Having something
> > > random happen when [...] issuing a syscall *with* an unknown syscall
> > > number *is* not surprising at all."
> > >
> > > I agree that adding a new syscall is problematic if we want to avoid
> > > breaking existing interceptors in particular.  Other types of code are
> > > much less likely to be affected by the addition of new syscalls.
> >
> > Right, and this to me is a case in point for why I would say that
> > "surprising" isn't the right frame of analysis here. My analysis seems
> > to generally be that "anticipated interceptor behavior matches desired
> > behavior" is positively correlated with "surprising" (i.e. the
> > interceptor viewpoint is the dual of the user viewpoint), so if we
> > care about interceptors we may end up making a "surprising" change
> > even though it doesn't intuitively seem like the right thing to do.
>
> You're right that interceptors are different from normal callers.  I'm
> not sure I follow your argument, but an alternative way of looking at
> it might be to say that an interceptor is both an implementation of an
> interface and a caller of the same interface.  Since API specs are
> rarely complete enough to cover the corner cases that arise from this,
> full portability is hard to achieve on top of an evolving kernel.
>
> However, I think this interceptor thing is a bit of a red herring.  I
> just intended that as an illustration of the kind of code that might
> fall foul.  This doesn't mean that it's 100% certain that no other
> software can be affected.
>
> The starting points for this discussion were: "is it reasonable for a
> caller to pass an unvalidated signal number to sigaction(), and rely on
> sigaction() to validate it?" and "is it reasonable to assume that a
> signal number accepted by sigaction() fits the POSIX specification of
> a valid signal number?"
>
> I think yes; you aren't (or weren't) convinced.
>
> The mere fact that it's hard to agree suggests to me that the
> specification is too weak to extend safely in this area.  Unfortunately,
> it's rather weak for sa_flags too, although a non-full flags argument
> does at least suggest that future extensions might appear.
>
> > > > Regarding a sa_flags bit, let's get inside the head of the interceptor
> > > > author again. How would they handle a flag bit that they don't
> > > > recognize when replacing the signal handler? It wouldn't be correct to
> > > > just pass it through to the kernel, or drop the flag on the floor, as
> > > > it might be semantically meaningful (and thus could change the calling
> > > > convention as SA_SIGINFO does, or change the meaning of fields in
> > > > siginfo, as SA_CODEX would do). A correctly written sigaction
> > > > interceptor should probably abort the program upon encountering an
> > > > unknown flag (thus giving a human a chance to update the interceptor),
> > > > but chances are that they don't. Ignoring all but a few flags (and
> > > > passing a fixed set of flags to the kernel) seems to be what
> > > > libsigchain does, and in the case of SA_CODEX it would seem to result
> > > > in desirable behavior (but I suspect that it isn't handling the other
> > > > flags correctly), but I could also see an interceptor author just
> > > > passing it unchanged to the kernel without checking it (perhaps
> > > > because they didn't think about these issues, and because that didn't
> > > > matter until now, with the exception of from-the-beginning-of-time
> > > > flags like SA_SIGINFO). And with SA_CODEX that could lead to silent
> > > > misreading of si_code in the interceptor's signal handler, if it
> > > > hasn't been updated to use the new macros.
> > >
> > > Agreed.  I've tried to implement things rather like this in the past,
> > > and how to interpret the flags is a tricky issue.  Some of the flags are
> > > impossible to emulate even when you know what they mean, in particular
> > > SA_NODEFER and SA_RESTART.
> > >
> > > Making new flags safe to ignore and harmless to set of you don't know
> > > what they mean is the safest approach, but not always possible (I think
> > > I managed this with by suggestion below, though).
> >
> > Again, this suggestion could lead to silent failures in an interceptor, if:
> > - the interceptor passes the sa_flags through to the kernel unchanged
> > (or otherwise doesn't touch SA_CODEX)
> > - the interceptor replaces the user's sa_sigaction
> > - the interceptor's replacement sa_sigaction tests the provided si_code.
> >
> > Maybe you're not concerned about that, though? At least to me it seems
> > in the same ballpark of likelihood as the ways in which things could
> > go wrong with the signal number bit.
>
> I agree this is a concern, and perhaps a bit nastier in practice than
> side-effects of setting random bits in the signal number.
>
> Personally I do tend to be paranoid about flags arguments and try to
> police them in any code that isn't a trivial pass-through, but this
> doesn't mean that all code out there does it.  (Including the kernel's
> sigaction()!)
>
> > > Ideally, a flags field should be specified with rules that say exactly
> > > what to do with flags you don't recognise.  Sadly this is usually not
> > > thought about until it's too late.
> >
> > It perhaps isn't too late to introduce such rules for sigaction if we
> > adopt the signal number bit and we make it mean "reject unknown
> > flags".
>
> If Eric likes the idea then fair enough, but as I've tried to argue this
> may still just be moving the problem around rather than solving it.
>
>
> As a final random idea to add to the mix, we could add two or more
> flags in sa_flags, and require the kernel to transform them in a
> specific way, say:
>
> #define SA_WANT_FLAGS 0x00c700000
> #define SA_HAVE_FLAGS 0x009200000
> #define SA_FLAGS_MASK 0x00ff00000
>
> volatile sig_atomic_t have_flags = 0;
>
>         sa.sa_flags |= SA_WANT_FLAGS;
>         if (sigaction(n, &sa, NULL))
>                 if (!sigaction(n, NULL, &sa) &&
>                                 (sa.sa_flags & SA_FLAGS_MASK) == SA_HAVE_FLAGS)
>                         have_flags = 1;
>
> This is at least proof against "dumb readback".
>
> Provided that the handler can cope with the have_flags == 0 case and
> just reads the flag once per call, I don't think we would need to worry
> about races.
>
> Of course, an interceptor that doesn't understand this mechanism and
> munges or manufactures its own siginfo might still fail to properly
> initialise our new field before passing it on to a signal handler that
> is expecting it.  But that's already broken: such an interceptor might
> also not understand new si_codes that the client code absolutely relies
> on.  And new si_codes _do_ get added (that's another extensibility fail
> in the existing signal API).
>
>
> So... overall, maybe a bit in the signal number isn't a lot worse, and
> perhaps it _will_ lead to cleaner failures.
>
> Really, I don't see a way to solve it properly without a new API.

I started on implementing my signal number bit idea, and in the
process of doing so came up with another idea that may be better from
the "don't abuse existing arguments" perspective. It involves a
sigaction protocol similar to the one that you describe above, but it
only requires one new bit (plus one bit per new flag) so it is less
wasteful of sa_flags bits.

The idea is twofold:

1. Require the kernel to clear unknown flag bits in sa_flags when
passing them back in oldact. I suppose that this is technically a
behavior change for sigaction, but critically, this change in behavior
only applies to unallocated flags, which we are free to change the
meaning of. We can simply define each existing unallocated flag bit to
mean "clear this bit in oldact (unless the bit becomes supported in
the future)". There was already code doing something similar in a
limited fashion on x86, which we can remove by using this approach.

2. Define a flag bit SA_UNSUPPORTED which will never be supported by
the kernel. Now userspace can use the fact that the bit has been
cleared to mean that it can trust that other unsupported bits have
also been cleared.

Now we may have code like this:

#define SA_UNSUPPORTED 0x400
#define SA_XFLAGS 0x800

volatile sig_atomic_t have_xflags = 0;

         sa.sa_flags |= SA_UNSUPPORTED | SA_XFLAGS;
         if (sigaction(n, &sa, NULL))
                 if (!sigaction(n, NULL, &sa) &&
                                 !(sa.sa_flags & SA_UNSUPPORTED) &&
(sa.sa_flags & SA_XFLAGS))
                         have_xflags = 1;

> In the meantime, can I suggest:
>
>  (1) Come up with an extensible way of encoding supplementary
>      information in siginfo.  If the consensus is that zeroing unused
>      fields is sufficient and that the kernel and compiler will
>      reliably do it, then great.  Otherwise, we might need explicit
>      flags fields or something.

I thought about this for a while and concluded that we probably want a
flags field anyway. si_addr_ignored_bits is something of a special
case in the sense that we can define the zero value to mean
"unknown" by taking advantage of the mask field (which I suppose is
something of a flags field), but we can't necessarily say that the
same is true for any fields that we may add in the future. For
example, if we wanted to communicate whether the failing access is a
read or a write, we would need a tristate: read, write and "unknown"
(and arrange for old kernels' behavior to be interpreted as
"unknown"). If we rely on zeroing then we may implement this by adding
a field like:

char si_access_type; // 0 = unknown, 1 = read, 2 = write

But that's really just a (slightly wasteful, because we use the entire
byte) flags field, so we may as well define an actual flags field to
begin with and let people add their flags there.

Unfortunately we can't name it sa_flags because ia64 got there first.
We may consider making the ia64 field generic though (ia64 only uses
one bit of their field, so we would have 31 free bits). In the
meantime, I added a separate field, sa_xflags.

>  (2) Hack up any simple mechanism (such as your signal number flag) for
>      requesting/detecting the extra information.
>
> Along with an illustration of a application of the mechanism (i.e.,
> reporting address tag bits), this should at least provide a basis for
> further review.
>
> We can then try to swap in a different mechanism for (2) if people have
> still have concerns (or it not, keep it).

Sounds good. Apologies for not replying sooner, I was hoping that Eric
would chime in so that I would get a sense of which approach he would
prefer (so that I wouldn't spend as much time implementing in an
undesired direction), then this fell off my radar. I decided to go
with the SA_UNSUPPORTED approach that I mentioned above for now, and
I'll send a v9 with that implemented shortly. Most of the change is
about letting the architecture-independent code know which bits are
supported, so it should be easy to replace the detection mechanism
with another idea like the signal number bit.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH v8] arm64: Expose FAR_EL1 tag bits in siginfo
  2020-08-18  3:16                                                                                 ` Peter Collingbourne
@ 2020-08-18 13:50                                                                                   ` Dave Martin
  0 siblings, 0 replies; 64+ messages in thread
From: Dave Martin @ 2020-08-18 13:50 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Linux ARM, Catalin Marinas, Kevin Brodsky, Oleg Nesterov,
	Kostya Serebryany, Eric W. Biederman, Andrey Konovalov,
	Vincenzo Frascino, Will Deacon, Evgenii Stepanov,
	Richard Henderson

On Mon, Aug 17, 2020 at 08:16:42PM -0700, Peter Collingbourne wrote:
> On Tue, Jul 14, 2020 at 10:36 AM Dave Martin <Dave.Martin@arm.com> wrote:

[...]

> > As a final random idea to add to the mix, we could add two or more
> > flags in sa_flags, and require the kernel to transform them in a
> > specific way, say:
> >
> > #define SA_WANT_FLAGS 0x00c700000
> > #define SA_HAVE_FLAGS 0x009200000
> > #define SA_FLAGS_MASK 0x00ff00000
> >
> > volatile sig_atomic_t have_flags = 0;
> >
> >         sa.sa_flags |= SA_WANT_FLAGS;
> >         if (sigaction(n, &sa, NULL))
> >                 if (!sigaction(n, NULL, &sa) &&
> >                                 (sa.sa_flags & SA_FLAGS_MASK) == SA_HAVE_FLAGS)
> >                         have_flags = 1;
> >
> > This is at least proof against "dumb readback".
> >
> > Provided that the handler can cope with the have_flags == 0 case and
> > just reads the flag once per call, I don't think we would need to worry
> > about races.
> >
> > Of course, an interceptor that doesn't understand this mechanism and
> > munges or manufactures its own siginfo might still fail to properly
> > initialise our new field before passing it on to a signal handler that
> > is expecting it.  But that's already broken: such an interceptor might
> > also not understand new si_codes that the client code absolutely relies
> > on.  And new si_codes _do_ get added (that's another extensibility fail
> > in the existing signal API).
> >
> >
> > So... overall, maybe a bit in the signal number isn't a lot worse, and
> > perhaps it _will_ lead to cleaner failures.
> >
> > Really, I don't see a way to solve it properly without a new API.
> 
> I started on implementing my signal number bit idea, and in the
> process of doing so came up with another idea that may be better from
> the "don't abuse existing arguments" perspective. It involves a
> sigaction protocol similar to the one that you describe above, but it
> only requires one new bit (plus one bit per new flag) so it is less
> wasteful of sa_flags bits.
> 
> The idea is twofold:
> 
> 1. Require the kernel to clear unknown flag bits in sa_flags when
> passing them back in oldact. I suppose that this is technically a
> behavior change for sigaction, but critically, this change in behavior
> only applies to unallocated flags, which we are free to change the
> meaning of. We can simply define each existing unallocated flag bit to
> mean "clear this bit in oldact (unless the bit becomes supported in
> the future)". There was already code doing something similar in a
> limited fashion on x86, which we can remove by using this approach.

Sounds reasonable.  It's quite hard to imagine how software could
accidentally rely on unallocated sa_flags bits reading back out
unmodified through sigaction().  Software that deliberately relies on
this for some bit that is never allocated would be obviously non POSIX
compliant.

> 2. Define a flag bit SA_UNSUPPORTED which will never be supported by
> the kernel. Now userspace can use the fact that the bit has been
> cleared to mean that it can trust that other unsupported bits have
> also been cleared.
> 
> Now we may have code like this:
> 
> #define SA_UNSUPPORTED 0x400
> #define SA_XFLAGS 0x800
> 
> volatile sig_atomic_t have_xflags = 0;
> 
>          sa.sa_flags |= SA_UNSUPPORTED | SA_XFLAGS;
>          if (sigaction(n, &sa, NULL))
>                  if (!sigaction(n, NULL, &sa) &&
>                                  !(sa.sa_flags & SA_UNSUPPORTED) &&
> (sa.sa_flags & SA_XFLAGS))
>                          have_xflags = 1;

OK, so I think the novelty here is that we detect support by requiring
the kernel to clear one bit while preserving another.  That does indeed
seem more robust: an old kernel (unless bizarrely buggy) would either
clear all unsupported bits or preserve them all.  Other OSes that extend
their sigaction() in line with POSIX would also be highly unlikely to
exhibit this behaviour by accident IMHO, making it easier for this to
coexist with other people's extensions.

Interceptors still may not transparently work with this approach, but I
think that's a reasonable price to pay.

> 
> > In the meantime, can I suggest:
> >
> >  (1) Come up with an extensible way of encoding supplementary
> >      information in siginfo.  If the consensus is that zeroing unused
> >      fields is sufficient and that the kernel and compiler will
> >      reliably do it, then great.  Otherwise, we might need explicit
> >      flags fields or something.
> 
> I thought about this for a while and concluded that we probably want a
> flags field anyway. si_addr_ignored_bits is something of a special
> case in the sense that we can define the zero value to mean
> "unknown" by taking advantage of the mask field (which I suppose is
> something of a flags field), but we can't necessarily say that the
> same is true for any fields that we may add in the future. For
> example, if we wanted to communicate whether the failing access is a
> read or a write, we would need a tristate: read, write and "unknown"
> (and arrange for old kernels' behavior to be interpreted as
> "unknown"). If we rely on zeroing then we may implement this by adding
> a field like:
> 
> char si_access_type; // 0 = unknown, 1 = read, 2 = write
> 
> But that's really just a (slightly wasteful, because we use the entire
> byte) flags field, so we may as well define an actual flags field to
> begin with and let people add their flags there.
> 
> Unfortunately we can't name it sa_flags because ia64 got there first.
> We may consider making the ia64 field generic though (ia64 only uses
> one bit of their field, so we would have 31 free bits). In the
> meantime, I added a separate field, sa_xflags.

Seems fair enough to me.  We still have the option to use zeroing for
detection of fields where it works.

Would SA_XFLAGS imply zeroing of unallocated siginfo fields?  This may
just be a matter of documentation, if the kernel already does the
zeroing today.

> 
> >  (2) Hack up any simple mechanism (such as your signal number flag) for
> >      requesting/detecting the extra information.
> >
> > Along with an illustration of a application of the mechanism (i.e.,
> > reporting address tag bits), this should at least provide a basis for
> > further review.
> >
> > We can then try to swap in a different mechanism for (2) if people have
> > still have concerns (or it not, keep it).
> 
> Sounds good. Apologies for not replying sooner, I was hoping that Eric
> would chime in so that I would get a sense of which approach he would
> prefer (so that I wouldn't spend as much time implementing in an
> undesired direction), then this fell off my radar. I decided to go
> with the SA_UNSUPPORTED approach that I mentioned above for now, and
> I'll send a v9 with that implemented shortly. Most of the change is
> about letting the architecture-independent code know which bits are
> supported, so it should be easy to replace the detection mechanism
> with another idea like the signal number bit.

No worries, I think various people have had distractions (I certainly
have ... but I digress).

I'll take a look at your v9.

Cheers
---Dave

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 64+ messages in thread

end of thread, other threads:[~2020-08-18 13:51 UTC | newest]

Thread overview: 64+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-12 17:17 [PATCH] arm64: Expose original FAR_EL1 value in sigcontext Peter Collingbourne
2020-03-25 13:10 ` Catalin Marinas
2020-03-25 17:41   ` Peter Collingbourne
2020-03-25 17:40 ` [PATCH v2] " Peter Collingbourne
2020-03-26 16:45   ` Catalin Marinas
2020-03-27  7:56     ` Will Deacon
2020-03-27 11:39       ` Catalin Marinas
2020-03-27 19:26         ` Peter Collingbourne
2020-03-27 19:19   ` [PATCH v3] " Peter Collingbourne
2020-04-22 14:25     ` Catalin Marinas
2020-04-29 21:08     ` Will Deacon
2020-04-29 21:42       ` Peter Collingbourne
2020-05-04 17:03         ` Will Deacon
2020-05-07 17:57           ` [PATCH v4] arm64: Expose FAR_EL1 tag bits " Peter Collingbourne
2020-05-08  2:01             ` [PATCH v5] " Peter Collingbourne
2020-05-12 16:25               ` Catalin Marinas
2020-05-13 18:09               ` [PATCH v6] " Peter Collingbourne
2020-05-13 20:28                 ` Dave Martin
2020-05-15  0:58                   ` Peter Collingbourne
2020-05-18  9:53                     ` Dave Martin
2020-05-19 22:00                       ` Peter Collingbourne
2020-05-20  8:55                         ` Will Deacon
2020-05-20  9:26                           ` Dave Martin
2020-05-21  2:28                             ` Peter Collingbourne
2020-05-21  2:29                               ` [PATCH v6 0/3] " Peter Collingbourne
2020-05-21  2:29                                 ` [PATCH v6 1/3] signal: Allow architectures to store arch-specific data in kernel_siginfo Peter Collingbourne
2020-05-21  2:29                                 ` [PATCH v6 2/3] arm64: Move fault address and fault code into kernel_siginfo Peter Collingbourne
2020-05-21 13:34                                   ` kbuild test robot
2020-05-21 13:34                                     ` kbuild test robot
2020-05-21  2:29                                 ` [PATCH v6 3/3] arm64: Expose FAR_EL1 tag bits in sigcontext Peter Collingbourne
2020-05-21 12:35                               ` [PATCH v6] " Eric W. Biederman
2020-05-21 18:03                                 ` Peter Collingbourne
2020-05-21 19:24                                   ` Eric W. Biederman
2020-05-21 20:48                                     ` Peter Collingbourne
2020-06-08 18:12                                       ` Peter Collingbourne
2020-06-08 18:14                                         ` [PATCH v7] arm64: Expose FAR_EL1 tag bits in siginfo Peter Collingbourne
     [not found]                                           ` <20200623020134.16655-1-pcc@google.com>
     [not found]                                             ` <87sgemrlgc.fsf@x220.int.ebiederm.org>
2020-06-23 14:38                                               ` [PATCH v8] " Dave Martin
2020-06-23 17:47                                                 ` Eric W. Biederman
2020-06-24  0:40                                                   ` Peter Collingbourne
2020-06-24  9:28                                                     ` Dave Martin
2020-06-24 16:51                                                       ` Peter Collingbourne
2020-06-24 17:12                                                         ` Dave Martin
2020-06-24 19:51                                                           ` Peter Collingbourne
2020-07-06 16:41                                                             ` Dave Martin
2020-07-06 19:20                                                               ` Peter Collingbourne
2020-07-07 14:19                                                                 ` Dave Martin
2020-07-07 19:07                                                                   ` Peter Collingbourne
2020-07-08 11:00                                                                     ` Dave Martin
2020-07-08 13:58                                                                       ` Dave Martin
2020-07-08 22:21                                                                         ` Peter Collingbourne
2020-07-13 13:24                                                                           ` Dave Martin
2020-07-13 20:50                                                                             ` Peter Collingbourne
2020-07-14 17:36                                                                               ` Dave Martin
2020-08-18  3:16                                                                                 ` Peter Collingbourne
2020-08-18 13:50                                                                                   ` Dave Martin
2020-06-23 14:57                                             ` Dave Martin
2020-05-26 13:03                                     ` [PATCH v6] arm64: Expose FAR_EL1 tag bits in sigcontext Dave Martin
2020-04-30  9:50       ` [PATCH v3] arm64: Expose original FAR_EL1 value " Catalin Marinas
2020-04-30  9:59         ` Will Deacon
2020-04-30 13:34           ` Catalin Marinas
2020-05-04 10:19     ` Dave Martin
2020-05-07 17:55       ` Peter Collingbourne
2020-05-13 17:27         ` Dave Martin
2020-05-13 18:00           ` Peter Collingbourne

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.