Linux-Fsdevel Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in
@ 2019-12-02  8:35 zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 1/7] sched/core: Allow putting thread_info into task_struct zhangyi (F)
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

Reporting eip and esp fields on a non-current task is dangerous,
so backport this series for 4.4 to fix protential oops and info leak
problems. The first 3 patch are depended on the 6/7 patch.

Alexey Dobriyan (1):
  proc: fix coredump vs read /proc/*/stat race

Andy Lutomirski (3):
  sched/core: Allow putting thread_info into task_struct
  sched/core: Add try_get_task_stack() and put_task_stack()
  fs/proc: Stop reporting eip and esp in /proc/PID/stat

Heiko Carstens (1):
  sched/core, x86: Make struct thread_info arch specific again

John Ogness (2):
  fs/proc: Report eip/esp in /prod/PID/stat for coredumping
  fs/proc/array.c: allow reporting eip/esp for all coredumping threads

 fs/proc/array.c             | 18 ++++++++++---
 include/linux/init_task.h   |  9 +++++++
 include/linux/sched.h       | 52 +++++++++++++++++++++++++++++++++++--
 include/linux/thread_info.h |  4 +++
 init/Kconfig                | 10 +++++++
 init/init_task.c            |  7 +++--
 kernel/sched/sched.h        |  4 +++
 7 files changed, 97 insertions(+), 7 deletions(-)

-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 1/7] sched/core: Allow putting thread_info into task_struct
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 2/7] sched/core: Add try_get_task_stack() and put_task_stack() zhangyi (F)
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: Andy Lutomirski <luto@kernel.org>

commit c65eacbe290b8141554c71b2c94489e73ade8c8d upstream.

If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT,
then thread_info is defined as a single 'u32 flags' and is the first
entry of task_struct.  thread_info::task is removed (it serves no
purpose if thread_info is embedded in task_struct), and
thread_info::cpu gets its own slot in task_struct.

This is heavily based on a patch written by Linus.

Originally-from: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jann Horn <jann@thejh.net>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a0898196f0476195ca02713691a5037a14f2aac5.1473801993.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 include/linux/init_task.h   |  9 +++++++++
 include/linux/sched.h       | 36 ++++++++++++++++++++++++++++++++++--
 include/linux/thread_info.h | 15 +++++++++++++++
 init/Kconfig                |  7 +++++++
 init/init_task.c            |  7 +++++--
 kernel/sched/sched.h        |  4 ++++
 6 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1c1ff7e4faa4..d25d3f70ee99 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,8 @@
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
+#include <asm/thread_info.h>
+
 #ifdef CONFIG_SMP
 # define INIT_PUSHABLE_TASKS(tsk)					\
 	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
@@ -183,12 +185,19 @@ extern struct task_group root_task_group;
 # define INIT_KASAN(tsk)
 #endif
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk),
+#else
+# define INIT_TASK_TI(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
 #define INIT_TASK(tsk)	\
 {									\
+	INIT_TASK_TI(tsk)						\
 	.state		= 0,						\
 	.stack		= &init_thread_info,				\
 	.usage		= ATOMIC_INIT(2),				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1218980f53de..d6b2e77aad3f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,13 @@ struct tlbflush_unmap_batch {
 };
 
 struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	/*
+	 * For reasons of header soup (see current_thread_info()), this
+	 * must be the first element of task_struct.
+	 */
+	struct thread_info thread_info;
+#endif
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	atomic_t usage;
@@ -1398,6 +1405,9 @@ struct task_struct {
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	unsigned int cpu;	/* current CPU */
+#endif
 	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
@@ -2440,7 +2450,9 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 
 union thread_union {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct thread_info thread_info;
+#endif
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 
@@ -2840,10 +2852,26 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
 	cgroup_threadgroup_change_end(tsk);
 }
 
-#ifndef __HAVE_THREAD_FUNCTIONS
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+static inline struct thread_info *task_thread_info(struct task_struct *task)
+{
+	return &task->thread_info;
+}
+static inline void *task_stack_page(const struct task_struct *task)
+{
+	return task->stack;
+}
+#define setup_thread_stack(new,old)	do { } while(0)
+static inline unsigned long *end_of_stack(const struct task_struct *task)
+{
+	return task->stack;
+}
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
 
 #define task_thread_info(task)	((struct thread_info *)(task)->stack)
-#define task_stack_page(task)	((task)->stack)
+#define task_stack_page(task)	((void *)(task)->stack)
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
@@ -3135,7 +3163,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
 
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	return p->cpu;
+#else
 	return task_thread_info(p)->cpu;
+#endif
 }
 
 static inline int task_node(const struct task_struct *p)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 646891f3bc1e..2813de75a96e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -13,6 +13,21 @@
 struct timespec;
 struct compat_timespec;
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+struct thread_info {
+	u32			flags;		/* low level flags */
+};
+
+#define INIT_THREAD_INFO(tsk)			\
+{						\
+	.flags		= 0,			\
+}
+#endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+#define current_thread_info() ((struct thread_info *)current)
+#endif
+
 /*
  * System call restart block.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 47b0bdcf33c2..5dec7d424980 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,6 +26,13 @@ config IRQ_WORK
 config BUILDTIME_EXTABLE_SORT
 	bool
 
+config THREAD_INFO_IN_TASK
+	bool
+	help
+	  Select this to move thread_info off the stack into task_struct.  To
+	  make this work, an arch will need to remove all thread_info fields
+	  except flags and fix any runtime bugs.
+
 menu "General setup"
 
 config BROKEN
diff --git a/init/init_task.c b/init/init_task.c
index ba0a7f362d9e..11f83be1fa79 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task);
  * Initial thread structure. Alignment of this is handled by a special
  * linker map entry.
  */
-union thread_union init_thread_union __init_task_data =
-	{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data = {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+	INIT_THREAD_INFO(init_task)
+#endif
+};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8b96df04ba78..8afd9d62c56e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -978,7 +978,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	p->cpu = cpu;
+#else
 	task_thread_info(p)->cpu = cpu;
+#endif
 	p->wake_cpu = cpu;
 #endif
 }
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 2/7] sched/core: Add try_get_task_stack() and put_task_stack()
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 1/7] sched/core: Allow putting thread_info into task_struct zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 3/7] sched/core, x86: Make struct thread_info arch specific again zhangyi (F)
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: Andy Lutomirski <luto@kernel.org>

commit c6c314a613cd7d03fb97713e0d642b493de42e69 upstream.

There are a few places in the kernel that access stack memory
belonging to a different task.  Before we can start freeing task
stacks before the task_struct is freed, we need a way for those code
paths to pin the stack.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jann Horn <jann@thejh.net>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/17a434f50ad3d77000104f21666575e10a9c1fbd.1474003868.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 include/linux/sched.h | 16 ++++++++++++++++
 init/Kconfig          |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d6b2e77aad3f..761247c966a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2858,11 +2858,19 @@ static inline struct thread_info *task_thread_info(struct task_struct *task)
 {
 	return &task->thread_info;
 }
+
+/*
+ * When accessing the stack of a non-current task that might exit, use
+ * try_get_task_stack() instead.  task_stack_page will return a pointer
+ * that could get freed out from under you.
+ */
 static inline void *task_stack_page(const struct task_struct *task)
 {
 	return task->stack;
 }
+
 #define setup_thread_stack(new,old)	do { } while(0)
+
 static inline unsigned long *end_of_stack(const struct task_struct *task)
 {
 	return task->stack;
@@ -2898,6 +2906,14 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 }
 
 #endif
+
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+	return task_stack_page(tsk);
+}
+
+static inline void put_task_stack(struct task_struct *tsk) {}
+
 #define task_stack_end_corrupted(task) \
 		(*(end_of_stack(task)) != STACK_END_MAGIC)
 
diff --git a/init/Kconfig b/init/Kconfig
index 5dec7d424980..f9fb621c9562 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -33,6 +33,9 @@ config THREAD_INFO_IN_TASK
 	  make this work, an arch will need to remove all thread_info fields
 	  except flags and fix any runtime bugs.
 
+	  One subtle change that will be needed is to use try_get_task_stack()
+	  and put_task_stack() in save_thread_stack_tsk() and get_wchan().
+
 menu "General setup"
 
 config BROKEN
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 3/7] sched/core, x86: Make struct thread_info arch specific again
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 1/7] sched/core: Allow putting thread_info into task_struct zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 2/7] sched/core: Add try_get_task_stack() and put_task_stack() zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 4/7] fs/proc: Stop reporting eip and esp in /proc/PID/stat zhangyi (F)
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: Heiko Carstens <heiko.carstens@de.ibm.com>

commit c8061485a0d7569a865a3cc3c63347b0f42b3765 upstream.

The following commit:

  c65eacbe290b ("sched/core: Allow putting thread_info into task_struct")

... made 'struct thread_info' a generic struct with only a
single ::flags member, if CONFIG_THREAD_INFO_IN_TASK_STRUCT=y is
selected.

This change however seems to be quite x86 centric, since at least the
generic preemption code (asm-generic/preempt.h) assumes that struct
thread_info also has a preempt_count member, which apparently was not
true for x86.

We could add a bit more #ifdefs to solve this problem too, but it seems
to be much simpler to make struct thread_info arch specific
again. This also makes the conversion to THREAD_INFO_IN_TASK_STRUCT a
bit easier for architectures that have a couple of arch specific stuff
in their thread_info definition.

The arch specific stuff _could_ be moved to thread_struct. However
keeping them in thread_info makes it easier: accessing thread_info
members is simple, since it is at the beginning of the task_struct,
while the thread_struct is at the end. At least on s390 the offsets
needed to access members of the thread_struct (with task_struct as
base) are too large for various asm instructions.  This is not a
problem when keeping these members within thread_info.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: keescook@chromium.org
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/1476901693-8492-2-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

[ zhangyi: skip defination of INIT_THREAD_INFO and struct thread_info ]

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 include/linux/thread_info.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 2813de75a96e..897e835379d8 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -13,17 +13,6 @@
 struct timespec;
 struct compat_timespec;
 
-#ifdef CONFIG_THREAD_INFO_IN_TASK
-struct thread_info {
-	u32			flags;		/* low level flags */
-};
-
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.flags		= 0,			\
-}
-#endif
-
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 #define current_thread_info() ((struct thread_info *)current)
 #endif
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 4/7] fs/proc: Stop reporting eip and esp in /proc/PID/stat
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
                   ` (2 preceding siblings ...)
  2019-12-02  8:35 ` [PATCH 4.4 3/7] sched/core, x86: Make struct thread_info arch specific again zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 5/7] fs/proc: Report eip/esp in /prod/PID/stat for coredumping zhangyi (F)
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: Andy Lutomirski <luto@kernel.org>

commit 0a1eb2d474edfe75466be6b4677ad84e5e8ca3f5 upstream.

Reporting these fields on a non-current task is dangerous.  If the
task is in any state other than normal kernel code, they may contain
garbage or even kernel addresses on some architectures.  (x86_64
used to do this.  I bet lots of architectures still do.)  With
CONFIG_THREAD_INFO_IN_TASK=y, it can OOPS, too.

As far as I know, there are no use programs that make any material
use of these fields, so just get rid of them.

Reported-by: Jann Horn <jann@thejh.net>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux API <linux-api@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Tycho Andersen <tycho.andersen@canonical.com>
Link: http://lkml.kernel.org/r/a5fed4c3f4e33ed25d4bb03567e329bc5a712bcc.1475257877.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 fs/proc/array.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 60cbaa821164..618c83f1866d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -425,10 +425,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
-		if (permitted) {
-			eip = KSTK_EIP(task);
-			esp = KSTK_ESP(task);
-		}
+		/*
+		 * esp and eip are intentionally zeroed out.  There is no
+		 * non-racy way to read them without freezing the task.
+		 * Programs that need reliable values can use ptrace(2).
+		 */
 	}
 
 	get_task_comm(tcomm, task);
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 5/7] fs/proc: Report eip/esp in /prod/PID/stat for coredumping
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
                   ` (3 preceding siblings ...)
  2019-12-02  8:35 ` [PATCH 4.4 4/7] fs/proc: Stop reporting eip and esp in /proc/PID/stat zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 6/7] proc: fix coredump vs read /proc/*/stat race zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 7/7] fs/proc/array.c: allow reporting eip/esp for all coredumping threads zhangyi (F)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: John Ogness <john.ogness@linutronix.de>

commit fd7d56270b526ca3ed0c224362e3c64a0f86687a upstream.

Commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in
/proc/PID/stat") stopped reporting eip/esp because it is
racy and dangerous for executing tasks. The comment adds:

    As far as I know, there are no use programs that make any
    material use of these fields, so just get rid of them.

However, existing userspace core-dump-handler applications (for
example, minicoredumper) are using these fields since they
provide an excellent cross-platform interface to these valuable
pointers. So that commit introduced a user space visible
regression.

Partially revert the change and make the readout possible for
tasks with the proper permissions and only if the target task
has the PF_DUMPCORE flag set.

Fixes: 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in> /proc/PID/stat")
Reported-by: Marco Felsch <marco.felsch@preh.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Tycho Andersen <tycho.andersen@canonical.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: stable@vger.kernel.org
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Linux API <linux-api@vger.kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/87poatfwg6.fsf@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

[ zhangyi: 68db0cf10678 does not merged, skip the task_stack.h for 4.4]

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 fs/proc/array.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 618c83f1866d..c3e1732bcaa5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -429,7 +429,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		 * esp and eip are intentionally zeroed out.  There is no
 		 * non-racy way to read them without freezing the task.
 		 * Programs that need reliable values can use ptrace(2).
+		 *
+		 * The only exception is if the task is core dumping because
+		 * a program is not able to use ptrace(2) in that case. It is
+		 * safe because the task has stopped executing permanently.
 		 */
+		if (permitted && (task->flags & PF_DUMPCORE)) {
+			eip = KSTK_EIP(task);
+			esp = KSTK_ESP(task);
+		}
 	}
 
 	get_task_comm(tcomm, task);
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 6/7] proc: fix coredump vs read /proc/*/stat race
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
                   ` (4 preceding siblings ...)
  2019-12-02  8:35 ` [PATCH 4.4 5/7] fs/proc: Report eip/esp in /prod/PID/stat for coredumping zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  2019-12-02  8:35 ` [PATCH 4.4 7/7] fs/proc/array.c: allow reporting eip/esp for all coredumping threads zhangyi (F)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: Alexey Dobriyan <adobriyan@gmail.com>

commit 8bb2ee192e482c5d500df9f2b1b26a560bd3026f upstream.

do_task_stat() accesses IP and SP of a task without bumping reference
count of a stack (which became an entity with independent lifetime at
some point).

Steps to reproduce:

    #include <stdio.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <sys/time.h>
    #include <sys/resource.h>
    #include <unistd.h>
    #include <sys/wait.h>

    int main(void)
    {
    	setrlimit(RLIMIT_CORE, &(struct rlimit){});

    	while (1) {
    		char buf[64];
    		char buf2[4096];
    		pid_t pid;
    		int fd;

    		pid = fork();
    		if (pid == 0) {
    			*(volatile int *)0 = 0;
    		}

    		snprintf(buf, sizeof(buf), "/proc/%u/stat", pid);
    		fd = open(buf, O_RDONLY);
    		read(fd, buf2, sizeof(buf2));
    		close(fd);

    		waitpid(pid, NULL, 0);
    	}
    	return 0;
    }

    BUG: unable to handle kernel paging request at 0000000000003fd8
    IP: do_task_stat+0x8b4/0xaf0
    PGD 800000003d73e067 P4D 800000003d73e067 PUD 3d558067 PMD 0
    Oops: 0000 [#1] PREEMPT SMP PTI
    CPU: 0 PID: 1417 Comm: a.out Not tainted 4.15.0-rc8-dirty #2
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc27 04/01/2014
    RIP: 0010:do_task_stat+0x8b4/0xaf0
    Call Trace:
     proc_single_show+0x43/0x70
     seq_read+0xe6/0x3b0
     __vfs_read+0x1e/0x120
     vfs_read+0x84/0x110
     SyS_read+0x3d/0xa0
     entry_SYSCALL_64_fastpath+0x13/0x6c
    RIP: 0033:0x7f4d7928cba0
    RSP: 002b:00007ffddb245158 EFLAGS: 00000246
    Code: 03 b7 a0 01 00 00 4c 8b 4c 24 70 4c 8b 44 24 78 4c 89 74 24 18 e9 91 f9 ff ff f6 45 4d 02 0f 84 fd f7 ff ff 48 8b 45 40 48 89 ef <48> 8b 80 d8 3f 00 00 48 89 44 24 20 e8 9b 97 eb ff 48 89 44 24
    RIP: do_task_stat+0x8b4/0xaf0 RSP: ffffc90000607cc8
    CR2: 0000000000003fd8

John Ogness said: for my tests I added an else case to verify that the
race is hit and correctly mitigated.

Link: http://lkml.kernel.org/r/20180116175054.GA11513@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reported-by: "Kohli, Gaurav" <gkohli@codeaurora.org>
Tested-by: John Ogness <john.ogness@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 fs/proc/array.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index c3e1732bcaa5..42e33ea50d1c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -435,8 +435,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		 * safe because the task has stopped executing permanently.
 		 */
 		if (permitted && (task->flags & PF_DUMPCORE)) {
-			eip = KSTK_EIP(task);
-			esp = KSTK_ESP(task);
+			if (try_get_task_stack(task)) {
+				eip = KSTK_EIP(task);
+				esp = KSTK_ESP(task);
+				put_task_stack(task);
+			}
 		}
 	}
 
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 4.4 7/7] fs/proc/array.c: allow reporting eip/esp for all coredumping threads
  2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
                   ` (5 preceding siblings ...)
  2019-12-02  8:35 ` [PATCH 4.4 6/7] proc: fix coredump vs read /proc/*/stat race zhangyi (F)
@ 2019-12-02  8:35 ` zhangyi (F)
  6 siblings, 0 replies; 8+ messages in thread
From: zhangyi (F) @ 2019-12-02  8:35 UTC (permalink / raw)
  To: gregkh
  Cc: linux-kernel, linux-fsdevel, luto, adobriyan, akpm, vbabka,
	peterz, bigeasy, mhocko, john.ogness, yi.zhang, nixiaoming

From: John Ogness <john.ogness@linutronix.de>

commit cb8f381f1613cafe3aec30809991cd56e7135d92 upstream.

0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat")
stopped reporting eip/esp and fd7d56270b52 ("fs/proc: Report eip/esp in
/prod/PID/stat for coredumping") reintroduced the feature to fix a
regression with userspace core dump handlers (such as minicoredumper).

Because PF_DUMPCORE is only set for the primary thread, this didn't fix
the original problem for secondary threads.  Allow reporting the eip/esp
for all threads by checking for PF_EXITING as well.  This is set for all
the other threads when they are killed.  coredump_wait() waits for all the
tasks to become inactive before proceeding to invoke a core dumper.

Link: http://lkml.kernel.org/r/87y32p7i7a.fsf@linutronix.de
Link: http://lkml.kernel.org/r/20190522161614.628-1-jlu@pengutronix.de
Fixes: fd7d56270b526ca3 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping")
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reported-by: Jan Luebbe <jlu@pengutronix.de>
Tested-by: Jan Luebbe <jlu@pengutronix.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 42e33ea50d1c..6238f45eed02 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -434,7 +434,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		 * a program is not able to use ptrace(2) in that case. It is
 		 * safe because the task has stopped executing permanently.
 		 */
-		if (permitted && (task->flags & PF_DUMPCORE)) {
+		if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
 			if (try_get_task_stack(task)) {
 				eip = KSTK_EIP(task);
 				esp = KSTK_ESP(task);
-- 
2.17.2


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, back to index

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-02  8:35 [PATCH 4.4 0/7] fs/proc: Stop reporting eip and esp in zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 1/7] sched/core: Allow putting thread_info into task_struct zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 2/7] sched/core: Add try_get_task_stack() and put_task_stack() zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 3/7] sched/core, x86: Make struct thread_info arch specific again zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 4/7] fs/proc: Stop reporting eip and esp in /proc/PID/stat zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 5/7] fs/proc: Report eip/esp in /prod/PID/stat for coredumping zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 6/7] proc: fix coredump vs read /proc/*/stat race zhangyi (F)
2019-12-02  8:35 ` [PATCH 4.4 7/7] fs/proc/array.c: allow reporting eip/esp for all coredumping threads zhangyi (F)

Linux-Fsdevel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-fsdevel/0 linux-fsdevel/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-fsdevel linux-fsdevel/ https://lore.kernel.org/linux-fsdevel \
		linux-fsdevel@vger.kernel.org
	public-inbox-index linux-fsdevel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-fsdevel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git