linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks
@ 2015-05-27 21:47 Alexey Dobriyan
  2015-05-27 21:49 ` [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline Alexey Dobriyan
  2015-05-27 22:02 ` [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Cyrill Gorcunov
  0 siblings, 2 replies; 14+ messages in thread
From: Alexey Dobriyan @ 2015-05-27 21:47 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, gorcunov, jarod, jstancek

Individual prctl(PR_SET_MM_*) calls do some checking to maintain
consistent view of mm->arg_start et al fields, but not enough.
In particular PR_SET_MM_ARG_START/PR_SET_MM_ARG_END/PR_SET_MM_ENV_START/PR_SET_MM_ENV_END
only check that address lies in existent VMA, but doesn't check that
start address is lower that end address _at all_.

Consolidate all consistency checks, so there will be no difference in
the future between PR_SET_MM_MAP and individual PR_SET_MM_* calls.

The program below makes both ARGV and ENVP areas reverted,
makes /proc/$PID/cmdline show garbage (doesn't oops by luck).

#include <sys/mman.h>
#include <sys/prctl.h>
#include <unistd.h>

enum {PAGE_SIZE=4096};

int main(void)
{
	void *p;

	p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

#define PR_SET_MM               35
#define PR_SET_MM_ARG_START     8
#define PR_SET_MM_ARG_END       9
#define PR_SET_MM_ENV_START     10
#define PR_SET_MM_ENV_END       11
	prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long)p + PAGE_SIZE - 1, 0, 0);
	prctl(PR_SET_MM, PR_SET_MM_ARG_END,   (unsigned long)p, 0, 0);
	prctl(PR_SET_MM, PR_SET_MM_ENV_START, (unsigned long)p + PAGE_SIZE - 1, 0, 0);
	prctl(PR_SET_MM, PR_SET_MM_ENV_END,   (unsigned long)p, 0, 0);

	pause();
	return 0;
}

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 kernel/sys.c |  159 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 92 insertions(+), 67 deletions(-)

--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1722,7 +1722,6 @@ exit_err:
 	goto exit;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
 /*
  * WARNING: we don't require any capability here so be very careful
  * in what is allowed for modification from userspace.
@@ -1818,6 +1817,7 @@ out:
 	return error;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
 {
 	struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -1902,10 +1902,42 @@ out:
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, unsigned long len)
+{
+	/*
+	 * This doesn't move auxiliary vector itself
+	 * since it's pinned to mm_struct, but allow
+	 * to fill vector with new values. It's up
+	 * to a caller to provide sane values here
+	 * otherwise user space tools which use this
+	 * vector might be unhappy.
+	 */
+	unsigned long user_auxv[AT_VECTOR_SIZE];
+
+	if (len > sizeof(user_auxv))
+		return -EINVAL;
+
+	if (copy_from_user(user_auxv, (const void __user *)addr, len))
+		return -EFAULT;
+
+	/* Make sure the last entry is always AT_NULL */
+	user_auxv[AT_VECTOR_SIZE - 2] = 0;
+	user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+	task_lock(current);
+	memcpy(mm->saved_auxv, user_auxv, len);
+	task_unlock(current);
+
+	return 0;
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	struct mm_struct *mm = current->mm;
+	struct prctl_mm_map prctl_map;
 	struct vm_area_struct *vma;
 	int error;
 
@@ -1925,6 +1957,9 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (opt == PR_SET_MM_EXE_FILE)
 		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
+	if (opt == PR_SET_MM_AUXV)
+		return prctl_set_auxv(mm, addr, arg4);
+
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
 
@@ -1933,42 +1968,64 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, addr);
 
+	prctl_map.start_code	= mm->start_code;
+	prctl_map.end_code	= mm->end_code;
+	prctl_map.start_data	= mm->start_data;
+	prctl_map.end_data	= mm->end_data;
+	prctl_map.start_brk	= mm->start_brk;
+	prctl_map.brk		= mm->brk;
+	prctl_map.start_stack	= mm->start_stack;
+	prctl_map.arg_start	= mm->arg_start;
+	prctl_map.arg_end	= mm->arg_end;
+	prctl_map.env_start	= mm->env_start;
+	prctl_map.env_end	= mm->env_end;
+	prctl_map.auxv		= NULL;
+	prctl_map.auxv_size	= 0;
+	prctl_map.exe_fd	= -1;
+
 	switch (opt) {
 	case PR_SET_MM_START_CODE:
-		mm->start_code = addr;
+		prctl_map.start_code = addr;
 		break;
 	case PR_SET_MM_END_CODE:
-		mm->end_code = addr;
+		prctl_map.end_code = addr;
 		break;
 	case PR_SET_MM_START_DATA:
-		mm->start_data = addr;
+		prctl_map.start_data = addr;
 		break;
 	case PR_SET_MM_END_DATA:
-		mm->end_data = addr;
+		prctl_map.end_data = addr;
+		break;
+	case PR_SET_MM_START_STACK:
+		prctl_map.start_stack = addr;
 		break;
-
 	case PR_SET_MM_START_BRK:
-		if (addr <= mm->end_data)
-			goto out;
-
-		if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
-				      mm->end_data, mm->start_data))
-			goto out;
-
-		mm->start_brk = addr;
+		prctl_map.start_brk = addr;
 		break;
-
 	case PR_SET_MM_BRK:
-		if (addr <= mm->end_data)
-			goto out;
-
-		if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
-				      mm->end_data, mm->start_data))
-			goto out;
-
-		mm->brk = addr;
+		prctl_map.brk = addr;
+		break;
+	case PR_SET_MM_ARG_START:
+		prctl_map.arg_start = addr;
 		break;
+	case PR_SET_MM_ARG_END:
+		prctl_map.arg_end = addr;
+		break;
+	case PR_SET_MM_ENV_START:
+		prctl_map.env_start = addr;
+		break;
+	case PR_SET_MM_ENV_END:
+		prctl_map.env_end = addr;
+		break;
+	default:
+		goto out;
+	}
 
+	error = validate_prctl_map(&prctl_map);
+	if (error)
+		goto out;
+
+	switch (opt) {
 	/*
 	 * If command line arguments and environment
 	 * are placed somewhere else on stack, we can
@@ -1985,52 +2042,20 @@ static int prctl_set_mm(int opt, unsigned long addr,
 			error = -EFAULT;
 			goto out;
 		}
-		if (opt == PR_SET_MM_START_STACK)
-			mm->start_stack = addr;
-		else if (opt == PR_SET_MM_ARG_START)
-			mm->arg_start = addr;
-		else if (opt == PR_SET_MM_ARG_END)
-			mm->arg_end = addr;
-		else if (opt == PR_SET_MM_ENV_START)
-			mm->env_start = addr;
-		else if (opt == PR_SET_MM_ENV_END)
-			mm->env_end = addr;
-		break;
-
-	/*
-	 * This doesn't move auxiliary vector itself
-	 * since it's pinned to mm_struct, but allow
-	 * to fill vector with new values. It's up
-	 * to a caller to provide sane values here
-	 * otherwise user space tools which use this
-	 * vector might be unhappy.
-	 */
-	case PR_SET_MM_AUXV: {
-		unsigned long user_auxv[AT_VECTOR_SIZE];
-
-		if (arg4 > sizeof(user_auxv))
-			goto out;
-		up_read(&mm->mmap_sem);
-
-		if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
-			return -EFAULT;
-
-		/* Make sure the last entry is always AT_NULL */
-		user_auxv[AT_VECTOR_SIZE - 2] = 0;
-		user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
-		BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
-		task_lock(current);
-		memcpy(mm->saved_auxv, user_auxv, arg4);
-		task_unlock(current);
-
-		return 0;
-	}
-	default:
-		goto out;
 	}
 
+	mm->start_code	= prctl_map.start_code;
+	mm->end_code	= prctl_map.end_code;
+	mm->start_data	= prctl_map.start_data;
+	mm->end_data	= prctl_map.end_data;
+	mm->start_brk	= prctl_map.start_brk;
+	mm->brk		= prctl_map.brk;
+	mm->start_stack	= prctl_map.start_stack;
+	mm->arg_start	= prctl_map.arg_start;
+	mm->arg_end	= prctl_map.arg_end;
+	mm->env_start	= prctl_map.env_start;
+	mm->env_end	= prctl_map.env_end;
+
 	error = 0;
 out:
 	up_read(&mm->mmap_sem);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 21:47 [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Alexey Dobriyan
@ 2015-05-27 21:49 ` Alexey Dobriyan
  2015-05-27 22:14   ` Cyrill Gorcunov
  2015-05-28 11:57   ` Jan Stancek
  2015-05-27 22:02 ` [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Cyrill Gorcunov
  1 sibling, 2 replies; 14+ messages in thread
From: Alexey Dobriyan @ 2015-05-27 21:49 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, gorcunov, jarod, jstancek

/proc/$PID/cmdline truncates output at PAGE_SIZE. It is easy to see with

	$ cat /proc/self/cmdline $(seq 1037) 2>/dev/null

However, command line size was never limited to PAGE_SIZE but to 128 KB and
relatively recently limitation was removed altogether.

People noticed and ask questions:
http://stackoverflow.com/questions/199130/how-do-i-increase-the-proc-pid-cmdline-4096-byte-limit

seq file interface is not OK, because it kmalloc's for whole output and
open + read(, 1) + sleep will pin arbitrary amounts of kernel memory.
To not do that, limit must be imposed which is incompatible with
arbitrary sized command lines.

I apologize for hairy code, but this it direct consequence of command line
layout in memory and hacks to support things like "init [3]".

The loops are "unrolled" otherwise it is either macros which hide
control flow or functions with 7-8 arguments with equal line count.

There should be real setproctitle(2) or something.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Tested-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Jarod Wilson <jarod@redhat.com>
---

	v5: fix BUG_ON(env_start > env_end)

 fs/proc/base.c |  205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 196 insertions(+), 9 deletions(-)

--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -196,18 +196,205 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
 	return result;
 }
 
-static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
-			    struct pid *pid, struct task_struct *task)
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+				     size_t _count, loff_t *pos)
 {
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	char *page;
+	unsigned long count = _count;
+	unsigned long arg_start, arg_end, env_start, env_end;
+	unsigned long len1, len2, len;
+	unsigned long p;
+	char c;
+	ssize_t rv;
+
+	BUG_ON(*pos < 0);
+
+	tsk = get_proc_task(file_inode(file));
+	if (!tsk)
+		return -ESRCH;
+	mm = get_task_mm(tsk);
+	put_task_struct(tsk);
+	if (!mm)
+		return 0;
+	/* Check if process spawned far enough to have cmdline. */
+	if (!mm->env_end) {
+		rv = 0;
+		goto out_mmput;
+	}
+
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page) {
+		rv = -ENOMEM;
+		goto out_mmput;
+	}
+
+	down_read(&mm->mmap_sem);
+	arg_start = mm->arg_start;
+	arg_end = mm->arg_end;
+	env_start = mm->env_start;
+	env_end = mm->env_end;
+	up_read(&mm->mmap_sem);
+
+	BUG_ON(arg_start > arg_end);
+	BUG_ON(env_start > env_end);
+
+	len1 = arg_end - arg_start;
+	len2 = env_end - env_start;
+
 	/*
-	 * Rely on struct seq_operations::show() being called once
-	 * per internal buffer allocation. See single_open(), traverse().
+	 * Inherently racy -- command line shares address space
+	 * with code and data.
 	 */
-	BUG_ON(m->size < PAGE_SIZE);
-	m->count += get_cmdline(task, m->buf, PAGE_SIZE);
-	return 0;
+	rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+	if (rv <= 0)
+		goto out_free_page;
+
+	rv = 0;
+
+	if (c == '\0') {
+		/* Command line (set of strings) occupies whole ARGV. */
+		if (len1 <= *pos)
+			goto out_free_page;
+
+		p = arg_start + *pos;
+		len = len1 - *pos;
+		while (count > 0 && len > 0) {
+			unsigned int _count;
+			int nr_read;
+
+			_count = min3(count, len, PAGE_SIZE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			if (nr_read < 0)
+				rv = nr_read;
+			if (nr_read <= 0)
+				goto out_free_page;
+
+			if (copy_to_user(buf, page, nr_read)) {
+				rv = -EFAULT;
+				goto out_free_page;
+			}
+
+			p	+= nr_read;
+			len	-= nr_read;
+			buf	+= nr_read;
+			count	-= nr_read;
+			rv	+= nr_read;
+		}
+	} else {
+		/*
+		 * Command line (1 string) occupies ARGV and maybe
+		 * extends into ENVP.
+		 */
+		if (len1 + len2 <= *pos)
+			goto skip_argv_envp;
+		if (len1 <= *pos)
+			goto skip_argv;
+
+		p = arg_start + *pos;
+		len = len1 - *pos;
+		while (count > 0 && len > 0) {
+			unsigned int _count, l;
+			int nr_read;
+			bool final;
+
+			_count = min3(count, len, PAGE_SIZE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			if (nr_read < 0)
+				rv = nr_read;
+			if (nr_read <= 0)
+				goto out_free_page;
+
+			/*
+			 * Command line can be shorter than whole ARGV
+			 * even if last "marker" byte says it is not.
+			 */
+			final = false;
+			l = strnlen(page, nr_read);
+			if (l < nr_read) {
+				nr_read = l;
+				final = true;
+			}
+
+			if (copy_to_user(buf, page, nr_read)) {
+				rv = -EFAULT;
+				goto out_free_page;
+			}
+
+			p	+= nr_read;
+			len	-= nr_read;
+			buf	+= nr_read;
+			count	-= nr_read;
+			rv	+= nr_read;
+
+			if (final)
+				goto out_free_page;
+		}
+skip_argv:
+		/*
+		 * Command line (1 string) occupies ARGV and
+		 * extends into ENVP.
+		 */
+		if (len1 <= *pos) {
+			p = env_start + *pos - len1;
+			len = len1 + len2 - *pos;
+		} else {
+			p = env_start;
+			len = len2;
+		}
+		while (count > 0 && len > 0) {
+			unsigned int _count, l;
+			int nr_read;
+			bool final;
+
+			_count = min3(count, len, PAGE_SIZE);
+			nr_read = access_remote_vm(mm, p, page, _count, 0);
+			if (nr_read < 0)
+				rv = nr_read;
+			if (nr_read <= 0)
+				goto out_free_page;
+
+			/* Find EOS. */
+			final = false;
+			l = strnlen(page, nr_read);
+			if (l < nr_read) {
+				nr_read = l;
+				final = true;
+			}
+
+			if (copy_to_user(buf, page, nr_read)) {
+				rv = -EFAULT;
+				goto out_free_page;
+			}
+
+			p	+= nr_read;
+			len	-= nr_read;
+			buf	+= nr_read;
+			count	-= nr_read;
+			rv	+= nr_read;
+
+			if (final)
+				goto out_free_page;
+		}
+skip_argv_envp:
+		;
+	}
+
+out_free_page:
+	free_page((unsigned long)page);
+out_mmput:
+	mmput(mm);
+	if (rv > 0)
+		*pos += rv;
+	return rv;
 }
 
+static const struct file_operations proc_pid_cmdline_ops = {
+	.read	= proc_pid_cmdline_read,
+	.llseek	= generic_file_llseek,
+};
+
 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
 			 struct pid *pid, struct task_struct *task)
 {
@@ -2572,7 +2759,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	ONE("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-	ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
+	REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
 	ONE("statm",      S_IRUGO, proc_pid_statm),
 	REG("maps",       S_IRUGO, proc_pid_maps_operations),
@@ -2918,7 +3105,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	ONE("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-	ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
+	REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
 	REG("maps",      S_IRUGO, proc_tid_maps_operations),

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks
  2015-05-27 21:47 [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Alexey Dobriyan
  2015-05-27 21:49 ` [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline Alexey Dobriyan
@ 2015-05-27 22:02 ` Cyrill Gorcunov
  2015-05-27 22:12   ` Alexey Dobriyan
  1 sibling, 1 reply; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-27 22:02 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 12:47:57AM +0300, Alexey Dobriyan wrote:
> Individual prctl(PR_SET_MM_*) calls do some checking to maintain
> consistent view of mm->arg_start et al fields, but not enough.
> In particular PR_SET_MM_ARG_START/PR_SET_MM_ARG_END/PR_SET_MM_ENV_START/PR_SET_MM_ENV_END
> only check that address lies in existent VMA, but doesn't check that
> start address is lower that end address _at all_.
> 
> Consolidate all consistency checks, so there will be no difference in
> the future between PR_SET_MM_MAP and individual PR_SET_MM_* calls.
> 
> The program below makes both ARGV and ENVP areas reverted,
> makes /proc/$PID/cmdline show garbage (doesn't oops by luck).

Why should it oops? Actually i would really love to drop off old
PR_SET_MM_ interface completely (which requires CAP_SYS_RESOURCE),
leaving only MM_MAP. But thank you!

Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks
  2015-05-27 22:02 ` [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Cyrill Gorcunov
@ 2015-05-27 22:12   ` Alexey Dobriyan
  2015-05-27 22:20     ` Cyrill Gorcunov
  0 siblings, 1 reply; 14+ messages in thread
From: Alexey Dobriyan @ 2015-05-27 22:12 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 01:02:29AM +0300, Cyrill Gorcunov wrote:
> On Thu, May 28, 2015 at 12:47:57AM +0300, Alexey Dobriyan wrote:
> > Individual prctl(PR_SET_MM_*) calls do some checking to maintain
> > consistent view of mm->arg_start et al fields, but not enough.
> > In particular PR_SET_MM_ARG_START/PR_SET_MM_ARG_END/PR_SET_MM_ENV_START/PR_SET_MM_ENV_END
> > only check that address lies in existent VMA, but doesn't check that
> > start address is lower that end address _at all_.
> > 
> > Consolidate all consistency checks, so there will be no difference in
> > the future between PR_SET_MM_MAP and individual PR_SET_MM_* calls.
> > 
> > The program below makes both ARGV and ENVP areas reverted,
> > makes /proc/$PID/cmdline show garbage (doesn't oops by luck).
> 
> Why should it oops?

Anything can happen if you constantly write code like this

	unsigned long len = mm->arg_end - mm->arg_start;

and expect result to be positive.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 21:49 ` [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline Alexey Dobriyan
@ 2015-05-27 22:14   ` Cyrill Gorcunov
  2015-05-27 22:29     ` Alexey Dobriyan
  2015-05-28 11:57   ` Jan Stancek
  1 sibling, 1 reply; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-27 22:14 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 12:49:53AM +0300, Alexey Dobriyan wrote:
...
> -static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
> -			    struct pid *pid, struct task_struct *task)
> +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
> +				     size_t _count, loff_t *pos)
>  {
> +	struct task_struct *tsk;
> +	struct mm_struct *mm;
> +	char *page;
> +	unsigned long count = _count;
> +	unsigned long arg_start, arg_end, env_start, env_end;
> +	unsigned long len1, len2, len;
> +	unsigned long p;
> +	char c;
> +	ssize_t rv;
> +
> +	BUG_ON(*pos < 0);
> +
> +	tsk = get_proc_task(file_inode(file));
> +	if (!tsk)
> +		return -ESRCH;
> +	mm = get_task_mm(tsk);
> +	put_task_struct(tsk);
> +	if (!mm)
> +		return 0;
> +	/* Check if process spawned far enough to have cmdline. */
> +	if (!mm->env_end) {
> +		rv = 0;
> +		goto out_mmput;
> +	}
> +
> +	page = (char *)__get_free_page(GFP_TEMPORARY);
> +	if (!page) {
> +		rv = -ENOMEM;
> +		goto out_mmput;
> +	}
> +
> +	down_read(&mm->mmap_sem);
> +	arg_start = mm->arg_start;
> +	arg_end = mm->arg_end;
> +	env_start = mm->env_start;
> +	env_end = mm->env_end;
> +	up_read(&mm->mmap_sem);

Could you please explain why this down/up is needed?
(the rest is hard to review at night :)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks
  2015-05-27 22:12   ` Alexey Dobriyan
@ 2015-05-27 22:20     ` Cyrill Gorcunov
  0 siblings, 0 replies; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-27 22:20 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 01:12:14AM +0300, Alexey Dobriyan wrote:
> On Thu, May 28, 2015 at 01:02:29AM +0300, Cyrill Gorcunov wrote:
> > On Thu, May 28, 2015 at 12:47:57AM +0300, Alexey Dobriyan wrote:
> > > Individual prctl(PR_SET_MM_*) calls do some checking to maintain
> > > consistent view of mm->arg_start et al fields, but not enough.
> > > In particular PR_SET_MM_ARG_START/PR_SET_MM_ARG_END/PR_SET_MM_ENV_START/PR_SET_MM_ENV_END
> > > only check that address lies in existent VMA, but doesn't check that
> > > start address is lower that end address _at all_.
> > > 
> > > Consolidate all consistency checks, so there will be no difference in
> > > the future between PR_SET_MM_MAP and individual PR_SET_MM_* calls.
> > > 
> > > The program below makes both ARGV and ENVP areas reverted,
> > > makes /proc/$PID/cmdline show garbage (doesn't oops by luck).
> > 
> > Why should it oops?
> 
> Anything can happen if you constantly write code like this
> 
> 	unsigned long len = mm->arg_end - mm->arg_start;
> 
> and expect result to be positive.

It won't 'cause proc code will limit it (this where 'luck'
comes from :). Anyway, switching to validate() helper will
make code more robust, so thank you!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 22:14   ` Cyrill Gorcunov
@ 2015-05-27 22:29     ` Alexey Dobriyan
  2015-05-27 22:48       ` Cyrill Gorcunov
  0 siblings, 1 reply; 14+ messages in thread
From: Alexey Dobriyan @ 2015-05-27 22:29 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 01:14:35AM +0300, Cyrill Gorcunov wrote:
> On Thu, May 28, 2015 at 12:49:53AM +0300, Alexey Dobriyan wrote:
> ...
> > -static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
> > -			    struct pid *pid, struct task_struct *task)
> > +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
> > +				     size_t _count, loff_t *pos)
> >  {
> > +	struct task_struct *tsk;
> > +	struct mm_struct *mm;
> > +	char *page;
> > +	unsigned long count = _count;
> > +	unsigned long arg_start, arg_end, env_start, env_end;
> > +	unsigned long len1, len2, len;
> > +	unsigned long p;
> > +	char c;
> > +	ssize_t rv;
> > +
> > +	BUG_ON(*pos < 0);
> > +
> > +	tsk = get_proc_task(file_inode(file));
> > +	if (!tsk)
> > +		return -ESRCH;
> > +	mm = get_task_mm(tsk);
> > +	put_task_struct(tsk);
> > +	if (!mm)
> > +		return 0;
> > +	/* Check if process spawned far enough to have cmdline. */
> > +	if (!mm->env_end) {
> > +		rv = 0;
> > +		goto out_mmput;
> > +	}
> > +
> > +	page = (char *)__get_free_page(GFP_TEMPORARY);
> > +	if (!page) {
> > +		rv = -ENOMEM;
> > +		goto out_mmput;
> > +	}
> > +
> > +	down_read(&mm->mmap_sem);
> > +	arg_start = mm->arg_start;
> > +	arg_end = mm->arg_end;
> > +	env_start = mm->env_start;
> > +	env_end = mm->env_end;
> > +	up_read(&mm->mmap_sem);
> 
> Could you please explain why this down/up is needed?

Code is written this way to get constistent snapshot of data.
If you look at PR_SET_MM_* code, you'll notice down_read(&mm->mmap_sem)
as well which is a separate bug because you're _writing_ those fields
eventually in prctl_set_mm(), yuck!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 22:29     ` Alexey Dobriyan
@ 2015-05-27 22:48       ` Cyrill Gorcunov
  2015-05-27 23:12         ` Alexey Dobriyan
  0 siblings, 1 reply; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-27 22:48 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 01:29:42AM +0300, Alexey Dobriyan wrote:
> > > +
> > > +	page = (char *)__get_free_page(GFP_TEMPORARY);
> > > +	if (!page) {
> > > +		rv = -ENOMEM;
> > > +		goto out_mmput;
> > > +	}
> > > +
> > > +	down_read(&mm->mmap_sem);
> > > +	arg_start = mm->arg_start;
> > > +	arg_end = mm->arg_end;
> > > +	env_start = mm->env_start;
> > > +	env_end = mm->env_end;
> > > +	up_read(&mm->mmap_sem);
> > 
> > Could you please explain why this down/up is needed?
> 
> Code is written this way to get constistent snapshot of data.

it does not. you fetch data into local variables which is the
same as simply read them locklessly in general (because later
you refer to local vars).

> If you look at PR_SET_MM_* code, you'll notice down_read(&mm->mmap_sem)
> as well which is a separate bug because you're _writing_ those fields
> eventually in prctl_set_mm(), yuck!

yes, there members are modified under read-lock and initially
i didn't see any problem with that except one can have inconsistent
statistics output because another process modified these fields
(we validate that new members are having sane values at least
 in new interface and after your first patch). But now I think
that down_write may be more suitable here.

	Cyrill

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 22:48       ` Cyrill Gorcunov
@ 2015-05-27 23:12         ` Alexey Dobriyan
  2015-05-28  6:59           ` Cyrill Gorcunov
  0 siblings, 1 reply; 14+ messages in thread
From: Alexey Dobriyan @ 2015-05-27 23:12 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 01:48:25AM +0300, Cyrill Gorcunov wrote:
> On Thu, May 28, 2015 at 01:29:42AM +0300, Alexey Dobriyan wrote:
> > > > +
> > > > +	page = (char *)__get_free_page(GFP_TEMPORARY);
> > > > +	if (!page) {
> > > > +		rv = -ENOMEM;
> > > > +		goto out_mmput;
> > > > +	}
> > > > +
> > > > +	down_read(&mm->mmap_sem);
> > > > +	arg_start = mm->arg_start;
> > > > +	arg_end = mm->arg_end;
> > > > +	env_start = mm->env_start;
> > > > +	env_end = mm->env_end;
> > > > +	up_read(&mm->mmap_sem);
> > > 
> > > Could you please explain why this down/up is needed?
> > 
> > Code is written this way to get constistent snapshot of data.
> 
> it does not. you fetch data into local variables which is the
> same as simply read them locklessly in general (because later
> you refer to local vars).

It is snapshot w.r.t getting both pairs not snapshot w.r.t atomicity or
something (unsigned long access is atomic after all). Once down_write()
is used in the other place, it even becomes obviously correct code!

> > If you look at PR_SET_MM_* code, you'll notice down_read(&mm->mmap_sem)
> > as well which is a separate bug because you're _writing_ those fields
> > eventually in prctl_set_mm(), yuck!
> 
> yes, there members are modified under read-lock and initially
> i didn't see any problem with that except one can have inconsistent
> statistics output because another process modified these fields
> (we validate that new members are having sane values at least
>  in new interface and after your first patch). But now I think
> that down_write may be more suitable here.
> 
> 	Cyrill

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 23:12         ` Alexey Dobriyan
@ 2015-05-28  6:59           ` Cyrill Gorcunov
  2015-05-28 11:14             ` Alexey Dobriyan
  0 siblings, 1 reply; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-28  6:59 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: akpm, linux-kernel, jarod, jstancek

On Thu, May 28, 2015 at 02:12:07AM +0300, Alexey Dobriyan wrote:
> > > > 
> > > > Could you please explain why this down/up is needed?
> > > 
> > > Code is written this way to get constistent snapshot of data.
> > 
> > it does not. you fetch data into local variables which is the
> > same as simply read them locklessly in general (because later
> > you refer to local vars).
> 
> It is snapshot w.r.t getting both pairs not snapshot w.r.t atomicity or
> something (unsigned long access is atomic after all). Once down_write()
> is used in the other place, it even becomes obviously correct code!

Not at all. It is correct if and only if you're operating under lock
taken, once you fetch the pair and left the lock it simply local copies
of values the descriptor had when lock was taken.

	Cyrill

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-28  6:59           ` Cyrill Gorcunov
@ 2015-05-28 11:14             ` Alexey Dobriyan
  2015-05-28 11:35               ` Cyrill Gorcunov
  0 siblings, 1 reply; 14+ messages in thread
From: Alexey Dobriyan @ 2015-05-28 11:14 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: Andrew Morton, Linux Kernel, Jarod Wilson, Jan Stancek

On Thu, May 28, 2015 at 9:59 AM, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> On Thu, May 28, 2015 at 02:12:07AM +0300, Alexey Dobriyan wrote:
>> > > >
>> > > > Could you please explain why this down/up is needed?
>> > >
>> > > Code is written this way to get constistent snapshot of data.
>> >
>> > it does not. you fetch data into local variables which is the
>> > same as simply read them locklessly in general (because later
>> > you refer to local vars).
>>
>> It is snapshot w.r.t getting both pairs not snapshot w.r.t atomicity or
>> something (unsigned long access is atomic after all). Once down_write()
>> is used in the other place, it even becomes obviously correct code!
>
> Not at all. It is correct if and only if you're operating under lock
> taken, once you fetch the pair and left the lock it simply local copies
> of values the descriptor had when lock was taken.

Yes, and?

You do not complain that signal statistics is collected
under sighand lock but printed for /proc/*/status without, do you?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-28 11:14             ` Alexey Dobriyan
@ 2015-05-28 11:35               ` Cyrill Gorcunov
  2015-05-28 12:00                 ` Cyrill Gorcunov
  0 siblings, 1 reply; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-28 11:35 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: Andrew Morton, Linux Kernel, Jarod Wilson, Jan Stancek

On Thu, May 28, 2015 at 02:14:41PM +0300, Alexey Dobriyan wrote:
> >>
> >> It is snapshot w.r.t getting both pairs not snapshot w.r.t atomicity or
> >> something (unsigned long access is atomic after all). Once down_write()
> >> is used in the other place, it even becomes obviously correct code!
> >
> > Not at all. It is correct if and only if you're operating under lock
> > taken, once you fetch the pair and left the lock it simply local copies
> > of values the descriptor had when lock was taken.
> 
> Yes, and?
> 
> You do not complain that signal statistics is collected
> under sighand lock but printed for /proc/*/status without, do you?

They are different, in signal statistics we fetch the _complete_
entries, ie full copies of data. In turn you fetch only pointers
to data. Don't you see the difference? IOW, prctl modifies members
under the lock testing both the pointers and the data they points
to are valid _under_ the lock, but you instead fetch the addresses
under the lock, leave lock and continue operating with addresses,
which is wrong imho.

(I don't see some serious problem here because in worst scenario
 (if someone set up bad pointers here) we simply get page fault
 or some related error for application, not the kernel itself,
 so I won't insist).

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-27 21:49 ` [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline Alexey Dobriyan
  2015-05-27 22:14   ` Cyrill Gorcunov
@ 2015-05-28 11:57   ` Jan Stancek
  1 sibling, 0 replies; 14+ messages in thread
From: Jan Stancek @ 2015-05-28 11:57 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: akpm, linux-kernel, gorcunov, jarod





----- Original Message -----
> From: "Alexey Dobriyan" <adobriyan@gmail.com>
> To: akpm@linux-foundation.org
> Cc: linux-kernel@vger.kernel.org, gorcunov@openvz.org, jarod@redhat.com, jstancek@redhat.com
> Sent: Wednesday, 27 May, 2015 11:49:53 PM
> Subject: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
> 
> /proc/$PID/cmdline truncates output at PAGE_SIZE. It is easy to see with
> 
> 	$ cat /proc/self/cmdline $(seq 1037) 2>/dev/null
> 
> However, command line size was never limited to PAGE_SIZE but to 128 KB and
> relatively recently limitation was removed altogether.
> 
> People noticed and ask questions:
> http://stackoverflow.com/questions/199130/how-do-i-increase-the-proc-pid-cmdline-4096-byte-limit
> 
> seq file interface is not OK, because it kmalloc's for whole output and
> open + read(, 1) + sleep will pin arbitrary amounts of kernel memory.
> To not do that, limit must be imposed which is incompatible with
> arbitrary sized command lines.
> 
> I apologize for hairy code, but this it direct consequence of command line
> layout in memory and hacks to support things like "init [3]".
> 
> The loops are "unrolled" otherwise it is either macros which hide
> control flow or functions with 7-8 arguments with equal line count.
> 
> There should be real setproctitle(2) or something.
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> Tested-by: Jarod Wilson <jarod@redhat.com>
> Acked-by: Jarod Wilson <jarod@redhat.com>
> ---
> 
> 	v5: fix BUG_ON(env_start > env_end)

I can no longer trigger this BUG_ON() with v5 of the patch.

Regards,
Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline
  2015-05-28 11:35               ` Cyrill Gorcunov
@ 2015-05-28 12:00                 ` Cyrill Gorcunov
  0 siblings, 0 replies; 14+ messages in thread
From: Cyrill Gorcunov @ 2015-05-28 12:00 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: Andrew Morton, Linux Kernel, Jarod Wilson, Jan Stancek

On Thu, May 28, 2015 at 02:35:07PM +0300, Cyrill Gorcunov wrote:
> On Thu, May 28, 2015 at 02:14:41PM +0300, Alexey Dobriyan wrote:
> > >>
> > >> It is snapshot w.r.t getting both pairs not snapshot w.r.t atomicity or
> > >> something (unsigned long access is atomic after all). Once down_write()
> > >> is used in the other place, it even becomes obviously correct code!
> > >
> > > Not at all. It is correct if and only if you're operating under lock
> > > taken, once you fetch the pair and left the lock it simply local copies
> > > of values the descriptor had when lock was taken.
> > 
> > Yes, and?
> > 
> > You do not complain that signal statistics is collected
> > under sighand lock but printed for /proc/*/status without, do you?
> 

After thinking more, right, you're correct, sorry for noise.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-05-28 12:00 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-27 21:47 [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Alexey Dobriyan
2015-05-27 21:49 ` [PATCH 2/2] proc: fix PAGE_SIZE limit of /proc/$PID/cmdline Alexey Dobriyan
2015-05-27 22:14   ` Cyrill Gorcunov
2015-05-27 22:29     ` Alexey Dobriyan
2015-05-27 22:48       ` Cyrill Gorcunov
2015-05-27 23:12         ` Alexey Dobriyan
2015-05-28  6:59           ` Cyrill Gorcunov
2015-05-28 11:14             ` Alexey Dobriyan
2015-05-28 11:35               ` Cyrill Gorcunov
2015-05-28 12:00                 ` Cyrill Gorcunov
2015-05-28 11:57   ` Jan Stancek
2015-05-27 22:02 ` [PATCH 1/2] prctl: more prctl(PR_SET_MM_*) checks Cyrill Gorcunov
2015-05-27 22:12   ` Alexey Dobriyan
2015-05-27 22:20     ` Cyrill Gorcunov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).