linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
@ 2006-06-23 10:54 Peter Zijlstra
  2006-06-26 16:57 ` Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2006-06-23 10:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Andrew Morton, Arjan van de Ven, Pavel Machek

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Some folks find 128KB of env+arg space too little. Solaris provides them with
1MB. Manually changing MAX_ARG_PAGES worked for them so far, however they
would like to run the supported vendor kernel.

In the interest of not penalising everybody with the overhead of just
setting it larger, provide a sysctl to change it.

Compiles and boots on i386.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---
 arch/ia64/ia32/binfmt_elf32.c  |    4 +--
 arch/mips/kernel/sysirix.c     |    2 -
 arch/x86_64/ia32/ia32_binfmt.c |    4 +--
 fs/binfmt_elf.c                |   10 +++++----
 fs/binfmt_elf_fdpic.c          |   15 +++++++------
 fs/binfmt_flat.c               |    2 -
 fs/compat.c                    |   18 +++++++++++++---
 fs/exec.c                      |   44 +++++++++++++++++++++++++++++------------
 include/linux/binfmts.h        |   14 ++++---------
 include/linux/sysctl.h         |    1 
 kernel/sysctl.c                |   14 +++++++++++++
 11 files changed, 86 insertions(+), 42 deletions(-)

Index: linux-2.6/fs/binfmt_elf.c
===================================================================
--- linux-2.6.orig/fs/binfmt_elf.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/fs/binfmt_elf.c	2006-06-14 16:00:36.000000000 +0200
@@ -255,8 +255,9 @@ create_elf_tables(struct linux_binprm *b
 	while (argc-- > 0) {
 		size_t len;
 		__put_user((elf_addr_t)p, argv++);
-		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
+		len = strnlen_user((void __user *)p,
+				PAGE_SIZE*bprm->max_arg_pages);
+		if (!len || len > PAGE_SIZE*bprm->max_arg_pages)
 			return 0;
 		p += len;
 	}
@@ -266,8 +267,9 @@ create_elf_tables(struct linux_binprm *b
 	while (envc-- > 0) {
 		size_t len;
 		__put_user((elf_addr_t)p, envp++);
-		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
+		len = strnlen_user((void __user *)p,
+				PAGE_SIZE*bprm->max_arg_pages);
+		if (!len || len > PAGE_SIZE*bprm->max_arg_pages)
 			return 0;
 		p += len;
 	}
Index: linux-2.6/fs/binfmt_elf_fdpic.c
===================================================================
--- linux-2.6.orig/fs/binfmt_elf_fdpic.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/fs/binfmt_elf_fdpic.c	2006-06-14 16:00:36.000000000 +0200
@@ -578,14 +578,15 @@ static int create_elf_fdpic_tables(struc
 #ifdef CONFIG_MMU
 	current->mm->arg_start = bprm->p;
 #else
-	current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
+	current->mm->arg_start = current->mm->start_stack -
+		(bprm->max_arg_pages * PAGE_SIZE - bprm->p);
 #endif
 
 	p = (char *) current->mm->arg_start;
 	for (loop = bprm->argc; loop > 0; loop--) {
 		__put_user((elf_caddr_t) p, argv++);
-		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE * MAX_ARG_PAGES)
+		len = strnlen_user(p, PAGE_SIZE * bprm->max_arg_pages);
+		if (!len || len > PAGE_SIZE * bprm->max_arg_pages)
 			return -EINVAL;
 		p += len;
 	}
@@ -596,8 +597,8 @@ static int create_elf_fdpic_tables(struc
 	current->mm->env_start = (unsigned long) p;
 	for (loop = bprm->envc; loop > 0; loop--) {
 		__put_user((elf_caddr_t)(unsigned long) p, envp++);
-		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE * MAX_ARG_PAGES)
+		len = strnlen_user(p, PAGE_SIZE * bprm->max_arg_pages);
+		if (!len || len > PAGE_SIZE * bprm->max_arg_pages)
 			return -EINVAL;
 		p += len;
 	}
@@ -623,7 +624,7 @@ static int elf_fdpic_transfer_args_to_st
 	stop = bprm->p >> PAGE_SHIFT;
 	sp = *_sp;
 
-	for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
+	for (index = bprm->max_arg_pages - 1; index >= stop; index--) {
 		src = kmap(bprm->page[index]);
 		sp -= PAGE_SIZE;
 		if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0)
@@ -633,7 +634,7 @@ static int elf_fdpic_transfer_args_to_st
 			goto out;
 	}
 
-	*_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
+	*_sp = (*_sp - (bprm->max_arg_pages * PAGE_SIZE - bprm->p)) & ~15;
 
  out:
 	return ret;
Index: linux-2.6/fs/binfmt_flat.c
===================================================================
--- linux-2.6.orig/fs/binfmt_flat.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/fs/binfmt_flat.c	2006-06-14 16:00:36.000000000 +0200
@@ -840,7 +840,7 @@ static int load_flat_binary(struct linux
 	 * pedantic and include space for the argv/envp array as it may have
 	 * a lot of entries.
 	 */
-#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *))
+#define TOP_OF_ARGS (PAGE_SIZE * bprm->max_arg_pages - sizeof(void *))
 	stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
 	stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
 	stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
Index: linux-2.6/fs/compat.c
===================================================================
--- linux-2.6.orig/fs/compat.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/fs/compat.c	2006-06-14 16:04:11.000000000 +0200
@@ -1476,7 +1476,7 @@ static inline void free_arg_pages(struct
 {
 	int i;
 
-	for (i = 0; i < MAX_ARG_PAGES; i++) {
+	for (i = 0; i < bprm->max_arg_pages; i++) {
 		if (bprm->page[i])
 			__free_page(bprm->page[i]);
 		bprm->page[i] = NULL;
@@ -1504,14 +1504,20 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_ret;
 
+	bprm->max_arg_pages = max_arg_pages;
+	bprm->page = kzalloc(bprm->max_arg_pages*sizeof(struct page *),
+			GFP_KERNEL);
+	if (!bprm->page)
+		goto out_kfree;
+
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_kfree;
+		goto out_kfree_page;
 
 	sched_exec();
 
-	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+	bprm->p = PAGE_SIZE*bprm->max_arg_pages-sizeof(void *);
 	bprm->file = file;
 	bprm->filename = filename;
 	bprm->interp = filename;
@@ -1560,13 +1566,14 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
+		kfree(bprm->page);
 		kfree(bprm);
 		return retval;
 	}
 
 out:
 	/* Something went wrong, return the inode and free the argument pages*/
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+	for (i = 0 ; i < bprm->max_arg_pages ; i++) {
 		struct page * page = bprm->page[i];
 		if (page)
 			__free_page(page);
@@ -1585,6 +1592,9 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_kfree_page:
+	kfree(bprm->page);
+
 out_kfree:
 	kfree(bprm);
 
Index: linux-2.6/fs/exec.c
===================================================================
--- linux-2.6.orig/fs/exec.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/fs/exec.c	2006-06-14 16:03:43.000000000 +0200
@@ -64,6 +64,16 @@ int suid_dumpable = 0;
 EXPORT_SYMBOL(suid_dumpable);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
+/*
+ * MAX_ARG_PAGES defines the number of pages allocated for arguments
+ * and envelope for the new program. 32 should suffice, this gives
+ * a maximum env+arg of 128kB w/4KB pages!
+ */
+int max_arg_pages = 32;
+int max_arg_pages_min = 32;
+int max_arg_pages_max = PAGE_SIZE/sizeof(struct page *);
+EXPORT_SYMBOL(max_arg_pages);
+
 static struct linux_binfmt *formats;
 static DEFINE_RWLOCK(binfmt_lock);
 
@@ -355,7 +365,7 @@ int setup_arg_pages(struct linux_binprm 
 
 	/* Start by shifting all the pages down */
 	i = 0;
-	for (j = 0; j < MAX_ARG_PAGES; j++) {
+	for (j = 0; j < bprm->max_arg_pages; j++) {
 		struct page *page = bprm->page[j];
 		if (!page)
 			continue;
@@ -388,10 +398,11 @@ int setup_arg_pages(struct linux_binprm 
 	arg_size = i << PAGE_SHIFT;
 
 	/* zero pages that were copied above */
-	while (i < MAX_ARG_PAGES)
+	while (i < bprm->max_arg_pages)
 		bprm->page[i++] = NULL;
 #else
-	stack_base = arch_align_stack(stack_top - MAX_ARG_PAGES*PAGE_SIZE);
+	stack_base = arch_align_stack(stack_top -
+			bprm->max_arg_pages*PAGE_SIZE);
 	stack_base = PAGE_ALIGN(stack_base);
 	bprm->p += stack_base;
 	mm->arg_start = bprm->p;
@@ -439,7 +450,7 @@ int setup_arg_pages(struct linux_binprm 
 		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	}
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+	for (i = 0 ; i < bprm->max_arg_pages ; i++) {
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
@@ -462,7 +473,7 @@ static inline void free_arg_pages(struct
 {
 	int i;
 
-	for (i = 0; i < MAX_ARG_PAGES; i++) {
+	for (i = 0; i < bprm->max_arg_pages; i++) {
 		if (bprm->page[i])
 			__free_page(bprm->page[i]);
 		bprm->page[i] = NULL;
@@ -1058,7 +1069,7 @@ int search_binary_handler(struct linux_b
 		fput(bprm->file);
 		bprm->file = NULL;
 
-	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+	        loader = PAGE_SIZE*bprm->max_arg_pages-sizeof(void *);
 
 		file = open_exec("/sbin/loader");
 		retval = PTR_ERR(file);
@@ -1153,14 +1164,20 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_ret;
 
+	bprm->max_arg_pages = max_arg_pages;
+	bprm->page = kzalloc(bprm->max_arg_pages*sizeof(struct page*),
+			GFP_KERNEL);
+	if (!bprm->page)
+		goto out_kfree;
+
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_kfree;
+		goto out_kfree_page;
 
 	sched_exec();
 
-	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+	bprm->p = PAGE_SIZE*bprm->max_arg_pages-sizeof(void *);
 
 	bprm->file = file;
 	bprm->filename = filename;
@@ -1210,16 +1227,16 @@ int do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
+		kfree(bprm->page);
 		kfree(bprm);
 		return retval;
 	}
 
 out:
 	/* Something went wrong, return the inode and free the argument pages*/
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page * page = bprm->page[i];
-		if (page)
-			__free_page(page);
+	for (i = 0; i < bprm->max_arg_pages; i++) {
+		if (bprm->page[i])
+			__free_page(bprm->page[i]);
 	}
 
 	if (bprm->security)
@@ -1235,6 +1252,9 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_kfree_page:
+	kfree(bprm->page);
+
 out_kfree:
 	kfree(bprm);
 
Index: linux-2.6/include/linux/binfmts.h
===================================================================
--- linux-2.6.orig/include/linux/binfmts.h	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/include/linux/binfmts.h	2006-06-14 16:00:36.000000000 +0200
@@ -5,13 +5,6 @@
 
 struct pt_regs;
 
-/*
- * MAX_ARG_PAGES defines the number of pages allocated for arguments
- * and envelope for the new program. 32 should suffice, this gives
- * a maximum env+arg of 128kB w/4KB pages!
- */
-#define MAX_ARG_PAGES 32
-
 /* sizeof(linux_binprm->buf) */
 #define BINPRM_BUF_SIZE 128
 
@@ -20,9 +13,10 @@ struct pt_regs;
 /*
  * This structure is used to hold the arguments that are used when loading binaries.
  */
-struct linux_binprm{
+struct linux_binprm {
 	char buf[BINPRM_BUF_SIZE];
-	struct page *page[MAX_ARG_PAGES];
+	struct page **page;
+	int max_arg_pages;
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	int sh_bang;
@@ -40,6 +34,8 @@ struct linux_binprm{
 	unsigned long loader, exec;
 };
 
+extern int max_arg_pages;
+
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
Index: linux-2.6/include/linux/sysctl.h
===================================================================
--- linux-2.6.orig/include/linux/sysctl.h	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/include/linux/sysctl.h	2006-06-14 16:00:36.000000000 +0200
@@ -148,6 +148,7 @@ enum
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
+	KERN_MAX_ARG_PAGES=73, /* int: max env+arg in pages */
 };
 

Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/kernel/sysctl.c	2006-06-14 16:00:36.000000000 +0200
@@ -131,6 +131,9 @@ extern int acct_parm[];
 extern int no_unaligned_warning;
 #endif
 
+extern int max_arg_pages;
+extern int max_arg_pages_min, max_arg_pages_max;
+
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -683,6 +686,17 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= KERN_MAX_ARG_PAGES,
+		.procname	= "max_arg_pages",
+		.data		= &max_arg_pages,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &max_arg_pages_min,
+		.extra2		= &max_arg_pages_max,
+	},
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6/arch/ia64/ia32/binfmt_elf32.c
===================================================================
--- linux-2.6.orig/arch/ia64/ia32/binfmt_elf32.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/arch/ia64/ia32/binfmt_elf32.c	2006-06-14 16:00:36.000000000 +0200
@@ -207,7 +207,7 @@ ia32_setup_arg_pages (struct linux_binpr
 	struct mm_struct *mm = current->mm;
 	int i, ret;
 
-	stack_base = IA32_STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
+	stack_base = IA32_STACK_TOP - bprm->max_arg_pages*PAGE_SIZE;
 	mm->arg_start = bprm->p + stack_base;
 
 	bprm->p += stack_base;
@@ -242,7 +242,7 @@ ia32_setup_arg_pages (struct linux_binpr
 		current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt);
 	}
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+	for (i = 0 ; i < bprm->max_arg_pages ; i++) {
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
Index: linux-2.6/arch/mips/kernel/sysirix.c
===================================================================
--- linux-2.6.orig/arch/mips/kernel/sysirix.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/arch/mips/kernel/sysirix.c	2006-06-14 16:16:12.000000000 +0200
@@ -344,7 +344,7 @@ asmlinkage int irix_syssgi(struct pt_reg
 	case SGI_SYSCONF: {
 		switch(regs->regs[base + 5]) {
 		case 1:
-			retval = (MAX_ARG_PAGES >> 4); /* XXX estimate... */
+			retval = (max_arg_pages >> 4); /* XXX estimate... */
 			goto out;
 		case 2:
 			retval = max_threads;
Index: linux-2.6/arch/x86_64/ia32/ia32_binfmt.c
===================================================================
--- linux-2.6.orig/arch/x86_64/ia32/ia32_binfmt.c	2006-06-14 15:56:40.000000000 +0200
+++ linux-2.6/arch/x86_64/ia32/ia32_binfmt.c	2006-06-14 16:00:36.000000000 +0200
@@ -339,7 +339,7 @@ int ia32_setup_arg_pages(struct linux_bi
 	struct mm_struct *mm = current->mm;
 	int i, ret;
 
-	stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE;
+	stack_base = stack_top - bprm->max_arg_pages * PAGE_SIZE;
 	mm->arg_start = bprm->p + stack_base;
 
 	bprm->p += stack_base;
@@ -374,7 +374,7 @@ int ia32_setup_arg_pages(struct linux_bi
 		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	} 
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+	for (i = 0 ; i < bprm->max_arg_pages ; i++) {
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-23 10:54 [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable Peter Zijlstra
@ 2006-06-26 16:57 ` Andrew Morton
  2006-06-26 17:21   ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2006-06-26 16:57 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, torvalds, arjan, pavel

On Fri, 23 Jun 2006 12:54:49 +0200
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Some folks find 128KB of env+arg space too little. Solaris provides them with
> 1MB. Manually changing MAX_ARG_PAGES worked for them so far, however they
> would like to run the supported vendor kernel.
> 
> In the interest of not penalising everybody with the overhead of just
> setting it larger, provide a sysctl to change it.
> 
> Compiles and boots on i386.

AFAICS, the main downside of simply increasing MAX_ARG_PAGES is that
fixed-size array in `struct linux_binprm'.  You've solved that via kmalloc,
so can we avoid the sysctl?  We can now increase MAX_ARG_PAGES to something
ridiculous with basically no cost?  It's swappable memory and should be
limited by the RLIMIT_RSS which we don't implement ;)

Also, I'm not sure that we need max_arg_pages_min and max_arg_pages_max -
it's a privileged operation and we can just let root decide whether or not
to screw up the machine.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-26 16:57 ` Andrew Morton
@ 2006-06-26 17:21   ` Linus Torvalds
  2006-06-26 22:35     ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2006-06-26 17:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Peter Zijlstra, linux-kernel, arjan, pavel



On Mon, 26 Jun 2006, Andrew Morton wrote:
> 
> AFAICS, the main downside of simply increasing MAX_ARG_PAGES is that
> fixed-size array in `struct linux_binprm'.  You've solved that via kmalloc,
> so can we avoid the sysctl?  We can now increase MAX_ARG_PAGES to something
> ridiculous with basically no cost?  It's swappable memory and should be
> limited by the RLIMIT_RSS which we don't implement ;)

It's _not_ swappable memory, sadly. It's swappable before it hits 
linux_binprm, and it's swappable after it's mapped into the destination, 
but it is _not_ swappable while it's held in the bprm->page[] array ;/

I totally re-organized how execve() allocates the new mm at an execve 
several years ago (it used to re-use the old MM if it could), and that was 
so that we count just remove the brpm->page array, and just install the 
pages directly into the destination.

That was in 2002. I never actually got around to doing it ;(.

		Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-26 17:21   ` Linus Torvalds
@ 2006-06-26 22:35     ` Ingo Molnar
  2006-06-26 22:46       ` Ingo Molnar
  2006-06-26 23:02       ` Linus Torvalds
  0 siblings, 2 replies; 11+ messages in thread
From: Ingo Molnar @ 2006-06-26 22:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Peter Zijlstra, linux-kernel, arjan, pavel,
	Ulrich Drepper


* Linus Torvalds <torvalds@osdl.org> wrote:

> I totally re-organized how execve() allocates the new mm at an execve 
> several years ago (it used to re-use the old MM if it could), and that 
> was so that we count just remove the brpm->page array, and just 
> install the pages directly into the destination.
> 
> That was in 2002. I never actually got around to doing it ;(.

i thought about your "map execve pages directly into target" (since the 
source gets destroyed anyway) suggestion back then, and unfortunately it 
gets quite complex.

Firstly, if setenv is done and the array of strings gets larger, glibc 
realloc()s it and the layout of the environment gets 'fragmented'. 
Ulrich was uneasy about passing a fragmented environment to the target 
task - it's not sure that no app would break. Secondly, for security 
reasons we have to memset all the memory around fragmented strings. So 
we might end up doing _alot_ of memsetting in some cases, if the string 
space happens to be fragmented. So while the current method is slow and 
uses persistent memory, it at least "compresses" the layout of the 
environment (and arguments) at every exec() time and thus avoids these 
sorts of problems.

And this is a real problem for real applications and is being complained 
about alot by shops that do alot of development and have scrips around 
large filesystem hierarchies. (and who got used to their scripts working 
on other unices just fine)

Lets at least give root the chance to increase this limit and go with 
the dumb and easy patch i posted years ago. It worked just fine - and 
it's not like root cannot cause lots of persistent memory to be 
allocated by processes: for example by setting the per-socket wmem limit 
to 10MB. Please? I personally run into this stupid limit every couple of 
days during development while working with the kernel tree, just do a 
simple thing like in the kernel tree's top directory:

 $ ls -lS `find .`
 -bash: /bin/ls: Argument list too long

The phrase "that ridiculous 128K limit" often comes to my mind :-|

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-26 22:35     ` Ingo Molnar
@ 2006-06-26 22:46       ` Ingo Molnar
  2006-06-26 23:02       ` Linus Torvalds
  1 sibling, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2006-06-26 22:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Peter Zijlstra, linux-kernel, arjan, pavel,
	Ulrich Drepper


* Ingo Molnar <mingo@elte.hu> wrote:

> * Linus Torvalds <torvalds@osdl.org> wrote:
> 
> > I totally re-organized how execve() allocates the new mm at an execve 
> > several years ago (it used to re-use the old MM if it could), and that 
> > was so that we count just remove the brpm->page array, and just 
> > install the pages directly into the destination.
> > 
> > That was in 2002. I never actually got around to doing it ;(.
> 
> i thought about your "map execve pages directly into target" (since the 
> source gets destroyed anyway) suggestion back then, and unfortunately it 
> gets quite complex.
> 
> Firstly, if setenv is done and the array of strings gets larger, glibc 
> realloc()s it and the layout of the environment gets 'fragmented'. 
> Ulrich was uneasy about passing a fragmented environment to the target 
> task - it's not sure that no app would break. Secondly, for security 
> reasons we have to memset all the memory around fragmented strings. So 
> we might end up doing _alot_ of memsetting in some cases, if the string 
> space happens to be fragmented. So while the current method is slow and 
> uses persistent memory, it at least "compresses" the layout of the 
> environment (and arguments) at every exec() time and thus avoids these 
> sorts of problems.
> 
> And this is a real problem for real applications and is being complained 
> about alot by shops that do alot of development and have scrips around 
> large filesystem hierarchies. (and who got used to their scripts working 
> on other unices just fine)
> 
> Lets at least give root the chance to increase this limit and go with 
> the dumb and easy patch i posted years ago. [...]

it's almost 5 years old:

  http://people.redhat.com/mingo/execve-patches/exec-argsize-2.4.10-A3

( purely making the limit dynamic is not enough - bprm->pages needs to
  become kmalloc()ed, plus i added a bprm->nr_pages so that decreasing 
  the limit becomes safe too.)

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-26 22:35     ` Ingo Molnar
  2006-06-26 22:46       ` Ingo Molnar
@ 2006-06-26 23:02       ` Linus Torvalds
  2006-06-27 11:09         ` Ingo Molnar
  1 sibling, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2006-06-26 23:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Peter Zijlstra, linux-kernel, arjan, pavel,
	Ulrich Drepper



On Tue, 27 Jun 2006, Ingo Molnar wrote:
>
> i thought about your "map execve pages directly into target" (since the 
> source gets destroyed anyway) suggestion back then, and unfortunately it 
> gets quite complex.

No, you misunderstood.

I wasn't actually suggesting mapping pages directly from the source into 
the destination.  That is indeed horribly horribly complicated.

I really only wanted to avoid the "brpm->pages[]" indirection.

So right now, we copy the argument strings into new temporary pages and 
put them in the ->pages[] array.

The "copy argument strings into new temporary pages" part is _fine_. I 
wouldn't change that part at all.

I'd only really change the "into the ->pages[] array" part, and instead 
move them directly into the destination page tables.

Why?

Two reasons:
 - right now ->pages[] array is unswappable. Avoiding the temporary array 
   would allow the swapper to actually swap the pages out (no special 
   cases: it's a perfectly normal page table, it just hasn't actually 
   gotten activated yet).
 - And the whole reason for having a limited array  basically goes away 
   (the swappable thing is part of it, but the fact that the page tables 
   themselves are just a lot more extensible than the silly array is just 
   fundamentally a part of it too)

So it's literally just the array I'd get rid of. Instead of insertign the 
page into the array, just insert it directly into the page table with 
"install_arg_page()".

			Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-26 23:02       ` Linus Torvalds
@ 2006-06-27 11:09         ` Ingo Molnar
  2006-06-27 12:16           ` Arjan van de Ven
  2006-06-28  1:07           ` Linus Torvalds
  0 siblings, 2 replies; 11+ messages in thread
From: Ingo Molnar @ 2006-06-27 11:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Peter Zijlstra, linux-kernel, arjan, pavel,
	Ulrich Drepper


* Linus Torvalds <torvalds@osdl.org> wrote:

> On Tue, 27 Jun 2006, Ingo Molnar wrote:
> >
> > i thought about your "map execve pages directly into target" (since the 
> > source gets destroyed anyway) suggestion back then, and unfortunately it 
> > gets quite complex.
> 
> No, you misunderstood.
> 
> I wasn't actually suggesting mapping pages directly from the source 
> into the destination.  That is indeed horribly horribly complicated.

ok, that's good news :-)

>  - And the whole reason for having a limited array basically goes away
>    (the swappable thing is part of it, but the fact that the page 
>    tables themselves are just a lot more extensible than the silly 
>    array is just fundamentally a part of it too)
> 
> So it's literally just the array I'd get rid of. Instead of insertign 
> the page into the array, just insert it directly into the page table 
> with "install_arg_page()".

ok, but there are a few logistical issues:

at copy_strings_kernel() time we dont yet know where in the target VM to 
install the pages. A binformat might want to install all sorts of stuff 
on the stack first, before it constructs the envp and copies the strings 
themselves. So we dont know the precise alignment needed.

delaying the copying to setup_arg_pages() time does not seem to work 
either, because that gets called after the old MM has been destroyed.

[ delaying the copying will also change behavior in error cases - 
  instead of returning with an error if the string pointers are bad 
  we'll have to kill the execve()ing process. ]

am i missing something?

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-27 12:16           ` Arjan van de Ven
@ 2006-06-27 12:14             ` Ingo Molnar
  2006-07-18 12:28             ` Peter Zijlstra
  1 sibling, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2006-06-27 12:14 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Linus Torvalds, Andrew Morton, Peter Zijlstra, linux-kernel,
	pavel, Ulrich Drepper


* Arjan van de Ven <arjan@linux.intel.com> wrote:

> Ingo Molnar wrote:
> >at copy_strings_kernel() time we dont yet know where in the target VM to 
> >install the pages. A binformat might want to install all sorts of stuff 
> >on the stack first, before it constructs the envp and copies the strings 
> >themselves. So we dont know the precise alignment needed.
> >
> >delaying the copying to setup_arg_pages() time does not seem to work 
> >either, because that gets called after the old MM has been destroyed.
> >
> >[ delaying the copying will also change behavior in error cases - 
> >  instead of returning with an error if the string pointers are bad 
> >  we'll have to kill the execve()ing process. ]
> >
> >am i missing something?
> 
> we could always just have the binfmt use mremap() equivalent to move 
> it into the place it wants...

i dont think so, these things on the stack are not page aligned.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-27 11:09         ` Ingo Molnar
@ 2006-06-27 12:16           ` Arjan van de Ven
  2006-06-27 12:14             ` Ingo Molnar
  2006-07-18 12:28             ` Peter Zijlstra
  2006-06-28  1:07           ` Linus Torvalds
  1 sibling, 2 replies; 11+ messages in thread
From: Arjan van de Ven @ 2006-06-27 12:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Peter Zijlstra, linux-kernel,
	pavel, Ulrich Drepper

Ingo Molnar wrote:
> at copy_strings_kernel() time we dont yet know where in the target VM to 
> install the pages. A binformat might want to install all sorts of stuff 
> on the stack first, before it constructs the envp and copies the strings 
> themselves. So we dont know the precise alignment needed.
> 
> delaying the copying to setup_arg_pages() time does not seem to work 
> either, because that gets called after the old MM has been destroyed.
> 
> [ delaying the copying will also change behavior in error cases - 
>   instead of returning with an error if the string pointers are bad 
>   we'll have to kill the execve()ing process. ]
> 
> am i missing something?

we could always just have the  binfmt use mremap() equivalent to move it 
into the place it wants...

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-27 11:09         ` Ingo Molnar
  2006-06-27 12:16           ` Arjan van de Ven
@ 2006-06-28  1:07           ` Linus Torvalds
  1 sibling, 0 replies; 11+ messages in thread
From: Linus Torvalds @ 2006-06-28  1:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Peter Zijlstra, linux-kernel, arjan, pavel,
	Ulrich Drepper



On Tue, 27 Jun 2006, Ingo Molnar wrote:
> 
> at copy_strings_kernel() time we dont yet know where in the target VM to 
> install the pages. A binformat might want to install all sorts of stuff 
> on the stack first, before it constructs the envp and copies the strings 
> themselves. So we dont know the precise alignment needed.
> 
> delaying the copying to setup_arg_pages() time does not seem to work 
> either, because that gets called after the old MM has been destroyed.

Well, I think we could just change the place the MM is destroyed. We 
shouldn't destroy it until we're done with it ;)

However, I don't think that actually matters. The only thing we _need_ to 
know is what the stack top is going to be, and we know that fairly early. 
The fact that we _also_ add various magic flags onto the stack is 
independent of copying the strings themselves, since the flags ordering is 
not actually something that matters for the _strings_, but for the actual 
setup of the pointers _to_ the strings.

And that's actually where we could potentially be better off without the 
current page[] array, because we could just do it using the user space 
image (and the hardware support for a TLB) itself rather than walk the 
array by hand.

I don't think it's a huge deal. In the end, we'll have to put a limit on 
the argument space _somewhere_. I don't like the current page[] array, and 
it's silly to make the limit so low, but on the other hand, in order to 
avoid excessive time in the kernel by badly behaved apps, and because any 
good application _needs_ to be able to handle the limited argument space 
anyway, I actually suspect it really doesn't matter in the end.

			Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable
  2006-06-27 12:16           ` Arjan van de Ven
  2006-06-27 12:14             ` Ingo Molnar
@ 2006-07-18 12:28             ` Peter Zijlstra
  1 sibling, 0 replies; 11+ messages in thread
From: Peter Zijlstra @ 2006-07-18 12:28 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, linux-kernel, pavel,
	Ulrich Drepper

Hi,

So where are we on this? Some ppl (Linus, Ingo) suggested rather
involved solutions that would take some time to mature.

Could we run with the current patch, or one where the sysctl is taken
out and MAX_ARG_PAGES is fixed to PAGE_SIZE/sizeof(void*)?

Peter


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2006-07-18 12:41 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-06-23 10:54 [PATCH] binfmt: turn MAX_ARG_PAGES into a sysctl tunable Peter Zijlstra
2006-06-26 16:57 ` Andrew Morton
2006-06-26 17:21   ` Linus Torvalds
2006-06-26 22:35     ` Ingo Molnar
2006-06-26 22:46       ` Ingo Molnar
2006-06-26 23:02       ` Linus Torvalds
2006-06-27 11:09         ` Ingo Molnar
2006-06-27 12:16           ` Arjan van de Ven
2006-06-27 12:14             ` Ingo Molnar
2006-07-18 12:28             ` Peter Zijlstra
2006-06-28  1:07           ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).