linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness()
@ 2010-10-25  3:26 KOSAKI Motohiro
  2010-10-25  3:27 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
                   ` (3 more replies)
  0 siblings, 4 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-25  3:26 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds; +Cc: kosaki.motohiro, LKML, linux-mm

Current oom_score_adj is completely broken because It is strongly bound
google usecase and ignore other all.

1) Priority inversion
   As kamezawa-san pointed out, This break cgroup and lxr environment.
   He said,
	> Assume 2 proceses A, B which has oom_score_adj of 300 and 0
	> And A uses 200M, B uses 1G of memory under 4G system
	>
	> Under the system.
	> 	A's socre = (200M *1000)/4G + 300 = 350
	> 	B's score = (1G * 1000)/4G = 250.
	>
	> In the cpuset, it has 2G of memory.
	> 	A's score = (200M * 1000)/2G + 300 = 400
	> 	B's socre = (1G * 1000)/2G = 500
	>
	> This priority-inversion don't happen in current system.

2) Ratio base point don't works large machine
   oom_score_adj normalize oom-score to 0-1000 range.
   but if the machine has 1TB memory, 1 point (i.e. 0.1%) mean
   1GB. this is no suitable for tuning parameter.
   As I said, proposional value oriented tuning parameter has
   scalability risk.

3) No reason to implement ABI breakage.
   old tuning parameter mean)
	oom-score = oom-base-score x 2^oom_adj
   new tuning parameter mean)
	oom-score = oom-base-score + oom_score_adj / (totalram + totalswap)
   but "oom_score_adj / (totalram + totalswap)" can be calculated in
   userland too. beucase both totalram and totalswap has been exporsed by
   /proc. So no reason to introduce funny new equation.

4) totalram based normalization assume flat memory model.
   example, the machine is assymmetric numa. fat node memory and thin
   node memory might have another wight value.
   In other word, totalram based priority is a one of policy. Fixed and
   workload depended policy shouldn't be embedded in kernel. probably.

Then, this patch remove *UGLY* total_pages suck completely. Googler
can calculate it at userland!

Cc: stable@kernel.org
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 fs/proc/base.c        |   35 +++----------
 include/linux/oom.h   |   16 +-----
 include/linux/sched.h |    2 +-
 mm/oom_kill.c         |  142 ++++++++++++++++++++-----------------------------
 4 files changed, 69 insertions(+), 126 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5d5f5..86c402e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -434,8 +434,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL,
-					totalram_pages + total_swap_pages);
+		points = oom_badness(task, NULL, NULL);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
@@ -1056,15 +1055,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 			current->comm, task_pid_nr(current),
 			task_pid_nr(task), task_pid_nr(task));
 	task->signal->oom_adj = oom_adjust;
-	/*
-	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
-	 * value is always attainable.
-	 */
-	if (task->signal->oom_adj == OOM_ADJUST_MAX)
-		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
-	else
-		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-								-OOM_DISABLE;
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1077,12 +1067,14 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+#define TMPBUFLEN 21
+
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
 	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-	char buffer[PROC_NUMBUF];
-	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	char buffer[TMPBUFLEN];
+	long oom_score_adj = 0;
 	unsigned long flags;
 	size_t len;
 
@@ -1093,7 +1085,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 		unlock_task_sighand(task, &flags);
 	}
 	put_task_struct(task);
-	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	len = snprintf(buffer, sizeof(buffer), "%ld\n", oom_score_adj);
 	return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
@@ -1101,7 +1093,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 					size_t count, loff_t *ppos)
 {
 	struct task_struct *task;
-	char buffer[PROC_NUMBUF];
+	char buffer[TMPBUFLEN];
 	unsigned long flags;
 	long oom_score_adj;
 	int err;
@@ -1115,9 +1107,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
 	if (err)
 		return -EINVAL;
-	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
-			oom_score_adj > OOM_SCORE_ADJ_MAX)
-		return -EINVAL;
 
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task)
@@ -1134,15 +1123,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	}
 
 	task->signal->oom_score_adj = oom_score_adj;
-	/*
-	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
-	 * always attainable.
-	 */
-	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		task->signal->oom_adj = OOM_DISABLE;
-	else
-		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
-							OOM_SCORE_ADJ_MAX;
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 	return count;
@@ -1155,7 +1135,6 @@ static const struct file_operations proc_oom_score_adj_operations = {
 };
 
 #ifdef CONFIG_AUDITSYSCALL
-#define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 5e3aa83..21006dc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -12,13 +12,6 @@
 #define OOM_ADJUST_MIN (-16)
 #define OOM_ADJUST_MAX 15
 
-/*
- * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
- * pid.
- */
-#define OOM_SCORE_ADJ_MIN	(-1000)
-#define OOM_SCORE_ADJ_MAX	1000
-
 #ifdef __KERNEL__
 
 #include <linux/sched.h>
@@ -40,8 +33,9 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
-extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
-			const nodemask_t *nodemask, unsigned long totalpages);
+/* The badness from the OOM killer */
+extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+				 const nodemask_t *nodemask);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
@@ -62,10 +56,6 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-/* The badness from the OOM killer */
-extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
-		      const nodemask_t *nodemask, unsigned long uptime);
-
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 /* sysctls */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56154bb..74ed859 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -625,7 +625,7 @@ struct signal_struct {
 #endif
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
-	int oom_score_adj;	/* OOM kill score adjustment */
+	long oom_score_adj;	/* OOM kill score adjustment */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583..d58925e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -143,55 +143,41 @@ static bool oom_unkillable_task(struct task_struct *p,
 /**
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
- * @totalpages: total present RAM allowed for page allocation
  *
  * The heuristic for determining which task to kill is made to be as simple and
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
-		      const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+			  const nodemask_t *nodemask)
 {
-	int points;
+	unsigned long points;
+	unsigned long points_orig;
+	int oom_adj = p->signal->oom_adj;
+	long oom_score_adj = p->signal->oom_score_adj;
 
-	if (oom_unkillable_task(p, mem, nodemask))
-		return 0;
 
-	p = find_lock_task_mm(p);
-	if (!p)
+	if (oom_unkillable_task(p, mem, nodemask))
 		return 0;
-
-	/*
-	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
-	 * need to be executed for something that cannot be killed.
-	 */
-	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-		task_unlock(p);
+	if (oom_adj == OOM_DISABLE)
 		return 0;
-	}
 
 	/*
 	 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
 	 * priority for oom killing.
 	 */
-	if (p->flags & PF_OOM_ORIGIN) {
-		task_unlock(p);
-		return 1000;
-	}
+	if (p->flags & PF_OOM_ORIGIN)
+		return ULONG_MAX;
 
-	/*
-	 * The memory controller may have a limit of 0 bytes, so avoid a divide
-	 * by zero, if necessary.
-	 */
-	if (!totalpages)
-		totalpages = 1;
+	p = find_lock_task_mm(p);
+	if (!p)
+		return 0;
 
 	/*
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss and swap space use.
 	 */
-	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
-			totalpages;
+	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS));
 	task_unlock(p);
 
 	/*
@@ -199,14 +185,26 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 	 * implementation used by LSMs.
 	 */
 	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-		points -= 30;
+		points -= points / 32;
 
 	/*
-	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
-	 * either completely disable oom killing or always prefer a certain
-	 * task.
+	 * Adjust the score by oom_adj and oom_score_adj.
 	 */
-	points += p->signal->oom_score_adj;
+	points_orig = points;
+	points += oom_score_adj;
+	if ((oom_score_adj > 0) && (points < points_orig))
+		points = ULONG_MAX;	/* may be overflow */
+	if ((oom_score_adj < 0) && (points > points_orig))
+		points = 0;		/* may be underflow */
+
+	if (oom_adj) {
+		if (oom_adj > 0) {
+			if (!points)
+				points = 1;
+			points <<= oom_adj;
+		} else
+			points >>= -(oom_adj);
+	}
 
 	/*
 	 * Never return 0 for an eligible task that may be killed since it's
@@ -215,7 +213,7 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 	 */
 	if (points <= 0)
 		return 1;
-	return (points < 1000) ? points : 1000;
+	return points;
 }
 
 /*
@@ -223,17 +221,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
  */
 #ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask,
-				unsigned long *totalpages)
+				gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-	bool cpuset_limited = false;
-	int nid;
-
-	/* Default to all available memory */
-	*totalpages = totalram_pages + total_swap_pages;
 
 	if (!zonelist)
 		return CONSTRAINT_NONE;
@@ -250,33 +242,21 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 	 * the page allocator means a mempolicy is in effect.  Cpuset policy
 	 * is enforced in get_page_from_freelist().
 	 */
-	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
-		*totalpages = total_swap_pages;
-		for_each_node_mask(nid, *nodemask)
-			*totalpages += node_spanned_pages(nid);
+	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
 		return CONSTRAINT_MEMORY_POLICY;
-	}
 
 	/* Check this allocation failure is caused by cpuset's wall function */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 			high_zoneidx, nodemask)
 		if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
-			cpuset_limited = true;
+			return CONSTRAINT_CPUSET;
 
-	if (cpuset_limited) {
-		*totalpages = total_swap_pages;
-		for_each_node_mask(nid, cpuset_current_mems_allowed)
-			*totalpages += node_spanned_pages(nid);
-		return CONSTRAINT_CPUSET;
-	}
 	return CONSTRAINT_NONE;
 }
 #else
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask,
-				unsigned long *totalpages)
+				gfp_t gfp_mask, nodemask_t *nodemask)
 {
-	*totalpages = totalram_pages + total_swap_pages;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -287,16 +267,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
-		unsigned long totalpages, struct mem_cgroup *mem,
-		const nodemask_t *nodemask)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+					      struct mem_cgroup *mem,
+					      const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
 	*ppoints = 0;
 
 	for_each_process(p) {
-		unsigned int points;
+		unsigned long points;
 
 		if (oom_unkillable_task(p, mem, nodemask))
 			continue;
@@ -328,10 +308,10 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 				return ERR_PTR(-1UL);
 
 			chosen = p;
-			*ppoints = 1000;
+			*ppoints = ULONG_MAX;
 		}
 
-		points = oom_badness(p, mem, nodemask, totalpages);
+		points = oom_badness(p, mem, nodemask);
 		if (points > *ppoints) {
 			chosen = p;
 			*ppoints = points;
@@ -374,7 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5ld %s\n",
 			task->pid, task_uid(task), task->tgid,
 			task->mm->total_vm, get_mm_rss(task->mm),
 			task_cpu(task), task->signal->oom_adj,
@@ -388,7 +368,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d, oom_score_adj=%d\n",
+		"oom_adj=%d, oom_score_adj=%ld\n",
 		current->comm, gfp_mask, order, current->signal->oom_adj,
 		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
@@ -429,14 +409,13 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 #undef K
 
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			    unsigned int points, unsigned long totalpages,
-			    struct mem_cgroup *mem, nodemask_t *nodemask,
-			    const char *message)
+			    unsigned long points, struct mem_cgroup *mem,
+			    nodemask_t *nodemask, const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
 	struct task_struct *t = p;
-	unsigned int victim_points = 0;
+	unsigned long victim_points = 0;
 
 	if (printk_ratelimit())
 		dump_header(p, gfp_mask, order, mem, nodemask);
@@ -452,7 +431,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	}
 
 	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+	pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
 	task_unlock(p);
 
@@ -464,13 +443,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 */
 	do {
 		list_for_each_entry(child, &t->children, sibling) {
-			unsigned int child_points;
+			unsigned long child_points;
 
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
-			child_points = oom_badness(child, mem, nodemask,
-								totalpages);
+			child_points = oom_badness(child, mem, nodemask);
 			if (child_points > victim_points) {
 				victim = child;
 				victim_points = child_points;
@@ -508,19 +486,17 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
-	unsigned long limit;
-	unsigned int points = 0;
+	unsigned long points = 0;
 	struct task_struct *p;
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
-	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, limit, mem, NULL);
+	p = select_bad_process(&points, mem, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
-	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
+	if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL,
 				"Memory cgroup out of memory"))
 		goto retry;
 out:
@@ -646,9 +622,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 {
 	const nodemask_t *mpol_mask;
 	struct task_struct *p;
-	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned int points;
+	unsigned long points;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 	int killed = 0;
 
@@ -672,8 +647,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
-	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
-						&totalpages);
+	constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
 	mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
 	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
 
@@ -686,14 +660,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		 * non-zero, current could not be killed so we must fallback to
 		 * the tasklist scan.
 		 */
-		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+		if (!oom_kill_process(current, gfp_mask, order, 0,
 				NULL, nodemask,
 				"Out of memory (oom_kill_allocating_task)"))
 			goto out;
 	}
 
 retry:
-	p = select_bad_process(&points, totalpages, NULL, mpol_mask);
+	p = select_bad_process(&points, NULL, mpol_mask);
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -704,7 +678,7 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+	if (oom_kill_process(p, gfp_mask, order, points, NULL,
 				nodemask, "Out of memory"))
 		goto retry;
 	killed = 1;
-- 
1.6.5.2




^ permalink raw reply related	[flat|nested] 109+ messages in thread

* [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-10-25  3:26 [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() KOSAKI Motohiro
@ 2010-10-25  3:27 ` KOSAKI Motohiro
  2010-10-25 20:40   ` David Rientjes
  2010-10-25  3:28 ` [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct KOSAKI Motohiro
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-25  3:27 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds; +Cc: kosaki.motohiro, LKML, linux-mm

oom_adj is not only used for kernel knob, but also used for
application interface. Then, adding new knob is no good
reason to deprecate it. Don't do stupid!

Also, after former patch, oom_score_adj can't be used for setting
OOM_DISABLE. We need "echo -17 > /proc/<pid>/oom_adj" thing.

This reverts commit 51b1bd2ace1595b72956224deda349efa880b693.

Cc: stable@kernel.org
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 Documentation/feature-removal-schedule.txt |   25 -------------------------
 Documentation/filesystems/proc.txt         |    3 ---
 fs/proc/base.c                             |    8 --------
 include/linux/oom.h                        |    3 ---
 4 files changed, 0 insertions(+), 39 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 9961f15..1cba5b8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -151,31 +151,6 @@ Who:	Eric Biederman <ebiederm@xmission.com>
 
 ---------------------------
 
-What:	/proc/<pid>/oom_adj
-When:	August 2012
-Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
-	badness heuristic used to determine which task to kill when the kernel
-	is out of memory.
-
-	The badness heuristic has since been rewritten since the introduction of
-	this tunable such that its meaning is deprecated.  The value was
-	implemented as a bitshift on a score generated by the badness()
-	function that did not have any precise units of measure.  With the
-	rewrite, the score is given as a proportion of available memory to the
-	task allocating pages, so using a bitshift which grows the score
-	exponentially is, thus, impossible to tune with fine granularity.
-
-	A much more powerful interface, /proc/<pid>/oom_score_adj, was
-	introduced with the oom killer rewrite that allows users to increase or
-	decrease the badness() score linearly.  This interface will replace
-	/proc/<pid>/oom_adj.
-
-	A warning will be emitted to the kernel log if an application uses this
-	deprecated interface.  After it is printed once, future warnings will be
-	suppressed until the kernel is rebooted.
-
----------------------------
-
 What:	remove EXPORT_SYMBOL(kernel_thread)
 When:	August 2006
 Files:	arch/*/kernel/*_ksyms.c
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a6aca87..cf1295c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1285,9 +1285,6 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
-NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
-Documentation/feature-removal-schedule.txt.
-
 Caveat: when a parent task is selected, the oom killer will sacrifice any first
 generation children with seperate address spaces instead, if possible.  This
 avoids servers and important system daemons from being killed and loses the
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 86c402e..0d2ce21 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1046,14 +1046,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
-	/*
-	 * Warn that /proc/pid/oom_adj is deprecated, see
-	 * Documentation/feature-removal-schedule.txt.
-	 */
-	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
-			"please use /proc/%d/oom_score_adj instead.\n",
-			current->comm, task_pid_nr(current),
-			task_pid_nr(task), task_pid_nr(task));
 	task->signal->oom_adj = oom_adjust;
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 21006dc..394f2e6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,9 +2,6 @@
 #define __INCLUDE_LINUX_OOM_H
 
 /*
- * /proc/<pid>/oom_adj is deprecated, see
- * Documentation/feature-removal-schedule.txt.
- *
  * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
  */
 #define OOM_DISABLE (-17)
-- 
1.6.5.2




^ permalink raw reply related	[flat|nested] 109+ messages in thread

* [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct
  2010-10-25  3:26 [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() KOSAKI Motohiro
  2010-10-25  3:27 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
@ 2010-10-25  3:28 ` KOSAKI Motohiro
  2010-10-25 17:26   ` Roland McGrath
  2010-10-25  3:29 ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
  2010-10-25 20:37 ` [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() David Rientjes
  3 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-25  3:28 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds
  Cc: kosaki.motohiro, LKML, linux-mm, Oleg Nesterov, Roland McGrath

Changelog
  o since v1
    - function comment also change current->cred_guard_mutex to
      current->signal->cred_guard_mutex.

---------------------------------------------------------------------------
Oleg Nesterov pointed out we have to prevent multiple-threads-inside-exec
itself and we can reuse ->cred_guard_mutex for it. Yes, concurrent
execve() has no worth.

Let's move ->cred_guard_mutex from task_struct to signal_struct. It
naturally prevent multiple-threads-inside-exec.

Cc: stable@kernel.org
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 fs/exec.c                 |   10 +++++-----
 fs/proc/base.c            |    8 ++++----
 include/linux/init_task.h |    4 ++--
 include/linux/sched.h     |    7 ++++---
 include/linux/tracehook.h |    2 +-
 kernel/cred.c             |    4 +---
 kernel/fork.c             |    2 ++
 kernel/ptrace.c           |    4 ++--
 8 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f9..94dabd2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1078,14 +1078,14 @@ EXPORT_SYMBOL(setup_new_exec);
  */
 int prepare_bprm_creds(struct linux_binprm *bprm)
 {
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
 		return -ERESTARTNOINTR;
 
 	bprm->cred = prepare_exec_creds();
 	if (likely(bprm->cred))
 		return 0;
 
-	mutex_unlock(&current->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 	return -ENOMEM;
 }
 
@@ -1093,7 +1093,7 @@ void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
-		mutex_unlock(&current->cred_guard_mutex);
+		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
 	kfree(bprm);
@@ -1114,13 +1114,13 @@ void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
-	mutex_unlock(&current->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0d2ce21..d3ea8b0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -226,7 +226,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm;
 
-	if (mutex_lock_killable(&task->cred_guard_mutex))
+	if (mutex_lock_killable(&task->signal->cred_guard_mutex))
 		return NULL;
 
 	mm = get_task_mm(task);
@@ -235,7 +235,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		mmput(mm);
 		mm = NULL;
 	}
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 
 	return mm;
 }
@@ -2277,14 +2277,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 		goto out_free;
 
 	/* Guard against adverse ptrace interaction */
-	length = mutex_lock_interruptible(&task->cred_guard_mutex);
+	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 	if (length < 0)
 		goto out_free;
 
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
 				      (void*)page, count);
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
 	free_page((unsigned long) page);
 out:
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2fea6c8..1f8c06c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -29,6 +29,8 @@ extern struct fs_struct init_fs;
 		.running = 0,						\
 		.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
 	},								\
+	.cred_guard_mutex =						\
+		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
 }
 
 extern struct nsproxy init_nsproxy;
@@ -145,8 +147,6 @@ extern struct cred init_cred;
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
-	.cred_guard_mutex =						\
-		 __MUTEX_INITIALIZER(tsk.cred_guard_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 74ed859..ac65605 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -626,6 +626,10 @@ struct signal_struct {
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	long oom_score_adj;	/* OOM kill score adjustment */
+
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
@@ -1305,9 +1309,6 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	struct mutex cred_guard_mutex;	/* guard against foreign influences on
-					 * credential calculations
-					 * (notably. ptrace) */
 	struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 10db010..3a2e66d 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -150,7 +150,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
  *
  * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
  *
- * @task->cred_guard_mutex is held by the caller through the do_execve().
+ * @task->signal->cred_guard_mutex is held by the caller through the do_execve().
  */
 static inline int tracehook_unsafe_exec(struct task_struct *task)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e226..6a1aa00 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
 
 /*
  * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
  */
 struct cred *prepare_exec_creds(void)
 {
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
-	mutex_init(&p->cred_guard_mutex);
-
 	if (
 #ifdef CONFIG_KEYS
 		!p->cred->thread_keyring &&
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8c..8c09cf9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -904,6 +904,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
 
+	mutex_init(&sig->cred_guard_mutex);
+
 	return 0;
 }
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798..ac5013a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
 	 * under ptrace.
 	 */
 	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&task->cred_guard_mutex))
+	if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
 		goto out;
 
 	task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
 unlock_tasklist:
 	write_unlock_irq(&tasklist_lock);
 unlock_creds:
-	mutex_unlock(&task->cred_guard_mutex);
+	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
 	return retval;
 }
-- 
1.6.5.2




^ permalink raw reply related	[flat|nested] 109+ messages in thread

* [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-10-25  3:26 [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() KOSAKI Motohiro
  2010-10-25  3:27 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
  2010-10-25  3:28 ` [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct KOSAKI Motohiro
@ 2010-10-25  3:29 ` KOSAKI Motohiro
  2010-10-25 11:28   ` pageexec
  2010-11-23 14:34   ` Oleg Nesterov
  2010-10-25 20:37 ` [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() David Rientjes
  3 siblings, 2 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-25  3:29 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds
  Cc: kosaki.motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Oleg Nesterov, Roland McGrath

ChangeLog
 o since v2
   - Move ->in_exec_mm from task_struct to signal_struct
   - clean up oom_rss_swap_usage()
 o since v1
   - Always use thread group leader's ->in_exec_mm.
     It slightly makes efficient oom when a process has many thread.
   - Add the link of Brad's explanation to the description.


-----------------------------------------------------------
Brad Spengler published a local memory-allocation DoS that
evades the OOM-killer (though not the virtual memory RLIMIT):
http://www.grsecurity.net/~spender/64bit_dos.c

Because execve() makes new mm struct and setup stack and
copy argv. It mean the task have two mm while execve() temporary.
Unfortunately this nascent mm is not pointed any tasks, then
OOM-killer can't detect this memory usage. therefore OOM-killer
may kill incorrect task.

Thus, this patch added signal->in_exec_mm member and track
nascent mm usage.

Cc: stable@kernel.org
Cc: pageexec@freemail.hu
Cc: Roland McGrath <roland@redhat.com>
Cc: Solar Designer <solar@openwall.com>
Cc: Eugene Teo <eteo@redhat.com>
Reported-by: Brad Spengler <spender@grsecurity.net>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 fs/compat.c             |    4 +++-
 fs/exec.c               |   16 +++++++++++++++-
 include/linux/binfmts.h |    1 +
 include/linux/sched.h   |    1 +
 mm/oom_kill.c           |   26 +++++++++++++++++++-------
 5 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 0644a15..a85b196 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1567,8 +1567,10 @@ int compat_do_execve(char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
+	if (bprm->mm) {
+		set_exec_mm(NULL);
 		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {
diff --git a/fs/exec.c b/fs/exec.c
index 94dabd2..2395d10 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -347,6 +347,8 @@ int bprm_mm_init(struct linux_binprm *bprm)
 	if (err)
 		goto err;
 
+	set_exec_mm(mm);
+
 	return 0;
 
 err:
@@ -759,6 +761,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	tsk->signal->in_exec_mm = NULL;
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
@@ -1328,6 +1331,15 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
+void set_exec_mm(struct mm_struct *mm)
+{
+	struct task_struct *leader = current->group_leader;
+
+	task_lock(leader);
+	leader->signal->in_exec_mm = mm;
+	task_unlock(leader);
+}
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1416,8 +1428,10 @@ int do_execve(const char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
+	if (bprm->mm) {
+		set_exec_mm(NULL);
 		mmput (bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index a065612..2fde1ba 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -133,6 +133,7 @@ extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
 extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
+extern void set_exec_mm(struct mm_struct *mm);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ac65605..b880931 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -630,6 +630,7 @@ struct signal_struct {
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace) */
+	struct mm_struct *in_exec_mm;	/* temporary nascent mm in execve */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d58925e..830065f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -120,6 +120,15 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
 	return NULL;
 }
 
+/*
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss and swap space use.
+ */
+static unsigned long oom_rss_swap_usage(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS);
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
 		const struct mem_cgroup *mem, const nodemask_t *nodemask)
@@ -151,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 			  const nodemask_t *nodemask)
 {
-	unsigned long points;
+	unsigned long points = 0;
 	unsigned long points_orig;
 	int oom_adj = p->signal->oom_adj;
 	long oom_score_adj = p->signal->oom_score_adj;
@@ -169,15 +178,18 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 	if (p->flags & PF_OOM_ORIGIN)
 		return ULONG_MAX;
 
+	/* The task is now processing execve(). then it has second mm */
+	if (unlikely(p->signal->in_exec_mm)) {
+		task_lock(p->group_leader);
+		if (p->signal->in_exec_mm)
+			points = oom_rss_swap_usage(p->signal->in_exec_mm);
+		task_unlock(p->group_leader);
+	}
+
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 0;
-
-	/*
-	 * The baseline for the badness score is the proportion of RAM that each
-	 * task's rss and swap space use.
-	 */
-	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS));
+	points += oom_rss_swap_usage(p->mm);
 	task_unlock(p);
 
 	/*
-- 
1.6.5.2




^ permalink raw reply related	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-10-25  3:29 ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
@ 2010-10-25 11:28   ` pageexec
  2010-10-26  7:25     ` KOSAKI Motohiro
  2010-11-23 14:34   ` Oleg Nesterov
  1 sibling, 1 reply; 109+ messages in thread
From: pageexec @ 2010-10-25 11:28 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds, KOSAKI Motohiro
  Cc: LKML, linux-mm, Solar Designer, Eugene Teo, Brad Spengler,
	Oleg Nesterov, Roland McGrath

On 25 Oct 2010 at 12:29, KOSAKI Motohiro wrote:

hi,

i've got a few comments/questions about the whole approach, see them inline.

> index 0644a15..a85b196 100644
> --- a/fs/compat.c
> +++ b/fs/compat.c
> @@ -1567,8 +1567,10 @@ int compat_do_execve(char * filename,
>  	return retval;
>  
>  out:
> -	if (bprm->mm)
> +	if (bprm->mm) {
> +		set_exec_mm(NULL);
>  		mmput(bprm->mm);
> +	}
>  
>  out_file:
>  	if (bprm->file) {
> diff --git a/fs/exec.c b/fs/exec.c
> index 94dabd2..2395d10 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -347,6 +347,8 @@ int bprm_mm_init(struct linux_binprm *bprm)
>  	if (err)
>  		goto err;
>  
> +	set_exec_mm(mm);
> +
>  	return 0;
>  
>  err:
> @@ -1416,8 +1428,10 @@ int do_execve(const char * filename,
>  	return retval;
>  
>  out:
> -	if (bprm->mm)
> +	if (bprm->mm) {
> +		set_exec_mm(NULL);
>  		mmput (bprm->mm);
> +	}
>  
>  out_file:
>  	if (bprm->file) {

what happens when two (or more) threads in the same process call execve? the
above set_exec_mm calls will race (de_thread doesn't happen until much later
in execve) and overwrite each other's ->in_exec_mm which will still lead to
problems since there will be at most one temporary mm accounted for in the
oom killer.

[update: since i don't seem to have been cc'd on the other patch that
serializes execve, the above point is moot ;)]

worse, even if each temporary mm was tracked separately there'd still be a
race where the oom killer can get triggered with the culprit thread long
gone (and reset ->in_exec_mm) and never to be found, so the oom killer would
find someone else as guilty.

now all this leads me to suggest a simpler solution, at least for the first
problem mentioned above (i don't know what to do with the second one yet as
it seems to be a generic issue with the oom killer, probably it should verify
the oom situation once again after it took the task_list lock).

[update: while the serialized execve solves the first problem, i still think
that my idea is simpler and worth considering, so i leave it here even if for
just documentation purposes ;)]

given that all the oom killer needs from the mm struct is either ->total_pages
(in .35 and before, so be careful with the stable backport) or some ->rss_stat
counters, wouldn't it be much easier to simply transfer the bprm->mm counters
into current->mm for the duration of the execve (say, add them in get_arg_page
and remove them when bprm->mm is mmput in the do_execve failure path, etc)? the
transfer can be either to the existing counters or to new ones (obviously in
the latter case the oom code needs a small change to take the new counters into
account as well).

cheers,

 PaX Team


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct
  2010-10-25  3:28 ` [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct KOSAKI Motohiro
@ 2010-10-25 17:26   ` Roland McGrath
  2010-10-25 17:42     ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Roland McGrath @ 2010-10-25 17:26 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, Oleg Nesterov

This has my ACK if Oleg doesn't see any problems.

Thanks,
Roland

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct
  2010-10-25 17:26   ` Roland McGrath
@ 2010-10-25 17:42     ` Oleg Nesterov
  2010-10-25 17:51       ` Roland McGrath
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-10-25 17:42 UTC (permalink / raw)
  To: Roland McGrath
  Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

On 10/25, Roland McGrath wrote:
>
> This has my ACK if Oleg doesn't see any problems.

I believe the patch is fine (it already has my reviewed-by).

Except: I am not sure about -stable. At least, this patch should
not go into the <2.6.35 kernels, it relies on misc changes which
changed the scope of task->signal. Before 2.6.35 almost any user
of ->cred_guard_mutex can race with exit and hit ->signal == NULL.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct
  2010-10-25 17:42     ` Oleg Nesterov
@ 2010-10-25 17:51       ` Roland McGrath
  2010-10-26 13:04         ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: Roland McGrath @ 2010-10-25 17:51 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> Except: I am not sure about -stable. At least, this patch should
> not go into the <2.6.35 kernels, it relies on misc changes which
> changed the scope of task->signal. Before 2.6.35 almost any user
> of ->cred_guard_mutex can race with exit and hit ->signal == NULL.

I see no justification for a change like this in any -stable tree.  It's
just a cleanup, right?  If it's a prerequisite for the fix we like for an
"important" bug, then that's a different story.  In its own right, it's
clearly not appropriate for backporting.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness()
  2010-10-25  3:26 [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() KOSAKI Motohiro
                   ` (2 preceding siblings ...)
  2010-10-25  3:29 ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
@ 2010-10-25 20:37 ` David Rientjes
  3 siblings, 0 replies; 109+ messages in thread
From: David Rientjes @ 2010-10-25 20:37 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Mon, 25 Oct 2010, KOSAKI Motohiro wrote:

> Current oom_score_adj is completely broken because It is strongly bound
> google usecase and ignore other all.
> 

NACK.

Same response as the previous three times this patch has been proposed:

	http://marc.info/?t=128461666500001
	http://marc.info/?t=128324705200002
	http://marc.info/?t=128272938200002

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-10-25  3:27 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
@ 2010-10-25 20:40   ` David Rientjes
  2010-10-26 13:01     ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-10-25 20:40 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Mon, 25 Oct 2010, KOSAKI Motohiro wrote:

> oom_adj is not only used for kernel knob, but also used for
> application interface. Then, adding new knob is no good
> reason to deprecate it. Don't do stupid!
> 

NACK as a logical follow-up to my NACK for "oom: remove totalpage 
normalization from oom_badness()"

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-10-25 11:28   ` pageexec
@ 2010-10-26  7:25     ` KOSAKI Motohiro
  0 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-26  7:25 UTC (permalink / raw)
  To: pageexec
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	Solar Designer, Eugene Teo, Brad Spengler, Oleg Nesterov,
	Roland McGrath

Hi

Thank you for reviewing.

> what happens when two (or more) threads in the same process call execve? the
> above set_exec_mm calls will race (de_thread doesn't happen until much later
> in execve) and overwrite each other's ->in_exec_mm which will still lead to
> problems since there will be at most one temporary mm accounted for in the
> oom killer.

patch 3/4 prevent this race :)
now, 3/4 move cred_guard_mutex into signal struct. and execve() take 
signal->cred_guard_mutex for protecting concurent execve race.


> [update: since i don't seem to have been cc'd on the other patch that
> serializes execve, the above point is moot ;)]

Ah, sorry. that's my mistake. I thought you've reviewed this one at
my last posting. 

can you please see 3/4? the URL is below.

http://www.gossamer-threads.com/lists/linux/kernel/1293297?do=post_view_threaded

> worse, even if each temporary mm was tracked separately there'd still be a
> race where the oom killer can get triggered with the culprit thread long
> gone (and reset ->in_exec_mm) and never to be found, so the oom killer would
> find someone else as guilty.

Sorry, I haven't got this point. can you please elaborate this worse scenario? 


> now all this leads me to suggest a simpler solution, at least for the first
> problem mentioned above (i don't know what to do with the second one yet as
> it seems to be a generic issue with the oom killer, probably it should verify
> the oom situation once again after it took the task_list lock).
> 
> [update: while the serialized execve solves the first problem, i still think
> that my idea is simpler and worth considering, so i leave it here even if for
> just documentation purposes ;)]
> 
> given that all the oom killer needs from the mm struct is either ->total_pages
> (in .35 and before, so be careful with the stable backport) or some ->rss_stat
> counters, wouldn't it be much easier to simply transfer the bprm->mm counters
> into current->mm for the duration of the execve (say, add them in get_arg_page
> and remove them when bprm->mm is mmput in the do_execve failure path, etc)? the
> transfer can be either to the existing counters or to new ones (obviously in
> the latter case the oom code needs a small change to take the new counters into
> account as well).

As I said at previous discussion, It is possible and one of option. and I've
made the patch of this way too at once. But, It is messy than current. because
pages in nascent mm are also swappable. then, a swapping-out of such page need
to update both mm->rss_stat and nascent_mm->rss_stat. IOW, we need to change 
VM core. But, actually, execve vs OOM race is very rarely event, then, I don't 
hope to add some new branch and complexity.

Note: before 2.6.35, oom_kill.c track amount of process virtual address space.
then changing get_arg_page() is enough. but on 2.6.36 or later, oom_kill.c track
amount of process rss. then we can't ignore swap in/out event. and changing
get_arg_page() is not enough. Or, Do you propse new OOM account 
mm->rss + nascent_mm->total_vm? this can be easily. but tricky more.

So, I think this is one of trade-off issue. If you have better patch rather
than me, I'm glad to accept your one and join to review it. However myself 
don't plan to take this approach.


Thanks.




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-10-25 20:40   ` David Rientjes
@ 2010-10-26 13:01     ` KOSAKI Motohiro
  2010-10-26 19:37       ` David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-26 13:01 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> On Mon, 25 Oct 2010, KOSAKI Motohiro wrote:
> 
> > oom_adj is not only used for kernel knob, but also used for
> > application interface. Then, adding new knob is no good
> > reason to deprecate it. Don't do stupid!
> > 
> 
> NACK as a logical follow-up to my NACK for "oom: remove totalpage 
> normalization from oom_badness()"

Huh?

I requested you show us justification. BUT YOU DIDNT. If you have any 
usecase, show us RIGHT NOW. 

Don't speaking sucking crap. If you don't want ignore you. you are busy
than you.

DONT STUPID.



^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct
  2010-10-25 17:51       ` Roland McGrath
@ 2010-10-26 13:04         ` KOSAKI Motohiro
  2010-10-26 13:18           ` Roland McGrath
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-10-26 13:04 UTC (permalink / raw)
  To: Roland McGrath
  Cc: kosaki.motohiro, Oleg Nesterov, Andrew Morton, Linus Torvalds,
	LKML, linux-mm

Hello,

> > Except: I am not sure about -stable. At least, this patch should
> > not go into the <2.6.35 kernels, it relies on misc changes which
> > changed the scope of task->signal. Before 2.6.35 almost any user
> > of ->cred_guard_mutex can race with exit and hit ->signal == NULL.
> 
> I see no justification for a change like this in any -stable tree.  It's
> just a cleanup, right?  If it's a prerequisite for the fix we like for an
> "important" bug, then that's a different story.  In its own right, it's
> clearly not appropriate for backporting.

Because [4/4] depend on [3/4] and I hope to backport it. Do you dislike it
too?


Thanks.




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct
  2010-10-26 13:04         ` KOSAKI Motohiro
@ 2010-10-26 13:18           ` Roland McGrath
  0 siblings, 0 replies; 109+ messages in thread
From: Roland McGrath @ 2010-10-26 13:18 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Oleg Nesterov, Andrew Morton, Linus Torvalds, LKML, linux-mm

> Because [4/4] depend on [3/4] and I hope to backport it. Do you dislike it
> too?

Ah, OK.  That is indeed a fix for an important bug.  Not knowing the mm
code very well, I'm not in a position to judge whether it's safe enough
for a -stable stream or not.  If it is and it could be done safely
without relying on 3/4, that would seem safer to me, but it is not a
strong opinion.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-10-26 13:01     ` KOSAKI Motohiro
@ 2010-10-26 19:37       ` David Rientjes
  2010-11-01  7:06         ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-10-26 19:37 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Tue, 26 Oct 2010, KOSAKI Motohiro wrote:

> > NACK as a logical follow-up to my NACK for "oom: remove totalpage 
> > normalization from oom_badness()"
> 
> Huh?
> 
> I requested you show us justification. BUT YOU DIDNT. If you have any 
> usecase, show us RIGHT NOW. 
> 

The new tunable added in 2.6.36, /proc/pid/oom_score_adj, is necessary for 
the units that the badness score now uses.  We need a tunable with a much 
higher resolution than the oom_adj scale from -16 to +15, and one that 
scales linearly as opposed to exponentially.  Since that tunable is much 
more powerful than the oom_adj implementation, which never made any real 
sense for defining oom killing priority for any purpose other than 
polarization, the old tunable is deprecated for two years.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-10-26 19:37       ` David Rientjes
@ 2010-11-01  7:06         ` KOSAKI Motohiro
  2010-11-01 19:36           ` David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-01  7:06 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> On Tue, 26 Oct 2010, KOSAKI Motohiro wrote:
> 
> > > NACK as a logical follow-up to my NACK for "oom: remove totalpage 
> > > normalization from oom_badness()"
> > 
> > Huh?
> > 
> > I requested you show us justification. BUT YOU DIDNT. If you have any 
> > usecase, show us RIGHT NOW. 
> > 
> 
> The new tunable added in 2.6.36, /proc/pid/oom_score_adj, is necessary for 
> the units that the badness score now uses.  We need a tunable with a much 

Who we?

> higher resolution than the oom_adj scale from -16 to +15, and one that 
> scales linearly as opposed to exponentially.  Since that tunable is much 
> more powerful than the oom_adj implementation, which never made any real 

The reason that you ware NAKed was not to introduce new powerful feature.
It was caused to break old and used feature from applications.


> sense for defining oom killing priority for any purpose other than 
> polarization, the old tunable is deprecated for two years.

You haven't tested your patch at all. Distro's initram script are using
oom_adj interface and latest kernel show pointless warnings 
"/proc/xx/oom_adj is deprecated, please use /proc/xx/oom_score_adj instead."
at _every_ boot time.

As I said, DON'T SEND UNTESTED PATCH! DON'T BREAK USERLAND CARELESSLY!



^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-01  7:06         ` KOSAKI Motohiro
@ 2010-11-01 19:36           ` David Rientjes
  2010-11-09  2:26             ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-11-01 19:36 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Mon, 1 Nov 2010, KOSAKI Motohiro wrote:

> > The new tunable added in 2.6.36, /proc/pid/oom_score_adj, is necessary for 
> > the units that the badness score now uses.  We need a tunable with a much 
> 
> Who we?
> 

Linux users who care about prioritizing tasks for oom kill with a tunable 
that (1) has a unit, (2) has a higher resolution, and (3) is linear and 
not exponential.  Memcg doesn't solve this issue without incurring a 1% 
memory cost.

> > higher resolution than the oom_adj scale from -16 to +15, and one that 
> > scales linearly as opposed to exponentially.  Since that tunable is much 
> > more powerful than the oom_adj implementation, which never made any real 
> 
> The reason that you ware NAKed was not to introduce new powerful feature.
> It was caused to break old and used feature from applications.
> 

No, it doesn't, and you completely and utterly failed to show a single 
usecase that broke as a result of this because nobody can currently use 
oom_adj for anything other than polarization.  Thus, there's no backwards 
compatibility issue.

> > sense for defining oom killing priority for any purpose other than 
> > polarization, the old tunable is deprecated for two years.
> 
> You haven't tested your patch at all. Distro's initram script are using
> oom_adj interface and latest kernel show pointless warnings 
> "/proc/xx/oom_adj is deprecated, please use /proc/xx/oom_score_adj instead."
> at _every_ boot time.
> 

Yes, I've tested it, and it deprecates the tunable as expected.  A single 
warning message serves the purpose well: let users know one time without 
being overly verbose that the tunable is deprecated and give them 
sufficient time (2 years) to start using the new tunable.  That's how 
deprecation is done.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-01 19:36           ` David Rientjes
@ 2010-11-09  2:26             ` KOSAKI Motohiro
  2010-11-09  3:28               ` KOSAKI Motohiro
  2010-11-09 23:33               ` David Rientjes
  0 siblings, 2 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-09  2:26 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> On Mon, 1 Nov 2010, KOSAKI Motohiro wrote:
> 
> > > The new tunable added in 2.6.36, /proc/pid/oom_score_adj, is necessary for 
> > > the units that the badness score now uses.  We need a tunable with a much 
> > 
> > Who we?
> > 
> 
> Linux users who care about prioritizing tasks for oom kill with a tunable 
> that (1) has a unit, (2) has a higher resolution, and (3) is linear and 
> not exponential.  

No. Majority user don't care. You only talk about your case. Don't ignore
end user.


> Memcg doesn't solve this issue without incurring a 1% 
> memory cost.

Look at a real.
All major distributions has already turn on memcg. End user don't need
to pay additional cost.



> 
> > > higher resolution than the oom_adj scale from -16 to +15, and one that 
> > > scales linearly as opposed to exponentially.  Since that tunable is much 
> > > more powerful than the oom_adj implementation, which never made any real 
> > 
> > The reason that you ware NAKed was not to introduce new powerful feature.
> > It was caused to break old and used feature from applications.
> > 
> 
> No, it doesn't, and you completely and utterly failed to show a single 
> usecase that broke as a result of this because nobody can currently use 
> oom_adj for anything other than polarization.  Thus, there's no backwards 
> compatibility issue.

No. I showed. 
1) Google code search showed some application are using this feature.
	http://www.google.com/codesearch?as_q=oom_adj&btnG=Search+Code&hl=ja&as_package=&as_lang=&as_filename=&as_class=&as_function=&as_license=&as_case=

2) Not body use oom_adj other than polarization even though there are a few.
   example, kde are using.
	http://www.google.com/codesearch/p?hl=ja#MPJuLvSvNYM/pub/kde/unstable/snapshots/kdelibs.tar.bz2%7CWClmGVN5niU/kdelibs-1164923/kinit/start_kdeinit.c&q=oom_adj%20kde%205

When you are talking polarization issue, you blind a real. Don't talk your dream.

3) udev are using this feature. It's one of major linux component and you broke.

http://www.google.com/codesearch/p?hl=ja#KVTjzuVpblQ/pub/linux/utils/kernel/hotplug/udev-072.tar.bz2%7CwUSE-Ay3lLI/udev-072/udevd.c&q=oom_adj

You don't have to break our userland. you can't rewrite or deprecate 
old one. It's used! You can only add orthogonal new knob.


> > > sense for defining oom killing priority for any purpose other than 
> > > polarization, the old tunable is deprecated for two years.
> > 
> > You haven't tested your patch at all. Distro's initram script are using
> > oom_adj interface and latest kernel show pointless warnings 
> > "/proc/xx/oom_adj is deprecated, please use /proc/xx/oom_score_adj instead."
> > at _every_ boot time.
> > 
> 
> Yes, I've tested it, and it deprecates the tunable as expected.  A single 
> warning message serves the purpose well: let users know one time without 
> being overly verbose that the tunable is deprecated and give them 
> sufficient time (2 years) to start using the new tunable.  That's how 
> deprecation is done.

no sense.

Why do their application need to rewrite for *YOU*? Okey, you will got
benefit from your new knob. But NOBDOY use the new one. and People need
to rewrite their application even though no benefit. 

Don't do selfish userland breakage!




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-09  2:26             ` KOSAKI Motohiro
@ 2010-11-09  3:28               ` KOSAKI Motohiro
  2010-11-15  0:24                 ` KOSAKI Motohiro
  2010-11-09 23:33               ` David Rientjes
  1 sibling, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-09  3:28 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: kosaki.motohiro, David Rientjes, Andrew Morton, Linus Torvalds,
	LKML, linux-mm

> > Yes, I've tested it, and it deprecates the tunable as expected.  A single 
> > warning message serves the purpose well: let users know one time without 
> > being overly verbose that the tunable is deprecated and give them 
> > sufficient time (2 years) to start using the new tunable.  That's how 
> > deprecation is done.
> 
> no sense.
> 
> Why do their application need to rewrite for *YOU*? Okey, you will got
> benefit from your new knob. But NOBDOY use the new one. and People need
> to rewrite their application even though no benefit. 
> 
> Don't do selfish userland breakage!

And you said you ignore bug even though you have seen it. It suck!




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-09  2:26             ` KOSAKI Motohiro
  2010-11-09  3:28               ` KOSAKI Motohiro
@ 2010-11-09 23:33               ` David Rientjes
  2010-11-09 23:35                 ` Alan Cox
  2010-11-14  5:07                 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
  1 sibling, 2 replies; 109+ messages in thread
From: David Rientjes @ 2010-11-09 23:33 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Tue, 9 Nov 2010, KOSAKI Motohiro wrote:

> > > > The new tunable added in 2.6.36, /proc/pid/oom_score_adj, is necessary for 
> > > > the units that the badness score now uses.  We need a tunable with a much 
> > > 
> > > Who we?
> > > 
> > 
> > Linux users who care about prioritizing tasks for oom kill with a tunable 
> > that (1) has a unit, (2) has a higher resolution, and (3) is linear and 
> > not exponential.  
> 
> No. Majority user don't care. You only talk about your case. Don't ignore
> end user.
> 

If they don't care, then they won't be using oom_adj, so you're point 
about it's deprecation is irrelevant.

Other users do want a more powerful userspace interface with a unit and 
higher resolution (I am one of them), there's no requirement that those 
users need to be in the majority.

> > Memcg doesn't solve this issue without incurring a 1% 
> > memory cost.
> 
> Look at a real.
> All major distributions has already turn on memcg. End user don't need
> to pay additional cost.
> 

Memcg also has a command-line disabling option to avoid incurring this 1% 
memory cost when you're not going to be using it.

> > No, it doesn't, and you completely and utterly failed to show a single 
> > usecase that broke as a result of this because nobody can currently use 
> > oom_adj for anything other than polarization.  Thus, there's no backwards 
> > compatibility issue.
> 
> No. I showed. 
> 1) Google code search showed some application are using this feature.
> 	http://www.google.com/codesearch?as_q=oom_adj&btnG=Search+Code&hl=ja&as_package=&as_lang=&as_filename=&as_class=&as_function=&as_license=&as_case=
> 

oom_adj isn't removed, it's deprecated.  These users are using a 
deprecated interface and have a few years to convert to using the new 
interface (if it ever is actually removed).

> 2) Not body use oom_adj other than polarization even though there are a few.
>    example, kde are using.
> 	http://www.google.com/codesearch/p?hl=ja#MPJuLvSvNYM/pub/kde/unstable/snapshots/kdelibs.tar.bz2%7CWClmGVN5niU/kdelibs-1164923/kinit/start_kdeinit.c&q=oom_adj%20kde%205
> 
> When you are talking polarization issue, you blind a real. Don't talk your dream.
> 

I don't understand what you're trying to say here, but the current users 
of oom_adj that aren't +15 or -16 (or OOM_DISABLE) are arbitrary based 
relative to other tasks such as +5, +10, etc.  They don't have any 
semantics other than being arbitrarily relative because it doesn't work in 
a linear way or with a scale.

> 3) udev are using this feature. It's one of major linux component and you broke.
> 
> http://www.google.com/codesearch/p?hl=ja#KVTjzuVpblQ/pub/linux/utils/kernel/hotplug/udev-072.tar.bz2%7CwUSE-Ay3lLI/udev-072/udevd.c&q=oom_adj
> 
> You don't have to break our userland. you can't rewrite or deprecate 
> old one. It's used! You can only add orthogonal new knob.
> 

That's incorrect, I didn't break anything by deprecating a tunable for a 
few years.  oom_adj gets converted roughly into an equivalent (but linear) 
oom_score_adj.

Unfortunately for your argument, you can't show a single example of a 
current oom_adj user that has a scientific calculation behind its value 
that is now broken on the linear scale.

> > Yes, I've tested it, and it deprecates the tunable as expected.  A single 
> > warning message serves the purpose well: let users know one time without 
> > being overly verbose that the tunable is deprecated and give them 
> > sufficient time (2 years) to start using the new tunable.  That's how 
> > deprecation is done.
> 
> no sense.
> 
> Why do their application need to rewrite for *YOU*? Okey, you will got
> benefit from your new knob. But NOBDOY use the new one. and People need
> to rewrite their application even though no benefit. 
> 
> Don't do selfish userland breakage!
> 

It's deprecated for a few years so users can gradually convert to the new 
tunable, it wasn't removed when the new one was introduced.  A higher 
resolution tunable that scales linearly with a unit is an advantage for 
Linux (for the minority of users who care about oom killing priority 
beyond the heuristic) and I think a few years is enough time for users to 
do a simple conversion to the new tunable.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-09 23:33               ` David Rientjes
@ 2010-11-09 23:35                 ` Alan Cox
  2010-11-09 23:48                   ` David Rientjes
  2010-11-14  5:07                 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
  1 sibling, 1 reply; 109+ messages in thread
From: Alan Cox @ 2010-11-09 23:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> It's deprecated for a few years so users can gradually convert to the new 
> tunable, it wasn't removed when the new one was introduced.  A higher 
> resolution tunable that scales linearly with a unit is an advantage for 
> Linux (for the minority of users who care about oom killing priority 
> beyond the heuristic) and I think a few years is enough time for users to 
> do a simple conversion to the new tunable.

Documentation/ABI/obsolete/

should have all obsoletes in it.

Alan

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-09 23:35                 ` Alan Cox
@ 2010-11-09 23:48                   ` David Rientjes
  2010-11-09 23:55                     ` [patch] oom: document obsolete oom_adj tunable David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-11-09 23:48 UTC (permalink / raw)
  To: Alan Cox; +Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

On Tue, 9 Nov 2010, Alan Cox wrote:

> > It's deprecated for a few years so users can gradually convert to the new 
> > tunable, it wasn't removed when the new one was introduced.  A higher 
> > resolution tunable that scales linearly with a unit is an advantage for 
> > Linux (for the minority of users who care about oom killing priority 
> > beyond the heuristic) and I think a few years is enough time for users to 
> > do a simple conversion to the new tunable.
> 
> Documentation/ABI/obsolete/
> 
> should have all obsoletes in it.
> 

Good point, the only documentation right now is in 
Documentation/feature-removal-schedule.txt and in the kernel log the first 
time oom_adj is written.  I'll generate a patch, thanks!

^ permalink raw reply	[flat|nested] 109+ messages in thread

* [patch] oom: document obsolete oom_adj tunable
  2010-11-09 23:48                   ` David Rientjes
@ 2010-11-09 23:55                     ` David Rientjes
  2010-11-15  0:22                       ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-11-09 23:55 UTC (permalink / raw)
  To: Alan Cox
  Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, linux-kernel, linux-mm

/proc/pid/oom_adj was deprecated in August 2010 with the introduction of
the new oom killer heuristic.

This patch copies the Documentation/feature-removal-schedule.txt entry
for this tunable to the Documentation/ABI/obsolete directory so nobody
misses it.

Reported-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/ABI/obsolete/proc-pid-oom_adj |   22 ++++++++++++++++++++++
 1 files changed, 22 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/ABI/obsolete/proc-pid-oom_adj

diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj
new file mode 100644
--- /dev/null
+++ b/Documentation/ABI/obsolete/proc-pid-oom_adj
@@ -0,0 +1,22 @@
+What:	/proc/<pid>/oom_adj
+When:	August 2012
+Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
+	badness heuristic used to determine which task to kill when the kernel
+	is out of memory.
+
+	The badness heuristic has since been rewritten since the introduction of
+	this tunable such that its meaning is deprecated.  The value was
+	implemented as a bitshift on a score generated by the badness()
+	function that did not have any precise units of measure.  With the
+	rewrite, the score is given as a proportion of available memory to the
+	task allocating pages, so using a bitshift which grows the score
+	exponentially is, thus, impossible to tune with fine granularity.
+
+	A much more powerful interface, /proc/<pid>/oom_score_adj, was
+	introduced with the oom killer rewrite that allows users to increase or
+	decrease the badness() score linearly.  This interface will replace
+	/proc/<pid>/oom_adj.
+
+	A warning will be emitted to the kernel log if an application uses this
+	deprecated interface.  After it is printed once, future warnings will be
+	suppressed until the kernel is rebooted.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-09 23:33               ` David Rientjes
  2010-11-09 23:35                 ` Alan Cox
@ 2010-11-14  5:07                 ` KOSAKI Motohiro
  2010-11-14 21:39                   ` David Rientjes
  1 sibling, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-14  5:07 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> On Tue, 9 Nov 2010, KOSAKI Motohiro wrote:
> 
> > > > > The new tunable added in 2.6.36, /proc/pid/oom_score_adj, is necessary for 
> > > > > the units that the badness score now uses.  We need a tunable with a much 
> > > > 
> > > > Who we?
> > > > 
> > > 
> > > Linux users who care about prioritizing tasks for oom kill with a tunable 
> > > that (1) has a unit, (2) has a higher resolution, and (3) is linear and 
> > > not exponential.  
> > 
> > No. Majority user don't care. You only talk about your case. Don't ignore
> > end user.
> 
> If they don't care, then they won't be using oom_adj, so you're point 
> about it's deprecation is irrelevant.

No irrelevant. Your patch break their environment even though
they don't use oom_adj explicitly. because their application are using it.


> 
> Other users do want a more powerful userspace interface with a unit and 
> higher resolution (I am one of them), there's no requirement that those 
> users need to be in the majority.

But, they only live in your DREAM. you coldn't show who necessary.


>
> > > Memcg doesn't solve this issue without incurring a 1% 
> > > memory cost.
> > 
> > Look at a real.
> > All major distributions has already turn on memcg. End user don't need
> > to pay additional cost.
> 
> Memcg also has a command-line disabling option to avoid incurring this 1% 
> memory cost when you're not going to be using it.

Look at real. who use it?



> > > No, it doesn't, and you completely and utterly failed to show a single 
> > > usecase that broke as a result of this because nobody can currently use 
> > > oom_adj for anything other than polarization.  Thus, there's no backwards 
> > > compatibility issue.
> > 
> > No. I showed. 
> > 1) Google code search showed some application are using this feature.
> > 	http://www.google.com/codesearch?as_q=oom_adj&btnG=Search+Code&hl=ja&as_package=&as_lang=&as_filename=&as_class=&as_function=&as_license=&as_case=
> > 
> 
> oom_adj isn't removed, it's deprecated.  These users are using a 
> deprecated interface and have a few years to convert to using the new 
> interface (if it ever is actually removed).

No. there is no reason to enforce rewrite tons applicatin.



> 
> > 2) Not body use oom_adj other than polarization even though there are a few.
> >    example, kde are using.
> > 	http://www.google.com/codesearch/p?hl=ja#MPJuLvSvNYM/pub/kde/unstable/snapshots/kdelibs.tar.bz2%7CWClmGVN5niU/kdelibs-1164923/kinit/start_kdeinit.c&q=oom_adj%20kde%205
> > 
> > When you are talking polarization issue, you blind a real. Don't talk your dream.
> > 
> 
> I don't understand what you're trying to say here, but the current users 
> of oom_adj that aren't +15 or -16 (or OOM_DISABLE) are arbitrary based 
> relative to other tasks such as +5, +10, etc.  They don't have any 
> semantics other than being arbitrarily relative because it doesn't work in 
> a linear way or with a scale.

Even if you don't understand, they are IN THE WORLD. you don't have to
ignore a real.


> > 3) udev are using this feature. It's one of major linux component and you broke.
> > 
> > http://www.google.com/codesearch/p?hl=ja#KVTjzuVpblQ/pub/linux/utils/kernel/hotplug/udev-072.tar.bz2%7CwUSE-Ay3lLI/udev-072/udevd.c&q=oom_adj
> > 
> > You don't have to break our userland. you can't rewrite or deprecate 
> > old one. It's used! You can only add orthogonal new knob.
> > 
> 
> That's incorrect, I didn't break anything by deprecating a tunable for a 
> few years.  oom_adj gets converted roughly into an equivalent (but linear) 
> oom_score_adj.
> 
> Unfortunately for your argument, you can't show a single example of a 
> current oom_adj user that has a scientific calculation behind its value 
> that is now broken on the linear scale.

you are talking unrelated thing.

> 
> > > Yes, I've tested it, and it deprecates the tunable as expected.  A single 
> > > warning message serves the purpose well: let users know one time without 
> > > being overly verbose that the tunable is deprecated and give them 
> > > sufficient time (2 years) to start using the new tunable.  That's how 
> > > deprecation is done.
> > 
> > no sense.
> > 
> > Why do their application need to rewrite for *YOU*? Okey, you will got
> > benefit from your new knob. But NOBDOY use the new one. and People need
> > to rewrite their application even though no benefit. 
> > 
> > Don't do selfish userland breakage!
> > 
> 
> It's deprecated for a few years so users can gradually convert to the new 
> tunable, it wasn't removed when the new one was introduced.  A higher 
> resolution tunable that scales linearly with a unit is an advantage for 
> Linux (for the minority of users who care about oom killing priority 
> beyond the heuristic) and I think a few years is enough time for users to 
> do a simple conversion to the new tunable.

no sense.




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-14  5:07                 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
@ 2010-11-14 21:39                   ` David Rientjes
  2010-11-23  7:16                     ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-11-14 21:39 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Sun, 14 Nov 2010, KOSAKI Motohiro wrote:

> No irrelevant. Your patch break their environment even though
> they don't use oom_adj explicitly. because their application are using it.
> 

The _only_ difference too oom_adj since the rewrite is that it is now 
mapped on a linear scale rather than an exponential scale.  That's because 
the heuristic itself has a defined range [0, 1000] that characterizes the 
memory usage of the application it is ranking.  To show any breakge, you 
would have to show how oom_adj values being used by applications are based 
on a calculated value that prioritizes those tasks amongst each other.  
With the exponential scale, that's nearly impossible because of the number 
of arbitrary heuristics that were used before oom_adj were considered 
(runtime, nice level, CAP_SYS_RAWIO, etc).

So don't talk about userspace breakage when you can't even describe it or 
present a single usecase.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [patch] oom: document obsolete oom_adj tunable
  2010-11-09 23:55                     ` [patch] oom: document obsolete oom_adj tunable David Rientjes
@ 2010-11-15  0:22                       ` KOSAKI Motohiro
  2010-11-15 10:38                         ` David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-15  0:22 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Alan Cox, Andrew Morton, Linus Torvalds,
	linux-kernel, linux-mm

> /proc/pid/oom_adj was deprecated in August 2010 with the introduction of
> the new oom killer heuristic.
> 
> This patch copies the Documentation/feature-removal-schedule.txt entry
> for this tunable to the Documentation/ABI/obsolete directory so nobody
> misses it.
> 
> Reported-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
> Signed-off-by: David Rientjes <rientjes@google.com>

NAK. You seems to think shouting claim makes some effect. but It's incorrect.
Your childish shout doesn't solve any real world issue. Only code fix does.


> ---
>  Documentation/ABI/obsolete/proc-pid-oom_adj |   22 ++++++++++++++++++++++
>  1 files changed, 22 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/ABI/obsolete/proc-pid-oom_adj
> 
> diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj
> new file mode 100644
> --- /dev/null
> +++ b/Documentation/ABI/obsolete/proc-pid-oom_adj
> @@ -0,0 +1,22 @@
> +What:	/proc/<pid>/oom_adj
> +When:	August 2012
> +Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
> +	badness heuristic used to determine which task to kill when the kernel
> +	is out of memory.
> +
> +	The badness heuristic has since been rewritten since the introduction of
> +	this tunable such that its meaning is deprecated.  The value was
> +	implemented as a bitshift on a score generated by the badness()
> +	function that did not have any precise units of measure.  With the
> +	rewrite, the score is given as a proportion of available memory to the
> +	task allocating pages, so using a bitshift which grows the score
> +	exponentially is, thus, impossible to tune with fine granularity.
> +
> +	A much more powerful interface, /proc/<pid>/oom_score_adj, was
> +	introduced with the oom killer rewrite that allows users to increase or
> +	decrease the badness() score linearly.  This interface will replace
> +	/proc/<pid>/oom_adj.

Incorrect. oom_adj and oom_score_adj have different concept and different abstraction.
One can't replace another.

> +
> +	A warning will be emitted to the kernel log if an application uses this
> +	deprecated interface.  After it is printed once, future warnings will be
> +	suppressed until the kernel is rebooted.






^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-09  3:28               ` KOSAKI Motohiro
@ 2010-11-15  0:24                 ` KOSAKI Motohiro
  2010-11-15  9:59                   ` David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-15  0:24 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: kosaki.motohiro, David Rientjes, Andrew Morton, Linus Torvalds,
	LKML, linux-mm

> > > Yes, I've tested it, and it deprecates the tunable as expected.  A single 
> > > warning message serves the purpose well: let users know one time without 
> > > being overly verbose that the tunable is deprecated and give them 
> > > sufficient time (2 years) to start using the new tunable.  That's how 
> > > deprecation is done.
> > 
> > no sense.
> > 
> > Why do their application need to rewrite for *YOU*? Okey, you will got
> > benefit from your new knob. But NOBDOY use the new one. and People need
> > to rewrite their application even though no benefit. 
> > 
> > Don't do selfish userland breakage!
> 
> And you said you ignore bug even though you have seen it. It suck!


At v2.6.36-rc1, oom-killer doesn't work at all because YOU BROKE.
And I was working on fixing it.

2010-08-19
http://marc.info/?t=128223176900001&r=1&w=2
http://marc.info/?t=128221532700003&r=1&w=2
http://marc.info/?t=128221532500008&r=1&w=2

However, You submitted new crap before the fixing. 

2010-08-15
http://marc.info/?t=128184669600001&r=1&w=2

If you tested mainline a bit, you could find the problem quickly.
You should have fixed mainline kernel at first.


	Again, YOU HAVEN'T TESTED YOUR OWN PATCH AT ALL.






^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-15  0:24                 ` KOSAKI Motohiro
@ 2010-11-15  9:59                   ` David Rientjes
  0 siblings, 0 replies; 109+ messages in thread
From: David Rientjes @ 2010-11-15  9:59 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Mon, 15 Nov 2010, KOSAKI Motohiro wrote:

> At v2.6.36-rc1, oom-killer doesn't work at all because YOU BROKE.
> And I was working on fixing it.
> 
> 2010-08-19
> http://marc.info/?t=128223176900001&r=1&w=2

This existed before my oom killer rewrite, it was only noticed because the 
rewrite enabled oom_dump_tasks by default.

> http://marc.info/?t=128221532700003&r=1&w=2

Yes, tasklist_lock was dropped in a mismerge of my patches when posting 
them.  Thanks for finding it and posting a patch, I appreciate it.

> http://marc.info/?t=128221532500008&r=1&w=2
> 

Yes, if a task was racing between oom_kill_process() and oom_kill_task() 
and all threads had dropped its mm between calls then there was a NULL 
pointer dereference, thanks for fixing that as well.

> However, You submitted new crap before the fixing. 
> 
> 2010-08-15
> http://marc.info/?t=128184669600001&r=1&w=2
> 

This isn't "crap", this is a necessary bit to ensure that tasks that share 
an ->mm with a task immune from kill aren't killed themselves since we 
can't free the memory.  We came to the consensus that it would be better 
to count the tasks that are OOM_DISABLE in the mm_struct to avoid the 
O(2*n) tasklist scan.

> If you tested mainline a bit, you could find the problem quickly.
> You should have fixed mainline kernel at first.
> 

Thanks for finding a couple fixes during the 2.6.36-rc1 when the rewrite 
was first merged, it's much appreciated!

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [patch] oom: document obsolete oom_adj tunable
  2010-11-15  0:22                       ` KOSAKI Motohiro
@ 2010-11-15 10:38                         ` David Rientjes
  2010-11-23  7:16                           ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-11-15 10:38 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Alan Cox, Andrew Morton, Linus Torvalds, linux-kernel, linux-mm

On Mon, 15 Nov 2010, KOSAKI Motohiro wrote:

> > /proc/pid/oom_adj was deprecated in August 2010 with the introduction of
> > the new oom killer heuristic.
> > 
> > This patch copies the Documentation/feature-removal-schedule.txt entry
> > for this tunable to the Documentation/ABI/obsolete directory so nobody
> > misses it.
> > 
> > Reported-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> NAK. You seems to think shouting claim makes some effect. but It's incorrect.
> Your childish shout doesn't solve any real world issue. Only code fix does.
> 

The tunable is deprecated.  If you are really that concerned about the 
existing users who you don't think can convert in the next two years, why 
don't you help them convert?  That fixes the issue, but you're not 
interested in that.  I offered to convert any open-source users you can 
list (the hardest part of the conversion is finding who to send patches to 
:).  You're only interested in continuing to assert your position as 
correct even when the kernel is obviously moving in a different direction.

Others may have a different opinion of who is being childish in this whole 
ordeal.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-14 21:39                   ` David Rientjes
@ 2010-11-23  7:16                     ` KOSAKI Motohiro
  2010-11-28  1:41                       ` David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-23  7:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> On Sun, 14 Nov 2010, KOSAKI Motohiro wrote:
> 
> > No irrelevant. Your patch break their environment even though
> > they don't use oom_adj explicitly. because their application are using it.
> > 
> 
> The _only_ difference too oom_adj since the rewrite is that it is now 
> mapped on a linear scale rather than an exponential scale.  

_only_ mean don't ZERO different. Why do userland application need to rewrite?


> That's because 
> the heuristic itself has a defined range [0, 1000] that characterizes the 
> memory usage of the application it is ranking.  To show any breakge, you 
> would have to show how oom_adj values being used by applications are based 
> on a calculated value that prioritizes those tasks amongst each other.  
> With the exponential scale, that's nearly impossible because of the number 
> of arbitrary heuristics that were used before oom_adj were considered 
> (runtime, nice level, CAP_SYS_RAWIO, etc).

But, No people have agreed your powerfulness even though you talked about
the same explanation a lot of times.

Again, IF you need to [0 .. 1000] range, you can calculate it by your
application. current oom score can be get from /proc/pid/oom_score and
total memory can be get from /proc/meminfo. You shouldn't have break
anything.


> So don't talk about userspace breakage when you can't even describe it or 
> present a single usecase.

Huh? Remember! your feature have ZERO user.




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [patch] oom: document obsolete oom_adj tunable
  2010-11-15 10:38                         ` David Rientjes
@ 2010-11-23  7:16                           ` KOSAKI Motohiro
  0 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-23  7:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Alan Cox, Andrew Morton, Linus Torvalds,
	linux-kernel, linux-mm

> On Mon, 15 Nov 2010, KOSAKI Motohiro wrote:
> 
> > > /proc/pid/oom_adj was deprecated in August 2010 with the introduction of
> > > the new oom killer heuristic.
> > > 
> > > This patch copies the Documentation/feature-removal-schedule.txt entry
> > > for this tunable to the Documentation/ABI/obsolete directory so nobody
> > > misses it.
> > > 
> > > Reported-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
> > > Signed-off-by: David Rientjes <rientjes@google.com>
> > 
> > NAK. You seems to think shouting claim makes some effect. but It's incorrect.
> > Your childish shout doesn't solve any real world issue. Only code fix does.
> > 
> 
> The tunable is deprecated.  If you are really that concerned about the 
> existing users who you don't think can convert in the next two years, why 
> don't you help them convert?  That fixes the issue, but you're not 
> interested in that.  I offered to convert any open-source users you can 
> list (the hardest part of the conversion is finding who to send patches to 
> :).  You're only interested in continuing to assert your position as 
> correct even when the kernel is obviously moving in a different direction.

Why don't you change by _your_ hand? 

_Usually_ userland software changed at first _by_ who wanted the change.
Example, we fujitsu changed elf core file format when vma are >65536, but
It was not made any breakage. we changed gdb, binutils, elfutils and etc etc
_at_ first.




> 
> Others may have a different opinion of who is being childish in this whole 
> ordeal.




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-10-25  3:29 ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
  2010-10-25 11:28   ` pageexec
@ 2010-11-23 14:34   ` Oleg Nesterov
  2010-11-24  0:24     ` KOSAKI Motohiro
  1 sibling, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-23 14:34 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath

On 10/25, KOSAKI Motohiro wrote:
>
> Because execve() makes new mm struct and setup stack and
> copy argv. It mean the task have two mm while execve() temporary.
> Unfortunately this nascent mm is not pointed any tasks, then
> OOM-killer can't detect this memory usage. therefore OOM-killer
> may kill incorrect task.
>
> Thus, this patch added signal->in_exec_mm member and track
> nascent mm usage.

Stupid question.

Can't we just account these allocations in the old -mm temporary?

IOW. Please look at the "patch" below. It is of course incomplete
and wrong (to the point inc_mm_counter() is not safe without
SPLIT_RSS_COUNTING), and copy_strings/flush_old_exec are not the
best places to play with mm-counters, just to explain what I mean.

It is very simple. copy_strings() increments MM_ANONPAGES every
time we add a new page into bprm->vma. This makes this memory
visible to select_bad_process().

When exec changes ->mm (or if it fails), we change MM_ANONPAGES
counter back.

Most probably I missed something, but what do you think?

Oleg.

--- x/include/linux/binfmts.h
+++ x/include/linux/binfmts.h
@@ -29,6 +29,7 @@ struct linux_binprm{
 	char buf[BINPRM_BUF_SIZE];
 #ifdef CONFIG_MMU
 	struct vm_area_struct *vma;
+	unsigned long mm_anonpages;
 #else
 # define MAX_ARG_PAGES	32
 	struct page *page[MAX_ARG_PAGES];
--- x/fs/exec.c
+++ x/fs/exec.c
@@ -457,6 +457,9 @@ static int copy_strings(int argc, const 
 					goto out;
 				}
 
+				bmrp->mm_anonpages--;
+				inc_mm_counter(current->mm, MM_ANONPAGES);
+
 				if (kmapped_page) {
 					flush_kernel_dcache_page(kmapped_page);
 					kunmap(kmapped_page);
@@ -1003,6 +1006,7 @@ int flush_old_exec(struct linux_binprm *
 	/*
 	 * Release all of the old mmap stuff
 	 */
+	add_mm_counter(current->mm, bprm->mm_anonpages);
 	retval = exec_mmap(bprm->mm);
 	if (retval)
 		goto out;
@@ -1426,8 +1430,10 @@ int do_execve(const char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
-		mmput (bprm->mm);
+	if (bprm->mm) {
+		add_mm_counter(current->mm, bprm->mm_anonpages);
+		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-23 14:34   ` Oleg Nesterov
@ 2010-11-24  0:24     ` KOSAKI Motohiro
  2010-11-24 11:09       ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-24  0:24 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

Hi

> On 10/25, KOSAKI Motohiro wrote:
> >
> > Because execve() makes new mm struct and setup stack and
> > copy argv. It mean the task have two mm while execve() temporary.
> > Unfortunately this nascent mm is not pointed any tasks, then
> > OOM-killer can't detect this memory usage. therefore OOM-killer
> > may kill incorrect task.
> >
> > Thus, this patch added signal->in_exec_mm member and track
> > nascent mm usage.
> 
> Stupid question.
> 
> Can't we just account these allocations in the old -mm temporary?
> 
> IOW. Please look at the "patch" below. It is of course incomplete
> and wrong (to the point inc_mm_counter() is not safe without
> SPLIT_RSS_COUNTING), and copy_strings/flush_old_exec are not the
> best places to play with mm-counters, just to explain what I mean.
> 
> It is very simple. copy_strings() increments MM_ANONPAGES every
> time we add a new page into bprm->vma. This makes this memory
> visible to select_bad_process().
> 
> When exec changes ->mm (or if it fails), we change MM_ANONPAGES
> counter back.
> 
> Most probably I missed something, but what do you think?

Because, If the pages of argv is swapping out when processing execve,
This accouing doesn't work.

Of cource, changing swapping-out logic is one of way. But I did hope
no VM core logic change. taking implict mlocking argv area during execve
is also one of option. But I did think implicit mlocking is more risky.

Is this enough explanation? Please don't hesitate say "no". If people
don't like my approach, I don't hesitate change my thinking.

Thanks.



^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-24  0:24     ` KOSAKI Motohiro
@ 2010-11-24 11:09       ` Oleg Nesterov
  2010-11-25 11:06         ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-24 11:09 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath

On 11/24, KOSAKI Motohiro wrote:
>
> Hi
>
> > On 10/25, KOSAKI Motohiro wrote:
> > >
> > > Because execve() makes new mm struct and setup stack and
> > > copy argv. It mean the task have two mm while execve() temporary.
> > > Unfortunately this nascent mm is not pointed any tasks, then
> > > OOM-killer can't detect this memory usage. therefore OOM-killer
> > > may kill incorrect task.
> > >
> > > Thus, this patch added signal->in_exec_mm member and track
> > > nascent mm usage.
> >
> > Stupid question.
> >
> > Can't we just account these allocations in the old -mm temporary?
> >
> > IOW. Please look at the "patch" below. It is of course incomplete
> > and wrong (to the point inc_mm_counter() is not safe without
> > SPLIT_RSS_COUNTING), and copy_strings/flush_old_exec are not the
> > best places to play with mm-counters, just to explain what I mean.
> >
> > It is very simple. copy_strings() increments MM_ANONPAGES every
> > time we add a new page into bprm->vma. This makes this memory
> > visible to select_bad_process().
> >
> > When exec changes ->mm (or if it fails), we change MM_ANONPAGES
> > counter back.
> >
> > Most probably I missed something, but what do you think?
>
> Because, If the pages of argv is swapping out when processing execve,
> This accouing doesn't work.

Why?

If copy_strings() inserts the new page into bprm->vma and then
this page is swapped out, inc_mm_counter(current->mm, MM_ANONPAGES)
becomes incorrect, yes. And we can't turn it into MM_SWAPENTS.

But does this really matter? oom_badness() counts MM_ANONPAGES +
MM_SWAPENTS, and result is the same.

> Is this enough explanation? Please don't hesitate say "no". If people
> don't like my approach, I don't hesitate change my thinking.

Well, certainly I can't say no ;)

But it would be nice to find a more simple fix (if it can work,
of course).


And. I need a simple solution for the older kernels.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-24 11:09       ` Oleg Nesterov
@ 2010-11-25 11:06         ` KOSAKI Motohiro
  2010-11-25 14:02           ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-25 11:06 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

> > > Stupid question.
> > >
> > > Can't we just account these allocations in the old -mm temporary?
> > >
> > > IOW. Please look at the "patch" below. It is of course incomplete
> > > and wrong (to the point inc_mm_counter() is not safe without
> > > SPLIT_RSS_COUNTING), and copy_strings/flush_old_exec are not the
> > > best places to play with mm-counters, just to explain what I mean.
> > >
> > > It is very simple. copy_strings() increments MM_ANONPAGES every
> > > time we add a new page into bprm->vma. This makes this memory
> > > visible to select_bad_process().
> > >
> > > When exec changes ->mm (or if it fails), we change MM_ANONPAGES
> > > counter back.
> > >
> > > Most probably I missed something, but what do you think?
> >
> > Because, If the pages of argv is swapping out when processing execve,
> > This accouing doesn't work.
> 
> Why?
> 
> If copy_strings() inserts the new page into bprm->vma and then
> this page is swapped out, inc_mm_counter(current->mm, MM_ANONPAGES)
> becomes incorrect, yes. And we can't turn it into MM_SWAPENTS.
> 
> But does this really matter? oom_badness() counts MM_ANONPAGES +
> MM_SWAPENTS, and result is the same.

Ah, I got it. I did too strongly get stucked correct accounting. but
you mean it's not must.

Okey, I'll tackle this one at this weekend hopefully.



> > Is this enough explanation? Please don't hesitate say "no". If people
> > don't like my approach, I don't hesitate change my thinking.
> 
> Well, certainly I can't say no ;)
> 
> But it would be nice to find a more simple fix (if it can work,
> of course).
> 
> 
> And. I need a simple solution for the older kernels.

Alright. It is certinally considerable one.



^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-25 11:06         ` KOSAKI Motohiro
@ 2010-11-25 14:02           ` Oleg Nesterov
  2010-11-25 19:36             ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-25 14:02 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath

On 11/25, KOSAKI Motohiro wrote:
>
> > > > It is very simple. copy_strings() increments MM_ANONPAGES every
> > > > time we add a new page into bprm->vma. This makes this memory
> > > > visible to select_bad_process().
> > > >
> > > > When exec changes ->mm (or if it fails), we change MM_ANONPAGES
> > > > counter back.
> > > >
> > > > Most probably I missed something, but what do you think?
> > >
> > > Because, If the pages of argv is swapping out when processing execve,
> > > This accouing doesn't work.
> >
> > Why?
> >
> > If copy_strings() inserts the new page into bprm->vma and then
> > this page is swapped out, inc_mm_counter(current->mm, MM_ANONPAGES)
> > becomes incorrect, yes. And we can't turn it into MM_SWAPENTS.
> >
> > But does this really matter? oom_badness() counts MM_ANONPAGES +
> > MM_SWAPENTS, and result is the same.
>
> Ah, I got it. I did too strongly get stucked correct accounting. but
> you mean it's not must.

Yes. In fact, I _think_ this patch makes accounting better, even if
the extra MM_ANONPAGES numbers are not 100% correct.

Even if we add signal->in_exec_mm, nobody except oom_badness() will
look at it.

With this patch, say, /proc/pid/statm or /proc/pid/status will report
the memory allocated by the execing task. Even if technically this is
not correct (and 'swap' part may be wrong), this makes sense imho.
Otherwise, there is no way to see that this task allocates (may be
a lot) of memory.

This can "confuse" update_hiwater_rss(), but imho this is fine too.


> > > Is this enough explanation? Please don't hesitate say "no". If people
> > > don't like my approach, I don't hesitate change my thinking.
> >
> > Well, certainly I can't say no ;)
> >
> > But it would be nice to find a more simple fix (if it can work,
> > of course).
> >
> >
> > And. I need a simple solution for the older kernels.
>
> Alright. It is certinally considerable one.

Great! I'll send the patch tomorrow.

Even if you prefer another fix for 2.6.37/stable, I'd like to see
your review to know if it is correct or not (for backporting).

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-25 14:02           ` Oleg Nesterov
@ 2010-11-25 19:36             ` Oleg Nesterov
  2010-11-29  5:25               ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-25 19:36 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath

On 11/25, Oleg Nesterov wrote:
>
> Great! I'll send the patch tomorrow.
>
> Even if you prefer another fix for 2.6.37/stable, I'd like to see
> your review to know if it is correct or not (for backporting).

OK, what do you think about the patch below?

Seems to work, with this patch the test-case doesn't kill the
system (sysctl_oom_kill_allocating_task == 0).

I didn't dare to change !CONFIG_MMU case, I do not know how to
test it.

The patch is not complete, compat_copy_strings() needs changes.
But, shouldn't it use get_arg_page() too? Otherwise, where do
we check RLIMIT_STACK?

The patch asks for the cleanups. In particular, I think exec_mmap()
should accept bprm, not mm. But I'd prefer to do this later.

Oleg.

 include/linux/binfmts.h |    1 +
 fs/exec.c               |   28 ++++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

--- K/include/linux/binfmts.h~acct_exec_mem	2010-08-19 11:35:00.000000000 +0200
+++ K/include/linux/binfmts.h	2010-11-25 20:19:33.000000000 +0100
@@ -33,6 +33,7 @@ struct linux_binprm{
 # define MAX_ARG_PAGES	32
 	struct page *page[MAX_ARG_PAGES];
 #endif
+	unsigned long vma_pages;
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	unsigned int
--- K/fs/exec.c~acct_exec_mem	2010-11-25 15:16:56.000000000 +0100
+++ K/fs/exec.c	2010-11-25 20:20:49.000000000 +0100
@@ -162,6 +162,25 @@ out:
   	return error;
 }
 
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+	struct mm_struct *mm = current->mm;
+	long diff = pages - bprm->vma_pages;
+
+	if (!mm || !diff)
+		return;
+
+	bprm->vma_pages += diff;
+
+#ifdef SPLIT_RSS_COUNTING
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+	spin_lock(&mm->page_table_lock);
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+	spin_unlock(&mm->page_table_lock);
+#endif
+}
+
 #ifdef CONFIG_MMU
 
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct 
 		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 		struct rlimit *rlim;
 
+		acct_arg_size(bprm, size / PAGE_SIZE);
+
 		/*
 		 * We've historically supported up to 32 pages (ARG_MAX)
 		 * of argument strings even with small stacks
@@ -1003,6 +1024,7 @@ int flush_old_exec(struct linux_binprm *
 	/*
 	 * Release all of the old mmap stuff
 	 */
+	acct_arg_size(bprm, 0);
 	retval = exec_mmap(bprm->mm);
 	if (retval)
 		goto out;
@@ -1426,8 +1448,10 @@ int do_execve(const char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
-		mmput (bprm->mm);
+	if (bprm->mm) {
+		acct_arg_size(bprm, 0);
+		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-23  7:16                     ` KOSAKI Motohiro
@ 2010-11-28  1:41                       ` David Rientjes
  2010-11-30 13:03                         ` KOSAKI Motohiro
  0 siblings, 1 reply; 109+ messages in thread
From: David Rientjes @ 2010-11-28  1:41 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Tue, 23 Nov 2010, KOSAKI Motohiro wrote:

> > > No irrelevant. Your patch break their environment even though
> > > they don't use oom_adj explicitly. because their application are using it.
> > > 
> > 
> > The _only_ difference too oom_adj since the rewrite is that it is now 
> > mapped on a linear scale rather than an exponential scale.  
> 
> _only_ mean don't ZERO different. Why do userland application need to rewrite?
> 

Because NOTHING breaks with the new mapping.  Eight months later since 
this was initially proposed on linux-mm, you still cannot show a single 
example that depended on the exponential mapping of oom_adj.  I'm not 
going to continue responding to your criticism about this point since your 
argument is completely and utterly baseless.

> Again, IF you need to [0 .. 1000] range, you can calculate it by your
> application. current oom score can be get from /proc/pid/oom_score and
> total memory can be get from /proc/meminfo. You shouldn't have break
> anything.
> 

That would require the userspace tunable to be adjusted anytime a task's 
mempolicy changes, its nodemask changes, it's cpuset attachment changes, 
its mems change, a memcg limit changes, etc.  The only constant is the 
task's priority, and the current oom_score_adj implementation preserves 
that unless explicitly changed later by the user.  I completely understand 
that you may not have a use for this.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-25 19:36             ` Oleg Nesterov
@ 2010-11-29  5:25               ` KOSAKI Motohiro
  2010-11-29 11:33                 ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-29  5:25 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

> On 11/25, Oleg Nesterov wrote:
> >
> > Great! I'll send the patch tomorrow.
> >
> > Even if you prefer another fix for 2.6.37/stable, I'd like to see
> > your review to know if it is correct or not (for backporting).
> 
> OK, what do you think about the patch below?

Great. Thanks a lot.


> 
> Seems to work, with this patch the test-case doesn't kill the
> system (sysctl_oom_kill_allocating_task == 0).
> 
> I didn't dare to change !CONFIG_MMU case, I do not know how to
> test it.
> 
> The patch is not complete, compat_copy_strings() needs changes.
> But, shouldn't it use get_arg_page() too? Otherwise, where do
> we check RLIMIT_STACK?
> 

Because NOMMU doesn't have variable length argv. Instead it is still
using MAX_ARG_STRLEN as old MMU code.

32 pages hard coded argv limitation naturally prevent this nascent mm
issue.


> The patch asks for the cleanups. In particular, I think exec_mmap()
> should accept bprm, not mm. But I'd prefer to do this later.
> 
> Oleg.

General request. Please consider to keep Brad's reported-by tag.


> 
>  include/linux/binfmts.h |    1 +
>  fs/exec.c               |   28 ++++++++++++++++++++++++++--
>  2 files changed, 27 insertions(+), 2 deletions(-)
> 
> --- K/include/linux/binfmts.h~acct_exec_mem	2010-08-19 11:35:00.000000000 +0200
> +++ K/include/linux/binfmts.h	2010-11-25 20:19:33.000000000 +0100
> @@ -33,6 +33,7 @@ struct linux_binprm{
>  # define MAX_ARG_PAGES	32
>  	struct page *page[MAX_ARG_PAGES];
>  #endif
> +	unsigned long vma_pages;
>  	struct mm_struct *mm;
>  	unsigned long p; /* current top of mem */
>  	unsigned int
> --- K/fs/exec.c~acct_exec_mem	2010-11-25 15:16:56.000000000 +0100
> +++ K/fs/exec.c	2010-11-25 20:20:49.000000000 +0100
> @@ -162,6 +162,25 @@ out:
>    	return error;
>  }
>  
> +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)

Please move this function into #ifdef CONFIG_MMU. nommu code doesn't use it.

> +{
> +	struct mm_struct *mm = current->mm;
> +	long diff = pages - bprm->vma_pages;

I prefer to cast signed before assignment. It's safer more.


> +
> +	if (!mm || !diff)
> +		return;
> +
> +	bprm->vma_pages += diff;
> +
> +#ifdef SPLIT_RSS_COUNTING
> +	add_mm_counter(mm, MM_ANONPAGES, diff);
> +#else
> +	spin_lock(&mm->page_table_lock);
> +	add_mm_counter(mm, MM_ANONPAGES, diff);
> +	spin_unlock(&mm->page_table_lock);
> +#endif

OK, looks good.


> +}
> +
>  #ifdef CONFIG_MMU
>  
>  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
> @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct 
>  		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
>  		struct rlimit *rlim;
>  
> +		acct_arg_size(bprm, size / PAGE_SIZE);
> +
>  		/*
>  		 * We've historically supported up to 32 pages (ARG_MAX)
>  		 * of argument strings even with small stacks
> @@ -1003,6 +1024,7 @@ int flush_old_exec(struct linux_binprm *
>  	/*
>  	 * Release all of the old mmap stuff
>  	 */
> +	acct_arg_size(bprm, 0);

Why do we need this unacct here? I mean 1) if exec_mmap() is success,
we don't need unaccount at all 2) if exec_mmap() is failure, an epilogue of
do_execve() does unaccount thing.


>  	retval = exec_mmap(bprm->mm);
>  	if (retval)
>  		goto out;
> @@ -1426,8 +1448,10 @@ int do_execve(const char * filename,
>  	return retval;
>  
>  out:
> -	if (bprm->mm)
> -		mmput (bprm->mm);
> +	if (bprm->mm) {
> +		acct_arg_size(bprm, 0);
> +		mmput(bprm->mm);
> +	}
>  
>  out_file:
>  	if (bprm->file) {
> 




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-29  5:25               ` KOSAKI Motohiro
@ 2010-11-29 11:33                 ` Oleg Nesterov
  2010-11-29 18:23                   ` Oleg Nesterov
  2010-11-30  0:06                   ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
  0 siblings, 2 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-29 11:33 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath

On 11/29, KOSAKI Motohiro wrote:
>
> > The patch is not complete, compat_copy_strings() needs changes.
> > But, shouldn't it use get_arg_page() too? Otherwise, where do
> > we check RLIMIT_STACK?
>
> Because NOMMU doesn't have variable length argv. Instead it is still
> using MAX_ARG_STRLEN as old MMU code.
>
> 32 pages hard coded argv limitation naturally prevent this nascent mm
> issue.

Ah, I didn't mean NOMMU. I meant compat_execve()->compat_copy_strings().
If a 32bit process execs we seem to miss the RLIMIT_STACK check, no?

> > The patch asks for the cleanups. In particular, I think exec_mmap()
> > should accept bprm, not mm. But I'd prefer to do this later.
> >
> > Oleg.
>
> General request. Please consider to keep Brad's reported-by tag.

Yes, yes, sure.

> > +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)

OK.

> Please move this function into #ifdef CONFIG_MMU. nommu code doesn't use it.

Well it does, to revert the MM_ANONPAGES counter. I'll add the empty
function for NOMMU.

> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	long diff = pages - bprm->vma_pages;
>
> I prefer to cast signed before assignment. It's safer more.

OK.

> > @@ -1003,6 +1024,7 @@ int flush_old_exec(struct linux_binprm *
> >  	/*
> >  	 * Release all of the old mmap stuff
> >  	 */
> > +	acct_arg_size(bprm, 0);
>
> Why do we need this unacct here? I mean 1) if exec_mmap() is success,
> we don't need unaccount at all

Yes, we already killed all sub-threads. But this doesn't mean nobody
else can use current->mm, think about CLONE_VM. The simplest example
is vfork().

> 2) if exec_mmap() is failure, an epilogue of
> do_execve() does unaccount thing.

Yes.

Thanks Kosaki!

I'll resend v2 today. I am still not sure about compat_copy_strings()...

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-29 11:33                 ` Oleg Nesterov
@ 2010-11-29 18:23                   ` Oleg Nesterov
  2010-11-30 19:54                     ` [PATCH 0/2] exec: more excessive argument size fixes for 2.6.37/stable Oleg Nesterov
  2010-11-30  0:06                   ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
  1 sibling, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-29 18:23 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath

On 11/29, Oleg Nesterov wrote:
>
> I'll resend v2 today.

OK, please see below, just for your review.

I was going to sent it "officially" with the changelog/etc, but

> I am still not sure about compat_copy_strings()...

Yes, I think it needs the same checks. It should use get_arg_page()
or we need more copy-and-paste code, I think it should also check
fatal_signal_pending() like copy_strings() does.

I was going to export get_arg_page/acct_arg_size, but it is so
ugly. I'll try to find the way to unify copy_strings and
compat_copy_strings, not sure it is possible to do cleanly.

Probably this needs a separate patch in any case.

Oleg.

Changes:

	- move acct_arg_size() under CONFIG_MMU

	- add the "nop" version for NOMMMU

 include/linux/binfmts.h |    1 +
 fs/exec.c               |   32 ++++++++++++++++++++++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

--- K/include/linux/binfmts.h~acct_exec_mem	2010-08-19 11:35:00.000000000 +0200
+++ K/include/linux/binfmts.h	2010-11-29 17:29:35.000000000 +0100
@@ -29,6 +29,7 @@ struct linux_binprm{
 	char buf[BINPRM_BUF_SIZE];
 #ifdef CONFIG_MMU
 	struct vm_area_struct *vma;
+	unsigned long vma_pages;
 #else
 # define MAX_ARG_PAGES	32
 	struct page *page[MAX_ARG_PAGES];
--- K/fs/exec.c~acct_exec_mem	2010-11-25 15:16:56.000000000 +0100
+++ K/fs/exec.c	2010-11-29 17:51:43.000000000 +0100
@@ -164,6 +164,25 @@ out:
 
 #ifdef CONFIG_MMU
 
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+	struct mm_struct *mm = current->mm;
+	long diff = (long)(pages - bprm->vma_pages);
+
+	if (!mm || !diff)
+		return;
+
+	bprm->vma_pages = pages;
+
+#ifdef SPLIT_RSS_COUNTING
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+	spin_lock(&mm->page_table_lock);
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+	spin_unlock(&mm->page_table_lock);
+#endif
+}
+
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct 
 		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 		struct rlimit *rlim;
 
+		acct_arg_size(bprm, size / PAGE_SIZE);
+
 		/*
 		 * We've historically supported up to 32 pages (ARG_MAX)
 		 * of argument strings even with small stacks
@@ -276,6 +297,10 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
@@ -1003,6 +1028,7 @@ int flush_old_exec(struct linux_binprm *
 	/*
 	 * Release all of the old mmap stuff
 	 */
+	acct_arg_size(bprm, 0);
 	retval = exec_mmap(bprm->mm);
 	if (retval)
 		goto out;
@@ -1426,8 +1452,10 @@ int do_execve(const char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
-		mmput (bprm->mm);
+	if (bprm->mm) {
+		acct_arg_size(bprm, 0);
+		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 4/4] oom: don't ignore rss in nascent mm
  2010-11-29 11:33                 ` Oleg Nesterov
  2010-11-29 18:23                   ` Oleg Nesterov
@ 2010-11-30  0:06                   ` KOSAKI Motohiro
  1 sibling, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-30  0:06 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

> On 11/29, KOSAKI Motohiro wrote:
> >
> > > The patch is not complete, compat_copy_strings() needs changes.
> > > But, shouldn't it use get_arg_page() too? Otherwise, where do
> > > we check RLIMIT_STACK?
> >
> > Because NOMMU doesn't have variable length argv. Instead it is still
> > using MAX_ARG_STRLEN as old MMU code.
> >
> > 32 pages hard coded argv limitation naturally prevent this nascent mm
> > issue.
> 
> Ah, I didn't mean NOMMU. I meant compat_execve()->compat_copy_strings().
> If a 32bit process execs we seem to miss the RLIMIT_STACK check, no?

Ah, yes. that's bug. You have found more serious issue ;)



> > > The patch asks for the cleanups. In particular, I think exec_mmap()
> > > should accept bprm, not mm. But I'd prefer to do this later.
> > >
> > > Oleg.
> >
> > General request. Please consider to keep Brad's reported-by tag.
> 
> Yes, yes, sure.
> 
> > > +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
> 
> OK.
> 
> > Please move this function into #ifdef CONFIG_MMU. nommu code doesn't use it.
> 
> Well it does, to revert the MM_ANONPAGES counter. I'll add the empty
> function for NOMMU.
> 
> > > +{
> > > +	struct mm_struct *mm = current->mm;
> > > +	long diff = pages - bprm->vma_pages;
> >
> > I prefer to cast signed before assignment. It's safer more.
> 
> OK.
> 
> > > @@ -1003,6 +1024,7 @@ int flush_old_exec(struct linux_binprm *
> > >  	/*
> > >  	 * Release all of the old mmap stuff
> > >  	 */
> > > +	acct_arg_size(bprm, 0);
> >
> > Why do we need this unacct here? I mean 1) if exec_mmap() is success,
> > we don't need unaccount at all
> 
> Yes, we already killed all sub-threads. But this doesn't mean nobody
> else can use current->mm, think about CLONE_VM. The simplest example
> is vfork().

Right you are.


> > 2) if exec_mmap() is failure, an epilogue of
> > do_execve() does unaccount thing.
> 
> Yes.
> 
> Thanks Kosaki!
> 
> I'll resend v2 today. I am still not sure about compat_copy_strings()...
> 
> Oleg.
> 




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-28  1:41                       ` David Rientjes
@ 2010-11-30 13:03                         ` KOSAKI Motohiro
  2010-11-30 20:07                           ` David Rientjes
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-11-30 13:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm

> On Tue, 23 Nov 2010, KOSAKI Motohiro wrote:
> 
> > > > No irrelevant. Your patch break their environment even though
> > > > they don't use oom_adj explicitly. because their application are using it.
> > > > 
> > > 
> > > The _only_ difference too oom_adj since the rewrite is that it is now 
> > > mapped on a linear scale rather than an exponential scale.  
> > 
> > _only_ mean don't ZERO different. Why do userland application need to rewrite?
> > 
> 
> Because NOTHING breaks with the new mapping.  Eight months later since 
> this was initially proposed on linux-mm, you still cannot show a single 
> example that depended on the exponential mapping of oom_adj.  I'm not 
> going to continue responding to your criticism about this point since your 
> argument is completely and utterly baseless.

No regression mean no break. Not single nor multiple. see?


> 
> > Again, IF you need to [0 .. 1000] range, you can calculate it by your
> > application. current oom score can be get from /proc/pid/oom_score and
> > total memory can be get from /proc/meminfo. You shouldn't have break
> > anything.
> > 
> 
> That would require the userspace tunable to be adjusted anytime a task's 
> mempolicy changes, its nodemask changes, it's cpuset attachment changes, 

All situation can be calculated on userland. User process can be know
their bindings.



> its mems change, a memcg limit changes, etc.  The only constant is the 
> task's priority, and the current oom_score_adj implementation preserves 
> that unless explicitly changed later by the user.  I completely understand 
> that you may not have a use for this.




^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 0/2] exec: more excessive argument size fixes for 2.6.37/stable
  2010-11-29 18:23                   ` Oleg Nesterov
@ 2010-11-30 19:54                     ` Oleg Nesterov
  2010-11-30 19:55                       ` [PATCH 1/2] exec: make argv/envp memory visible to oom-killer Oleg Nesterov
                                         ` (2 more replies)
  0 siblings, 3 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 19:54 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, stable

On 11/29, Oleg Nesterov wrote:
>
> I was going to export get_arg_page/acct_arg_size, but it is so
> ugly.

But I think this is the only option for 2.6.37/stable.

So. I am sending 2 patches, hopefully they fix the problems
and there are simple enough for 2.6.27/stable.

> I'll try to find the way to unify copy_strings and
> compat_copy_strings, not sure it is possible to do cleanly.

I'll send the cleanups which unify compat/non-compat code on
top of these fixes, this is not stable material.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 1/2] exec: make argv/envp memory visible to oom-killer
  2010-11-30 19:54                     ` [PATCH 0/2] exec: more excessive argument size fixes for 2.6.37/stable Oleg Nesterov
@ 2010-11-30 19:55                       ` Oleg Nesterov
  2010-12-01  0:12                         ` KOSAKI Motohiro
  2010-11-30 19:56                       ` [PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths Oleg Nesterov
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
  2 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 19:55 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, stable

Brad Spengler published a local memory-allocation DoS that
evades the OOM-killer (though not the virtual memory RLIMIT):
http://www.grsecurity.net/~spender/64bit_dos.c

execve()->copy_strings() can allocate a lot of memory, but
this is not visible to oom-killer, nobody can see the nascent
bprm->mm and take it into account.

With this patch get_arg_page() increments current's MM_ANONPAGES
counter every time we allocate the new page for argv/envp. When
do_execve() succeds or fails, we change this counter back.

Technically this is not 100% correct, we can't know if the new
page is swapped out and turn MM_ANONPAGES into MM_SWAPENTS, but
I don't think this really matters and everything becomes correct
once exec changes ->mm or fails.

Reported-by: Brad Spengler <spender@grsecurity.net>
By-discussion-with: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/binfmts.h |    1 +
 fs/exec.c               |   32 ++++++++++++++++++++++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

--- K/include/linux/binfmts.h~acct_exec_mem	2010-11-30 18:27:15.000000000 +0100
+++ K/include/linux/binfmts.h	2010-11-30 18:28:54.000000000 +0100
@@ -29,6 +29,7 @@ struct linux_binprm{
 	char buf[BINPRM_BUF_SIZE];
 #ifdef CONFIG_MMU
 	struct vm_area_struct *vma;
+	unsigned long vma_pages;
 #else
 # define MAX_ARG_PAGES	32
 	struct page *page[MAX_ARG_PAGES];
--- K/fs/exec.c~acct_exec_mem	2010-11-30 18:27:15.000000000 +0100
+++ K/fs/exec.c	2010-11-30 18:28:54.000000000 +0100
@@ -164,6 +164,25 @@ out:
 
 #ifdef CONFIG_MMU
 
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+	struct mm_struct *mm = current->mm;
+	long diff = (long)(pages - bprm->vma_pages);
+
+	if (!mm || !diff)
+		return;
+
+	bprm->vma_pages = pages;
+
+#ifdef SPLIT_RSS_COUNTING
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+	spin_lock(&mm->page_table_lock);
+	add_mm_counter(mm, MM_ANONPAGES, diff);
+	spin_unlock(&mm->page_table_lock);
+#endif
+}
+
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct 
 		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
 		struct rlimit *rlim;
 
+		acct_arg_size(bprm, size / PAGE_SIZE);
+
 		/*
 		 * We've historically supported up to 32 pages (ARG_MAX)
 		 * of argument strings even with small stacks
@@ -276,6 +297,10 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
@@ -1003,6 +1028,7 @@ int flush_old_exec(struct linux_binprm *
 	/*
 	 * Release all of the old mmap stuff
 	 */
+	acct_arg_size(bprm, 0);
 	retval = exec_mmap(bprm->mm);
 	if (retval)
 		goto out;
@@ -1426,8 +1452,10 @@ int do_execve(const char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
-		mmput (bprm->mm);
+	if (bprm->mm) {
+		acct_arg_size(bprm, 0);
+		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths
  2010-11-30 19:54                     ` [PATCH 0/2] exec: more excessive argument size fixes for 2.6.37/stable Oleg Nesterov
  2010-11-30 19:55                       ` [PATCH 1/2] exec: make argv/envp memory visible to oom-killer Oleg Nesterov
@ 2010-11-30 19:56                       ` Oleg Nesterov
  2010-12-01  3:04                         ` KOSAKI Motohiro
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
  2 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 19:56 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, stable

Note: this patch targets 2.6.37 and tries to be as simple as possible.
That is why it adds more copy-and-paste horror into fs/compat.c and
uglifies fs/exec.c, this will be cleanuped later.

compat_copy_strings() plays with bprm->vma/mm directly and thus has
two problems: it lacks the RLIMIT_STACK check and argv/envp memory
is not visible to oom killer.

Export acct_arg_size() and get_arg_page(), change compat_copy_strings()
to use get_arg_page(), change compat_do_execve() to do acct_arg_size(0)
as do_execve() does.

Add the fatal_signal_pending/cond_resched checks into compat_count() and
compat_copy_strings(), this matches the code in fs/exec.c and certainly
makes sense.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/binfmts.h |    4 ++++
 fs/exec.c               |    8 ++++----
 fs/compat.c             |   28 +++++++++++++++-------------
 3 files changed, 23 insertions(+), 17 deletions(-)

--- K/include/linux/binfmts.h~compat_get_arg_page	2010-11-30 18:28:54.000000000 +0100
+++ K/include/linux/binfmts.h	2010-11-30 18:30:45.000000000 +0100
@@ -60,6 +60,10 @@ struct linux_binprm{
 	unsigned long loader, exec;
 };
 
+extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
+extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+					int write);
+
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- K/fs/exec.c~compat_get_arg_page	2010-11-30 18:28:54.000000000 +0100
+++ K/fs/exec.c	2010-11-30 18:30:45.000000000 +0100
@@ -164,7 +164,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -183,7 +183,7 @@ static void acct_arg_size(struct linux_b
 #endif
 }
 
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -297,11 +297,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
--- K/fs/compat.c~compat_get_arg_page	2010-11-30 17:55:20.000000000 +0100
+++ K/fs/compat.c	2010-11-30 18:30:45.000000000 +0100
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __
 			argv++;
 			if (i++ >= max)
 				return -E2BIG;
+
+			if (fatal_signal_pending(current))
+				return -ERESTARTNOHAND;
+			cond_resched();
 		}
 	}
 	return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc,
 		while (len > 0) {
 			int offset, bytes_to_copy;
 
+			if (fatal_signal_pending(current)) {
+				ret = -ERESTARTNOHAND;
+				goto out;
+			}
+			cond_resched();
+
 			offset = pos % PAGE_SIZE;
 			if (offset == 0)
 				offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc,
 			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
 				struct page *page;
 
-#ifdef CONFIG_STACK_GROWSUP
-				ret = expand_stack_downwards(bprm->vma, pos);
-				if (ret < 0) {
-					/* We've exceed the stack rlimit. */
-					ret = -E2BIG;
-					goto out;
-				}
-#endif
-				ret = get_user_pages(current, bprm->mm, pos,
-						     1, 1, 1, &page, NULL);
-				if (ret <= 0) {
-					/* We've exceed the stack rlimit. */
+				page = get_arg_page(bprm, pos, 1);
+				if (!page) {
 					ret = -E2BIG;
 					goto out;
 				}
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
+	if (bprm->mm) {
+		acct_arg_size(bprm, 0);
 		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 0/4] exec: unify compat/non-compat code
  2010-11-30 19:54                     ` [PATCH 0/2] exec: more excessive argument size fixes for 2.6.37/stable Oleg Nesterov
  2010-11-30 19:55                       ` [PATCH 1/2] exec: make argv/envp memory visible to oom-killer Oleg Nesterov
  2010-11-30 19:56                       ` [PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths Oleg Nesterov
@ 2010-11-30 20:00                       ` Oleg Nesterov
  2010-11-30 20:00                         ` [PATCH 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
                                           ` (4 more replies)
  2 siblings, 5 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 20:00 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath

(remove stable)

On 11/30, Oleg Nesterov wrote:
>
> I'll send the cleanups which unify compat/non-compat code on
> top of these fixes, this is not stable material.

On top of

	[PATCH 1/2] exec: make argv/envp memory visible to oom-killer
	[PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths

Imho, execve code in fs/compat.c must die. It is very hard to
maintain this copy-and-paste horror.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 1/4] exec: introduce get_arg_ptr() helper
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
@ 2010-11-30 20:00                         ` Oleg Nesterov
  2010-11-30 20:01                         ` [PATCH 2/4] exec: introduce "bool compat" argument Oleg Nesterov
                                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 20:00 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath

Introduce get_arg_ptr() helper, convert count() and copy_strings()
to use it.

No functional changes, preparation. This helper is trivial, it just
reads the pointer from argv/envp user-space array.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- K/fs/exec.c~1_get_arg_ptr	2010-11-30 18:30:45.000000000 +0100
+++ K/fs/exec.c	2010-11-30 19:14:54.000000000 +0100
@@ -390,6 +390,17 @@ err:
 	return err;
 }
 
+static const char __user *
+get_arg_ptr(const char __user * const __user *argv, int argc)
+{
+	const char __user *ptr;
+
+	if (get_user(ptr, argv + argc))
+		return ERR_PTR(-EFAULT);
+
+	return ptr;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -399,13 +410,14 @@ static int count(const char __user * con
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -435,16 +447,18 @@ static int copy_strings(int argc, const 
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 2/4] exec: introduce "bool compat" argument
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
  2010-11-30 20:00                         ` [PATCH 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
@ 2010-11-30 20:01                         ` Oleg Nesterov
  2010-11-30 20:01                         ` [PATCH 3/4] exec: unify compat_do_execve() code Oleg Nesterov
                                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 20:01 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath

No functional changes, preparation to simplify the review.

And the new (and currently unused) "bool compat" argument to
get_arg_ptr(), count(), and copy_strings().

Add this argument to do_execve() as well, and rename it to
do_execve_common().

Reintroduce do_execve() as a trivial wrapper() on top of
do_execve_common(compat => false).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

--- K/fs/exec.c~2_is_compat_arg	2010-11-30 19:14:54.000000000 +0100
+++ K/fs/exec.c	2010-11-30 19:47:24.000000000 +0100
@@ -391,7 +391,7 @@ err:
 }
 
 static const char __user *
-get_arg_ptr(const char __user * const __user *argv, int argc)
+get_arg_ptr(const char __user * const __user *argv, int argc, bool compat)
 {
 	const char __user *ptr;
 
@@ -404,13 +404,13 @@ get_arg_ptr(const char __user * const __
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(const char __user * const __user *argv, int max, bool compat)
 {
 	int i = 0;
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user *p = get_arg_ptr(argv, i);
+			const char __user *p = get_arg_ptr(argv, i, compat);
 
 			if (!p)
 				break;
@@ -435,7 +435,7 @@ static int count(const char __user * con
  * ensures the destination page is created and not swapped out.
  */
 static int copy_strings(int argc, const char __user *const __user *argv,
-			struct linux_binprm *bprm)
+			struct linux_binprm *bprm, bool compat)
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
@@ -448,7 +448,7 @@ static int copy_strings(int argc, const 
 		unsigned long pos;
 
 		ret = -EFAULT;
-		str = get_arg_ptr(argv, argc);
+		str = get_arg_ptr(argv, argc, compat);
 		if (IS_ERR(str))
 			goto out;
 
@@ -531,7 +531,8 @@ int copy_strings_kernel(int argc, const 
 	int r;
 	mm_segment_t oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, (const char __user *const  __user *)argv,
+				bprm, false);
 	set_fs(oldfs);
 	return r;
 }
@@ -1382,10 +1383,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
+static int do_execve_common(const char *filename,
 	const char __user *const __user *argv,
 	const char __user *const __user *envp,
-	struct pt_regs * regs)
+	struct pt_regs *regs, bool compat)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1427,11 +1428,11 @@ int do_execve(const char * filename,
 	if (retval)
 		goto out_file;
 
-	bprm->argc = count(argv, MAX_ARG_STRINGS);
+	bprm->argc = count(argv, MAX_ARG_STRINGS, compat);
 	if ((retval = bprm->argc) < 0)
 		goto out;
 
-	bprm->envc = count(envp, MAX_ARG_STRINGS);
+	bprm->envc = count(envp, MAX_ARG_STRINGS, compat);
 	if ((retval = bprm->envc) < 0)
 		goto out;
 
@@ -1444,11 +1445,11 @@ int do_execve(const char * filename,
 		goto out;
 
 	bprm->exec = bprm->p;
-	retval = copy_strings(bprm->envc, envp, bprm);
+	retval = copy_strings(bprm->envc, envp, bprm, compat);
 	if (retval < 0)
 		goto out;
 
-	retval = copy_strings(bprm->argc, argv, bprm);
+	retval = copy_strings(bprm->argc, argv, bprm, compat);
 	if (retval < 0)
 		goto out;
 
@@ -1492,6 +1493,14 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *argv,
+	const char __user *const __user *envp,
+	struct pt_regs *regs)
+{
+	return do_execve_common(filename, argv, envp, regs, false);
+}
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 3/4] exec: unify compat_do_execve() code
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
  2010-11-30 20:00                         ` [PATCH 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
  2010-11-30 20:01                         ` [PATCH 2/4] exec: introduce "bool compat" argument Oleg Nesterov
@ 2010-11-30 20:01                         ` Oleg Nesterov
  2010-12-01 17:37                           ` (No subject header) Milton Miller
  2010-11-30 20:01                         ` [PATCH 4/4] exec: unexport acct_arg_size() and get_arg_page() Oleg Nesterov
  2010-12-01  3:09                         ` [PATCH 0/4] exec: unify compat/non-compat code KOSAKI Motohiro
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 20:01 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath

Teach get_arg_ptr() to handle compat = T case correctly.

This allows us to remove the compat_do_execve() code from fs/compat.c
and reimplement compat_do_execve() as the trivial wrapper on top of
do_execve_common(compat => true).

In fact, this fixes another (minor) bug. "compat_uptr_t str" can
overflow after "str += len" in compat_copy_strings() if a 64bit
application execs via sys32_execve().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c   |   25 ++++++
 fs/compat.c |  235 ------------------------------------------------------------
 2 files changed, 25 insertions(+), 235 deletions(-)

--- K/fs/exec.c~3_use_compat	2010-11-30 19:47:24.000000000 +0100
+++ K/fs/exec.c	2010-11-30 20:15:11.000000000 +0100
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -395,6 +396,18 @@ get_arg_ptr(const char __user * const __
 {
 	const char __user *ptr;
 
+#ifdef CONFIG_COMPAT
+	if (unlikely(compat)) {
+		compat_uptr_t __user *a = (void __user*)argv;
+		compat_uptr_t p;
+
+		if (get_user(p, a + argc))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(p);
+	}
+#endif
+
 	if (get_user(ptr, argv + argc))
 		return ERR_PTR(-EFAULT);
 
@@ -1501,6 +1514,18 @@ int do_execve(const char *filename,
 	return do_execve_common(filename, argv, envp, regs, false);
 }
 
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char * filename,
+	compat_uptr_t __user *argv,
+	compat_uptr_t __user *envp,
+	struct pt_regs * regs)
+{
+	return do_execve_common(filename,
+				(void __user*)argv, (void __user*)envp,
+				regs, true);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;
--- K/fs/compat.c~3_use_compat	2010-11-30 18:30:45.000000000 +0100
+++ K/fs/compat.c	2010-11-30 20:17:28.000000000 +0100
@@ -1330,241 +1330,6 @@ compat_sys_openat(unsigned int dfd, cons
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 4/4] exec: unexport acct_arg_size() and get_arg_page()
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
                                           ` (2 preceding siblings ...)
  2010-11-30 20:01                         ` [PATCH 3/4] exec: unify compat_do_execve() code Oleg Nesterov
@ 2010-11-30 20:01                         ` Oleg Nesterov
  2010-12-01  3:09                         ` [PATCH 0/4] exec: unify compat/non-compat code KOSAKI Motohiro
  4 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-11-30 20:01 UTC (permalink / raw)
  To: KOSAKI Motohiro, Andrew Morton, Linus Torvalds
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath

Unexport acct_arg_size() and get_arg_page(), fs/compat.c doesn't
need them any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/binfmts.h |    4 ----
 fs/exec.c               |    8 ++++----
 2 files changed, 4 insertions(+), 8 deletions(-)

--- K/include/linux/binfmts.h~4_unexport_arg_helpers	2010-11-30 18:30:45.000000000 +0100
+++ K/include/linux/binfmts.h	2010-11-30 20:38:13.000000000 +0100
@@ -60,10 +60,6 @@ struct linux_binprm{
 	unsigned long loader, exec;
 };
 
-extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
-extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
-					int write);
-
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- K/fs/exec.c~4_unexport_arg_helpers	2010-11-30 20:15:11.000000000 +0100
+++ K/fs/exec.c	2010-11-30 20:38:13.000000000 +0100
@@ -165,7 +165,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -184,7 +184,7 @@ void acct_arg_size(struct linux_binprm *
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -298,11 +298,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable"
  2010-11-30 13:03                         ` KOSAKI Motohiro
@ 2010-11-30 20:07                           ` David Rientjes
  0 siblings, 0 replies; 109+ messages in thread
From: David Rientjes @ 2010-11-30 20:07 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm

On Tue, 30 Nov 2010, KOSAKI Motohiro wrote:

> > Because NOTHING breaks with the new mapping.  Eight months later since 
> > this was initially proposed on linux-mm, you still cannot show a single 
> > example that depended on the exponential mapping of oom_adj.  I'm not 
> > going to continue responding to your criticism about this point since your 
> > argument is completely and utterly baseless.
> 
> No regression mean no break. Not single nor multiple. see?
> 

Nothing breaks.  If something did, you could respond to my answer above 
and provide a single example of a real-world example that broke as a 
result of the new linear mapping.

> All situation can be calculated on userland. User process can be know
> their bindings.
> 

Yes, but the proportional priority-based oom_score_adj values allow users 
to avoid recalculating and writing that value anytime a mempolicy 
attachment changes, its nodemask changes, it moves to another cpuset, its 
set of mems changes, its memcg attachment changes, its limit is modiifed, 
etc.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 1/2] exec: make argv/envp memory visible to oom-killer
  2010-11-30 19:55                       ` [PATCH 1/2] exec: make argv/envp memory visible to oom-killer Oleg Nesterov
@ 2010-12-01  0:12                         ` KOSAKI Motohiro
  2010-12-01 18:07                           ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-12-01  0:12 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath, stable

> Brad Spengler published a local memory-allocation DoS that
> evades the OOM-killer (though not the virtual memory RLIMIT):
> http://www.grsecurity.net/~spender/64bit_dos.c
> 
> execve()->copy_strings() can allocate a lot of memory, but
> this is not visible to oom-killer, nobody can see the nascent
> bprm->mm and take it into account.
> 
> With this patch get_arg_page() increments current's MM_ANONPAGES
> counter every time we allocate the new page for argv/envp. When
> do_execve() succeds or fails, we change this counter back.
> 
> Technically this is not 100% correct, we can't know if the new
> page is swapped out and turn MM_ANONPAGES into MM_SWAPENTS, but
> I don't think this really matters and everything becomes correct
> once exec changes ->mm or fails.
> 
> Reported-by: Brad Spengler <spender@grsecurity.net>
> By-discussion-with: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

Looks good to me.
	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>


Thank you very much.


> --- K/fs/exec.c~acct_exec_mem	2010-11-30 18:27:15.000000000 +0100
> +++ K/fs/exec.c	2010-11-30 18:28:54.000000000 +0100
> @@ -164,6 +164,25 @@ out:
>  
>  #ifdef CONFIG_MMU
>  
> +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)

One minor request.

I guess this function can easily makes confusing to a code reader. So I
hope you write small function comments. describe to
 - What is oom nascent issue
 - Why we think inaccurate account is ok


> +{
> +	struct mm_struct *mm = current->mm;
> +	long diff = (long)(pages - bprm->vma_pages);
> +
> +	if (!mm || !diff)
> +		return;
> +
> +	bprm->vma_pages = pages;
> +
> +#ifdef SPLIT_RSS_COUNTING
> +	add_mm_counter(mm, MM_ANONPAGES, diff);
> +#else
> +	spin_lock(&mm->page_table_lock);
> +	add_mm_counter(mm, MM_ANONPAGES, diff);
> +	spin_unlock(&mm->page_table_lock);
> +#endif
> +}
> +





^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths
  2010-11-30 19:56                       ` [PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths Oleg Nesterov
@ 2010-12-01  3:04                         ` KOSAKI Motohiro
  0 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-12-01  3:04 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath, stable

> Note: this patch targets 2.6.37 and tries to be as simple as possible.
> That is why it adds more copy-and-paste horror into fs/compat.c and
> uglifies fs/exec.c, this will be cleanuped later.
> 
> compat_copy_strings() plays with bprm->vma/mm directly and thus has
> two problems: it lacks the RLIMIT_STACK check and argv/envp memory
> is not visible to oom killer.
> 
> Export acct_arg_size() and get_arg_page(), change compat_copy_strings()
> to use get_arg_page(), change compat_do_execve() to do acct_arg_size(0)
> as do_execve() does.
> 
> Add the fatal_signal_pending/cond_resched checks into compat_count() and
> compat_copy_strings(), this matches the code in fs/exec.c and certainly
> makes sense.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

Looks good to me.
	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 0/4] exec: unify compat/non-compat code
  2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
                                           ` (3 preceding siblings ...)
  2010-11-30 20:01                         ` [PATCH 4/4] exec: unexport acct_arg_size() and get_arg_page() Oleg Nesterov
@ 2010-12-01  3:09                         ` KOSAKI Motohiro
  4 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2010-12-01  3:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

> (remove stable)
> 
> On 11/30, Oleg Nesterov wrote:
> >
> > I'll send the cleanups which unify compat/non-compat code on
> > top of these fixes, this is not stable material.
> 
> On top of
> 
> 	[PATCH 1/2] exec: make argv/envp memory visible to oom-killer
> 	[PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths
> 
> Imho, execve code in fs/compat.c must die. It is very hard to
> maintain this copy-and-paste horror.

I strongly like this series. (yes, I made fault to forgot to change compat.c
multiple times ;)

Unfortunatelly, this is a bit large and I have no time now. I expect I
can review this at this or next weekend.....
Hopefully, anyoneelse will review this and ignore me....




^ permalink raw reply	[flat|nested] 109+ messages in thread

* (No subject header)
  2010-11-30 20:01                         ` [PATCH 3/4] exec: unify compat_do_execve() code Oleg Nesterov
@ 2010-12-01 17:37                           ` Milton Miller
  2010-12-01 18:27                             ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Milton Miller @ 2010-12-01 17:37 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

On Tue, 30 Nov 2010 about 20:01:29 -0000, Oleg Nesterov wrote:
> Teach get_arg_ptr() to handle compat = T case correctly.

>  #include <asm/uaccess.h>
>  #include <asm/mmu_context.h>
> @@ -395,6 +396,18 @@ get_arg_ptr(const char __user * const __
>  {
>  	const char __user *ptr;
>  
> +#ifdef CONFIG_COMPAT
> +	if (unlikely(compat)) {

This should not be marked unlikely.  Unlikely tells gcc the path
with over 99% confidence and disables branch predictors on some
architectures.  If called from a compat processes this will result
in a mispredicted branch every iteration.  Just use if (compat)
and let the hardware branch predictors do their job.

> +		compat_uptr_t __user *a = (void __user*)argv;
> +		compat_uptr_t p;
> +
> +		if (get_user(p, a + argc))
> +			return ERR_PTR(-EFAULT);
> +
> +		return compat_ptr(p);
> +	}
> +#endif
> +
>  	if (get_user(ptr, argv + argc))
>  		return ERR_PTR(-EFAULT);
>  
> @@ -1501,6 +1514,18 @@ int do_execve(const char *filename,
>  	return do_execve_common(filename, argv, envp, regs, false);
>  }
>  
> +#ifdef CONFIG_COMPAT
> +int compat_do_execve(char * filename,
> +	compat_uptr_t __user *argv,
> +	compat_uptr_t __user *envp,
> +	struct pt_regs * regs)
> +{
> +	return do_execve_common(filename,
> +				(void __user*)argv, (void __user*)envp,

Shouldn't these be compat_ptr(argv)?  (makes a difference on s390)

> +				regs, true);
> +}
> +#endif
> +
>  void set_binfmt(struct linux_binfmt *new)
>  {
>  	struct mm_struct *mm = current->mm;

Thanks,
milton

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 1/2] exec: make argv/envp memory visible to oom-killer
  2010-12-01  0:12                         ` KOSAKI Motohiro
@ 2010-12-01 18:07                           ` Oleg Nesterov
  0 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2010-12-01 18:07 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	stable

On 12/01, KOSAKI Motohiro wrote:
>
> > +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
>
> One minor request.
>
> I guess this function can easily makes confusing to a code reader. So I
> hope you write small function comments. describe to
>  - What is oom nascent issue
>  - Why we think inaccurate account is ok

Agreed, this needs a comment.

The patch was already applied, I'll send a separate one on top
of the next "unify exec/compat" series. Or, I'll add the comments
into this series, depending on review.

Thanks,

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: (No subject header)
  2010-12-01 17:37                           ` (No subject header) Milton Miller
@ 2010-12-01 18:27                             ` Oleg Nesterov
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2010-12-01 18:27 UTC (permalink / raw)
  To: Milton Miller
  Cc: KOSAKI Motohiro, Andrew Morton, Linus Torvalds, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath

On 12/01, Milton Miller wrote:
>
> On Tue, 30 Nov 2010 about 20:01:29 -0000, Oleg Nesterov wrote:
> > Teach get_arg_ptr() to handle compat = T case correctly.
>
> >  #include <asm/uaccess.h>
> >  #include <asm/mmu_context.h>
> > @@ -395,6 +396,18 @@ get_arg_ptr(const char __user * const __
> >  {
> >  	const char __user *ptr;
> >
> > +#ifdef CONFIG_COMPAT
> > +	if (unlikely(compat)) {
>
> This should not be marked unlikely.  Unlikely tells gcc the path
> with over 99% confidence and disables branch predictors on some
> architectures.  If called from a compat processes this will result
> in a mispredicted branch every iteration.  Just use if (compat)
> and let the hardware branch predictors do their job.

This applies to almost every likely/unlikely, and I think that compat
processes should fall into "unlikely category". But I don't really mind,
I can remove this hint, I added it mostly as documentation.

> > +#ifdef CONFIG_COMPAT
> > +int compat_do_execve(char * filename,
> > +	compat_uptr_t __user *argv,
> > +	compat_uptr_t __user *envp,
> > +	struct pt_regs * regs)
> > +{
> > +	return do_execve_common(filename,
> > +				(void __user*)argv, (void __user*)envp,
>
> Shouldn't these be compat_ptr(argv)?  (makes a difference on s390)

I'll recheck, but I don't think so. Please note that compat_ptr()
accepts "compat_uptr_t", not "compat_uptr_t *".

argv should be correct as a pointer to user-space, otherwise the
current code is buggy. For example, compat_do_execve() passes
argv to compat_count() which does get_user(argv) without any
conversion.

IOW, even if this should be fixed, I think this have nothing to
do with this patch. But I'll recheck, thanks.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 0/4 RESEND] exec: unify compat/non-compat code
  2010-12-01 18:27                             ` Oleg Nesterov
@ 2011-02-25 17:52                               ` Oleg Nesterov
  2011-02-25 17:52                                 ` [PATCH 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
                                                   ` (5 more replies)
  0 siblings, 6 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-25 17:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 12/01, Oleg Nesterov wrote:
>
> On 12/01, Milton Miller wrote:
> >
> > > +#ifdef CONFIG_COMPAT
> > > +int compat_do_execve(char * filename,
> > > +	compat_uptr_t __user *argv,
> > > +	compat_uptr_t __user *envp,
> > > +	struct pt_regs * regs)
> > > +{
> > > +	return do_execve_common(filename,
> > > +				(void __user*)argv, (void __user*)envp,
> >
> > Shouldn't these be compat_ptr(argv)?  (makes a difference on s390)
>
> I'll recheck, but I don't think so. Please note that compat_ptr()
> accepts "compat_uptr_t", not "compat_uptr_t *".
>
> argv should be correct as a pointer to user-space, otherwise the
> current code is buggy. For example, compat_do_execve() passes
> argv to compat_count() which does get_user(argv) without any
> conversion.

So, once again, this should not (and can not) be compat_ptr(argv) afaics.

I don't understand the s390 asm, but compat_wrapper.S:sys32_execve_wrapper
looks correct. If not, the current code is already buggy and s390 should
be fixed. argv/envp are not compat ptrs, they just point to compat_ data,
we should not do any conversion.

I am resending this series unchanged, plus the trivial 5/5 to document
acct_arg_size().

----------------------------------------------------------------------

execve code in fs/compat.c must die. It is very hard to maintain this
copy-and-paste horror. And the only reason for this duplication is that
argv/envp point to char* or compat_uptr_t depending on compat. We can
add the trivial helper which hides the difference and unify the code.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 1/5] exec: introduce get_arg_ptr() helper
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
@ 2011-02-25 17:52                                 ` Oleg Nesterov
  2011-02-25 17:52                                 ` [PATCH 2/5] exec: introduce "bool compat" argument Oleg Nesterov
                                                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-25 17:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

Introduce get_arg_ptr() helper, convert count() and copy_strings()
to use it.

No functional changes, preparation. This helper is trivial, it just
reads the pointer from argv/envp user-space array.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- 38/fs/exec.c~1_get_arg_ptr	2011-02-25 18:01:59.000000000 +0100
+++ 38/fs/exec.c	2011-02-25 18:04:50.000000000 +0100
@@ -395,6 +395,17 @@ err:
 	return err;
 }
 
+static const char __user *
+get_arg_ptr(const char __user * const __user *argv, int argc)
+{
+	const char __user *ptr;
+
+	if (get_user(ptr, argv + argc))
+		return ERR_PTR(-EFAULT);
+
+	return ptr;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -404,13 +415,14 @@ static int count(const char __user * con
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -440,16 +452,18 @@ static int copy_strings(int argc, const 
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 2/5] exec: introduce "bool compat" argument
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
  2011-02-25 17:52                                 ` [PATCH 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
@ 2011-02-25 17:52                                 ` Oleg Nesterov
  2011-02-25 18:57                                   ` Linus Torvalds
  2011-02-25 17:53                                 ` [PATCH 3/5] exec: unify compat_do_execve() code Oleg Nesterov
                                                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-25 17:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

No functional changes, preparation to simplify the review.

And the new (and currently unused) "bool compat" argument to
get_arg_ptr(), count(), and copy_strings().

Add this argument to do_execve() as well, and rename it to
do_execve_common().

Reintroduce do_execve() as a trivial wrapper() on top of
do_execve_common(compat => false).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

--- 38/fs/exec.c~2_is_compat_arg	2011-02-25 18:04:50.000000000 +0100
+++ 38/fs/exec.c	2011-02-25 18:05:05.000000000 +0100
@@ -396,7 +396,7 @@ err:
 }
 
 static const char __user *
-get_arg_ptr(const char __user * const __user *argv, int argc)
+get_arg_ptr(const char __user * const __user *argv, int argc, bool compat)
 {
 	const char __user *ptr;
 
@@ -409,13 +409,13 @@ get_arg_ptr(const char __user * const __
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(const char __user * const __user *argv, int max, bool compat)
 {
 	int i = 0;
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user *p = get_arg_ptr(argv, i);
+			const char __user *p = get_arg_ptr(argv, i, compat);
 
 			if (!p)
 				break;
@@ -440,7 +440,7 @@ static int count(const char __user * con
  * ensures the destination page is created and not swapped out.
  */
 static int copy_strings(int argc, const char __user *const __user *argv,
-			struct linux_binprm *bprm)
+			struct linux_binprm *bprm, bool compat)
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
@@ -453,7 +453,7 @@ static int copy_strings(int argc, const 
 		unsigned long pos;
 
 		ret = -EFAULT;
-		str = get_arg_ptr(argv, argc);
+		str = get_arg_ptr(argv, argc, compat);
 		if (IS_ERR(str))
 			goto out;
 
@@ -536,7 +536,8 @@ int copy_strings_kernel(int argc, const 
 	int r;
 	mm_segment_t oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, (const char __user *const  __user *)argv,
+				bprm, false);
 	set_fs(oldfs);
 	return r;
 }
@@ -1387,10 +1388,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
+static int do_execve_common(const char *filename,
 	const char __user *const __user *argv,
 	const char __user *const __user *envp,
-	struct pt_regs * regs)
+	struct pt_regs *regs, bool compat)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1432,11 +1433,11 @@ int do_execve(const char * filename,
 	if (retval)
 		goto out_file;
 
-	bprm->argc = count(argv, MAX_ARG_STRINGS);
+	bprm->argc = count(argv, MAX_ARG_STRINGS, compat);
 	if ((retval = bprm->argc) < 0)
 		goto out;
 
-	bprm->envc = count(envp, MAX_ARG_STRINGS);
+	bprm->envc = count(envp, MAX_ARG_STRINGS, compat);
 	if ((retval = bprm->envc) < 0)
 		goto out;
 
@@ -1449,11 +1450,11 @@ int do_execve(const char * filename,
 		goto out;
 
 	bprm->exec = bprm->p;
-	retval = copy_strings(bprm->envc, envp, bprm);
+	retval = copy_strings(bprm->envc, envp, bprm, compat);
 	if (retval < 0)
 		goto out;
 
-	retval = copy_strings(bprm->argc, argv, bprm);
+	retval = copy_strings(bprm->argc, argv, bprm, compat);
 	if (retval < 0)
 		goto out;
 
@@ -1497,6 +1498,14 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *argv,
+	const char __user *const __user *envp,
+	struct pt_regs *regs)
+{
+	return do_execve_common(filename, argv, envp, regs, false);
+}
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 3/5] exec: unify compat_do_execve() code
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
  2011-02-25 17:52                                 ` [PATCH 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
  2011-02-25 17:52                                 ` [PATCH 2/5] exec: introduce "bool compat" argument Oleg Nesterov
@ 2011-02-25 17:53                                 ` Oleg Nesterov
  2011-02-25 19:10                                   ` Linus Torvalds
  2011-02-25 17:53                                 ` [PATCH 4/5] exec: unexport acct_arg_size() and get_arg_page() Oleg Nesterov
                                                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-25 17:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

Teach get_arg_ptr() to handle compat = T case correctly.

This allows us to remove the compat_do_execve() code from fs/compat.c
and reimplement compat_do_execve() as the trivial wrapper on top of
do_execve_common(compat => true).

In fact, this fixes another (minor) bug. "compat_uptr_t str" can
overflow after "str += len" in compat_copy_strings() if a 64bit
application execs via sys32_execve().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c   |   25 ++++++
 fs/compat.c |  235 ------------------------------------------------------------
 2 files changed, 25 insertions(+), 235 deletions(-)

--- 38/fs/exec.c~3_use_compat	2011-02-25 18:05:05.000000000 +0100
+++ 38/fs/exec.c	2011-02-25 18:05:17.000000000 +0100
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -400,6 +401,18 @@ get_arg_ptr(const char __user * const __
 {
 	const char __user *ptr;
 
+#ifdef CONFIG_COMPAT
+	if (unlikely(compat)) {
+		compat_uptr_t __user *a = (void __user *)argv;
+		compat_uptr_t p;
+
+		if (get_user(p, a + argc))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(p);
+	}
+#endif
+
 	if (get_user(ptr, argv + argc))
 		return ERR_PTR(-EFAULT);
 
@@ -1506,6 +1519,18 @@ int do_execve(const char *filename,
 	return do_execve_common(filename, argv, envp, regs, false);
 }
 
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *argv,
+	compat_uptr_t __user *envp,
+	struct pt_regs *regs)
+{
+	return do_execve_common(filename,
+				(void __user *)argv, (void __user*)envp,
+				regs, true);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;
--- 38/fs/compat.c~3_use_compat	2011-02-25 18:01:58.000000000 +0100
+++ 38/fs/compat.c	2011-02-25 18:05:17.000000000 +0100
@@ -1330,241 +1330,6 @@ compat_sys_openat(unsigned int dfd, cons
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 4/5] exec: unexport acct_arg_size() and get_arg_page()
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
                                                   ` (2 preceding siblings ...)
  2011-02-25 17:53                                 ` [PATCH 3/5] exec: unify compat_do_execve() code Oleg Nesterov
@ 2011-02-25 17:53                                 ` Oleg Nesterov
  2011-02-25 17:54                                 ` [PATCH 5/5] exec: document acct_arg_size() Oleg Nesterov
  2011-02-25 18:54                                 ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Linus Torvalds
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-25 17:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

Unexport acct_arg_size() and get_arg_page(), fs/compat.c doesn't
need them any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/binfmts.h |    4 ----
 fs/exec.c               |    8 ++++----
 2 files changed, 4 insertions(+), 8 deletions(-)

--- 38/include/linux/binfmts.h~4_unexport_arg_helpers	2011-02-25 18:01:57.000000000 +0100
+++ 38/include/linux/binfmts.h	2011-02-25 18:05:27.000000000 +0100
@@ -60,10 +60,6 @@ struct linux_binprm {
 	unsigned long loader, exec;
 };
 
-extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
-extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
-					int write);
-
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- 38/fs/exec.c~4_unexport_arg_helpers	2011-02-25 18:05:17.000000000 +0100
+++ 38/fs/exec.c	2011-02-25 18:05:27.000000000 +0100
@@ -165,7 +165,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -184,7 +184,7 @@ void acct_arg_size(struct linux_binprm *
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -303,11 +303,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH 5/5] exec: document acct_arg_size()
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
                                                   ` (3 preceding siblings ...)
  2011-02-25 17:53                                 ` [PATCH 4/5] exec: unexport acct_arg_size() and get_arg_page() Oleg Nesterov
@ 2011-02-25 17:54                                 ` Oleg Nesterov
  2011-02-25 18:54                                 ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Linus Torvalds
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-25 17:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, Linus Torvalds, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

Add the comment to explain acct_arg_size().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- 38/fs/exec.c~5_doc_acct_arg_size	2011-02-25 18:05:27.000000000 +0100
+++ 38/fs/exec.c	2011-02-25 18:05:34.000000000 +0100
@@ -164,7 +164,12 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
 static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 0/4 RESEND] exec: unify compat/non-compat code
  2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
                                                   ` (4 preceding siblings ...)
  2011-02-25 17:54                                 ` [PATCH 5/5] exec: document acct_arg_size() Oleg Nesterov
@ 2011-02-25 18:54                                 ` Linus Torvalds
  2011-02-26 12:35                                   ` Oleg Nesterov
  5 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-02-25 18:54 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Fri, Feb 25, 2011 at 9:52 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>> On 12/01, Milton Miller wrote:
>> >
>> > > +#ifdef CONFIG_COMPAT
>> > > +int compat_do_execve(char * filename,
>> > > + compat_uptr_t __user *argv,
>> > > + compat_uptr_t __user *envp,
>> > > + struct pt_regs * regs)
>> > > +{
>> > > + return do_execve_common(filename,
>> > > +                         (void __user*)argv, (void __user*)envp,
>> >
>> > Shouldn't these be compat_ptr(argv)?  (makes a difference on s390)

Indeed. The "compat_uptr_t __user *argv" is wrong, and it should be just

    compat_uptr_t argv;

and then every time you turn it into a pointer, it should use
"compat_ptr(argv)".

Then, since it's a pointer to an array of pointers, when you do that,
you should turn it into a pointer to "compat_uptr_t", so you actually
have this:

 - user passes "compat_uptr_t"

 - the kernel can turn that into "compat_uptr_t __user *" by doing

       compat_uptr_t __user *pptr;
       pptr = compat_ptr(argv);

 - the kernel needs to fetch the individual entries with

       compat_uptr_t cuptr = get_user(pptr);

 - the kernel can then turn _those_ into the actual pointers to the string with

       const char __user *str = compat_ptr(cuptr);

so you need two levels of compat_ptr() conversion.

> So, once again, this should not (and can not) be compat_ptr(argv) afaics.

It can be, and probably should. But the low-level s390 wrapper
function may have done one of the levels already. It probably
shouldn't, and we _should_ do the "compat_ptr()" thing a the generic C
level. That's what we do with all the other pointers, after all.

                          Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 2/5] exec: introduce "bool compat" argument
  2011-02-25 17:52                                 ` [PATCH 2/5] exec: introduce "bool compat" argument Oleg Nesterov
@ 2011-02-25 18:57                                   ` Linus Torvalds
  2011-02-26 12:37                                     ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-02-25 18:57 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Fri, Feb 25, 2011 at 9:52 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> No functional changes, preparation to simplify the review.

I think this is wrong.

If you introduce the "bool compat" thing, you should also change the
type of the argument pointers to some opaque type at the same time.
It's no longer really a

  const char __user *const __user *

pointer at that point. Trying to claim it is, is just wrong. The type
suddently becomes conditional on that 'compat' variable.

                        Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 3/5] exec: unify compat_do_execve() code
  2011-02-25 17:53                                 ` [PATCH 3/5] exec: unify compat_do_execve() code Oleg Nesterov
@ 2011-02-25 19:10                                   ` Linus Torvalds
  2011-02-26 12:37                                     ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-02-25 19:10 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Fri, Feb 25, 2011 at 9:53 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> Teach get_arg_ptr() to handle compat = T case correctly.

Does it?

> +#ifdef CONFIG_COMPAT
> +int compat_do_execve(char *filename,
> +       compat_uptr_t __user *argv,
> +       compat_uptr_t __user *envp,
> +       struct pt_regs *regs)
> +{
> +       return do_execve_common(filename,
> +                               (void __user *)argv, (void __user*)envp,
> +                               regs, true);
> +}
> +#endif

I really suspect this should be something like

  typedef union {
     compat_uptr_t compat;
     const char __user *native;
   } conditional_user_ptr_t;

  ...

  int compat_do_execve(char *filename,
                  compat_uptr_t argv,
                  compat_uptr_t envp,
                  struct pt_regs *regs)
   {
             return do_execve_common(filename,
                      compat_ptr(argv), compat_ptr(envp), regs);

where that 'do_execve_common()' takes it's arguments as

    union conditional_user_ptr_t __user *argv,
    union conditional_user_ptr_t __user *envp

and then in get_arg_ptr() we do the proper union member dereference
depending on the "compat" flag.

THAT would actually have the type system help us track what is
actually going on, and would clarify the rules. It would also make it
clear that "do_execve_common()" does *not* take some kind of random
pointer to user space (much less a "const char __user *const char
__user *"). It really does take a pointer to user space, but what that
pointer contains in turn depends on the "compat" flag.

IOW, it really acts as a pointer to a user-space union, and I think
we'd be better off having the type show that.

                  Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 0/4 RESEND] exec: unify compat/non-compat code
  2011-02-25 18:54                                 ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Linus Torvalds
@ 2011-02-26 12:35                                   ` Oleg Nesterov
  0 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-26 12:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 02/25, Linus Torvalds wrote:
>
> On Fri, Feb 25, 2011 at 9:52 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> >> On 12/01, Milton Miller wrote:
> >> >
> >> > > +#ifdef CONFIG_COMPAT
> >> > > +int compat_do_execve(char * filename,
> >> > > + compat_uptr_t __user *argv,
> >> > > + compat_uptr_t __user *envp,
> >> > > + struct pt_regs * regs)
> >> > > +{
> >> > > + return do_execve_common(filename,
> >> > > +                         (void __user*)argv, (void __user*)envp,
> >> >
> >> > Shouldn't these be compat_ptr(argv)?  (makes a difference on s390)
>
> Indeed. The "compat_uptr_t __user *argv" is wrong, and it should be just
>
>     compat_uptr_t argv;
>
> and then every time you turn it into a pointer, it should use
> "compat_ptr(argv)".

Oh, perhaps, and I was thinking about this too. But this is another
issue, no? Or I misunderstood.

First of all, I agree that perhaps it makes sense to change the
signature of compat_do_execve()

	-	compat_do_execve(compat_uptr_t __user *argv)
	+	compat_do_execve(compat_uptr_t argv)

but this has nothing to do with this series. We can do this before
or after ("after" seems simpler").

>  - user passes "compat_uptr_t"

Yes,

>  - the kernel can turn that into "compat_uptr_t __user *" by doing
>
>        compat_uptr_t __user *pptr;
>        pptr = compat_ptr(argv);

Yes! and the kernel already does this before it calls compat_do_execve(),
iow compat_do_execve() gets the result of compat_ptr(compat_ptr_from_user).

>  - the kernel needs to fetch the individual entries with
>
>        compat_uptr_t cuptr = get_user(pptr);
>
>  - the kernel can then turn _those_ into the actual pointers to the string with
>
>        const char __user *str = compat_ptr(cuptr);

Yes, and this is exactly what get_arg_ptr(compat => true) does.

> > So, once again, this should not (and can not) be compat_ptr(argv) afaics.
>
> It can be, and probably should.

Only if we change the signature of compat_do_execve(). With the current
code yet another compat_ptr() is not needed and it is simply wrong, this
is what I meant when I replied to Milton.

> But the low-level s390 wrapper
> function may have done one of the levels already. It probably
> shouldn't, and we _should_ do the "compat_ptr()" thing a the generic C
> level.

Agreed, but currently this compat_ptr() thing belongs to the caller.

IOW. Lets look at the current code. arch/ calls
compat_do_execve(compat_uptr_t __user *argv)->compat_count(argv) which
does get_user(argv) without any conversion, because argv was already
converted or arch/ is buggy.

Both do_execve() and compat_do_execve() accept the valid pointer
which does not need any conversion. But this pointer points to different
things, either to "char*" of "compat_uptr_t".

However, please see my reply to 2-3/5, I agree that this is confusing
and can be cleanuped.

Or do you think I missed something else?

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 2/5] exec: introduce "bool compat" argument
  2011-02-25 18:57                                   ` Linus Torvalds
@ 2011-02-26 12:37                                     ` Oleg Nesterov
  0 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-26 12:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 02/25, Linus Torvalds wrote:
>
> On Fri, Feb 25, 2011 at 9:52 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> > No functional changes, preparation to simplify the review.
>
> I think this is wrong.
>
> If you introduce the "bool compat" thing, you should also change the
> type of the argument pointers to some opaque type at the same time.
> It's no longer really a
>
>   const char __user *const __user *
>
> pointer at that point. Trying to claim it is, is just wrong. The type
> suddently becomes conditional on that 'compat' variable.

Yes, this is true.

And I agree this could be done in more clean way, just we need more
changed. Please see the next email.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 3/5] exec: unify compat_do_execve() code
  2011-02-25 19:10                                   ` Linus Torvalds
@ 2011-02-26 12:37                                     ` Oleg Nesterov
  2011-02-26 12:57                                       ` Oleg Nesterov
  2011-02-26 15:55                                       ` Linus Torvalds
  0 siblings, 2 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-26 12:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 02/25, Linus Torvalds wrote:
>
> On Fri, Feb 25, 2011 at 9:53 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> > Teach get_arg_ptr() to handle compat = T case correctly.
>
> Does it?

I think it does.

> > +#ifdef CONFIG_COMPAT
> > +int compat_do_execve(char *filename,
> > +       compat_uptr_t __user *argv,
> > +       compat_uptr_t __user *envp,
> > +       struct pt_regs *regs)
> > +{
> > +       return do_execve_common(filename,
> > +                               (void __user *)argv, (void __user*)envp,
> > +                               regs, true);
> > +}
> > +#endif
>
> I really suspect this should be something like
>
>   typedef union {
>      compat_uptr_t compat;
>      const char __user *native;
>    } conditional_user_ptr_t;

Personally I don't really like this union, to me "void __user*" looks
better, but I won't insist.

>   int compat_do_execve(char *filename,
>                   compat_uptr_t argv,
>                   compat_uptr_t envp,
>    {
>              return do_execve_common(filename,
>                       compat_ptr(argv), compat_ptr(envp), regs);

Indeed! But, again, this has nothing to do with this series. We can
do this later and change the callers in arch/.

> where that 'do_execve_common()' takes it's arguments as
>
>     union conditional_user_ptr_t __user *argv,
>     union conditional_user_ptr_t __user *envp
>
> and then in get_arg_ptr() we do the proper union member dereference
> depending on the "compat" flag.

Once again, to me "void __user*" looks better (just simpler). In this
case get_arg_ptr() becomes (without const/__user for the clarity)

	void *get_arg_ptr(void **argv, int argc, bool compat)
	{
		char *ptr;

	#ifdef CONFIG_COMPAT
		if (unlikely(compat)) {
			compat_uptr_t *a = argv;
			compat_uptr_t p;

			if (get_user(p, a + argc))
				return ERR_PTR(-EFAULT);

			return compat_ptr(p);
		}
	#endif

		if (get_user(ptr, argv + argc))
			return ERR_PTR(-EFAULT);

		return ptr;
	}

Otherwise, get_arg_ptr() should return conditional_user_ptr_t as well,
this looks like the unnecessary complication to me, but of course this
is subjective.

So, what do you think?

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 3/5] exec: unify compat_do_execve() code
  2011-02-26 12:37                                     ` Oleg Nesterov
@ 2011-02-26 12:57                                       ` Oleg Nesterov
  2011-02-26 15:55                                       ` Linus Torvalds
  1 sibling, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-26 12:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 02/26, Oleg Nesterov wrote:
>
> Once again, to me "void __user*" looks better (just simpler). In this
> case get_arg_ptr() becomes (without const/__user for the clarity)
>
> 	void *get_arg_ptr(void **argv, int argc, bool compat)
> 	{
> 		char *ptr;
>
> 	#ifdef CONFIG_COMPAT
> 		if (unlikely(compat)) {
> 			compat_uptr_t *a = argv;
> 			compat_uptr_t p;
>
> 			if (get_user(p, a + argc))
> 				return ERR_PTR(-EFAULT);
>
> 			return compat_ptr(p);
> 		}
> 	#endif
>
> 		if (get_user(ptr, argv + argc))
> 			return ERR_PTR(-EFAULT);
>
> 		return ptr;
> 	}
>
> Otherwise, get_arg_ptr() should return conditional_user_ptr_t as well,

No, this is not true, I am stupid.

Still,

> this looks like the unnecessary complication to me, but of course this
> is subjective.
>
> So, what do you think?

Yes, please.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 3/5] exec: unify compat_do_execve() code
  2011-02-26 12:37                                     ` Oleg Nesterov
  2011-02-26 12:57                                       ` Oleg Nesterov
@ 2011-02-26 15:55                                       ` Linus Torvalds
  2011-02-26 17:44                                         ` Oleg Nesterov
  1 sibling, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-02-26 15:55 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Sat, Feb 26, 2011 at 4:37 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>>
>>   typedef union {
>>      compat_uptr_t compat;
>>      const char __user *native;
>>    } conditional_user_ptr_t;
>
> Personally I don't really like this union, to me "void __user*" looks
> better, but I won't insist.

Umm. "void __user *" may look simpler/better, but it's WRONG.

Using "const char __user *const __user *" is correct - but only for
the non-compat case.

And using "void __user *" may result in compiling code, but it will
have lost all actual information about the type. We don't do that in
the kernel if we can avoid it, because "void *" basically does no type
checking. Sure, sometimes it's the only thing we can do, but _if_ we
have a type, we should use it.

And that "union" really is the true type. You are passing a user
pointer down that can be either of those members.

So if you think it looks ugly, then you shouldn't do that "conditional
compat argument at run-time at all". Because the real ugliness of the
type comes not from the type, but from the fact that you pass a
pointer that can contain two different things.


> Once again, to me "void __user*" looks better (just simpler). In this
> case get_arg_ptr() becomes (without const/__user for the clarity)

No.

I simply won't apply that. It's WRONG. It's wrong because you've
dropped all the type information.

With the right union,

>        void *get_arg_ptr(void **argv, int argc, bool compat)
>        {
>                char *ptr;
>
>        #ifdef CONFIG_COMPAT
>                if (unlikely(compat)) {
>                        compat_uptr_t *a = argv;
>                        compat_uptr_t p;
>
>                        if (get_user(p, a + argc))
>                                return ERR_PTR(-EFAULT);
>
>                        return compat_ptr(p);
>                }
>        #endif
>
>                if (get_user(ptr, &argv. + argc))
>                        return ERR_PTR(-EFAULT);
>
>                return ptr;
>        }
>
> Otherwise, get_arg_ptr() should return conditional_user_ptr_t as well,

No it shouldn't. The get_arg_ptr() should always just return the
actual pointer. It will have _resolved_ the ambiguity! That's what the
"compat_ptr()" thing does in the return case inside teh CONFIG_COMPAT.

So the correct way to do this is something like the following (yeah,
maybe I got the syntax wrong, I didn't test this, I just wrote it in
my MUA):

       void *get_arg_ptr(const union compat_ptr_union __user *argv,
int argc, bool compat)
       {
               char *ptr;

        #ifdef CONFIG_COMPAT
               if (unlikely(compat)) {
                       compat_uptr_t p;

                       if (get_user(p, &argv->compat + argc))
                               return ERR_PTR(-EFAULT);

                       return compat_ptr(p);
               }
        #endif

               if (get_user(ptr, &argv->noncompat +argc))
                       return ERR_PTR(-EFAULT);

               return ptr;
       }

and notice how it gets the types right, and it even has one line LESS
than your version, exactly because it gets the types right and doesn't
need that implied cast in your

     compat_uptr_t *a = argv;

(in fact, I think your version needs an _explicit_ cast in order to
not get a warning: you can't just cast "void **" to something else).

See? The advantage of the union is that the types are correct, which
means that the casts are unnecessary.

The advantage of the union is also that you see what is going on, and
it's clear from the function prototype that this doesn't just take a
random user pointer, it takes a user pointer to something that can be
two different types.

See? Correct typing is important.

                            Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH 3/5] exec: unify compat_do_execve() code
  2011-02-26 15:55                                       ` Linus Torvalds
@ 2011-02-26 17:44                                         ` Oleg Nesterov
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-02-26 17:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 02/26, Linus Torvalds wrote:
>
> On Sat, Feb 26, 2011 at 4:37 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> >>
> > Otherwise, get_arg_ptr() should return conditional_user_ptr_t as well,
>
> No it shouldn't.

(Yes I am stupid, see the next email).

> and notice how it gets the types right, and it even has one line LESS
> than your version, exactly because it gets the types right and doesn't
> need that implied cast in your
>
>      compat_uptr_t *a = argv;
>
> (in fact, I think your version needs an _explicit_ cast in order to
> not get a warning: you can't just cast "void **" to something else).

Yes, and get_user(argv) in the !compat case doesn't look nice, I agree.

> See? The advantage of the union is that the types are correct, which
> means that the casts are unnecessary.

My point was, apart from the trivial get_arg_ptr() helper, nobody else
uses this argv/envp, so I thought it is OK to drop the type info and
use "void *".

But as I said, I won't insist. I'll redo/resend.

Thanks.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v2 0/5] exec: unify native/compat code
  2011-02-26 17:44                                         ` Oleg Nesterov
@ 2011-03-01 20:47                                           ` Oleg Nesterov
  2011-03-01 20:48                                             ` [PATCH v2 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
                                                               ` (5 more replies)
  0 siblings, 6 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-01 20:47 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

On 02/26, Oleg Nesterov wrote:
>
> On 02/26, Linus Torvalds wrote:
> >
> > See? The advantage of the union is that the types are correct, which
> > means that the casts are unnecessary.
>
> My point was, apart from the trivial get_arg_ptr() helper, nobody else
> uses this argv/envp, so I thought it is OK to drop the type info and
> use "void *".
>
> But as I said, I won't insist. I'll redo/resend.

Well, yes... But it turns out I didn't actually read what you proposed.

	typedef union {
		compat_uptr_t compat;
		const char __user *native;
	} conditional_user_ptr_t;

	...

	where that 'do_execve_common()' takes it's arguments as

		union conditional_user_ptr_t __user *argv,
		union conditional_user_ptr_t __user *envp

I hope you didn't really mean this...

OK, we have two kinds of pointers, the union makes sense. But I think
we do not want the 3rd kind, pointer to the union. This can't help to
avoid the casts. Yes, get_arg_ptr() can do

	&argv->native

but this still means the cast even if looks differently (and tricky).

And. How can we pass "argv" from do_execve() to do_execve_common() ?
We need another cast.

So. If you insist you prefer the pointer to the union - no need to
convince me. Just say this and I'll redo again.

This patch does:

	typedef union {
		const char __user *const __user *native;
		compat_uptr_t __user *compat;
	} conditional_user_ptr_t;

	static int do_execve_common(const char *filename,
			conditional_user_ptr_t argv,
			conditional_user_ptr_t envp,
			struct pt_regs *regs, bool compat)

get_arg_ptr() does argv.native/compat, this looks more understandable.

Do you agree?

copy_strings_kernel() still needs the cast, but this is only because
we want to add "__user" for annotation.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v2 1/5] exec: introduce get_arg_ptr() helper
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
@ 2011-03-01 20:48                                             ` Oleg Nesterov
  2011-03-01 20:48                                             ` [PATCH v2 2/5] exec: introduce "bool compat" argument Oleg Nesterov
                                                               ` (4 subsequent siblings)
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-01 20:48 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

Introduce get_arg_ptr() helper, convert count() and copy_strings()
to use it.

No functional changes, preparation. This helper is trivial, it just
reads the pointer from argv/envp user-space array.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- 38/fs/exec.c~1_get_arg_ptr	2011-03-01 21:15:47.000000000 +0100
+++ 38/fs/exec.c	2011-03-01 21:17:45.000000000 +0100
@@ -395,6 +395,17 @@ err:
 	return err;
 }
 
+static const char __user *
+get_arg_ptr(const char __user * const __user *argv, int argc)
+{
+	const char __user *ptr;
+
+	if (get_user(ptr, argv + argc))
+		return ERR_PTR(-EFAULT);
+
+	return ptr;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -404,13 +415,14 @@ static int count(const char __user * con
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -440,16 +452,18 @@ static int copy_strings(int argc, const 
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v2 2/5] exec: introduce "bool compat" argument
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
  2011-03-01 20:48                                             ` [PATCH v2 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
@ 2011-03-01 20:48                                             ` Oleg Nesterov
  2011-03-01 20:48                                             ` [PATCH v2 3/5] exec: introduce conditional_user_ptr_t Oleg Nesterov
                                                               ` (3 subsequent siblings)
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-01 20:48 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

No functional changes, preparation to simplify the review.

And the new (and currently unused) "bool compat" argument to
get_arg_ptr(), count(), and copy_strings().

Add this argument to do_execve() as well, and rename it to
do_execve_common().

Reintroduce do_execve() as a trivial wrapper() on top of
do_execve_common(compat => false).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

--- 38/fs/exec.c~2_is_compat_arg	2011-03-01 21:17:45.000000000 +0100
+++ 38/fs/exec.c	2011-03-01 21:17:46.000000000 +0100
@@ -396,7 +396,7 @@ err:
 }
 
 static const char __user *
-get_arg_ptr(const char __user * const __user *argv, int argc)
+get_arg_ptr(const char __user * const __user *argv, int argc, bool compat)
 {
 	const char __user *ptr;
 
@@ -409,13 +409,13 @@ get_arg_ptr(const char __user * const __
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(const char __user * const __user *argv, int max, bool compat)
 {
 	int i = 0;
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user *p = get_arg_ptr(argv, i);
+			const char __user *p = get_arg_ptr(argv, i, compat);
 
 			if (!p)
 				break;
@@ -440,7 +440,7 @@ static int count(const char __user * con
  * ensures the destination page is created and not swapped out.
  */
 static int copy_strings(int argc, const char __user *const __user *argv,
-			struct linux_binprm *bprm)
+			struct linux_binprm *bprm, bool compat)
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
@@ -453,7 +453,7 @@ static int copy_strings(int argc, const 
 		unsigned long pos;
 
 		ret = -EFAULT;
-		str = get_arg_ptr(argv, argc);
+		str = get_arg_ptr(argv, argc, compat);
 		if (IS_ERR(str))
 			goto out;
 
@@ -536,7 +536,8 @@ int copy_strings_kernel(int argc, const 
 	int r;
 	mm_segment_t oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, (const char __user *const  __user *)argv,
+				bprm, false);
 	set_fs(oldfs);
 	return r;
 }
@@ -1387,10 +1388,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
+static int do_execve_common(const char *filename,
 	const char __user *const __user *argv,
 	const char __user *const __user *envp,
-	struct pt_regs * regs)
+	struct pt_regs *regs, bool compat)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1432,11 +1433,11 @@ int do_execve(const char * filename,
 	if (retval)
 		goto out_file;
 
-	bprm->argc = count(argv, MAX_ARG_STRINGS);
+	bprm->argc = count(argv, MAX_ARG_STRINGS, compat);
 	if ((retval = bprm->argc) < 0)
 		goto out;
 
-	bprm->envc = count(envp, MAX_ARG_STRINGS);
+	bprm->envc = count(envp, MAX_ARG_STRINGS, compat);
 	if ((retval = bprm->envc) < 0)
 		goto out;
 
@@ -1449,11 +1450,11 @@ int do_execve(const char * filename,
 		goto out;
 
 	bprm->exec = bprm->p;
-	retval = copy_strings(bprm->envc, envp, bprm);
+	retval = copy_strings(bprm->envc, envp, bprm, compat);
 	if (retval < 0)
 		goto out;
 
-	retval = copy_strings(bprm->argc, argv, bprm);
+	retval = copy_strings(bprm->argc, argv, bprm, compat);
 	if (retval < 0)
 		goto out;
 
@@ -1497,6 +1498,14 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *argv,
+	const char __user *const __user *envp,
+	struct pt_regs *regs)
+{
+	return do_execve_common(filename, argv, envp, regs, false);
+}
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v2 3/5] exec: introduce conditional_user_ptr_t
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
  2011-03-01 20:48                                             ` [PATCH v2 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
  2011-03-01 20:48                                             ` [PATCH v2 2/5] exec: introduce "bool compat" argument Oleg Nesterov
@ 2011-03-01 20:48                                             ` Oleg Nesterov
  2011-03-01 20:49                                             ` [PATCH v2 4/5] exec: unify do_execve/compat_do_execve code Oleg Nesterov
                                                               ` (2 subsequent siblings)
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-01 20:48 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

No functional changes, preparation.

Introduce conditional_user_ptr_t, change do_execve() paths to use it
instead of "char __user * const __user *argv".

This makes the argv/envp arguments opaque, we are ready to handle the
compat case which needs argv pointing to compat_uptr_t.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

--- 38/fs/exec.c~3_typdef_for_argv	2011-03-01 21:17:46.000000000 +0100
+++ 38/fs/exec.c	2011-03-01 21:17:46.000000000 +0100
@@ -395,12 +395,16 @@ err:
 	return err;
 }
 
+typedef union {
+	const char __user *const __user *native;
+} conditional_user_ptr_t;
+
 static const char __user *
-get_arg_ptr(const char __user * const __user *argv, int argc, bool compat)
+get_arg_ptr(conditional_user_ptr_t argv, int argc, bool compat)
 {
 	const char __user *ptr;
 
-	if (get_user(ptr, argv + argc))
+	if (get_user(ptr, argv.native + argc))
 		return ERR_PTR(-EFAULT);
 
 	return ptr;
@@ -409,11 +413,11 @@ get_arg_ptr(const char __user * const __
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user *argv, int max, bool compat)
+static int count(conditional_user_ptr_t argv, int max, bool compat)
 {
 	int i = 0;
 
-	if (argv != NULL) {
+	if (argv.native != NULL) {
 		for (;;) {
 			const char __user *p = get_arg_ptr(argv, i, compat);
 
@@ -439,7 +443,7 @@ static int count(const char __user * con
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, conditional_user_ptr_t argv,
 			struct linux_binprm *bprm, bool compat)
 {
 	struct page *kmapped_page = NULL;
@@ -530,15 +534,19 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *ptr,
 			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
+	conditional_user_ptr_t argv = {
+		.native = (const char __user *const  __user *)ptr,
+	};
+
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv,
-				bprm, false);
+	r = copy_strings(argc, argv, bprm, false);
 	set_fs(oldfs);
+
 	return r;
 }
 EXPORT_SYMBOL(copy_strings_kernel);
@@ -1389,8 +1397,7 @@ EXPORT_SYMBOL(search_binary_handler);
  * sys_execve() executes a new program.
  */
 static int do_execve_common(const char *filename,
-	const char __user *const __user *argv,
-	const char __user *const __user *envp,
+	conditional_user_ptr_t argv, conditional_user_ptr_t envp,
 	struct pt_regs *regs, bool compat)
 {
 	struct linux_binprm *bprm;
@@ -1499,10 +1506,12 @@ out_ret:
 }
 
 int do_execve(const char *filename,
-	const char __user *const __user *argv,
-	const char __user *const __user *envp,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp,
 	struct pt_regs *regs)
 {
+	conditional_user_ptr_t argv = { .native = __argv };
+	conditional_user_ptr_t envp = { .native = __envp };
 	return do_execve_common(filename, argv, envp, regs, false);
 }
 


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v2 4/5] exec: unify do_execve/compat_do_execve code
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
                                                               ` (2 preceding siblings ...)
  2011-03-01 20:48                                             ` [PATCH v2 3/5] exec: introduce conditional_user_ptr_t Oleg Nesterov
@ 2011-03-01 20:49                                             ` Oleg Nesterov
  2011-03-01 20:49                                             ` [PATCH v2 5/5] exec: document acct_arg_size() Oleg Nesterov
  2011-03-01 21:39                                             ` [PATCH v2 0/5] exec: unify native/compat code Linus Torvalds
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-01 20:49 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

Add the appropriate member into conditional_user_ptr_t union and
teach get_arg_ptr() to handle compat = T case correctly.

This allows us to remove the compat_do_execve() code from fs/compat.c
and reimplement compat_do_execve() as the trivial wrapper on top of
do_execve_common(compat => true).

In fact, this fixes another (minor) bug. "compat_uptr_t str" can
overflow after "str += len" in compat_copy_strings() if a 64bit
application execs via sys32_execve().

Unexport acct_arg_size() and get_arg_page(), fs/compat.c doesn't
need them any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/binfmts.h |    4 
 fs/exec.c               |   33 +++++-
 fs/compat.c             |  235 ------------------------------------------------
 3 files changed, 29 insertions(+), 243 deletions(-)

--- 38/include/linux/binfmts.h~4_use_compat	2011-03-01 21:15:45.000000000 +0100
+++ 38/include/linux/binfmts.h	2011-03-01 21:17:47.000000000 +0100
@@ -60,10 +60,6 @@ struct linux_binprm {
 	unsigned long loader, exec;
 };
 
-extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
-extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
-					int write);
-
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- 38/fs/exec.c~4_use_compat	2011-03-01 21:17:46.000000000 +0100
+++ 38/fs/exec.c	2011-03-01 21:17:47.000000000 +0100
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -164,7 +165,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -183,7 +184,7 @@ void acct_arg_size(struct linux_binprm *
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -302,11 +303,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -397,6 +398,7 @@ err:
 
 typedef union {
 	const char __user *const __user *native;
+	compat_uptr_t __user *compat;
 } conditional_user_ptr_t;
 
 static const char __user *
@@ -404,6 +406,17 @@ get_arg_ptr(conditional_user_ptr_t argv,
 {
 	const char __user *ptr;
 
+#ifdef CONFIG_COMPAT
+	if (unlikely(compat)) {
+		compat_uptr_t p;
+
+		if (get_user(p, argv.compat + argc))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(p);
+	}
+#endif
+
 	if (get_user(ptr, argv.native + argc))
 		return ERR_PTR(-EFAULT);
 
@@ -1515,6 +1528,18 @@ int do_execve(const char *filename,
 	return do_execve_common(filename, argv, envp, regs, false);
 }
 
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *__argv,
+	compat_uptr_t __user *__envp,
+	struct pt_regs *regs)
+{
+	conditional_user_ptr_t argv = { .compat = __argv };
+	conditional_user_ptr_t envp = { .compat = __envp };
+	return do_execve_common(filename, argv, envp, regs, true);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;
--- 38/fs/compat.c~4_use_compat	2011-03-01 21:15:45.000000000 +0100
+++ 38/fs/compat.c	2011-03-01 21:17:47.000000000 +0100
@@ -1330,241 +1330,6 @@ compat_sys_openat(unsigned int dfd, cons
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v2 5/5] exec: document acct_arg_size()
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
                                                               ` (3 preceding siblings ...)
  2011-03-01 20:49                                             ` [PATCH v2 4/5] exec: unify do_execve/compat_do_execve code Oleg Nesterov
@ 2011-03-01 20:49                                             ` Oleg Nesterov
  2011-03-01 21:39                                             ` [PATCH v2 0/5] exec: unify native/compat code Linus Torvalds
  5 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-01 20:49 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

Add the comment to explain acct_arg_size().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- 38/fs/exec.c~5_doc_acct_arg_size	2011-03-01 21:17:47.000000000 +0100
+++ 38/fs/exec.c	2011-03-01 21:17:47.000000000 +0100
@@ -164,7 +164,12 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
 static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v2 0/5] exec: unify native/compat code
  2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
                                                               ` (4 preceding siblings ...)
  2011-03-01 20:49                                             ` [PATCH v2 5/5] exec: document acct_arg_size() Oleg Nesterov
@ 2011-03-01 21:39                                             ` Linus Torvalds
  2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
  5 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-03-01 21:39 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Tue, Mar 1, 2011 at 12:47 PM, Oleg Nesterov <oleg@redhat.com> wrote:
>        where that 'do_execve_common()' takes it's arguments as
>
>                union conditional_user_ptr_t __user *argv,
>                union conditional_user_ptr_t __user *envp
>
> I hope you didn't really mean this...

I really did mean that (although not the double "union" + "_t" thing
for the typedef).

But I'm not going to claim that it has to be done exactly that way,
the union can certainly be encapsulated differently too.

So I'm ok with your alternative

>        typedef union {
>                const char __user *const __user *native;
>                compat_uptr_t __user *compat;
>        } conditional_user_ptr_t;

model instead, which moves the pointer into the union.

However, if you do this, then I have one more suggestion: just move
the "compat" flag in there too!

Every time you pass the union, you're going to pass the compat flag to
distinguish the cases. So do it like this:

  struct conditional_ptr {
    int is_compat;
    union {
      const char __user *const __user *native;
      compat_uptr_t __user *compat;
    };
  };

and it will all look much cleaner, I bet.

                        Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v3 0/4] exec: unify native/compat code
  2011-03-01 21:39                                             ` [PATCH v2 0/5] exec: unify native/compat code Linus Torvalds
@ 2011-03-02 16:26                                               ` Oleg Nesterov
  2011-03-02 16:27                                                 ` [PATCH v3 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
                                                                   ` (4 more replies)
  0 siblings, 5 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-02 16:26 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

On 03/01, Linus Torvalds wrote:
>
> So I'm ok with your alternative
>
> >        typedef union {
> >                const char __user *const __user *native;
> >                compat_uptr_t __user *compat;
> >        } conditional_user_ptr_t;
>
> model instead, which moves the pointer into the union.
>
> However, if you do this, then I have one more suggestion: just move
> the "compat" flag in there too!
>
> Every time you pass the union, you're going to pass the compat flag to
> distinguish the cases. So do it like this:
>
>   struct conditional_ptr {
>     int is_compat;
>     union {
>       const char __user *const __user *native;
>       compat_uptr_t __user *compat;
>     };
>   };
>
> and it will all look much cleaner, I bet.

Heh. I knew. I swear, I knew you would suggest this ;)

OK, please find v3. I had to deanonymize the union though, otherwise
the initializer in do_execve() becomes nontrivial.



But I don't think this is right. Not only this adds 200 bytes to exec.o.
To me, is_compat is not the private property of argv/envp. Yes, currently
nobody except get_arg_ptr() needs to know the difference. But who knows,
it is possible that we will need more "if (compat)" code in future. IOW,
I think that the explicit argument is a win.

Never mind. I agree with everything as long as we can remove this c-a-p
compat_do_execve().

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v3 1/4] exec: introduce get_arg_ptr() helper
  2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
@ 2011-03-02 16:27                                                 ` Oleg Nesterov
  2011-03-03  3:01                                                   ` KOSAKI Motohiro
  2011-03-02 16:27                                                 ` [PATCH v3 2/4] exec: introduce struct conditional_ptr Oleg Nesterov
                                                                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-02 16:27 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

Introduce get_arg_ptr() helper, convert count() and copy_strings()
to use it.

No functional changes, preparation. This helper is trivial, it just
reads the pointer from argv/envp user-space array.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- 38/fs/exec.c~1_get_arg_ptr	2011-03-02 15:15:27.000000000 +0100
+++ 38/fs/exec.c	2011-03-02 15:16:44.000000000 +0100
@@ -395,6 +395,17 @@ err:
 	return err;
 }
 
+static const char __user *
+get_arg_ptr(const char __user * const __user *argv, int argc)
+{
+	const char __user *ptr;
+
+	if (get_user(ptr, argv + argc))
+		return ERR_PTR(-EFAULT);
+
+	return ptr;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -404,13 +415,14 @@ static int count(const char __user * con
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -440,16 +452,18 @@ static int copy_strings(int argc, const 
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v3 2/4] exec: introduce struct conditional_ptr
  2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
  2011-03-02 16:27                                                 ` [PATCH v3 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
@ 2011-03-02 16:27                                                 ` Oleg Nesterov
  2011-03-03  3:08                                                   ` KOSAKI Motohiro
  2011-03-02 16:27                                                 ` [PATCH v3 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
                                                                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-02 16:27 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

No functional changes, preparation.

Introduce struct conditional_ptr, change do_execve() paths to use it
instead of "char __user * const __user *argv".

This makes the argv/envp arguments opaque, we are ready to handle the
compat case which needs argv pointing to compat_uptr_t.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |   42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

--- 38/fs/exec.c~2_typedef_for_argv	2011-03-02 15:40:22.000000000 +0100
+++ 38/fs/exec.c	2011-03-02 15:40:44.000000000 +0100
@@ -395,12 +395,15 @@ err:
 	return err;
 }
 
-static const char __user *
-get_arg_ptr(const char __user * const __user *argv, int argc)
+struct conditional_ptr {
+	const char __user *const __user *native;
+};
+
+static const char __user *get_arg_ptr(struct conditional_ptr argv, int argc)
 {
 	const char __user *ptr;
 
-	if (get_user(ptr, argv + argc))
+	if (get_user(ptr, argv.native + argc))
 		return ERR_PTR(-EFAULT);
 
 	return ptr;
@@ -409,11 +412,11 @@ get_arg_ptr(const char __user * const __
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(struct conditional_ptr argv, int max)
 {
 	int i = 0;
 
-	if (argv != NULL) {
+	if (argv.native != NULL) {
 		for (;;) {
 			const char __user *p = get_arg_ptr(argv, i);
 
@@ -439,7 +442,7 @@ static int count(const char __user * con
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, struct conditional_ptr argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
@@ -530,14 +533,19 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *__argv,
 			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
+	struct conditional_ptr argv = {
+		.native = (const char __user *const  __user *)__argv,
+	};
+
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, argv, bprm);
 	set_fs(oldfs);
+
 	return r;
 }
 EXPORT_SYMBOL(copy_strings_kernel);
@@ -1387,10 +1395,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
-	const char __user *const __user *argv,
-	const char __user *const __user *envp,
-	struct pt_regs * regs)
+static int do_execve_common(const char *filename,
+				struct conditional_ptr argv,
+				struct conditional_ptr envp,
+				struct pt_regs *regs)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1497,6 +1505,16 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp,
+	struct pt_regs *regs)
+{
+	struct conditional_ptr argv = { .native = __argv };
+	struct conditional_ptr envp = { .native = __envp };
+	return do_execve_common(filename, argv, envp, regs);
+}
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v3 3/4] exec: unify do_execve/compat_do_execve code
  2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
  2011-03-02 16:27                                                 ` [PATCH v3 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
  2011-03-02 16:27                                                 ` [PATCH v3 2/4] exec: introduce struct conditional_ptr Oleg Nesterov
@ 2011-03-02 16:27                                                 ` Oleg Nesterov
  2011-03-03  3:13                                                   ` KOSAKI Motohiro
  2011-03-02 16:28                                                 ` [PATCH v3 4/4] exec: document acct_arg_size() Oleg Nesterov
  2011-03-02 16:44                                                 ` [PATCH v3 0/4] exec: unify native/compat code Oleg Nesterov
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-02 16:27 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

Add the appropriate members into struct conditional_ptr and teach
get_arg_ptr() to handle is_compat = T case correctly.

This allows us to remove the compat_do_execve() code from fs/compat.c
and reimplement compat_do_execve() as the trivial wrapper on top of
do_execve_common(is_compat => true).

In fact, this fixes another (minor) bug. "compat_uptr_t str" can
overflow after "str += len" in compat_copy_strings() if a 64bit
application execs via sys32_execve().

Unexport acct_arg_size() and get_arg_page(), fs/compat.c doesn't
need them any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/binfmts.h |    4 
 fs/exec.c               |   58 +++++++++--
 fs/compat.c             |  235 ------------------------------------------------
 3 files changed, 46 insertions(+), 251 deletions(-)

--- 38/include/linux/binfmts.h~3_handle_compat_case	2011-03-02 15:15:25.000000000 +0100
+++ 38/include/linux/binfmts.h	2011-03-02 15:47:15.000000000 +0100
@@ -60,10 +60,6 @@ struct linux_binprm {
 	unsigned long loader, exec;
 };
 
-extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
-extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
-					int write);
-
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- 38/fs/exec.c~3_handle_compat_case	2011-03-02 15:40:44.000000000 +0100
+++ 38/fs/exec.c	2011-03-02 16:21:57.000000000 +0100
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -164,7 +165,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -183,7 +184,7 @@ void acct_arg_size(struct linux_binprm *
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -302,11 +303,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -396,17 +397,34 @@ err:
 }
 
 struct conditional_ptr {
-	const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+		compat_uptr_t __user *compat;
+	} ptr;
 };
 
 static const char __user *get_arg_ptr(struct conditional_ptr argv, int argc)
 {
-	const char __user *ptr;
+	const char __user *native;
 
-	if (get_user(ptr, argv.native + argc))
+#ifdef CONFIG_COMPAT
+	if (unlikely(argv.is_compat)) {
+		compat_uptr_t compat;
+
+		if (get_user(compat, argv.ptr.compat + argc))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(compat);
+	}
+#endif
+
+	if (get_user(native, argv.ptr.native + argc))
 		return ERR_PTR(-EFAULT);
 
-	return ptr;
+	return native;
 }
 
 /*
@@ -416,7 +434,7 @@ static int count(struct conditional_ptr 
 {
 	int i = 0;
 
-	if (argv.native != NULL) {
+	if (argv.ptr.native != NULL) {
 		for (;;) {
 			const char __user *p = get_arg_ptr(argv, i);
 
@@ -539,7 +557,7 @@ int copy_strings_kernel(int argc, const 
 	int r;
 	mm_segment_t oldfs = get_fs();
 	struct conditional_ptr argv = {
-		.native = (const char __user *const  __user *)__argv,
+		.ptr.native = (const char __user *const  __user *)__argv,
 	};
 
 	set_fs(KERNEL_DS);
@@ -1510,11 +1528,27 @@ int do_execve(const char *filename,
 	const char __user *const __user *__envp,
 	struct pt_regs *regs)
 {
-	struct conditional_ptr argv = { .native = __argv };
-	struct conditional_ptr envp = { .native = __envp };
+	struct conditional_ptr argv = { .ptr.native = __argv };
+	struct conditional_ptr envp = { .ptr.native = __envp };
 	return do_execve_common(filename, argv, envp, regs);
 }
 
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *__argv,
+	compat_uptr_t __user *__envp,
+	struct pt_regs *regs)
+{
+	struct conditional_ptr argv = {
+		.is_compat = true, .ptr.compat = __argv,
+	};
+	struct conditional_ptr envp = {
+		.is_compat = true, .ptr.compat = __envp,
+	};
+	return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;
--- 38/fs/compat.c~3_handle_compat_case	2011-03-02 15:15:25.000000000 +0100
+++ 38/fs/compat.c	2011-03-02 15:47:15.000000000 +0100
@@ -1330,241 +1330,6 @@ compat_sys_openat(unsigned int dfd, cons
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v3 4/4] exec: document acct_arg_size()
  2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
                                                                   ` (2 preceding siblings ...)
  2011-03-02 16:27                                                 ` [PATCH v3 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
@ 2011-03-02 16:28                                                 ` Oleg Nesterov
  2011-03-03  3:09                                                   ` KOSAKI Motohiro
  2011-03-02 16:44                                                 ` [PATCH v3 0/4] exec: unify native/compat code Oleg Nesterov
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-02 16:28 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

Add the comment to explain acct_arg_size().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 fs/exec.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- 38/fs/exec.c~4_doc_acct_arg_size	2011-03-02 16:21:57.000000000 +0100
+++ 38/fs/exec.c	2011-03-02 16:27:24.000000000 +0100
@@ -164,7 +164,12 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
 static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 0/4] exec: unify native/compat code
  2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
                                                                   ` (3 preceding siblings ...)
  2011-03-02 16:28                                                 ` [PATCH v3 4/4] exec: document acct_arg_size() Oleg Nesterov
@ 2011-03-02 16:44                                                 ` Oleg Nesterov
  2011-03-02 18:00                                                   ` Linus Torvalds
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-02 16:44 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller

On 03/02, Oleg Nesterov wrote:
>
> Never mind. I agree with everything as long as we can remove this c-a-p
> compat_do_execve().

forgot to mention...

And probably you meant we should pass "struct conditional_ptr*", not
by value. I can redo again.

And sorry for the duplicated 4/4 emails...

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 0/4] exec: unify native/compat code
  2011-03-02 16:44                                                 ` [PATCH v3 0/4] exec: unify native/compat code Oleg Nesterov
@ 2011-03-02 18:00                                                   ` Linus Torvalds
  2011-03-02 19:40                                                     ` David Miller
  0 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-03-02 18:00 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Wed, Mar 2, 2011 at 8:44 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>
> forgot to mention...
>
> And probably you meant we should pass "struct conditional_ptr*", not
> by value. I can redo again.

No, I think we're ok with passing the structure by value - it's a
small structure that would generally be passed in registers (at least
on some architectures, I guess it will depend on the ABI), and we do
the "struct-by-value" thing for other things too (notably the page
table entries), so it's not a new thing in the kernel.

So I think I finally have no complaints. Of course, I didn't actually
check whether it _works_, but I assume it does.

If the s390 people (who actually do special things with compat
pointers) can test, that would be ok, but I'm certainly happily going
to apply this series when the next merge window opens.

                            Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 0/4] exec: unify native/compat code
  2011-03-02 18:00                                                   ` Linus Torvalds
@ 2011-03-02 19:40                                                     ` David Miller
  2011-03-02 19:48                                                       ` Linus Torvalds
  0 siblings, 1 reply; 109+ messages in thread
From: David Miller @ 2011-03-02 19:40 UTC (permalink / raw)
  To: torvalds
  Cc: oleg, akpm, kosaki.motohiro, linux-kernel, linux-mm, pageexec,
	solar, eteo, spender, roland, miltonm

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 2 Mar 2011 10:00:23 -0800

> No, I think we're ok with passing the structure by value - it's a
> small structure that would generally be passed in registers (at least
> on some architectures, I guess it will depend on the ABI), and we do
> the "struct-by-value" thing for other things too (notably the page
> table entries), so it's not a new thing in the kernel.

We purposely don't do that "page table entry typedef'd to aggregate" stuff
on sparc32 because otherwise such values get passed on the stack.

Architectures can currently avoid this bad code generation for the
page table case, but with this new code they won't be able to avoid
pass-by-value.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 0/4] exec: unify native/compat code
  2011-03-02 19:40                                                     ` David Miller
@ 2011-03-02 19:48                                                       ` Linus Torvalds
  2011-03-02 19:54                                                         ` David Miller
  0 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-03-02 19:48 UTC (permalink / raw)
  To: David Miller
  Cc: oleg, akpm, kosaki.motohiro, linux-kernel, linux-mm, pageexec,
	solar, eteo, spender, roland, miltonm

On Wed, Mar 2, 2011 at 11:40 AM, David Miller <davem@davemloft.net> wrote:
>
> We purposely don't do that "page table entry typedef'd to aggregate" stuff
> on sparc32 because otherwise such values get passed on the stack.
>
> Architectures can currently avoid this bad code generation for the
> page table case, but with this new code they won't be able to avoid
> pass-by-value.

Well, the thing is, on architectures that _can_ pass by value, it
avoids one indirection.

And if you do pass it on stack, then the code generated will be the
same as if we passed a pointer. So sparc may not be able to take
advantage of the optimization, but I don't think the code generation
would be worse.

For the page table case, we don't have that kind of trade-off: the
trade-off there is literally just between "pass in registers, or pass
on stack". Here the trade-off is "pass as an aggregate value or pass
as a pointer to an aggregate value".

That said, since I suspect that the main user will always just get
inlined (ie the helper function that actually fetches the pointers), I
suspect even sparc will see the advantage of the pass-by-value model.

But you might want to actually test the difference and look at the
code generation.

                      Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 0/4] exec: unify native/compat code
  2011-03-02 19:48                                                       ` Linus Torvalds
@ 2011-03-02 19:54                                                         ` David Miller
  0 siblings, 0 replies; 109+ messages in thread
From: David Miller @ 2011-03-02 19:54 UTC (permalink / raw)
  To: torvalds
  Cc: oleg, akpm, kosaki.motohiro, linux-kernel, linux-mm, pageexec,
	solar, eteo, spender, roland, miltonm

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 2 Mar 2011 11:48:03 -0800

> Well, the thing is, on architectures that _can_ pass by value, it
> avoids one indirection.
> 
> And if you do pass it on stack, then the code generated will be the
> same as if we passed a pointer. So sparc may not be able to take
> advantage of the optimization, but I don't think the code generation
> would be worse.

That's a good point, the situation here is different than the page table
one.

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 1/4] exec: introduce get_arg_ptr() helper
  2011-03-02 16:27                                                 ` [PATCH v3 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
@ 2011-03-03  3:01                                                   ` KOSAKI Motohiro
  2011-03-03 15:47                                                     ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2011-03-03  3:01 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Linus Torvalds, Andrew Morton, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath, Milton Miller

Hi

Sorry for the long delay. now I'm getting stuck sucky paper work. ;-)
In short, I don't find any issue in this patch. So, I'll test it at
this weekend if linus haven't merged it yet.

A few small and cosmetic comments are below. but anyway I don't want
keep this up in the air.
	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>



> Introduce get_arg_ptr() helper, convert count() and copy_strings()
> to use it.
> 
> No functional changes, preparation. This helper is trivial, it just
> reads the pointer from argv/envp user-space array.
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> 
>  fs/exec.c |   36 +++++++++++++++++++++++++-----------
>  1 file changed, 25 insertions(+), 11 deletions(-)
> 
> --- 38/fs/exec.c~1_get_arg_ptr	2011-03-02 15:15:27.000000000 +0100
> +++ 38/fs/exec.c	2011-03-02 15:16:44.000000000 +0100
> @@ -395,6 +395,17 @@ err:
>  	return err;
>  }
>  
> +static const char __user *
> +get_arg_ptr(const char __user * const __user *argv, int argc)
> +{

[argc, argv] is natural order to me than [argv, argc].
and "get_" prefix are usually used for reference count incrementing
function in linux. so, i _personally_ prefer to call "user_arg_ptr".




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 2/4] exec: introduce struct conditional_ptr
  2011-03-02 16:27                                                 ` [PATCH v3 2/4] exec: introduce struct conditional_ptr Oleg Nesterov
@ 2011-03-03  3:08                                                   ` KOSAKI Motohiro
  0 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2011-03-03  3:08 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Linus Torvalds, Andrew Morton, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath, Milton Miller

> No functional changes, preparation.
> 
> Introduce struct conditional_ptr, change do_execve() paths to use it
> instead of "char __user * const __user *argv".
> 
> This makes the argv/envp arguments opaque, we are ready to handle the
> compat case which needs argv pointing to compat_uptr_t.
> 
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> 
>  fs/exec.c |   42 ++++++++++++++++++++++++++++++------------
>  1 file changed, 30 insertions(+), 12 deletions(-)
> 
> --- 38/fs/exec.c~2_typedef_for_argv	2011-03-02 15:40:22.000000000 +0100
> +++ 38/fs/exec.c	2011-03-02 15:40:44.000000000 +0100
> @@ -395,12 +395,15 @@ err:
>  	return err;
>  }
>  
> -static const char __user *
> -get_arg_ptr(const char __user * const __user *argv, int argc)
> +struct conditional_ptr {

I _personally_ don't like "conditional". Its name is based on code logic.
It's unclear what mean "conditional". From data strucuture view, It is 
"opaque userland pointer".

but again, it is my personal preference.

	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 4/4] exec: document acct_arg_size()
  2011-03-02 16:28                                                 ` [PATCH v3 4/4] exec: document acct_arg_size() Oleg Nesterov
@ 2011-03-03  3:09                                                   ` KOSAKI Motohiro
  0 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2011-03-03  3:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Linus Torvalds, Andrew Morton, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath, Milton Miller

> Add the comment to explain acct_arg_size().
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> 
>  fs/exec.c |    7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> --- 38/fs/exec.c~4_doc_acct_arg_size	2011-03-02 16:21:57.000000000 +0100
> +++ 38/fs/exec.c	2011-03-02 16:27:24.000000000 +0100
> @@ -164,7 +164,12 @@ out:
>  }
>  
>  #ifdef CONFIG_MMU
> -
> +/*
> + * The nascent bprm->mm is not visible until exec_mmap() but it can
> + * use a lot of memory, account these pages in current->mm temporary
> + * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
> + * change the counter back via acct_arg_size(0).
> + */
>  static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
>  {
>  	struct mm_struct *mm = current->mm;
> 

Yeah! Thank you very much to make proper and clear comment.
	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 3/4] exec: unify do_execve/compat_do_execve code
  2011-03-02 16:27                                                 ` [PATCH v3 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
@ 2011-03-03  3:13                                                   ` KOSAKI Motohiro
  0 siblings, 0 replies; 109+ messages in thread
From: KOSAKI Motohiro @ 2011-03-03  3:13 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Linus Torvalds, Andrew Morton, LKML, linux-mm,
	pageexec, Solar Designer, Eugene Teo, Brad Spengler,
	Roland McGrath, Milton Miller

> @@ -1510,11 +1528,27 @@ int do_execve(const char *filename,
>  	const char __user *const __user *__envp,
>  	struct pt_regs *regs)
>  {
> -	struct conditional_ptr argv = { .native = __argv };
> -	struct conditional_ptr envp = { .native = __envp };
> +	struct conditional_ptr argv = { .ptr.native = __argv };
> +	struct conditional_ptr envp = { .ptr.native = __envp };
>  	return do_execve_common(filename, argv, envp, regs);
>  }
>  
> +#ifdef CONFIG_COMPAT
> +int compat_do_execve(char *filename,
> +	compat_uptr_t __user *__argv,
> +	compat_uptr_t __user *__envp,
> +	struct pt_regs *regs)
> +{
> +	struct conditional_ptr argv = {
> +		.is_compat = true, .ptr.compat = __argv,
> +	};

Please don't mind to compress a line.

	struct conditional_ptr argv = {
		.is_compat = true,
		.ptr.compat = __argv,
	};

is more good readability.


Other parts looks very good to me.
	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>




^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 1/4] exec: introduce get_arg_ptr() helper
  2011-03-03  3:01                                                   ` KOSAKI Motohiro
@ 2011-03-03 15:47                                                     ` Oleg Nesterov
  2011-03-03 16:07                                                       ` Linus Torvalds
  0 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-03 15:47 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Linus Torvalds, Andrew Morton, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 03/03, KOSAKI Motohiro wrote:
>
> > +static const char __user *
> > +get_arg_ptr(const char __user * const __user *argv, int argc)
> > +{
>
> [argc, argv] is natural order to me than [argv, argc].

Yes... in fact, "argc" is misnamed here. It doesn't mean the number of
arguments, it is the index in the array. Perhaps this should be [argv, nr].

> and "get_" prefix are usually used for reference count incrementing
> function in linux. so, i _personally_ prefer to call "user_arg_ptr".

Agreed, the name is ugly. I'll rename and resend keeping your reviewed-by.

[2/4]
> I _personally_ don't like "conditional". Its name is based on code logic.
> It's unclear what mean "conditional". From data strucuture view, It is
> "opaque userland pointer".

I agree with any naming, just suggest a better name ;)

[3/4]
> > +     struct conditional_ptr argv = {
> > +             .is_compat = true, .ptr.compat = __argv,
> > +     };
>
> Please don't mind to compress a line.
>
>         struct conditional_ptr argv = {
>                 .is_compat = true,
>                 .ptr.compat = __argv,
>         };

OK, will do.

Thanks for review!

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v3 1/4] exec: introduce get_arg_ptr() helper
  2011-03-03 15:47                                                     ` Oleg Nesterov
@ 2011-03-03 16:07                                                       ` Linus Torvalds
  2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-03-03 16:07 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: KOSAKI Motohiro, Andrew Morton, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On Thu, Mar 3, 2011 at 7:47 AM, Oleg Nesterov <oleg@redhat.com> wrote:
>> I _personally_ don't like "conditional". Its name is based on code logic.
>> It's unclear what mean "conditional". From data strucuture view, It is
>> "opaque userland pointer".
>
> I agree with any naming, just suggest a better name ;)

Maybe just "struct user_arg_ptr" or something?

                        Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v4 0/4] exec: unify native/compat code
  2011-03-03 16:07                                                       ` Linus Torvalds
@ 2011-03-05 20:30                                                         ` Oleg Nesterov
  2011-03-05 20:31                                                           ` [PATCH v4 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
                                                                             ` (4 more replies)
  0 siblings, 5 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-05 20:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller,
	Linus Torvalds

On 03/03, Linus Torvalds wrote:
>
> On Thu, Mar 3, 2011 at 7:47 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> >> I _personally_ don't like "conditional". Its name is based on code logic.
> >> It's unclear what mean "conditional". From data strucuture view, It is
> >> "opaque userland pointer".
> >
> > I agree with any naming, just suggest a better name ;)
>
> Maybe just "struct user_arg_ptr" or something?

OK, nothing else was suggessted, I assume Kosaki agrees.

So rename conditional_ptr to user_arg_ptr.

Also rename get_user_ptr() to get_user_arg_ptr(). It was suggested to
use the same "user_arg_ptr" for this helper too, but this is not
grep-friendly. As for get_ in the name... Well, I can redo again ;)
But this matches get_user() and this is all what this helper does.

Otherwise unchanged.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v4 1/4] exec: introduce get_user_arg_ptr() helper
  2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
@ 2011-03-05 20:31                                                           ` Oleg Nesterov
  2011-03-05 20:31                                                           ` [PATCH v4 2/4] exec: introduce struct user_arg_ptr Oleg Nesterov
                                                                             ` (3 subsequent siblings)
  4 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-05 20:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller,
	Linus Torvalds

Introduce get_user_arg_ptr() helper, convert count() and copy_strings()
to use it.

No functional changes, preparation. This helper is trivial, it just
reads the pointer from argv/envp user-space array.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 fs/exec.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- 38/fs/exec.c~1_get_arg_ptr	2011-03-05 21:13:46.000000000 +0100
+++ 38/fs/exec.c	2011-03-05 21:21:56.000000000 +0100
@@ -395,6 +395,17 @@ err:
 	return err;
 }
 
+static const char __user *
+get_user_arg_ptr(const char __user * const __user *argv, int nr)
+{
+	const char __user *ptr;
+
+	if (get_user(ptr, argv + nr))
+		return ERR_PTR(-EFAULT);
+
+	return ptr;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -404,13 +415,14 @@ static int count(const char __user * con
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_user_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -440,16 +452,18 @@ static int copy_strings(int argc, const 
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_user_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v4 2/4] exec: introduce struct user_arg_ptr
  2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
  2011-03-05 20:31                                                           ` [PATCH v4 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
@ 2011-03-05 20:31                                                           ` Oleg Nesterov
  2011-03-05 20:31                                                           ` [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
                                                                             ` (2 subsequent siblings)
  4 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-05 20:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller,
	Linus Torvalds

No functional changes, preparation.

Introduce struct user_arg_ptr, change do_execve() paths to use it
instead of "char __user * const __user *argv".

This makes the argv/envp arguments opaque, we are ready to handle the
compat case which needs argv pointing to compat_uptr_t.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 fs/exec.c |   42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

--- 38/fs/exec.c~2_typedef_for_argv	2011-03-05 21:21:56.000000000 +0100
+++ 38/fs/exec.c	2011-03-05 21:22:42.000000000 +0100
@@ -395,12 +395,15 @@ err:
 	return err;
 }
 
-static const char __user *
-get_user_arg_ptr(const char __user * const __user *argv, int nr)
+struct user_arg_ptr {
+	const char __user *const __user *native;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 {
 	const char __user *ptr;
 
-	if (get_user(ptr, argv + nr))
+	if (get_user(ptr, argv.native + nr))
 		return ERR_PTR(-EFAULT);
 
 	return ptr;
@@ -409,11 +412,11 @@ get_user_arg_ptr(const char __user * con
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(struct user_arg_ptr argv, int max)
 {
 	int i = 0;
 
-	if (argv != NULL) {
+	if (argv.native != NULL) {
 		for (;;) {
 			const char __user *p = get_user_arg_ptr(argv, i);
 
@@ -439,7 +442,7 @@ static int count(const char __user * con
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, struct user_arg_ptr argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
@@ -530,14 +533,19 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *__argv,
 			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
+	struct user_arg_ptr argv = {
+		.native = (const char __user *const  __user *)__argv,
+	};
+
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, argv, bprm);
 	set_fs(oldfs);
+
 	return r;
 }
 EXPORT_SYMBOL(copy_strings_kernel);
@@ -1387,10 +1395,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
-	const char __user *const __user *argv,
-	const char __user *const __user *envp,
-	struct pt_regs * regs)
+static int do_execve_common(const char *filename,
+				struct user_arg_ptr argv,
+				struct user_arg_ptr envp,
+				struct pt_regs *regs)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1497,6 +1505,16 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = { .native = __argv };
+	struct user_arg_ptr envp = { .native = __envp };
+	return do_execve_common(filename, argv, envp, regs);
+}
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code
  2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
  2011-03-05 20:31                                                           ` [PATCH v4 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
  2011-03-05 20:31                                                           ` [PATCH v4 2/4] exec: introduce struct user_arg_ptr Oleg Nesterov
@ 2011-03-05 20:31                                                           ` Oleg Nesterov
  2011-03-05 20:52                                                             ` Linus Torvalds
  2011-03-05 20:31                                                           ` [PATCH v4 4/4] exec: document acct_arg_size() Oleg Nesterov
  2011-03-06 12:04                                                           ` [PATCH v4 0/4] exec: unify native/compat code KOSAKI Motohiro
  4 siblings, 1 reply; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-05 20:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller,
	Linus Torvalds

Add the appropriate members into struct user_arg_ptr and teach
get_user_arg_ptr() to handle is_compat = T case correctly.

This allows us to remove the compat_do_execve() code from fs/compat.c
and reimplement compat_do_execve() as the trivial wrapper on top of
do_execve_common(is_compat => true).

In fact, this fixes another (minor) bug. "compat_uptr_t str" can
overflow after "str += len" in compat_copy_strings() if a 64bit
application execs via sys32_execve().

Unexport acct_arg_size() and get_arg_page(), fs/compat.c doesn't
need them any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 include/linux/binfmts.h |    4 
 fs/exec.c               |   58 +++++++++--
 fs/compat.c             |  235 ------------------------------------------------
 3 files changed, 46 insertions(+), 251 deletions(-)

--- 38/include/linux/binfmts.h~3_handle_compat_case	2011-03-05 21:13:45.000000000 +0100
+++ 38/include/linux/binfmts.h	2011-03-05 21:23:15.000000000 +0100
@@ -60,10 +60,6 @@ struct linux_binprm {
 	unsigned long loader, exec;
 };
 
-extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
-extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
-					int write);
-
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- 38/fs/exec.c~3_handle_compat_case	2011-03-05 21:22:42.000000000 +0100
+++ 38/fs/exec.c	2011-03-05 21:23:15.000000000 +0100
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -164,7 +165,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -183,7 +184,7 @@ void acct_arg_size(struct linux_binprm *
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -302,11 +303,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -396,17 +397,34 @@ err:
 }
 
 struct user_arg_ptr {
-	const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+		compat_uptr_t __user *compat;
+	} ptr;
 };
 
 static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 {
-	const char __user *ptr;
+	const char __user *native;
 
-	if (get_user(ptr, argv.native + nr))
+#ifdef CONFIG_COMPAT
+	if (unlikely(argv.is_compat)) {
+		compat_uptr_t compat;
+
+		if (get_user(compat, argv.ptr.compat + nr))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(compat);
+	}
+#endif
+
+	if (get_user(native, argv.ptr.native + nr))
 		return ERR_PTR(-EFAULT);
 
-	return ptr;
+	return native;
 }
 
 /*
@@ -416,7 +434,7 @@ static int count(struct user_arg_ptr arg
 {
 	int i = 0;
 
-	if (argv.native != NULL) {
+	if (argv.ptr.native != NULL) {
 		for (;;) {
 			const char __user *p = get_user_arg_ptr(argv, i);
 
@@ -539,7 +557,7 @@ int copy_strings_kernel(int argc, const 
 	int r;
 	mm_segment_t oldfs = get_fs();
 	struct user_arg_ptr argv = {
-		.native = (const char __user *const  __user *)__argv,
+		.ptr.native = (const char __user *const  __user *)__argv,
 	};
 
 	set_fs(KERNEL_DS);
@@ -1510,10 +1528,28 @@ int do_execve(const char *filename,
 	const char __user *const __user *__envp,
 	struct pt_regs *regs)
 {
-	struct user_arg_ptr argv = { .native = __argv };
-	struct user_arg_ptr envp = { .native = __envp };
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+	return do_execve_common(filename, argv, envp, regs);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *__argv,
+	compat_uptr_t __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
 	return do_execve_common(filename, argv, envp, regs);
 }
+#endif
 
 void set_binfmt(struct linux_binfmt *new)
 {
--- 38/fs/compat.c~3_handle_compat_case	2011-03-05 21:13:45.000000000 +0100
+++ 38/fs/compat.c	2011-03-05 21:23:15.000000000 +0100
@@ -1330,241 +1330,6 @@ compat_sys_openat(unsigned int dfd, cons
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v4 4/4] exec: document acct_arg_size()
  2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
                                                                             ` (2 preceding siblings ...)
  2011-03-05 20:31                                                           ` [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
@ 2011-03-05 20:31                                                           ` Oleg Nesterov
  2011-03-06 12:04                                                           ` [PATCH v4 0/4] exec: unify native/compat code KOSAKI Motohiro
  4 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-05 20:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KOSAKI Motohiro, LKML, linux-mm, pageexec, Solar Designer,
	Eugene Teo, Brad Spengler, Roland McGrath, Milton Miller,
	Linus Torvalds

Add the comment to explain acct_arg_size().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 fs/exec.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- 38/fs/exec.c~4_doc_acct_arg_size	2011-03-05 21:23:15.000000000 +0100
+++ 38/fs/exec.c	2011-03-05 21:23:33.000000000 +0100
@@ -164,7 +164,12 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
 static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code
  2011-03-05 20:31                                                           ` [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
@ 2011-03-05 20:52                                                             ` Linus Torvalds
  2011-03-05 21:20                                                               ` Oleg Nesterov
  0 siblings, 1 reply; 109+ messages in thread
From: Linus Torvalds @ 2011-03-05 20:52 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

Ok, everything looks fine to me.

Except looking at this, I don't think this part:

On Sat, Mar 5, 2011 at 12:31 PM, Oleg Nesterov <oleg@redhat.com> wrote:
>
>  struct user_arg_ptr {
> -       const char __user *const __user *native;
> +#ifdef CONFIG_COMPAT
> +       bool is_compat;
> +#endif
> +       union {
> +               const char __user *const __user *native;
> +               compat_uptr_t __user *compat;
> +       } ptr;
>  };

will necessarily even compile on an architecture that doesn't have any
'compat' support.

Do we even define 'compat_uptr_t' for that case? I don't think so.

So I suspect you need two of those annoying #ifdef's. Or we need to
have some way to guarantee that 'compat_uptr_t' exists.

                           Linus

^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code
  2011-03-05 20:52                                                             ` Linus Torvalds
@ 2011-03-05 21:20                                                               ` Oleg Nesterov
  0 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-05 21:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, KOSAKI Motohiro, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller

On 03/05, Linus Torvalds wrote:
>
> Ok, everything looks fine to me.
>
> Except looking at this, I don't think this part:
>
> On Sat, Mar 5, 2011 at 12:31 PM, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> >  struct user_arg_ptr {
> > -       const char __user *const __user *native;
> > +#ifdef CONFIG_COMPAT
> > +       bool is_compat;
> > +#endif
> > +       union {
> > +               const char __user *const __user *native;
> > +               compat_uptr_t __user *compat;
> > +       } ptr;
> >  };
>
> will necessarily even compile on an architecture that doesn't have any
> 'compat' support.

Aaaaaaaaaaaaaaaaaah, now this is a really good point.

> Do we even define 'compat_uptr_t' for that case? I don't think so.

Indeed, you are right.

What I was thinking about? I do not know.

> So I suspect you need two of those annoying #ifdef's.

please expect v5 tomorrow.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* Re: [PATCH v4 0/4] exec: unify native/compat code
  2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
                                                                             ` (3 preceding siblings ...)
  2011-03-05 20:31                                                           ` [PATCH v4 4/4] exec: document acct_arg_size() Oleg Nesterov
@ 2011-03-06 12:04                                                           ` KOSAKI Motohiro
  2011-03-06 17:01                                                             ` [PATCH v5 " Oleg Nesterov
  4 siblings, 1 reply; 109+ messages in thread
From: KOSAKI Motohiro @ 2011-03-06 12:04 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: kosaki.motohiro, Andrew Morton, LKML, linux-mm, pageexec,
	Solar Designer, Eugene Teo, Brad Spengler, Roland McGrath,
	Milton Miller, Linus Torvalds

> On 03/03, Linus Torvalds wrote:
> >
> > On Thu, Mar 3, 2011 at 7:47 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> > >> I _personally_ don't like "conditional". Its name is based on code logic.
> > >> It's unclear what mean "conditional". From data strucuture view, It is
> > >> "opaque userland pointer".
> > >
> > > I agree with any naming, just suggest a better name ;)
> >
> > Maybe just "struct user_arg_ptr" or something?
> 
> OK, nothing else was suggessted, I assume Kosaki agrees.

Sure. :)

And, I happily reported this series run successfully my testsuite.
Could you please add my tested-by tag?

thanks.


> 
> So rename conditional_ptr to user_arg_ptr.
> 
> Also rename get_user_ptr() to get_user_arg_ptr(). It was suggested to
> use the same "user_arg_ptr" for this helper too, but this is not
> grep-friendly. As for get_ in the name... Well, I can redo again ;)
> But this matches get_user() and this is all what this helper does.
> 
> Otherwise unchanged.
> 
> Oleg.
> 




^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v5 0/4] exec: unify native/compat code
  2011-03-06 12:04                                                           ` [PATCH v4 0/4] exec: unify native/compat code KOSAKI Motohiro
@ 2011-03-06 17:01                                                             ` Oleg Nesterov
  2011-03-06 17:02                                                               ` [PATCH v5 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
                                                                                 ` (3 more replies)
  0 siblings, 4 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-06 17:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, Milton Miller, Linus Torvalds,
	KOSAKI Motohiro

On 03/06, KOSAKI Motohiro wrote:
>
> And, I happily reported this series run successfully my testsuite.
> Could you please add my tested-by tag?

Sure, thanks a lot Kosaki.

I hope this is the last version. Changes:

	- as Linus pointed out, we do not have compat_uptr_t without
	  CONFIG_COMPAT. Add another ifdef into struct user_arg_ptr.

Oleg.


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v5 1/4] exec: introduce get_user_arg_ptr() helper
  2011-03-06 17:01                                                             ` [PATCH v5 " Oleg Nesterov
@ 2011-03-06 17:02                                                               ` Oleg Nesterov
  2011-03-06 17:02                                                               ` [PATCH v5 2/4] exec: introduce struct user_arg_ptr Oleg Nesterov
                                                                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-06 17:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, Milton Miller, Linus Torvalds,
	KOSAKI Motohiro

Introduce get_user_arg_ptr() helper, convert count() and copy_strings()
to use it.

No functional changes, preparation. This helper is trivial, it just
reads the pointer from argv/envp user-space array.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 fs/exec.c |   36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

--- 38/fs/exec.c~1_get_arg_ptr	2011-03-06 17:48:00.000000000 +0100
+++ 38/fs/exec.c	2011-03-06 17:51:01.000000000 +0100
@@ -395,6 +395,17 @@ err:
 	return err;
 }
 
+static const char __user *
+get_user_arg_ptr(const char __user * const __user *argv, int nr)
+{
+	const char __user *ptr;
+
+	if (get_user(ptr, argv + nr))
+		return ERR_PTR(-EFAULT);
+
+	return ptr;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -404,13 +415,14 @@ static int count(const char __user * con
 
 	if (argv != NULL) {
 		for (;;) {
-			const char __user * p;
+			const char __user *p = get_user_arg_ptr(argv, i);
 
-			if (get_user(p, argv))
-				return -EFAULT;
 			if (!p)
 				break;
-			argv++;
+
+			if (IS_ERR(p))
+				return -EFAULT;
+
 			if (i++ >= max)
 				return -E2BIG;
 
@@ -440,16 +452,18 @@ static int copy_strings(int argc, const 
 		int len;
 		unsigned long pos;
 
-		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		str = get_user_arg_ptr(argv, argc);
+		if (IS_ERR(str))
 			goto out;
-		}
 
-		if (!valid_arg_len(bprm, len)) {
-			ret = -E2BIG;
+		len = strnlen_user(str, MAX_ARG_STRLEN);
+		if (!len)
+			goto out;
+
+		ret = -E2BIG;
+		if (!valid_arg_len(bprm, len))
 			goto out;
-		}
 
 		/* We're going to work our way backwords. */
 		pos = bprm->p;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v5 2/4] exec: introduce struct user_arg_ptr
  2011-03-06 17:01                                                             ` [PATCH v5 " Oleg Nesterov
  2011-03-06 17:02                                                               ` [PATCH v5 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
@ 2011-03-06 17:02                                                               ` Oleg Nesterov
  2011-03-06 17:02                                                               ` [PATCH v5 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
  2011-03-06 17:03                                                               ` [PATCH v5 4/4] exec: document acct_arg_size() Oleg Nesterov
  3 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-06 17:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, Milton Miller, Linus Torvalds,
	KOSAKI Motohiro

No functional changes, preparation.

Introduce struct user_arg_ptr, change do_execve() paths to use it
instead of "char __user * const __user *argv".

This makes the argv/envp arguments opaque, we are ready to handle the
compat case which needs argv pointing to compat_uptr_t.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 fs/exec.c |   42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

--- 38/fs/exec.c~2_typedef_for_argv	2011-03-06 17:51:01.000000000 +0100
+++ 38/fs/exec.c	2011-03-06 17:51:44.000000000 +0100
@@ -395,12 +395,15 @@ err:
 	return err;
 }
 
-static const char __user *
-get_user_arg_ptr(const char __user * const __user *argv, int nr)
+struct user_arg_ptr {
+	const char __user *const __user *native;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 {
 	const char __user *ptr;
 
-	if (get_user(ptr, argv + nr))
+	if (get_user(ptr, argv.native + nr))
 		return ERR_PTR(-EFAULT);
 
 	return ptr;
@@ -409,11 +412,11 @@ get_user_arg_ptr(const char __user * con
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(const char __user * const __user * argv, int max)
+static int count(struct user_arg_ptr argv, int max)
 {
 	int i = 0;
 
-	if (argv != NULL) {
+	if (argv.native != NULL) {
 		for (;;) {
 			const char __user *p = get_user_arg_ptr(argv, i);
 
@@ -439,7 +442,7 @@ static int count(const char __user * con
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, struct user_arg_ptr argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
@@ -530,14 +533,19 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *__argv,
 			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
+	struct user_arg_ptr argv = {
+		.native = (const char __user *const  __user *)__argv,
+	};
+
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
+	r = copy_strings(argc, argv, bprm);
 	set_fs(oldfs);
+
 	return r;
 }
 EXPORT_SYMBOL(copy_strings_kernel);
@@ -1387,10 +1395,10 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(const char * filename,
-	const char __user *const __user *argv,
-	const char __user *const __user *envp,
-	struct pt_regs * regs)
+static int do_execve_common(const char *filename,
+				struct user_arg_ptr argv,
+				struct user_arg_ptr envp,
+				struct pt_regs *regs)
 {
 	struct linux_binprm *bprm;
 	struct file *file;
@@ -1497,6 +1505,16 @@ out_ret:
 	return retval;
 }
 
+int do_execve(const char *filename,
+	const char __user *const __user *__argv,
+	const char __user *const __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = { .native = __argv };
+	struct user_arg_ptr envp = { .native = __envp };
+	return do_execve_common(filename, argv, envp, regs);
+}
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v5 3/4] exec: unify do_execve/compat_do_execve code
  2011-03-06 17:01                                                             ` [PATCH v5 " Oleg Nesterov
  2011-03-06 17:02                                                               ` [PATCH v5 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
  2011-03-06 17:02                                                               ` [PATCH v5 2/4] exec: introduce struct user_arg_ptr Oleg Nesterov
@ 2011-03-06 17:02                                                               ` Oleg Nesterov
  2011-03-06 17:03                                                               ` [PATCH v5 4/4] exec: document acct_arg_size() Oleg Nesterov
  3 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-06 17:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, Milton Miller, Linus Torvalds,
	KOSAKI Motohiro

Add the appropriate members into struct user_arg_ptr and teach
get_user_arg_ptr() to handle is_compat = T case correctly.

This allows us to remove the compat_do_execve() code from fs/compat.c
and reimplement compat_do_execve() as the trivial wrapper on top of
do_execve_common(is_compat => true).

In fact, this fixes another (minor) bug. "compat_uptr_t str" can
overflow after "str += len" in compat_copy_strings() if a 64bit
application execs via sys32_execve().

Unexport acct_arg_size() and get_arg_page(), fs/compat.c doesn't
need them any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 include/linux/binfmts.h |    4 
 fs/exec.c               |   58 +++++++++--
 fs/compat.c             |  235 ------------------------------------------------
 3 files changed, 46 insertions(+), 251 deletions(-)

--- 38/include/linux/binfmts.h~3_handle_compat_case	2011-03-06 17:48:00.000000000 +0100
+++ 38/include/linux/binfmts.h	2011-03-06 17:52:26.000000000 +0100
@@ -60,10 +60,6 @@ struct linux_binprm {
 	unsigned long loader, exec;
 };
 
-extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages);
-extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
-					int write);
-
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
 
--- 38/fs/exec.c~3_handle_compat_case	2011-03-06 17:51:44.000000000 +0100
+++ 38/fs/exec.c	2011-03-06 17:56:26.000000000 +0100
@@ -55,6 +55,7 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -164,7 +165,7 @@ out:
 
 #ifdef CONFIG_MMU
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;
 	long diff = (long)(pages - bprm->vma_pages);
@@ -183,7 +184,7 @@ void acct_arg_size(struct linux_binprm *
 #endif
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -302,11 +303,11 @@ static bool valid_arg_len(struct linux_b
 
 #else
 
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 }
 
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		int write)
 {
 	struct page *page;
@@ -396,17 +397,36 @@ err:
 }
 
 struct user_arg_ptr {
-	const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+	bool is_compat;
+#endif
+	union {
+		const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+		compat_uptr_t __user *compat;
+#endif
+	} ptr;
 };
 
 static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
 {
-	const char __user *ptr;
+	const char __user *native;
 
-	if (get_user(ptr, argv.native + nr))
+#ifdef CONFIG_COMPAT
+	if (unlikely(argv.is_compat)) {
+		compat_uptr_t compat;
+
+		if (get_user(compat, argv.ptr.compat + nr))
+			return ERR_PTR(-EFAULT);
+
+		return compat_ptr(compat);
+	}
+#endif
+
+	if (get_user(native, argv.ptr.native + nr))
 		return ERR_PTR(-EFAULT);
 
-	return ptr;
+	return native;
 }
 
 /*
@@ -416,7 +436,7 @@ static int count(struct user_arg_ptr arg
 {
 	int i = 0;
 
-	if (argv.native != NULL) {
+	if (argv.ptr.native != NULL) {
 		for (;;) {
 			const char __user *p = get_user_arg_ptr(argv, i);
 
@@ -539,7 +559,7 @@ int copy_strings_kernel(int argc, const 
 	int r;
 	mm_segment_t oldfs = get_fs();
 	struct user_arg_ptr argv = {
-		.native = (const char __user *const  __user *)__argv,
+		.ptr.native = (const char __user *const  __user *)__argv,
 	};
 
 	set_fs(KERNEL_DS);
@@ -1510,11 +1530,29 @@ int do_execve(const char *filename,
 	const char __user *const __user *__envp,
 	struct pt_regs *regs)
 {
-	struct user_arg_ptr argv = { .native = __argv };
-	struct user_arg_ptr envp = { .native = __envp };
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
 	return do_execve_common(filename, argv, envp, regs);
 }
 
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+	compat_uptr_t __user *__argv,
+	compat_uptr_t __user *__envp,
+	struct pt_regs *regs)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
 void set_binfmt(struct linux_binfmt *new)
 {
 	struct mm_struct *mm = current->mm;
--- 38/fs/compat.c~3_handle_compat_case	2011-03-06 17:48:00.000000000 +0100
+++ 38/fs/compat.c	2011-03-06 17:52:26.000000000 +0100
@@ -1330,241 +1330,6 @@ compat_sys_openat(unsigned int dfd, cons
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
-	int i = 0;
-
-	if (argv != NULL) {
-		for (;;) {
-			compat_uptr_t p;
-
-			if (get_user(p, argv))
-				return -EFAULT;
-			if (!p)
-				break;
-			argv++;
-			if (i++ >= max)
-				return -E2BIG;
-
-			if (fatal_signal_pending(current))
-				return -ERESTARTNOHAND;
-			cond_resched();
-		}
-	}
-	return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
-				struct linux_binprm *bprm)
-{
-	struct page *kmapped_page = NULL;
-	char *kaddr = NULL;
-	unsigned long kpos = 0;
-	int ret;
-
-	while (argc-- > 0) {
-		compat_uptr_t str;
-		int len;
-		unsigned long pos;
-
-		if (get_user(str, argv+argc) ||
-		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		if (len > MAX_ARG_STRLEN) {
-			ret = -E2BIG;
-			goto out;
-		}
-
-		/* We're going to work our way backwords. */
-		pos = bprm->p;
-		str += len;
-		bprm->p -= len;
-
-		while (len > 0) {
-			int offset, bytes_to_copy;
-
-			if (fatal_signal_pending(current)) {
-				ret = -ERESTARTNOHAND;
-				goto out;
-			}
-			cond_resched();
-
-			offset = pos % PAGE_SIZE;
-			if (offset == 0)
-				offset = PAGE_SIZE;
-
-			bytes_to_copy = offset;
-			if (bytes_to_copy > len)
-				bytes_to_copy = len;
-
-			offset -= bytes_to_copy;
-			pos -= bytes_to_copy;
-			str -= bytes_to_copy;
-			len -= bytes_to_copy;
-
-			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
-				struct page *page;
-
-				page = get_arg_page(bprm, pos, 1);
-				if (!page) {
-					ret = -E2BIG;
-					goto out;
-				}
-
-				if (kmapped_page) {
-					flush_kernel_dcache_page(kmapped_page);
-					kunmap(kmapped_page);
-					put_page(kmapped_page);
-				}
-				kmapped_page = page;
-				kaddr = kmap(kmapped_page);
-				kpos = pos & PAGE_MASK;
-				flush_cache_page(bprm->vma, kpos,
-						 page_to_pfn(kmapped_page));
-			}
-			if (copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy)) {
-				ret = -EFAULT;
-				goto out;
-			}
-		}
-	}
-	ret = 0;
-out:
-	if (kmapped_page) {
-		flush_kernel_dcache_page(kmapped_page);
-		kunmap(kmapped_page);
-		put_page(kmapped_page);
-	}
-	return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
-	compat_uptr_t __user *argv,
-	compat_uptr_t __user *envp,
-	struct pt_regs * regs)
-{
-	struct linux_binprm *bprm;
-	struct file *file;
-	struct files_struct *displaced;
-	bool clear_in_exec;
-	int retval;
-
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto out_ret;
-
-	retval = -ENOMEM;
-	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-	if (!bprm)
-		goto out_files;
-
-	retval = prepare_bprm_creds(bprm);
-	if (retval)
-		goto out_free;
-
-	retval = check_unsafe_exec(bprm);
-	if (retval < 0)
-		goto out_free;
-	clear_in_exec = retval;
-	current->in_execve = 1;
-
-	file = open_exec(filename);
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_unmark;
-
-	sched_exec();
-
-	bprm->file = file;
-	bprm->filename = filename;
-	bprm->interp = filename;
-
-	retval = bprm_mm_init(bprm);
-	if (retval)
-		goto out_file;
-
-	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
-	if ((retval = bprm->argc) < 0)
-		goto out;
-
-	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
-	if ((retval = bprm->envc) < 0)
-		goto out;
-
-	retval = prepare_binprm(bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = copy_strings_kernel(1, &bprm->filename, bprm);
-	if (retval < 0)
-		goto out;
-
-	bprm->exec = bprm->p;
-	retval = compat_copy_strings(bprm->envc, envp, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = compat_copy_strings(bprm->argc, argv, bprm);
-	if (retval < 0)
-		goto out;
-
-	retval = search_binary_handler(bprm, regs);
-	if (retval < 0)
-		goto out;
-
-	/* execve succeeded */
-	current->fs->in_exec = 0;
-	current->in_execve = 0;
-	acct_update_integrals(current);
-	free_bprm(bprm);
-	if (displaced)
-		put_files_struct(displaced);
-	return retval;
-
-out:
-	if (bprm->mm) {
-		acct_arg_size(bprm, 0);
-		mmput(bprm->mm);
-	}
-
-out_file:
-	if (bprm->file) {
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-	}
-
-out_unmark:
-	if (clear_in_exec)
-		current->fs->in_exec = 0;
-	current->in_execve = 0;
-
-out_free:
-	free_bprm(bprm);
-
-out_files:
-	if (displaced)
-		reset_files_struct(displaced);
-out_ret:
-	return retval;
-}
-
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,


^ permalink raw reply	[flat|nested] 109+ messages in thread

* [PATCH v5 4/4] exec: document acct_arg_size()
  2011-03-06 17:01                                                             ` [PATCH v5 " Oleg Nesterov
                                                                                 ` (2 preceding siblings ...)
  2011-03-06 17:02                                                               ` [PATCH v5 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
@ 2011-03-06 17:03                                                               ` Oleg Nesterov
  3 siblings, 0 replies; 109+ messages in thread
From: Oleg Nesterov @ 2011-03-06 17:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, linux-mm, pageexec, Solar Designer, Eugene Teo,
	Brad Spengler, Roland McGrath, Milton Miller, Linus Torvalds,
	KOSAKI Motohiro

Add the comment to explain acct_arg_size().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---

 fs/exec.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- 38/fs/exec.c~4_doc_acct_arg_size	2011-03-06 17:56:26.000000000 +0100
+++ 38/fs/exec.c	2011-03-06 17:56:47.000000000 +0100
@@ -164,7 +164,12 @@ out:
 }
 
 #ifdef CONFIG_MMU
-
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
 static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
 {
 	struct mm_struct *mm = current->mm;


^ permalink raw reply	[flat|nested] 109+ messages in thread

end of thread, other threads:[~2011-03-06 17:12 UTC | newest]

Thread overview: 109+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-25  3:26 [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() KOSAKI Motohiro
2010-10-25  3:27 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
2010-10-25 20:40   ` David Rientjes
2010-10-26 13:01     ` KOSAKI Motohiro
2010-10-26 19:37       ` David Rientjes
2010-11-01  7:06         ` KOSAKI Motohiro
2010-11-01 19:36           ` David Rientjes
2010-11-09  2:26             ` KOSAKI Motohiro
2010-11-09  3:28               ` KOSAKI Motohiro
2010-11-15  0:24                 ` KOSAKI Motohiro
2010-11-15  9:59                   ` David Rientjes
2010-11-09 23:33               ` David Rientjes
2010-11-09 23:35                 ` Alan Cox
2010-11-09 23:48                   ` David Rientjes
2010-11-09 23:55                     ` [patch] oom: document obsolete oom_adj tunable David Rientjes
2010-11-15  0:22                       ` KOSAKI Motohiro
2010-11-15 10:38                         ` David Rientjes
2010-11-23  7:16                           ` KOSAKI Motohiro
2010-11-14  5:07                 ` [resend][PATCH 2/4] Revert "oom: deprecate oom_adj tunable" KOSAKI Motohiro
2010-11-14 21:39                   ` David Rientjes
2010-11-23  7:16                     ` KOSAKI Motohiro
2010-11-28  1:41                       ` David Rientjes
2010-11-30 13:03                         ` KOSAKI Motohiro
2010-11-30 20:07                           ` David Rientjes
2010-10-25  3:28 ` [resend][PATCH 3/4] move cred_guard_mutex from task_struct to signal_struct KOSAKI Motohiro
2010-10-25 17:26   ` Roland McGrath
2010-10-25 17:42     ` Oleg Nesterov
2010-10-25 17:51       ` Roland McGrath
2010-10-26 13:04         ` KOSAKI Motohiro
2010-10-26 13:18           ` Roland McGrath
2010-10-25  3:29 ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
2010-10-25 11:28   ` pageexec
2010-10-26  7:25     ` KOSAKI Motohiro
2010-11-23 14:34   ` Oleg Nesterov
2010-11-24  0:24     ` KOSAKI Motohiro
2010-11-24 11:09       ` Oleg Nesterov
2010-11-25 11:06         ` KOSAKI Motohiro
2010-11-25 14:02           ` Oleg Nesterov
2010-11-25 19:36             ` Oleg Nesterov
2010-11-29  5:25               ` KOSAKI Motohiro
2010-11-29 11:33                 ` Oleg Nesterov
2010-11-29 18:23                   ` Oleg Nesterov
2010-11-30 19:54                     ` [PATCH 0/2] exec: more excessive argument size fixes for 2.6.37/stable Oleg Nesterov
2010-11-30 19:55                       ` [PATCH 1/2] exec: make argv/envp memory visible to oom-killer Oleg Nesterov
2010-12-01  0:12                         ` KOSAKI Motohiro
2010-12-01 18:07                           ` Oleg Nesterov
2010-11-30 19:56                       ` [PATCH 2/2] exec: copy-and-paste the fixes into compat_do_execve() paths Oleg Nesterov
2010-12-01  3:04                         ` KOSAKI Motohiro
2010-11-30 20:00                       ` [PATCH 0/4] exec: unify compat/non-compat code Oleg Nesterov
2010-11-30 20:00                         ` [PATCH 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
2010-11-30 20:01                         ` [PATCH 2/4] exec: introduce "bool compat" argument Oleg Nesterov
2010-11-30 20:01                         ` [PATCH 3/4] exec: unify compat_do_execve() code Oleg Nesterov
2010-12-01 17:37                           ` (No subject header) Milton Miller
2010-12-01 18:27                             ` Oleg Nesterov
2011-02-25 17:52                               ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Oleg Nesterov
2011-02-25 17:52                                 ` [PATCH 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
2011-02-25 17:52                                 ` [PATCH 2/5] exec: introduce "bool compat" argument Oleg Nesterov
2011-02-25 18:57                                   ` Linus Torvalds
2011-02-26 12:37                                     ` Oleg Nesterov
2011-02-25 17:53                                 ` [PATCH 3/5] exec: unify compat_do_execve() code Oleg Nesterov
2011-02-25 19:10                                   ` Linus Torvalds
2011-02-26 12:37                                     ` Oleg Nesterov
2011-02-26 12:57                                       ` Oleg Nesterov
2011-02-26 15:55                                       ` Linus Torvalds
2011-02-26 17:44                                         ` Oleg Nesterov
2011-03-01 20:47                                           ` [PATCH v2 0/5] exec: unify native/compat code Oleg Nesterov
2011-03-01 20:48                                             ` [PATCH v2 1/5] exec: introduce get_arg_ptr() helper Oleg Nesterov
2011-03-01 20:48                                             ` [PATCH v2 2/5] exec: introduce "bool compat" argument Oleg Nesterov
2011-03-01 20:48                                             ` [PATCH v2 3/5] exec: introduce conditional_user_ptr_t Oleg Nesterov
2011-03-01 20:49                                             ` [PATCH v2 4/5] exec: unify do_execve/compat_do_execve code Oleg Nesterov
2011-03-01 20:49                                             ` [PATCH v2 5/5] exec: document acct_arg_size() Oleg Nesterov
2011-03-01 21:39                                             ` [PATCH v2 0/5] exec: unify native/compat code Linus Torvalds
2011-03-02 16:26                                               ` [PATCH v3 0/4] " Oleg Nesterov
2011-03-02 16:27                                                 ` [PATCH v3 1/4] exec: introduce get_arg_ptr() helper Oleg Nesterov
2011-03-03  3:01                                                   ` KOSAKI Motohiro
2011-03-03 15:47                                                     ` Oleg Nesterov
2011-03-03 16:07                                                       ` Linus Torvalds
2011-03-05 20:30                                                         ` [PATCH v4 0/4] exec: unify native/compat code Oleg Nesterov
2011-03-05 20:31                                                           ` [PATCH v4 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
2011-03-05 20:31                                                           ` [PATCH v4 2/4] exec: introduce struct user_arg_ptr Oleg Nesterov
2011-03-05 20:31                                                           ` [PATCH v4 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
2011-03-05 20:52                                                             ` Linus Torvalds
2011-03-05 21:20                                                               ` Oleg Nesterov
2011-03-05 20:31                                                           ` [PATCH v4 4/4] exec: document acct_arg_size() Oleg Nesterov
2011-03-06 12:04                                                           ` [PATCH v4 0/4] exec: unify native/compat code KOSAKI Motohiro
2011-03-06 17:01                                                             ` [PATCH v5 " Oleg Nesterov
2011-03-06 17:02                                                               ` [PATCH v5 1/4] exec: introduce get_user_arg_ptr() helper Oleg Nesterov
2011-03-06 17:02                                                               ` [PATCH v5 2/4] exec: introduce struct user_arg_ptr Oleg Nesterov
2011-03-06 17:02                                                               ` [PATCH v5 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
2011-03-06 17:03                                                               ` [PATCH v5 4/4] exec: document acct_arg_size() Oleg Nesterov
2011-03-02 16:27                                                 ` [PATCH v3 2/4] exec: introduce struct conditional_ptr Oleg Nesterov
2011-03-03  3:08                                                   ` KOSAKI Motohiro
2011-03-02 16:27                                                 ` [PATCH v3 3/4] exec: unify do_execve/compat_do_execve code Oleg Nesterov
2011-03-03  3:13                                                   ` KOSAKI Motohiro
2011-03-02 16:28                                                 ` [PATCH v3 4/4] exec: document acct_arg_size() Oleg Nesterov
2011-03-03  3:09                                                   ` KOSAKI Motohiro
2011-03-02 16:44                                                 ` [PATCH v3 0/4] exec: unify native/compat code Oleg Nesterov
2011-03-02 18:00                                                   ` Linus Torvalds
2011-03-02 19:40                                                     ` David Miller
2011-03-02 19:48                                                       ` Linus Torvalds
2011-03-02 19:54                                                         ` David Miller
2011-02-25 17:53                                 ` [PATCH 4/5] exec: unexport acct_arg_size() and get_arg_page() Oleg Nesterov
2011-02-25 17:54                                 ` [PATCH 5/5] exec: document acct_arg_size() Oleg Nesterov
2011-02-25 18:54                                 ` [PATCH 0/4 RESEND] exec: unify compat/non-compat code Linus Torvalds
2011-02-26 12:35                                   ` Oleg Nesterov
2010-11-30 20:01                         ` [PATCH 4/4] exec: unexport acct_arg_size() and get_arg_page() Oleg Nesterov
2010-12-01  3:09                         ` [PATCH 0/4] exec: unify compat/non-compat code KOSAKI Motohiro
2010-11-30  0:06                   ` [resend][PATCH 4/4] oom: don't ignore rss in nascent mm KOSAKI Motohiro
2010-10-25 20:37 ` [resend][PATCH 1/4] oom: remove totalpage normalization from oom_badness() David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).