linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] ps command race fix
@ 2006-07-14 11:39 KAMEZAWA Hiroyuki
  2006-07-25  1:20 ` Andrew Morton
  0 siblings, 1 reply; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-14 11:39 UTC (permalink / raw)
  To: LKML; +Cc: Andrew Morton

Hi, this is an experimental patch for the probelm
	- "ps command can miss some pid occationally"
please comment 


the problem itself is very rare case, but the result is sometimes terrible

for example, when a user does

alive=`ps | grep command | grep -v command | wc -l`

to check process is alive or not (I think this user should use kill-0 ;)

-Kame
==
Now, prod_pid_readir() uses direct access to task and 
indexing 'task list' as fallback.
Of course, entries in this list can be removed randomly.

So, following can happen when using 'ps' command.
==
1. assume task_list as
....-(taskA)-(taskB)-(taskC)-(taskD)-(taskE)-(taskF)-(taskG)-...

2. at getdents() iteration 'N', ps command's getdents() read entries before taskC.
and remenbers "I read X entries".

....-(taskA)-(taskB)-(taskC)-(taskD)-(taskE)-(taskF)-(taskG)-...
------(f_pos=X)---------^

getdents() remembers
	- "taskC is next candidate to be read"
	- "we already read X ents".

3. consider taskA and taskC exits, before next getdents(N+1)

....-(lost)-(taskB)-(lost)-(taskD)-(taskE)-(taskF)-(taskG)-...
------(f_pos=X)--------^

4. at getdents(N+1), becasue getdents() cannot find taskC, it skips 'X'
   ents in the list.
   from head of the list.
....-(taskB)-(taskD)-(taskE)-(taskF)-(taskG)-..
------(f_pos=X)--------^

5. in this case, taskD is skipped.
==

This patch changes indexing in the list to indexing in a table.
Table is created only for storing valid tgid.(not pid)
Tested on x86/ia64.

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 fs/proc/base.c          |  187 ++++++++++++++++++++++++++++++++----------------
 include/linux/proc_fs.h |    4 +
 include/linux/sched.h   |    6 +
 kernel/exit.c           |    1 
 kernel/fork.c           |    2 
 5 files changed, 138 insertions(+), 62 deletions(-)

Index: linux-2.6.18-rc1-kame/fs/proc/base.c
===================================================================
--- linux-2.6.18-rc1-kame.orig/fs/proc/base.c	2006-07-14 09:14:45.000000000 +0900
+++ linux-2.6.18-rc1-kame/fs/proc/base.c	2006-07-14 17:04:35.000000000 +0900
@@ -2121,59 +2121,126 @@
  * In the case of a seek we start with &init_task and walk nr
  * threads past it.
  */
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+
+rwlock_t	tgid_table_lock;
+struct tgid_buffer {
+	struct list_head list;
+	int base_pos;
+	int nr_free;
+	int cache;
+	int *data;
+};
+
+LIST_HEAD(tgid_table);
+DECLARE_RWSEM(tgid_table_sem);
+int nr_tgid_table;
+int tgid_end_pos;
+#define get_tgid_buf(p) ((struct tgid_buffer *)((unsigned long)p & PAGE_MASK))
+#define ENT_PER_TGIDBUF ((PAGE_SIZE - sizeof(struct tgid_buffer))/sizeof(int))
+
+static inline int tgid_ent_to_pos(int *tgid)
 {
-	struct task_struct *pos;
-	rcu_read_lock();
-	if (tgid && nr) {
-		pos = find_task_by_pid(tgid);
-		if (pos && thread_group_leader(pos))
+	struct tgid_buffer *buf =  get_tgid_buf(tgid);
+	return buf->base_pos + (tgid - buf->data);
+}
+
+static struct tgid_buffer *grow_tgid_buf(void)
+{
+	struct tgid_buffer *buf = (void *) get_zeroed_page(GFP_KERNEL);
+	BUG_ON(!buf);
+	INIT_LIST_HEAD(&buf->list);
+	buf->nr_free = ENT_PER_TGIDBUF;
+	buf->data = (void *)buf + sizeof(struct tgid_buffer);
+	buf->base_pos = nr_tgid_table * ENT_PER_TGIDBUF;
+	/* add to the tail */
+	list_add_tail(&buf->list,&tgid_table);
+	nr_tgid_table++;
+	tgid_end_pos += ENT_PER_TGIDBUF;
+	return buf;
+}
+
+void proc_register_tgid(struct task_struct *task)
+{
+	struct tgid_buffer *buf;
+	int tgid = task->tgid;
+	int *ent;
+	/* we just record thread group leader */
+	if (!thread_group_leader(task))
+		return;
+	down_write(&tgid_table_sem);
+	list_for_each_entry(buf, &tgid_table, list) {
+		if (buf->nr_free != 0)
 			goto found;
 	}
-	/* If nr exceeds the number of processes get out quickly */
-	pos = NULL;
-	if (nr && nr >= nr_processes())
-		goto done;
-
-	/* If we haven't found our starting place yet start with
-	 * the init_task and walk nr tasks forward.
-	 */
-	for (pos = next_task(&init_task); nr > 0; --nr) {
-		pos = next_task(pos);
-		if (pos == &init_task) {
-			pos = NULL;
-			goto done;
+	/* no free area... allocate new one */
+	buf = grow_tgid_buf();
+found:
+	ent = buf->data + buf->cache;
+	while (*ent) {
+		ent++;
+		if (ent - buf->data >= ENT_PER_TGIDBUF) {
+			ent = buf->data;
 		}
 	}
-found:
-	get_task_struct(pos);
-done:
-	rcu_read_unlock();
-	return pos;
+	*ent = tgid;
+	task->tgid_buf_pos = ent;
+	buf->cache = (ent - buf->data);
+	buf->nr_free--;
+	up_write(&tgid_table_sem);
+	return;
 }
 
-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
+void proc_unregister_tgid(struct task_struct *task)
 {
-	struct task_struct *pos;
-	rcu_read_lock();
-	pos = start;
-	if (pid_alive(start))
-		pos = next_task(start);
-	if (pid_alive(pos) && (pos != &init_task)) {
-		get_task_struct(pos);
-		goto done;
+	struct tgid_buffer *buf;
+	int *ent;
+	if (task->tgid_buf_pos == NULL)
+		return;
+	down_write(&tgid_table_sem);
+	ent = task->tgid_buf_pos;
+	buf = get_tgid_buf(ent);
+	*ent = 0;
+	buf->cache = ent - buf->data;
+	buf->nr_free++;
+	up_write(&tgid_table_sem);
+}
+
+static int *next_tgid(int *ent)
+{
+	struct tgid_buffer *buf;
+	int pos;
+	buf = get_tgid_buf(ent);
+	/* search next valid entry */
+	ent++;
+	pos = ent - buf->data;
+	do {
+		while (pos < ENT_PER_TGIDBUF) {
+			ent = buf->data + pos;
+			if (*ent)
+				return ent;
+			pos++;
+		}
+		buf = list_entry(buf->list.next, struct tgid_buffer, list);
+		pos = 0;
+	} while (&buf->list != &tgid_table);
+	return NULL;
+}
+
+static int *first_tgid(int nr)
+{
+	int *ent;
+	struct tgid_buffer *buf;
+	list_for_each_entry(buf, &tgid_table, list) {
+		if (nr < ENT_PER_TGIDBUF)
+			goto found;
+		nr -= ENT_PER_TGIDBUF;
 	}
-	pos = NULL;
-done:
-	rcu_read_unlock();
-	put_task_struct(start);
-	return pos;
+	return NULL;
+found:
+	ent = buf->data + nr;
+	if (*ent)
+		return ent;
+	return next_tgid(ent);
 }
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
@@ -2181,8 +2248,7 @@
 {
 	char buf[PROC_NUMBUF];
 	unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-	struct task_struct *task;
-	int tgid;
+	int *tgid;
 
 	if (!nr) {
 		ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2192,28 +2258,25 @@
 		nr++;
 	}
 	nr -= 1;
-
-	/* f_version caches the tgid value that the last readdir call couldn't
-	 * return. lseek aka telldir automagically resets f_version to 0.
-	 */
-	tgid = filp->f_version;
-	filp->f_version = 0;
-	for (task = first_tgid(tgid, nr);
-	     task;
-	     task = next_tgid(task), filp->f_pos++) {
+	if (nr >= tgid_end_pos)
+		return 0;
+	down_read(&tgid_table_sem);
+	for (tgid = first_tgid(nr);
+	     tgid;
+	     tgid = next_tgid(tgid)) {
 		int len;
 		ino_t ino;
-		tgid = task->pid;
-		len = snprintf(buf, sizeof(buf), "%d", tgid);
-		ino = fake_ino(tgid, PROC_TGID_INO);
+		len = snprintf(buf, sizeof(buf), "%d", *tgid);
+		ino = fake_ino(*tgid, PROC_TGID_INO);
+		filp->f_pos = tgid_ent_to_pos(tgid) + FIRST_PROCESS_ENTRY + 1;
 		if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
-			/* returning this tgid failed, save it as the first
-			 * pid for the next readir call */
-			filp->f_version = tgid;
-			put_task_struct(task);
 			break;
 		}
 	}
+	up_read(&tgid_table_sem);
+	if (!tgid) { /* EOF */
+		filp->f_pos = tgid_end_pos + FIRST_PROCESS_ENTRY + 1;
+	}
 	return 0;
 }
 
Index: linux-2.6.18-rc1-kame/include/linux/proc_fs.h
===================================================================
--- linux-2.6.18-rc1-kame.orig/include/linux/proc_fs.h	2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/include/linux/proc_fs.h	2006-07-14 19:52:23.000000000 +0900
@@ -200,6 +200,8 @@
 	remove_proc_entry(name,proc_net);
 }
 
+extern void proc_register_tgid(struct task_struct *task);
+extern void proc_unregister_tgid(struct task_struct *task);
 #else
 
 #define proc_root_driver NULL
@@ -232,6 +234,8 @@
 struct tty_driver;
 static inline void proc_tty_register_driver(struct tty_driver *driver) {};
 static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
+static inline void proc_register_tgid(struct task_struct *task) {};
+static inline void proc_unregister_tgid(struct task_struct *task) {};
 
 extern struct proc_dir_entry proc_root;
 
Index: linux-2.6.18-rc1-kame/include/linux/sched.h
===================================================================
--- linux-2.6.18-rc1-kame.orig/include/linux/sched.h	2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/include/linux/sched.h	2006-07-14 13:54:07.000000000 +0900
@@ -940,6 +940,12 @@
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
+/*
+ * for proc root directory
+ */
+#ifdef CONFIG_PROC_FS
+	int	*tgid_buf_pos;
+#endif
 
 	/*
 	 * cache last used pipe for splice
Index: linux-2.6.18-rc1-kame/kernel/exit.c
===================================================================
--- linux-2.6.18-rc1-kame.orig/kernel/exit.c	2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/kernel/exit.c	2006-07-14 13:54:07.000000000 +0900
@@ -166,6 +166,7 @@
 
 	sched_exit(p);
 	write_unlock_irq(&tasklist_lock);
+	proc_unregister_tgid(p);
 	proc_flush_task(p);
 	release_thread(p);
 	call_rcu(&p->rcu, delayed_put_task_struct);
Index: linux-2.6.18-rc1-kame/kernel/fork.c
===================================================================
--- linux-2.6.18-rc1-kame.orig/kernel/fork.c	2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/kernel/fork.c	2006-07-14 13:54:07.000000000 +0900
@@ -179,6 +179,7 @@
 	atomic_set(&tsk->fs_excl, 0);
 	tsk->btrace_seq = 0;
 	tsk->splice_pipe = NULL;
+	tsk->tgid_buf_pos = NULL;
 	return tsk;
 }
 
@@ -1244,6 +1245,7 @@
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
+	proc_register_tgid(p);
 	return p;
 
 bad_fork_cleanup_namespace:


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-14 11:39 [RFC] ps command race fix KAMEZAWA Hiroyuki
@ 2006-07-25  1:20 ` Andrew Morton
  2006-07-25  1:48   ` Paul Jackson
                     ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Andrew Morton @ 2006-07-25  1:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, Eric W. Biederman

On Fri, 14 Jul 2006 20:39:39 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> Hi, this is an experimental patch for the probelm
> 	- "ps command can miss some pid occationally"
> please comment 
> 
> 
> the problem itself is very rare case, but the result is sometimes terrible
> 
> for example, when a user does
> 
> alive=`ps | grep command | grep -v command | wc -l`
> 
> to check process is alive or not (I think this user should use kill-0 ;)
> 
> -Kame
> ==
> Now, prod_pid_readir() uses direct access to task and 
> indexing 'task list' as fallback.
> Of course, entries in this list can be removed randomly.
> 
> So, following can happen when using 'ps' command.
> ==
> 1. assume task_list as
> ....-(taskA)-(taskB)-(taskC)-(taskD)-(taskE)-(taskF)-(taskG)-...
> 
> 2. at getdents() iteration 'N', ps command's getdents() read entries before taskC.
> and remenbers "I read X entries".
> 
> ....-(taskA)-(taskB)-(taskC)-(taskD)-(taskE)-(taskF)-(taskG)-...
> ------(f_pos=X)---------^
> 
> getdents() remembers
> 	- "taskC is next candidate to be read"
> 	- "we already read X ents".
> 
> 3. consider taskA and taskC exits, before next getdents(N+1)
> 
> ....-(lost)-(taskB)-(lost)-(taskD)-(taskE)-(taskF)-(taskG)-...
> ------(f_pos=X)--------^
> 
> 4. at getdents(N+1), becasue getdents() cannot find taskC, it skips 'X'
>    ents in the list.
>    from head of the list.
> ....-(taskB)-(taskD)-(taskE)-(taskF)-(taskG)-..
> ------(f_pos=X)--------^
> 
> 5. in this case, taskD is skipped.
> ==
> 
> This patch changes indexing in the list to indexing in a table.
> Table is created only for storing valid tgid.(not pid)
> Tested on x86/ia64.
> 

It allocates a potentially-significant amount of memory per-task, until
that tasks exits (we could release it earlier, but the problem remains) and
it adds yet another global lock in the process exit path.

>  5 files changed, 138 insertions(+), 62 deletions(-)

And it adds complexity and code.

So I think we're still seeking a solution to this.

Options might be:

a) Pin the most-recently-visited task in some manner, so that it is
   still on the global task list when we return.  That's fairly simple to
   do (defer the release_task()) but it affects task lifetime and visibility
   in rare and worrisome ways.

b) Change proc_pid_readdir() so that it walks the pid_hash[] array
   instead of the task list.  Need to do something clever when traversing
   each bucket's list, but I'm not sure what ;) It's the same problem.

   Possibly what we could do here is to permit the task which is walking
   /proc to pin a particular `struct pid': take a ref on it then when we
   next start walking one of the pid_hash[] chains, we _know_ that the
   `struct pid' which we're looking for will still be there.  Even if it
   now refers to a departed process.

c) Nuke the pid_hash[], convert the whole thing to a radix-tree. 
   They're super-simple to traverse.  Not sure what we'd index it by
   though.

I guess b) is best.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  1:20 ` Andrew Morton
@ 2006-07-25  1:48   ` Paul Jackson
  2006-07-25  2:00     ` Andrew Morton
  2006-07-25  2:08     ` KAMEZAWA Hiroyuki
  2006-07-25  1:53   ` KAMEZAWA Hiroyuki
  2006-07-25  6:09   ` Eric W. Biederman
  2 siblings, 2 replies; 27+ messages in thread
From: Paul Jackson @ 2006-07-25  1:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: kamezawa.hiroyu, linux-kernel, ebiederm

Another possibility (perhaps a really stupid idea ;) would be to
snapshot the list of pids on the open, and let the readdir() just
access that fixed array.

The kernel/cpuset.c cpuset_tasks_open() routine that displays the
pids of tasks in a cpuset (the per-cpuset 'tasks' file) does this.

Then the seek and read and such semantics are nice and stable and
simple.

Throw out the snapshot on the last close.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  1:20 ` Andrew Morton
  2006-07-25  1:48   ` Paul Jackson
@ 2006-07-25  1:53   ` KAMEZAWA Hiroyuki
  2006-07-25  2:06     ` Andrew Morton
  2006-07-25  6:09   ` Eric W. Biederman
  2 siblings, 1 reply; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-25  1:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ebiederm

On Mon, 24 Jul 2006 18:20:00 -0700
Andrew Morton <akpm@osdl.org> wrote:
> 
> It allocates a potentially-significant amount of memory per-task, until
> that tasks exits (we could release it earlier, but the problem remains) and
> it adds yet another global lock in the process exit path.
> 
I see.

> >  5 files changed, 138 insertions(+), 62 deletions(-)
> 
> And it adds complexity and code.
> 
> So I think we're still seeking a solution to this.
> 
> Options might be:
> 
> a) Pin the most-recently-visited task in some manner, so that it is
>    still on the global task list when we return.  That's fairly simple to
>    do (defer the release_task()) but it affects task lifetime and visibility
>    in rare and worrisome ways.
> 
> b) Change proc_pid_readdir() so that it walks the pid_hash[] array
>    instead of the task list.  Need to do something clever when traversing
>    each bucket's list, but I'm not sure what ;) It's the same problem.
> 
>    Possibly what we could do here is to permit the task which is walking
>    /proc to pin a particular `struct pid': take a ref on it then when we
>    next start walking one of the pid_hash[] chains, we _know_ that the
>    `struct pid' which we're looking for will still be there.  Even if it
>    now refers to a departed process.
> 
> c) Nuke the pid_hash[], convert the whole thing to a radix-tree. 
>    They're super-simple to traverse.  Not sure what we'd index it by
>    though.
> 
> I guess b) is best.
> 

I tried b) at the first place. but it was not very good because 
proc_pid_readdir() has to traverse all pids, not tgids. So, I had to access
task_struct of the pid. I wanted to avoid to access task struct itself,
my patch implemented a table made only from tgids.

But as you say, my patch is much intrusive. 
I'll dig this more. thank you for advise.

Thanks,
-Kame





^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  1:48   ` Paul Jackson
@ 2006-07-25  2:00     ` Andrew Morton
  2006-07-25  2:08     ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 27+ messages in thread
From: Andrew Morton @ 2006-07-25  2:00 UTC (permalink / raw)
  To: Paul Jackson; +Cc: kamezawa.hiroyu, linux-kernel, ebiederm

On Mon, 24 Jul 2006 18:48:47 -0700
Paul Jackson <pj@sgi.com> wrote:

> Another possibility (perhaps a really stupid idea ;) would be to
> snapshot the list of pids on the open, and let the readdir() just
> access that fixed array.

The patch under discussion does precisely this.  (Awkwardly.  Using
kmalloc-pre-object might be better).

> The kernel/cpuset.c cpuset_tasks_open() routine that displays the
> pids of tasks in a cpuset (the per-cpuset 'tasks' file) does this.

Your faith in large kmalloc()s is touching ;) I guess the number of pids
will be smaller for cpusets.

> Then the seek and read and such semantics are nice and stable and
> simple.
> 
> Throw out the snapshot on the last close.

The patch under discussion didn't do this, although it could.  But it still
permits rather a lot of kernel memory to be pinned.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  1:53   ` KAMEZAWA Hiroyuki
@ 2006-07-25  2:06     ` Andrew Morton
  2006-07-25  2:34       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2006-07-25  2:06 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, ebiederm

On Tue, 25 Jul 2006 10:53:39 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Mon, 24 Jul 2006 18:20:00 -0700
> Andrew Morton <akpm@osdl.org> wrote:
> > 
> > It allocates a potentially-significant amount of memory per-task, until
> > that tasks exits (we could release it earlier, but the problem remains) and
> > it adds yet another global lock in the process exit path.
> > 
> I see.
> 
> > >  5 files changed, 138 insertions(+), 62 deletions(-)
> > 
> > And it adds complexity and code.
> > 
> > So I think we're still seeking a solution to this.
> > 
> > Options might be:
> > 
> > a) Pin the most-recently-visited task in some manner, so that it is
> >    still on the global task list when we return.  That's fairly simple to
> >    do (defer the release_task()) but it affects task lifetime and visibility
> >    in rare and worrisome ways.
> > 
> > b) Change proc_pid_readdir() so that it walks the pid_hash[] array
> >    instead of the task list.  Need to do something clever when traversing
> >    each bucket's list, but I'm not sure what ;) It's the same problem.
> > 
> >    Possibly what we could do here is to permit the task which is walking
> >    /proc to pin a particular `struct pid': take a ref on it then when we
> >    next start walking one of the pid_hash[] chains, we _know_ that the
> >    `struct pid' which we're looking for will still be there.  Even if it
> >    now refers to a departed process.
> > 
> > c) Nuke the pid_hash[], convert the whole thing to a radix-tree. 
> >    They're super-simple to traverse.  Not sure what we'd index it by
> >    though.
> > 
> > I guess b) is best.
> > 
> 
> I tried b) at the first place.

OK.

> but it was not very good because 
> proc_pid_readdir() has to traverse all pids, not tgids.

You mean "all tgids, not pids".

> So, I had to access
> task_struct of the pid. I wanted to avoid to access task struct itself,

Why do you wish to avoid accessing the task_struct?

> my patch implemented a table made only from tgids.
> 
> But as you say, my patch is much intrusive. 
> I'll dig this more. thank you for advise.

Thanks.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  1:48   ` Paul Jackson
  2006-07-25  2:00     ` Andrew Morton
@ 2006-07-25  2:08     ` KAMEZAWA Hiroyuki
  2006-07-25  2:33       ` Andrew Morton
  1 sibling, 1 reply; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-25  2:08 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, linux-kernel, ebiederm

On Mon, 24 Jul 2006 18:48:47 -0700
Paul Jackson <pj@sgi.com> wrote:

> Another possibility (perhaps a really stupid idea ;) would be to
> snapshot the list of pids on the open, and let the readdir() just
> access that fixed array.
> 
> The kernel/cpuset.c cpuset_tasks_open() routine that displays the
> pids of tasks in a cpuset (the per-cpuset 'tasks' file) does this.
> 
Oh. thank you for informing :) I don't know about that.
I'll look into.

> Then the seek and read and such semantics are nice and stable and
> simple.
> 
yes...
I think snapshot at open() is okay.

-Kame


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  2:08     ` KAMEZAWA Hiroyuki
@ 2006-07-25  2:33       ` Andrew Morton
  2006-07-25  2:50         ` KAMEZAWA Hiroyuki
  2006-07-25  7:22         ` Paul Jackson
  0 siblings, 2 replies; 27+ messages in thread
From: Andrew Morton @ 2006-07-25  2:33 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: pj, linux-kernel, ebiederm

On Tue, 25 Jul 2006 11:08:35 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > Then the seek and read and such semantics are nice and stable and
> > simple.
> > 
> yes...
> I think snapshot at open() is okay.

We cannot do a single kmalloc() like cpuset does.

The kernel presently kind-of guarantees that a 32k kmalloc() will work,
although the VM might have to do very large amounts of work to achieve it.

But 32k is only 8192 processes, so a snapshot will need multiple
allocations and a list and trouble dropping and retaking tasklist_lock to
allocate memory and keeping things stable while doing that.  I suspect
it'll end up ugly.

And it permits userspace to pin rather a lot of memory.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  2:06     ` Andrew Morton
@ 2006-07-25  2:34       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-25  2:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ebiederm

On Mon, 24 Jul 2006 19:06:26 -0700
Andrew Morton <akpm@osdl.org> wrote:
> > but it was not very good because 
> > proc_pid_readdir() has to traverse all pids, not tgids.
> 
> You mean "all tgids, not pids".
> 
Ah, sorry. Precisely, traverse all pids and choose tgids from them because
pid_hash[] remembers all pids.


> > So, I had to access
> > task_struct of the pid. I wanted to avoid to access task struct itself,
> 
> Why do you wish to avoid accessing the task_struct?
> 
I thought accessing all task struct at random is heavy work and have to
up and down rcu_read_lock thousands of times.
I'm sorry if my thought is not problem.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  2:33       ` Andrew Morton
@ 2006-07-25  2:50         ` KAMEZAWA Hiroyuki
  2006-07-25  3:16           ` KAMEZAWA Hiroyuki
  2006-07-25  7:22         ` Paul Jackson
  1 sibling, 1 reply; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-25  2:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: pj, linux-kernel, ebiederm

On Mon, 24 Jul 2006 19:33:18 -0700
Andrew Morton <akpm@osdl.org> wrote:

> On Tue, 25 Jul 2006 11:08:35 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > > Then the seek and read and such semantics are nice and stable and
> > > simple.
> > > 
> > yes...
> > I think snapshot at open() is okay.
> 
> We cannot do a single kmalloc() like cpuset does.
> 
> The kernel presently kind-of guarantees that a 32k kmalloc() will work,
> although the VM might have to do very large amounts of work to achieve it.
> 
> But 32k is only 8192 processes, so a snapshot will need multiple
> allocations and a list and trouble dropping and retaking tasklist_lock to
> allocate memory and keeping things stable while doing that.  I suspect
> it'll end up ugly.
> 

Hm, how about using bitmap instead of table ? (we'll have many holes but...)
Implementing
- sytem-wide bitmap of used tgid which is updated only when /proc is opened
seems not to be much problem. 32k kmalloc can store 256k pids.

BTW, how large pids and how many proccess in a (heavy and big) system ?

-Kame


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  2:50         ` KAMEZAWA Hiroyuki
@ 2006-07-25  3:16           ` KAMEZAWA Hiroyuki
  2006-08-13 16:29             ` Eric W. Biederman
  0 siblings, 1 reply; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-25  3:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: akpm, pj, linux-kernel, ebiederm

On Tue, 25 Jul 2006 11:50:04 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> BTW, how large pids and how many proccess in a (heavy and big) system ?
> 
I found
/*
 * A maximum of 4 million PIDs should be enough for a while.
 * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
 */
#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
        (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))

...we have to manage 4 millions tids.

I'll try to make use of pidmap array which is already maintained in pid.c
and to avoid using extra memory.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  1:20 ` Andrew Morton
  2006-07-25  1:48   ` Paul Jackson
  2006-07-25  1:53   ` KAMEZAWA Hiroyuki
@ 2006-07-25  6:09   ` Eric W. Biederman
  2 siblings, 0 replies; 27+ messages in thread
From: Eric W. Biederman @ 2006-07-25  6:09 UTC (permalink / raw)
  To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, linux-kernel

Andrew Morton <akpm@osdl.org> writes:

> So I think we're still seeking a solution to this.
>
> Options might be:
>
> a) Pin the most-recently-visited task in some manner, so that it is
>    still on the global task list when we return.  That's fairly simple to
>    do (defer the release_task()) but it affects task lifetime and visibility
>    in rare and worrisome ways.
>
> b) Change proc_pid_readdir() so that it walks the pid_hash[] array
>    instead of the task list.  Need to do something clever when traversing
>    each bucket's list, but I'm not sure what ;) It's the same problem.
>
>    Possibly what we could do here is to permit the task which is walking
>    /proc to pin a particular `struct pid': take a ref on it then when we
>    next start walking one of the pid_hash[] chains, we _know_ that the
>    `struct pid' which we're looking for will still be there.  Even if it
>    now refers to a departed process.
>
> c) Nuke the pid_hash[], convert the whole thing to a radix-tree. 
>    They're super-simple to traverse.  Not sure what we'd index it by
>    though.
>
> I guess b) is best.

The advantage with walking the task list is new entries are always added
at the end.  Neither of the other two proposed orders does that.  So
in fact I think the problem will get worse leaving more tasks skipped,
because in those cases new tasks can be inserted before our cursor in
the list.  The proposed snapshot implementation has a similar problem
in that new process could get skipped.

The simplest version I can think of is to place a cousin of the
tasklist into struct pid.  Allowing us to use a very similar
algorithm to what we use now.  But since we can pin struct pid we
won't have the chance of our current position being deleted.

Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  2:33       ` Andrew Morton
  2006-07-25  2:50         ` KAMEZAWA Hiroyuki
@ 2006-07-25  7:22         ` Paul Jackson
  1 sibling, 0 replies; 27+ messages in thread
From: Paul Jackson @ 2006-07-25  7:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: kamezawa.hiroyu, linux-kernel, ebiederm, Albert Cahalan

Andrew wrote:
> We cannot do a single kmalloc() like cpuset does.

Ok ...

Well, since you're so impressed with the studliness of that idea
<grin>, how about this:

  Add a 'false link' to the .next task list.

Each diropen on /proc and each open on a cpuset 'tasks' file would
add one such 'false link' to the task list, representing that file
descriptors current seek offset in the task list.

A 'false link' would be a task_struct that was almost entirely unused,
except to mark the offset in the task list of a file descriptor open
on it (for /proc or cpuset 'tasks' files.)

The 'normal' do_each_thread/while_each_thread and related macros would
silently skip over these false links.

The /proc and cpuset 'tasks' code would use special macros that could
see the false link representing its current seek offset, and be able
to implement read and seek operations relative to that position.

Remove this 'false link' on the final close of the file descriptor
holding it.

This reduces the memory cost of an open on /proc or a 'tasks' file to
the size of a single 'false link' task_struct.

It -would- add another test and jump to the critical do_each_thread and
while_each_thread macros, to skip over 'false links'.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-07-25  3:16           ` KAMEZAWA Hiroyuki
@ 2006-08-13 16:29             ` Eric W. Biederman
  2006-08-13 17:34               ` Andrew Morton
  2006-08-13 20:08               ` Albert Cahalan
  0 siblings, 2 replies; 27+ messages in thread
From: Eric W. Biederman @ 2006-08-13 16:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: akpm, pj, linux-kernel, Albert Cahalan

KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:

> On Tue, 25 Jul 2006 11:50:04 +0900
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
>> BTW, how large pids and how many proccess in a (heavy and big) system ?
>> 
> I found
> /*
>  * A maximum of 4 million PIDs should be enough for a while.
>  * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
>  */
> #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
>         (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
>
> ...we have to manage 4 millions tids.
>
> I'll try to make use of pidmap array which is already maintained in pid.c
> and to avoid using extra memory.

Has any progress been made on this front?

I know I was initially discouraging.

I looked up the posix definition for readdir and thought about this
some more and I do agree that this is a problem that needs to be fixed.

The basic posix/susv guarantee is that in readdir if a directory
entry is neither deleted nor added between opendir and closedir of the
directory you should see the directory entry.   I could not
quite tell what the rules were with regards seekdir.

That is a sane set of guarantees to build a userspace implementation
on.

Since the current implementation does not provide those guarantees
it is hard to build a normal user space implementation, because
the guarantees provided are not the normal ones, and the current
guarantees provided (it works if you have a big enough readdir
buffer) aren't terribly useful as they require several hundred
megabytes in the worse case.

There are also other reasons to changing to a pid base traversal
of /proc.  It allows us to display information on process groups,
and sessions whose original leader has died.  If namespaces get
assigned a pid traversal by pid looks like a good way to display
namespaces that are not used by any process but are still alive.
Albert does that sound like a sane extension?

I also have always liked the idea of a different data structure
because hash tables may it ugly when it comes to implementing
namespaces.  But I believe the current hash table is generally better
than what I have seen proposed, as replacements.

In the normal case the current hash has 4096 entries and pid_max
is set to 32768.  In the normal case that means we have a single
hash table lookup, which is very cache friendly since we only have
the single cache line miss.  Not always but largely we can assume
any lookup in the data structure is going to be a cache miss.  pid
traversal might be different I'm not certain. 

Our worst case with the current hash table with a pid_max of 32768 is
traversing a linked list 9 items long. (I just computed this by brute
force).  Of course when you push pid_max to the maximum of
(4*1024*1024) the hash chains can get as long as 1024 entries which
is distressing, but at least uniformly distributed.

Comparing that with a radix tree with a height of 6.  For 32768 pids
we get a usual height of ceil(15/6) = 3 and in the worst case
ceil(22/6) = 4.

Our hash function distributes things well which is good.  It is a
known function which means an attacker can predict it, and with a
series of fork/exit can within some reasonable bounds pick which
pids are getting used. A cryptographic hash will not help because
the input space can be trivially brute force searched.

So for systems that are going to be using a larger number of pid
values I think we need a better data structure, and containers are
likely to push us in that area.  Which means either an extensible
hash table or radix tree look like the sane choices.


Eric


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-13 16:29             ` Eric W. Biederman
@ 2006-08-13 17:34               ` Andrew Morton
  2006-08-13 19:00                 ` Eric W. Biederman
  2006-08-13 20:08               ` Albert Cahalan
  1 sibling, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2006-08-13 17:34 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: KAMEZAWA Hiroyuki, pj, linux-kernel, Albert Cahalan

On Sun, 13 Aug 2006 10:29:51 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> So for systems that are going to be using a larger number of pid
> values I think we need a better data structure, and containers are
> likely to push us in that area.  Which means either an extensible
> hash table or radix tree look like the sane choices.

radix-trees are nice because you can efficiently traverse them in-order
while the contents are changing (albeit potentially missing newly-added
things, but that's inevitable).

radix-trees are not-nice because they allocate memory at insertion time. 
If that's a problem then rbtrees could perhaps be used.

idr-trees have similar characteristics to the radix-trees, except a) the
idr-tree find-next-above feature could perhaps be used for the core pid
allocation and b) idr-trees don't presently have suitable search functions
for performing the iteration.

At least we have plenty of choices ;)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-13 17:34               ` Andrew Morton
@ 2006-08-13 19:00                 ` Eric W. Biederman
  2006-08-13 19:12                   ` Paul Jackson
  0 siblings, 1 reply; 27+ messages in thread
From: Eric W. Biederman @ 2006-08-13 19:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, pj, linux-kernel, Albert Cahalan

Andrew Morton <akpm@osdl.org> writes:

> On Sun, 13 Aug 2006 10:29:51 -0600
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>> So for systems that are going to be using a larger number of pid
>> values I think we need a better data structure, and containers are
>> likely to push us in that area.  Which means either an extensible
>> hash table or radix tree look like the sane choices.
>
> radix-trees are nice because you can efficiently traverse them in-order
> while the contents are changing (albeit potentially missing newly-added
> things, but that's inevitable).

Actually except when we can't find the process we were just at
the current code doesn't miss any newly added processes.  So there
are implementations that missing new entries isn't inevitable.

> radix-trees are not-nice because they allocate memory at insertion time. 
> If that's a problem then rbtrees could perhaps be used.

It isn't fundamental, as I do memory allocations on that path.  But
it does appear to be a implementation conflict as the current locking
is a spinlock with irqs disabled, and doing a GFP_ATOMIC allocation
to avoid that is fairly silly.

If we were to loose the potential to do rcu traversals when looking
up a single pid that would be make scaling the code much harder.

> idr-trees have similar characteristics to the radix-trees, except a) the
> idr-tree find-next-above feature could perhaps be used for the core pid
> allocation and b) idr-trees don't presently have suitable search functions
> for performing the iteration.

We have to be careful about changing the pid allocation algorithm.
We need to keep the logic where we ensure it will be a long time
before a newly freed pid is reused.  Which basically means the current
implementation that walks through possible pids allocating new ones.
The current implementation doesn't give us any guarantees, but it is
much better than the normal allocator algorithm of allocating the next
available pid.

> At least we have plenty of choices ;)

Yes, and I'm still not quite convinced we have a problem that needs a
new data structure.  But with everything becoming multi-core there are
serious pressures in that direction.

Anyway the important part is to get a traversal by pid.

Eric


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-13 19:00                 ` Eric W. Biederman
@ 2006-08-13 19:12                   ` Paul Jackson
  2006-08-16  1:23                     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 27+ messages in thread
From: Paul Jackson @ 2006-08-13 19:12 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: akpm, kamezawa.hiroyu, linux-kernel, acahalan

Eric wrote:
> Actually except when we can't find the process we were just at
> the current code doesn't miss any newly added processes. 

Random thought -- could we have file descriptors open on /proc put some
sort of 'hold' on whatever /proc entry they were just at, so it doesn't
disappear out from under them, even if that process has otherwise fully
exited?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-13 16:29             ` Eric W. Biederman
  2006-08-13 17:34               ` Andrew Morton
@ 2006-08-13 20:08               ` Albert Cahalan
  2006-08-16  2:20                 ` Kyle Moffett
  1 sibling, 1 reply; 27+ messages in thread
From: Albert Cahalan @ 2006-08-13 20:08 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: KAMEZAWA Hiroyuki, akpm, pj, linux-kernel

On 8/13/06, Eric W. Biederman <ebiederm@xmission.com> wrote:
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:

> > /*
> >  * A maximum of 4 million PIDs should be enough for a while.
> >  * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
> >  */
> > #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
> >         (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
> >
> > ...we have to manage 4 millions tids.

BTW, it looks like powerpc runs out of MMU contexts if
there are more than 32768 processes. Badness happens.

> The basic posix/susv guarantee is that in readdir if a directory
> entry is neither deleted nor added between opendir and closedir of the
> directory you should see the directory entry.   I could not
> quite tell what the rules were with regards seekdir.

Never minding the bare minimum, I think the user-visible
offset should be the PID plus some constant for all PIDs.
(sadly, the constant can't be 0 because of ".." and init)

> There are also other reasons to changing to a pid base traversal
> of /proc.  It allows us to display information on process groups,
> and sessions whose original leader has died.

Huh?

> If namespaces get
> assigned a pid traversal by pid looks like a good way to display
> namespaces that are not used by any process but are still alive.
> Albert does that sound like a sane extension?

You mean /proc/42 could be non-process data?
That will surely break lots and lots of things.

In general, process namespaces are useful for:

1. silly marketing (see Sun and FreeBSD)

2. the very obscure case of "root" account providers
   who are too clueless to use SE Linux or Xen

I don't think either case justifies the complexity.
I am not looking forward to the demands that I
support this mess in procps. I suspect I am not
alone; soon people will be asking for support in
pstools, gdb, fuser, killall... until every app which
interacts with other processes will need hacks.

If the cost were only an #ifdef in the kernel, there
would be no problem. Unfortunately, this is quite
a hack in the kernel and it has far-reaching
consequenses in user space.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-13 19:12                   ` Paul Jackson
@ 2006-08-16  1:23                     ` KAMEZAWA Hiroyuki
  2006-08-17  4:59                       ` Eric W. Biederman
  0 siblings, 1 reply; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-08-16  1:23 UTC (permalink / raw)
  To: Paul Jackson; +Cc: ebiederm, akpm, linux-kernel, acahalan

On Sun, 13 Aug 2006 12:12:22 -0700
Paul Jackson <pj@sgi.com> wrote:

> Eric wrote:
> > Actually except when we can't find the process we were just at
> > the current code doesn't miss any newly added processes. 
> 
> Random thought -- could we have file descriptors open on /proc put some
> sort of 'hold' on whatever /proc entry they were just at, so it doesn't
> disappear out from under them, even if that process has otherwise fully
> exited?
> 

Sorry for long absense, I was on vacation.

I and my colleague  are still working on ps command fix.

For holding, I have a patch to insert a token in a list to remember its
position. but my colleague may have another thought.

-Kame


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-13 20:08               ` Albert Cahalan
@ 2006-08-16  2:20                 ` Kyle Moffett
  0 siblings, 0 replies; 27+ messages in thread
From: Kyle Moffett @ 2006-08-16  2:20 UTC (permalink / raw)
  To: Albert Cahalan
  Cc: Eric W. Biederman, KAMEZAWA Hiroyuki, akpm, pj, linux-kernel

On Aug 13, 2006, at 16:08:20, Albert Cahalan wrote:
> In general, process namespaces are useful for:
>
> 1. silly marketing (see Sun and FreeBSD)
>
> 2. the very obscure case of "root" account providers who are too  
> clueless to use SE Linux or Xen

3. Kernel developers who need to test their kernel changes on  
multiple distributions can install those distributions unmodified in  
containers (requires process namespaces)

4. Server admins who want an absolutely unbreakable way (well, as  
close as one can get) to isolate virtual servers from each other and  
from the hardware.  This also doesn't have the overhead of running 2  
stacked memory allocators (Xen allocator and kernel allocator) as  
well as multiple copies of each kernel.

> I don't think either case justifies the complexity.  I am not  
> looking forward to the demands that I support this mess in procps.  
> I suspect I am not alone; soon people will be asking for support in  
> pstools, gdb, fuser, killall... until every app which interacts  
> with other processes will need hacks.

IMHO support for PID namespaces should/would/will be done without  
changing the existing /proc entries or underlying layout.  For  
example, one compatible solution would be to add a new "pidspace="  
mount option to procfs which specifies which pidspace will be  
represented.  Another method (also compatible with the pidspace=  
mount option and enabling hierarchical pid spaces would be to  
represent child pidspaces in their parent pidspace as a single  
process, such that sending signals to said process will send signals  
to the "init" process in the child pidspace as though they came from  
the kernel (IE: They aren't blocked-by-default), then add a  
"pidspace" subdirectory that contains all of the proc entries for  
processes in that space.  Example:

# mount -t proc proc /proc
# create_new_pidspace /bin/sleep 100000 &
[1] 1234
# ls /proc
...
...
1234
...
...
# ls /proc/1234/pidspace
1

There are obviously still problems with this scenario (what numbering  
schemes do pidspaces use and how is the pidspace= mount option  
interpreted?), but it's an example of a few ways to preserve ABI  
compatibility with existing userspace.  I think the idea behind all  
of this is to make it possible to run 6 different unmodified distros  
on a single system (but you need a few extra tools in the parent  
"boot" namespace).

> If the cost were only an #ifdef in the kernel, there would be no  
> problem. Unfortunately, this is quite a hack in the kernel and it  
> has far-reaching consequenses in user space.

I believe it's been stated that rule #1 for this stuff is preserving  
backwards compatibility and kernel<=>userspace ABI with every  
change.  We might add new APIs (which then have to remain the same in  
future versions, barring outright showstopper bugs), but existing  
APIs will still work correctly.

Cheers,
Kyle Moffett


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-16  1:23                     ` KAMEZAWA Hiroyuki
@ 2006-08-17  4:59                       ` Eric W. Biederman
  2006-08-17  6:32                         ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 27+ messages in thread
From: Eric W. Biederman @ 2006-08-17  4:59 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Paul Jackson, akpm, linux-kernel, acahalan, Jean Delvare


So I just threw something together that seems to work.

All of the pid are listed in order and the next used pid is found by
scanning the pid bitmap. 

Scanning the pid bitmap might be a little heavy but actually is likely
quite cache friendly as the accesses are extremely predictable, and
everything should fit into 64 cache lines, which is fewer cache lines
than walking a tree, and more predictable.  Of course I turn around
and then do a hash table lookup which is at least one more cache line
and probably an unpredictable one at that. 

The point despite the brute force nature I have no reason to suspect
this will run any noticeably slower than the current implementation.

Look at this try it out and if this solves the problem we can push
this upstream.

Eric
---

 fs/proc/base.c      |   82 ++++++++--------------------------------------------
 include/linux/pid.h |    1 
 kernel/pid.c        |   35 ++++++++++++++++++++++
 3 files changed, 50 insertions(+), 68 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9943527..6f7433c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1813,70 +1813,17 @@ out:
 }
 
 /*
- * Find the first tgid to return to user space.
+ * Find the first task with tgid >= tgid
  *
- * Usually this is just whatever follows &init_task, but if the users
- * buffer was too small to hold the full list or there was a seek into
- * the middle of the directory we have more work to do.
- *
- * In the case of a short read we start with find_task_by_pid.
- *
- * In the case of a seek we start with &init_task and walk nr
- * threads past it.
  */
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+static struct task_struct *next_tgid(unsigned int tgid)
 {
-	struct task_struct *pos;
-	rcu_read_lock();
-	if (tgid && nr) {
-		pos = find_task_by_pid(tgid);
-		if (pos && thread_group_leader(pos))
-			goto found;
-	}
-	/* If nr exceeds the number of processes get out quickly */
-	pos = NULL;
-	if (nr && nr >= nr_processes())
-		goto done;
-
-	/* If we haven't found our starting place yet start with
-	 * the init_task and walk nr tasks forward.
-	 */
-	for (pos = next_task(&init_task); nr > 0; --nr) {
-		pos = next_task(pos);
-		if (pos == &init_task) {
-			pos = NULL;
-			goto done;
-		}
-	}
-found:
-	get_task_struct(pos);
-done:
-	rcu_read_unlock();
-	return pos;
-}
-
-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
-{
-	struct task_struct *pos;
+	struct task_struct *task;
 	rcu_read_lock();
-	pos = start;
-	if (pid_alive(start))
-		pos = next_task(start);
-	if (pid_alive(pos) && (pos != &init_task)) {
-		get_task_struct(pos);
-		goto done;
-	}
-	pos = NULL;
-done:
+	task = get_pid_task(find_next_pid(tgid), PIDTYPE_PID);
 	rcu_read_unlock();
-	put_task_struct(start);
-	return pos;
+	return task;
+	
 }
 
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
@@ -1888,6 +1835,8 @@ static int proc_pid_fill_cache(struct fi
 				proc_pid_instantiate, task, NULL);
 }
 
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff) - 1)
+
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -1906,23 +1855,20 @@ int proc_pid_readdir(struct file * filp,
 	}
 	nr -= ARRAY_SIZE(proc_base_stuff) - 1;
 
-	/* f_version caches the tgid value that the last readdir call couldn't
-	 * return. lseek aka telldir automagically resets f_version to 0.
-	 */
-	tgid = filp->f_version;
-	filp->f_version = 0;
-	for (task = first_tgid(tgid, nr);
+	tgid = filp->f_pos - TGID_OFFSET;
+	for (task = next_tgid(tgid);
 	     task;
-	     task = next_tgid(task), filp->f_pos++) {
+	     task = next_tgid(tgid + 1)) {
 		tgid = task->pid;
+		filp->f_pos = tgid + TGID_OFFSET;
 		if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
 			/* returning this tgid failed, save it as the first
 			 * pid for the next readir call */
-			filp->f_version = tgid;
 			put_task_struct(task);
-			break;
+			goto out;
 		}
 	}
+	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
 out:
 	put_task_struct(reaper);
 out_no_task:
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 9fd547f..d06d4ba 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -89,6 +89,7 @@ extern struct pid *FASTCALL(find_pid(int
  * Lookup a PID in the hash table, and return with it's count elevated.
  */
 extern struct pid *find_get_pid(int nr);
+extern struct pid *find_next_pid(int nr);
 
 extern struct pid *alloc_pid(void);
 extern void FASTCALL(free_pid(struct pid *pid));
diff --git a/kernel/pid.c b/kernel/pid.c
index 40e8e4d..112ff2a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -145,6 +145,23 @@ static int alloc_pidmap(void)
 	return -1;
 }
 
+static int next_pidmap(int last)
+{
+	int offset;
+	pidmap_t *map;
+
+	offset = (last + 1) & BITS_PER_PAGE_MASK;
+	map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
+	for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
+		if (unlikely(!map->page))
+			continue;
+		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+		if (offset < BITS_PER_PAGE) 
+			return mk_pid(map, offset);
+	}
+	return -1;
+}
+
 fastcall void put_pid(struct pid *pid)
 {
 	if (!pid)
@@ -307,6 +324,24 @@ struct pid *find_get_pid(pid_t nr)
 EXPORT_SYMBOL_GPL(find_get_pid);
 
 /*
+ * Used by proc to find the pid with the next greater number.
+ * Specifying nr is used to handle the seek case.
+ */
+struct pid *find_next_pid(int nr)
+{
+	struct pid *next;
+
+	next = find_pid(nr);
+	while (!next) {
+		nr = next_pidmap(nr);
+		if (nr <= 0)
+			break;
+		next = find_pid(nr);
+	}
+	return next;
+}
+
+/*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
  * more.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-17  4:59                       ` Eric W. Biederman
@ 2006-08-17  6:32                         ` KAMEZAWA Hiroyuki
  2006-08-17 13:39                           ` Eric W. Biederman
  2006-08-18  3:53                           ` Eric W. Biederman
  0 siblings, 2 replies; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-08-17  6:32 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: pj, akpm, linux-kernel, acahalan, jdelvare

On Wed, 16 Aug 2006 22:59:50 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> 
> So I just threw something together that seems to work.
> 
> All of the pid are listed in order and the next used pid is found by
> scanning the pid bitmap. 
> 
> Scanning the pid bitmap might be a little heavy but actually is likely
> quite cache friendly as the accesses are extremely predictable, and
> everything should fit into 64 cache lines, which is fewer cache lines
> than walking a tree, and more predictable.  Of course I turn around
> and then do a hash table lookup which is at least one more cache line
> and probably an unpredictable one at that. 
> 
> The point despite the brute force nature I have no reason to suspect
> this will run any noticeably slower than the current implementation.
> 
> Look at this try it out and if this solves the problem we can push
> this upstream.
> 
At first, Thanks.

question:

	task = get_pid_task(find_next_pid(tgid), PIDTYPE_PID);

Does this return only "task/process" ? and never return "thread" ?

My another concern is that newly-created-process while ps running cannot be catched
by this kind of "table walking" approach (as my previous work)
But if people say O.K, I have no complaint.

I(we)'ll post another list-based one in the next week, anyway.
(I can't go-ahead this week...)

-Kame


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-17  6:32                         ` KAMEZAWA Hiroyuki
@ 2006-08-17 13:39                           ` Eric W. Biederman
  2006-08-17 18:16                             ` Jean Delvare
  2006-08-18  0:21                             ` KAMEZAWA Hiroyuki
  2006-08-18  3:53                           ` Eric W. Biederman
  1 sibling, 2 replies; 27+ messages in thread
From: Eric W. Biederman @ 2006-08-17 13:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: pj, akpm, linux-kernel, acahalan, jdelvare

KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:

> On Wed, 16 Aug 2006 22:59:50 -0600
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>> 
>> So I just threw something together that seems to work.
>> 
>> All of the pid are listed in order and the next used pid is found by
>> scanning the pid bitmap. 
>> 
>> Scanning the pid bitmap might be a little heavy but actually is likely
>> quite cache friendly as the accesses are extremely predictable, and
>> everything should fit into 64 cache lines, which is fewer cache lines
>> than walking a tree, and more predictable.  Of course I turn around
>> and then do a hash table lookup which is at least one more cache line
>> and probably an unpredictable one at that. 
>> 
>> The point despite the brute force nature I have no reason to suspect
>> this will run any noticeably slower than the current implementation.
>> 
>> Look at this try it out and if this solves the problem we can push
>> this upstream.
>> 
> At first, Thanks.
>
> question:
>
> 	task = get_pid_task(find_next_pid(tgid), PIDTYPE_PID);
>
> Does this return only "task/process" ? and never return "thread" ?
>
> My another concern is that newly-created-process while ps running cannot be
> catched
> by this kind of "table walking" approach (as my previous work)
> But if people say O.K, I have no complaint.
>
> I(we)'ll post another list-based one in the next week, anyway.
> (I can't go-ahead this week...)
>

Here is a respin with a fixed version of next_tgid.  I believe
I have all of the silly corner cases handled.  So if the pid
I have found is only a process group or session id or not
a thread_group leader it should not be ignored.

> static struct task_struct *next_tgid(unsigned int tgid)
> {
> 	struct task_struct *task;
> 	struct pid *pid;
> 
> 	task = NULL;
> 	rcu_read_lock();
> retry:
> 	pid = find_next_pid(tgid);
> 	if (pid) {
> 		tgid = pid->nr + 1;
> 		task = pid_task(pid, PIDTYPE_PID);
> 		if (!task || !thread_group_leader(task))
> 			goto retry;
> 		get_task_struct(task);
> 	}
> 	rcu_read_unlock();
> 	return task;
> 	
> }

Seeking should now just work.

---
 fs/proc/base.c      |   89 +++++++++++++---------------------------------------
 include/linux/pid.h |    1 
 kernel/pid.c        |   35 ++++++++++++++++++++
 3 files changed, 59 insertions(+), 66 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9943527..05dc244 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1813,70 +1813,28 @@ out:
 }
 
 /*
- * Find the first tgid to return to user space.
+ * Find the first task with tgid >= tgid
  *
- * Usually this is just whatever follows &init_task, but if the users
- * buffer was too small to hold the full list or there was a seek into
- * the middle of the directory we have more work to do.
- *
- * In the case of a short read we start with find_task_by_pid.
- *
- * In the case of a seek we start with &init_task and walk nr
- * threads past it.
  */
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+static struct task_struct *next_tgid(unsigned int tgid)
 {
-	struct task_struct *pos;
-	rcu_read_lock();
-	if (tgid && nr) {
-		pos = find_task_by_pid(tgid);
-		if (pos && thread_group_leader(pos))
-			goto found;
-	}
-	/* If nr exceeds the number of processes get out quickly */
-	pos = NULL;
-	if (nr && nr >= nr_processes())
-		goto done;
-
-	/* If we haven't found our starting place yet start with
-	 * the init_task and walk nr tasks forward.
-	 */
-	for (pos = next_task(&init_task); nr > 0; --nr) {
-		pos = next_task(pos);
-		if (pos == &init_task) {
-			pos = NULL;
-			goto done;
-		}
-	}
-found:
-	get_task_struct(pos);
-done:
-	rcu_read_unlock();
-	return pos;
-}
+	struct task_struct *task;
+	struct pid *pid;
 
-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
-{
-	struct task_struct *pos;
+	task = NULL;
 	rcu_read_lock();
-	pos = start;
-	if (pid_alive(start))
-		pos = next_task(start);
-	if (pid_alive(pos) && (pos != &init_task)) {
-		get_task_struct(pos);
-		goto done;
+retry:
+	pid = find_next_pid(tgid);
+	if (pid) {
+		tgid = pid->nr + 1;
+		task = pid_task(pid, PIDTYPE_PID);
+		if (!task || !thread_group_leader(task))
+			goto retry;
+		get_task_struct(task);
 	}
-	pos = NULL;
-done:
 	rcu_read_unlock();
-	put_task_struct(start);
-	return pos;
+	return task;
+	
 }
 
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
@@ -1888,6 +1846,8 @@ static int proc_pid_fill_cache(struct fi
 				proc_pid_instantiate, task, NULL);
 }
 
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff) - 1)
+
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -1906,23 +1866,20 @@ int proc_pid_readdir(struct file * filp,
 	}
 	nr -= ARRAY_SIZE(proc_base_stuff) - 1;
 
-	/* f_version caches the tgid value that the last readdir call couldn't
-	 * return. lseek aka telldir automagically resets f_version to 0.
-	 */
-	tgid = filp->f_version;
-	filp->f_version = 0;
-	for (task = first_tgid(tgid, nr);
+	tgid = filp->f_pos - TGID_OFFSET;
+	for (task = next_tgid(tgid);
 	     task;
-	     task = next_tgid(task), filp->f_pos++) {
+	     task = next_tgid(tgid + 1)) {
 		tgid = task->pid;
+		filp->f_pos = tgid + TGID_OFFSET;
 		if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
 			/* returning this tgid failed, save it as the first
 			 * pid for the next readir call */
-			filp->f_version = tgid;
 			put_task_struct(task);
-			break;
+			goto out;
 		}
 	}
+	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
 out:
 	put_task_struct(reaper);
 out_no_task:
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 9fd547f..d06d4ba 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -89,6 +89,7 @@ extern struct pid *FASTCALL(find_pid(int
  * Lookup a PID in the hash table, and return with it's count elevated.
  */
 extern struct pid *find_get_pid(int nr);
+extern struct pid *find_next_pid(int nr);
 
 extern struct pid *alloc_pid(void);
 extern void FASTCALL(free_pid(struct pid *pid));
diff --git a/kernel/pid.c b/kernel/pid.c
index 40e8e4d..112ff2a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -145,6 +145,23 @@ static int alloc_pidmap(void)
 	return -1;
 }
 
+static int next_pidmap(int last)
+{
+	int offset;
+	pidmap_t *map;
+
+	offset = (last + 1) & BITS_PER_PAGE_MASK;
+	map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
+	for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
+		if (unlikely(!map->page))
+			continue;
+		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+		if (offset < BITS_PER_PAGE) 
+			return mk_pid(map, offset);
+	}
+	return -1;
+}
+
 fastcall void put_pid(struct pid *pid)
 {
 	if (!pid)
@@ -307,6 +324,24 @@ struct pid *find_get_pid(pid_t nr)
 EXPORT_SYMBOL_GPL(find_get_pid);
 
 /*
+ * Used by proc to find the pid with the next greater number.
+ * Specifying nr is used to handle the seek case.
+ */
+struct pid *find_next_pid(int nr)
+{
+	struct pid *next;
+
+	next = find_pid(nr);
+	while (!next) {
+		nr = next_pidmap(nr);
+		if (nr <= 0)
+			break;
+		next = find_pid(nr);
+	}
+	return next;
+}
+
+/*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
  * more.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-17 13:39                           ` Eric W. Biederman
@ 2006-08-17 18:16                             ` Jean Delvare
  2006-08-18  0:21                             ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 27+ messages in thread
From: Jean Delvare @ 2006-08-17 18:16 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: KAMEZAWA Hiroyuki, pj, akpm, linux-kernel, acahalan

Eric,

On Thursday 17 August 2006 15:39, Eric W. Biederman wrote:
> Here is a respin with a fixed version of next_tgid.  I believe
> I have all of the silly corner cases handled.  So if the pid
> I have found is only a process group or session id or not
> a thread_group leader it should not be ignored.

On what tree is this patch based? It doesn't appear to apply to 
2.6.18-rc4, nor 2.6.18-rc4-mm1.

Thanks,
-- 
Jean Delvare

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-17 13:39                           ` Eric W. Biederman
  2006-08-17 18:16                             ` Jean Delvare
@ 2006-08-18  0:21                             ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-08-18  0:21 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: pj, akpm, linux-kernel, acahalan, jdelvare

On Thu, 17 Aug 2006 07:39:05 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> > static struct task_struct *next_tgid(unsigned int tgid)
> > {
> > 	struct task_struct *task;
> > 	struct pid *pid;
> > 
> > 	task = NULL;
> > 	rcu_read_lock();
> > retry:
> > 	pid = find_next_pid(tgid);
> > 	if (pid) {
> > 		tgid = pid->nr + 1;
> > 		task = pid_task(pid, PIDTYPE_PID);
> > 		if (!task || !thread_group_leader(task))
> > 			goto retry;
> > 		get_task_struct(task);
> > 	}
> > 	rcu_read_unlock();
> > 	return task;
> > 	
> > }
> 
> Seeking should now just work.
> 

Above function looks good. Is this patch based on 'struct pid' patch ?

BTW, the whole story is..
==
last_read_pid = find_next_pid(x);
possible_next_pid = last_read_pid + 1;
find_next_pid( possible_next_pid)
<call>-> if ( find_pid(possible_next_pid) == 0) {
		next = next_pidmap(maybe_next_pid + 1);
	 } else {
		next = maybe_next_pid;
	}
==

Why just do this by
==
next = next_pidmap(last_read_pid) ?
==

I don't think existing pids are contiguous in usual system.

-Kame


> ---
>  fs/proc/base.c      |   89 +++++++++++++---------------------------------------
>  include/linux/pid.h |    1 
>  kernel/pid.c        |   35 ++++++++++++++++++++
>  3 files changed, 59 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 9943527..05dc244 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1813,70 +1813,28 @@ out:
>  }
>  
>  /*
> - * Find the first tgid to return to user space.
> + * Find the first task with tgid >= tgid
>   *
> - * Usually this is just whatever follows &init_task, but if the users
> - * buffer was too small to hold the full list or there was a seek into
> - * the middle of the directory we have more work to do.
> - *
> - * In the case of a short read we start with find_task_by_pid.
> - *
> - * In the case of a seek we start with &init_task and walk nr
> - * threads past it.
>   */
> -static struct task_struct *first_tgid(int tgid, unsigned int nr)
> +static struct task_struct *next_tgid(unsigned int tgid)
>  {
> -	struct task_struct *pos;
> -	rcu_read_lock();
> -	if (tgid && nr) {
> -		pos = find_task_by_pid(tgid);
> -		if (pos && thread_group_leader(pos))
> -			goto found;
> -	}
> -	/* If nr exceeds the number of processes get out quickly */
> -	pos = NULL;
> -	if (nr && nr >= nr_processes())
> -		goto done;
> -
> -	/* If we haven't found our starting place yet start with
> -	 * the init_task and walk nr tasks forward.
> -	 */
> -	for (pos = next_task(&init_task); nr > 0; --nr) {
> -		pos = next_task(pos);
> -		if (pos == &init_task) {
> -			pos = NULL;
> -			goto done;
> -		}
> -	}
> -found:
> -	get_task_struct(pos);
> -done:
> -	rcu_read_unlock();
> -	return pos;
> -}
> +	struct task_struct *task;
> +	struct pid *pid;
>  
> -/*
> - * Find the next task in the task list.
> - * Return NULL if we loop or there is any error.
> - *
> - * The reference to the input task_struct is released.
> - */
> -static struct task_struct *next_tgid(struct task_struct *start)
> -{
> -	struct task_struct *pos;
> +	task = NULL;
>  	rcu_read_lock();
> -	pos = start;
> -	if (pid_alive(start))
> -		pos = next_task(start);
> -	if (pid_alive(pos) && (pos != &init_task)) {
> -		get_task_struct(pos);
> -		goto done;
> +retry:
> +	pid = find_next_pid(tgid);
> +	if (pid) {
> +		tgid = pid->nr + 1;
> +		task = pid_task(pid, PIDTYPE_PID);
> +		if (!task || !thread_group_leader(task))
> +			goto retry;
> +		get_task_struct(task);
>  	}
> -	pos = NULL;
> -done:
>  	rcu_read_unlock();
> -	put_task_struct(start);
> -	return pos;
> +	return task;
> +	
>  }
>  
>  static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
> @@ -1888,6 +1846,8 @@ static int proc_pid_fill_cache(struct fi
>  				proc_pid_instantiate, task, NULL);
>  }
>  
> +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff) - 1)
> +
>  /* for the /proc/ directory itself, after non-process stuff has been done */
>  int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
>  {
> @@ -1906,23 +1866,20 @@ int proc_pid_readdir(struct file * filp,
>  	}
>  	nr -= ARRAY_SIZE(proc_base_stuff) - 1;
>  
> -	/* f_version caches the tgid value that the last readdir call couldn't
> -	 * return. lseek aka telldir automagically resets f_version to 0.
> -	 */
> -	tgid = filp->f_version;
> -	filp->f_version = 0;
> -	for (task = first_tgid(tgid, nr);
> +	tgid = filp->f_pos - TGID_OFFSET;
> +	for (task = next_tgid(tgid);
>  	     task;
> -	     task = next_tgid(task), filp->f_pos++) {
> +	     task = next_tgid(tgid + 1)) {
>  		tgid = task->pid;
> +		filp->f_pos = tgid + TGID_OFFSET;
>  		if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
>  			/* returning this tgid failed, save it as the first
>  			 * pid for the next readir call */
> -			filp->f_version = tgid;
>  			put_task_struct(task);
> -			break;
> +			goto out;
>  		}
>  	}
> +	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
>  out:
>  	put_task_struct(reaper);
>  out_no_task:
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 9fd547f..d06d4ba 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -89,6 +89,7 @@ extern struct pid *FASTCALL(find_pid(int
>   * Lookup a PID in the hash table, and return with it's count elevated.
>   */
>  extern struct pid *find_get_pid(int nr);
> +extern struct pid *find_next_pid(int nr);
>  
>  extern struct pid *alloc_pid(void);
>  extern void FASTCALL(free_pid(struct pid *pid));
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 40e8e4d..112ff2a 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -145,6 +145,23 @@ static int alloc_pidmap(void)
>  	return -1;
>  }
>  
> +static int next_pidmap(int last)
> +{
> +	int offset;
> +	pidmap_t *map;
> +
> +	offset = (last + 1) & BITS_PER_PAGE_MASK;
> +	map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
> +	for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
> +		if (unlikely(!map->page))
> +			continue;
> +		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
> +		if (offset < BITS_PER_PAGE) 
> +			return mk_pid(map, offset);
> +	}
> +	return -1;
> +}
> +
>  fastcall void put_pid(struct pid *pid)
>  {
>  	if (!pid)
> @@ -307,6 +324,24 @@ struct pid *find_get_pid(pid_t nr)
>  EXPORT_SYMBOL_GPL(find_get_pid);
>  
>  /*
> + * Used by proc to find the pid with the next greater number.
> + * Specifying nr is used to handle the seek case.
> + */
> +struct pid *find_next_pid(int nr)
> +{
> +	struct pid *next;
> +
> +	next = find_pid(nr);
> +	while (!next) {
> +		nr = next_pidmap(nr);
> +		if (nr <= 0)
> +			break;
> +		next = find_pid(nr);
> +	}
> +	return next;
> +}
> +
> +/*
>   * The pid hash table is scaled according to the amount of memory in the
>   * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
>   * more.
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
  2006-08-17  6:32                         ` KAMEZAWA Hiroyuki
  2006-08-17 13:39                           ` Eric W. Biederman
@ 2006-08-18  3:53                           ` Eric W. Biederman
  1 sibling, 0 replies; 27+ messages in thread
From: Eric W. Biederman @ 2006-08-18  3:53 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: pj, akpm, linux-kernel, acahalan, jdelvare


Hmm...  I forgot to hit send.

KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> writes:

> At first, Thanks.
>
> question:
>
> 	task = get_pid_task(find_next_pid(tgid), PIDTYPE_PID);
>
> Does this return only "task/process" ? and never return "thread" ?

Good point.  I don't think I'm filter for thread group leaders here.
That should take a couple more lines of code.

> My another concern is that newly-created-process while ps running cannot be
> catched
> by this kind of "table walking" approach (as my previous work)
> But if people say O.K, I have no complaint.

Well it can but only if the newly created processes have a higher pid.

The guarantee that POSIX readdir makes is that you will see all directory
entries that are neither added nor deleted during the readdir.

> I(we)'ll post another list-based one in the next week, anyway.
> (I can't go-ahead this week...)

Where I see what I'm doing as being superior to that is that I'm
not writing to any global data structures.  Which means what I'm doing
should scale to large machines without problem.

Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [RFC] ps command race fix
@ 2006-07-25  6:47 Albert Cahalan
  0 siblings, 0 replies; 27+ messages in thread
From: Albert Cahalan @ 2006-07-25  6:47 UTC (permalink / raw)
  To: kamezawa.hiroyu, linux-kernel, ebiederm, pj, akpm

Andrew Morton writes:
> KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

>> Hi, this is an experimental patch for the probelm
>>      - "ps command can miss some pid occationally"
...
> So I think we're still seeking a solution to this.

We have a solution, subject to some bit rot I'm sure.
The cookie/cursor hack should have been rejected.
I'm still wondering why that ever got accepted.

Hugh had a patch set containing a tree-based replacement
for the PID handling. It worked perfectly, letting /proc
look up the lowest-not-under PID for any given PID.

(can anybody find the patch set?)

BTW, here's a WONTFIX resolved bug that places the date
for the patch set as being prior to 2005-05-21.
https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=158277

> Options might be:
>
> a) Pin the most-recently-visited task in some manner, so that it is
>    still on the global task list when we return.  That's fairly simple to
>    do (defer the release_task()) but it affects task lifetime and visibility
>    in rare and worrisome ways.

In state X perhaps?

> b) Change proc_pid_readdir() so that it walks the pid_hash[] array
>    instead of the task list.  Need to do something clever when traversing
>    each bucket's list, but I'm not sure what ;) It's the same problem.
>
>    Possibly what we could do here is to permit the task which is walking
>    /proc to pin a particular 'struct pid': take a ref on it then when we
>    next start walking one of the pid_hash[] chains, we _know_ that the
>    'struct pid' which we're looking for will still be there.  Even if it
>    now refers to a departed process.

Well, we have to pin something if we don't use a tree.

If I remember right the cookie/cursor thing was mostly intended
to solve real-time problems. Obviously it fails at this too.
(not that attackable hashes are acceptable for real-time!)

> c) Nuke the pid_hash[], convert the whole thing to a radix-tree.
>    They're super-simple to traverse.  Not sure what we'd index it by
>    though.

With the right kind of tree, you just look up by PID and get
back the nearest result which is not less than the desired one.
This is what the older patch set did.

I like trees. They have nice cache properties. Decent trees are
immune to being turned into linked lists via hash function attacks.
The non-crypto hashes in the kernel ought to worry people.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2006-08-18  3:54 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-07-14 11:39 [RFC] ps command race fix KAMEZAWA Hiroyuki
2006-07-25  1:20 ` Andrew Morton
2006-07-25  1:48   ` Paul Jackson
2006-07-25  2:00     ` Andrew Morton
2006-07-25  2:08     ` KAMEZAWA Hiroyuki
2006-07-25  2:33       ` Andrew Morton
2006-07-25  2:50         ` KAMEZAWA Hiroyuki
2006-07-25  3:16           ` KAMEZAWA Hiroyuki
2006-08-13 16:29             ` Eric W. Biederman
2006-08-13 17:34               ` Andrew Morton
2006-08-13 19:00                 ` Eric W. Biederman
2006-08-13 19:12                   ` Paul Jackson
2006-08-16  1:23                     ` KAMEZAWA Hiroyuki
2006-08-17  4:59                       ` Eric W. Biederman
2006-08-17  6:32                         ` KAMEZAWA Hiroyuki
2006-08-17 13:39                           ` Eric W. Biederman
2006-08-17 18:16                             ` Jean Delvare
2006-08-18  0:21                             ` KAMEZAWA Hiroyuki
2006-08-18  3:53                           ` Eric W. Biederman
2006-08-13 20:08               ` Albert Cahalan
2006-08-16  2:20                 ` Kyle Moffett
2006-07-25  7:22         ` Paul Jackson
2006-07-25  1:53   ` KAMEZAWA Hiroyuki
2006-07-25  2:06     ` Andrew Morton
2006-07-25  2:34       ` KAMEZAWA Hiroyuki
2006-07-25  6:09   ` Eric W. Biederman
2006-07-25  6:47 Albert Cahalan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).